1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Stand-alone page-table allocator for hyp stage-1 and guest stage-2. 4 * No bombay mix was harmed in the writing of this file. 5 * 6 * Copyright (C) 2020 Google LLC 7 * Author: Will Deacon <will@kernel.org> 8 */ 9 10 #include <linux/bitfield.h> 11 #include <asm/kvm_pgtable.h> 12 #include <asm/stage2_pgtable.h> 13 14 15 #define KVM_PTE_TYPE BIT(1) 16 #define KVM_PTE_TYPE_BLOCK 0 17 #define KVM_PTE_TYPE_PAGE 1 18 #define KVM_PTE_TYPE_TABLE 1 19 20 #define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) 21 22 #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) 23 #define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) 24 #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \ 25 ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; }) 26 #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \ 27 ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; }) 28 #define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) 29 #define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 30 #define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) 31 32 #define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) 33 #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) 34 #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) 35 #define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) 36 #define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 37 #define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) 38 39 #define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50) 40 41 #define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) 42 43 #define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) 44 45 #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) 46 47 #define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) 48 49 #define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ 50 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ 51 KVM_PTE_LEAF_ATTR_HI_S2_XN) 52 53 #define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) 54 #define KVM_MAX_OWNER_ID 1 55 56 /* 57 * Used to indicate a pte for which a 'break-before-make' sequence is in 58 * progress. 59 */ 60 #define KVM_INVALID_PTE_LOCKED BIT(10) 61 62 struct kvm_pgtable_walk_data { 63 struct kvm_pgtable_walker *walker; 64 65 const u64 start; 66 u64 addr; 67 const u64 end; 68 }; 69 70 static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx) 71 { 72 return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI); 73 } 74 75 static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx) 76 { 77 return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO); 78 } 79 80 static bool kvm_phys_is_valid(u64 phys) 81 { 82 return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX)); 83 } 84 85 static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys) 86 { 87 u64 granule = kvm_granule_size(ctx->level); 88 89 if (!kvm_level_supports_block_mapping(ctx->level)) 90 return false; 91 92 if (granule > (ctx->end - ctx->addr)) 93 return false; 94 95 if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule)) 96 return false; 97 98 return IS_ALIGNED(ctx->addr, granule); 99 } 100 101 static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level) 102 { 103 u64 shift = kvm_granule_shift(level); 104 u64 mask = BIT(PAGE_SHIFT - 3) - 1; 105 106 return (data->addr >> shift) & mask; 107 } 108 109 static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) 110 { 111 u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */ 112 u64 mask = BIT(pgt->ia_bits) - 1; 113 114 return (addr & mask) >> shift; 115 } 116 117 static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) 118 { 119 struct kvm_pgtable pgt = { 120 .ia_bits = ia_bits, 121 .start_level = start_level, 122 }; 123 124 return kvm_pgd_page_idx(&pgt, -1ULL) + 1; 125 } 126 127 static bool kvm_pte_table(kvm_pte_t pte, u32 level) 128 { 129 if (level == KVM_PGTABLE_MAX_LEVELS - 1) 130 return false; 131 132 if (!kvm_pte_valid(pte)) 133 return false; 134 135 return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE; 136 } 137 138 static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops) 139 { 140 return mm_ops->phys_to_virt(kvm_pte_to_phys(pte)); 141 } 142 143 static void kvm_clear_pte(kvm_pte_t *ptep) 144 { 145 WRITE_ONCE(*ptep, 0); 146 } 147 148 static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops *mm_ops) 149 { 150 kvm_pte_t pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp)); 151 152 pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE); 153 pte |= KVM_PTE_VALID; 154 return pte; 155 } 156 157 static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level) 158 { 159 kvm_pte_t pte = kvm_phys_to_pte(pa); 160 u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE : 161 KVM_PTE_TYPE_BLOCK; 162 163 pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI); 164 pte |= FIELD_PREP(KVM_PTE_TYPE, type); 165 pte |= KVM_PTE_VALID; 166 167 return pte; 168 } 169 170 static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id) 171 { 172 return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id); 173 } 174 175 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, 176 const struct kvm_pgtable_visit_ctx *ctx, 177 enum kvm_pgtable_walk_flags visit) 178 { 179 struct kvm_pgtable_walker *walker = data->walker; 180 181 /* Ensure the appropriate lock is held (e.g. RCU lock for stage-2 MMU) */ 182 WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx) && !kvm_pgtable_walk_lock_held()); 183 return walker->cb(ctx, visit); 184 } 185 186 static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker, 187 int r) 188 { 189 /* 190 * Visitor callbacks return EAGAIN when the conditions that led to a 191 * fault are no longer reflected in the page tables due to a race to 192 * update a PTE. In the context of a fault handler this is interpreted 193 * as a signal to retry guest execution. 194 * 195 * Ignore the return code altogether for walkers outside a fault handler 196 * (e.g. write protecting a range of memory) and chug along with the 197 * page table walk. 198 */ 199 if (r == -EAGAIN) 200 return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT); 201 202 return !r; 203 } 204 205 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, 206 struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level); 207 208 static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, 209 struct kvm_pgtable_mm_ops *mm_ops, 210 kvm_pteref_t pteref, u32 level) 211 { 212 enum kvm_pgtable_walk_flags flags = data->walker->flags; 213 kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref); 214 struct kvm_pgtable_visit_ctx ctx = { 215 .ptep = ptep, 216 .old = READ_ONCE(*ptep), 217 .arg = data->walker->arg, 218 .mm_ops = mm_ops, 219 .start = data->start, 220 .addr = data->addr, 221 .end = data->end, 222 .level = level, 223 .flags = flags, 224 }; 225 int ret = 0; 226 bool reload = false; 227 kvm_pteref_t childp; 228 bool table = kvm_pte_table(ctx.old, level); 229 230 if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) { 231 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE); 232 reload = true; 233 } 234 235 if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) { 236 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF); 237 reload = true; 238 } 239 240 /* 241 * Reload the page table after invoking the walker callback for leaf 242 * entries or after pre-order traversal, to allow the walker to descend 243 * into a newly installed or replaced table. 244 */ 245 if (reload) { 246 ctx.old = READ_ONCE(*ptep); 247 table = kvm_pte_table(ctx.old, level); 248 } 249 250 if (!kvm_pgtable_walk_continue(data->walker, ret)) 251 goto out; 252 253 if (!table) { 254 data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level)); 255 data->addr += kvm_granule_size(level); 256 goto out; 257 } 258 259 childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops); 260 ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1); 261 if (!kvm_pgtable_walk_continue(data->walker, ret)) 262 goto out; 263 264 if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST) 265 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST); 266 267 out: 268 if (kvm_pgtable_walk_continue(data->walker, ret)) 269 return 0; 270 271 return ret; 272 } 273 274 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, 275 struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level) 276 { 277 u32 idx; 278 int ret = 0; 279 280 if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS)) 281 return -EINVAL; 282 283 for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) { 284 kvm_pteref_t pteref = &pgtable[idx]; 285 286 if (data->addr >= data->end) 287 break; 288 289 ret = __kvm_pgtable_visit(data, mm_ops, pteref, level); 290 if (ret) 291 break; 292 } 293 294 return ret; 295 } 296 297 static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_data *data) 298 { 299 u32 idx; 300 int ret = 0; 301 u64 limit = BIT(pgt->ia_bits); 302 303 if (data->addr > limit || data->end > limit) 304 return -ERANGE; 305 306 if (!pgt->pgd) 307 return -EINVAL; 308 309 for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) { 310 kvm_pteref_t pteref = &pgt->pgd[idx * PTRS_PER_PTE]; 311 312 ret = __kvm_pgtable_walk(data, pgt->mm_ops, pteref, pgt->start_level); 313 if (ret) 314 break; 315 } 316 317 return ret; 318 } 319 320 int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, 321 struct kvm_pgtable_walker *walker) 322 { 323 struct kvm_pgtable_walk_data walk_data = { 324 .start = ALIGN_DOWN(addr, PAGE_SIZE), 325 .addr = ALIGN_DOWN(addr, PAGE_SIZE), 326 .end = PAGE_ALIGN(walk_data.addr + size), 327 .walker = walker, 328 }; 329 int r; 330 331 r = kvm_pgtable_walk_begin(walker); 332 if (r) 333 return r; 334 335 r = _kvm_pgtable_walk(pgt, &walk_data); 336 kvm_pgtable_walk_end(walker); 337 338 return r; 339 } 340 341 struct leaf_walk_data { 342 kvm_pte_t pte; 343 u32 level; 344 }; 345 346 static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx, 347 enum kvm_pgtable_walk_flags visit) 348 { 349 struct leaf_walk_data *data = ctx->arg; 350 351 data->pte = ctx->old; 352 data->level = ctx->level; 353 354 return 0; 355 } 356 357 int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, 358 kvm_pte_t *ptep, u32 *level) 359 { 360 struct leaf_walk_data data; 361 struct kvm_pgtable_walker walker = { 362 .cb = leaf_walker, 363 .flags = KVM_PGTABLE_WALK_LEAF, 364 .arg = &data, 365 }; 366 int ret; 367 368 ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE), 369 PAGE_SIZE, &walker); 370 if (!ret) { 371 if (ptep) 372 *ptep = data.pte; 373 if (level) 374 *level = data.level; 375 } 376 377 return ret; 378 } 379 380 struct hyp_map_data { 381 const u64 phys; 382 kvm_pte_t attr; 383 }; 384 385 static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) 386 { 387 bool device = prot & KVM_PGTABLE_PROT_DEVICE; 388 u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL; 389 kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype); 390 u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS; 391 u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW : 392 KVM_PTE_LEAF_ATTR_LO_S1_AP_RO; 393 394 if (!(prot & KVM_PGTABLE_PROT_R)) 395 return -EINVAL; 396 397 if (prot & KVM_PGTABLE_PROT_X) { 398 if (prot & KVM_PGTABLE_PROT_W) 399 return -EINVAL; 400 401 if (device) 402 return -EINVAL; 403 404 if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti()) 405 attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP; 406 } else { 407 attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN; 408 } 409 410 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); 411 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); 412 attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; 413 attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; 414 *ptep = attr; 415 416 return 0; 417 } 418 419 enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) 420 { 421 enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; 422 u32 ap; 423 424 if (!kvm_pte_valid(pte)) 425 return prot; 426 427 if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN)) 428 prot |= KVM_PGTABLE_PROT_X; 429 430 ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte); 431 if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO) 432 prot |= KVM_PGTABLE_PROT_R; 433 else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW) 434 prot |= KVM_PGTABLE_PROT_RW; 435 436 return prot; 437 } 438 439 static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, 440 struct hyp_map_data *data) 441 { 442 u64 phys = data->phys + (ctx->addr - ctx->start); 443 kvm_pte_t new; 444 445 if (!kvm_block_mapping_supported(ctx, phys)) 446 return false; 447 448 new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); 449 if (ctx->old == new) 450 return true; 451 if (!kvm_pte_valid(ctx->old)) 452 ctx->mm_ops->get_page(ctx->ptep); 453 else if (WARN_ON((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) 454 return false; 455 456 smp_store_release(ctx->ptep, new); 457 return true; 458 } 459 460 static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx, 461 enum kvm_pgtable_walk_flags visit) 462 { 463 kvm_pte_t *childp, new; 464 struct hyp_map_data *data = ctx->arg; 465 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 466 467 if (hyp_map_walker_try_leaf(ctx, data)) 468 return 0; 469 470 if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) 471 return -EINVAL; 472 473 childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL); 474 if (!childp) 475 return -ENOMEM; 476 477 new = kvm_init_table_pte(childp, mm_ops); 478 mm_ops->get_page(ctx->ptep); 479 smp_store_release(ctx->ptep, new); 480 481 return 0; 482 } 483 484 int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, 485 enum kvm_pgtable_prot prot) 486 { 487 int ret; 488 struct hyp_map_data map_data = { 489 .phys = ALIGN_DOWN(phys, PAGE_SIZE), 490 }; 491 struct kvm_pgtable_walker walker = { 492 .cb = hyp_map_walker, 493 .flags = KVM_PGTABLE_WALK_LEAF, 494 .arg = &map_data, 495 }; 496 497 ret = hyp_set_prot_attr(prot, &map_data.attr); 498 if (ret) 499 return ret; 500 501 ret = kvm_pgtable_walk(pgt, addr, size, &walker); 502 dsb(ishst); 503 isb(); 504 return ret; 505 } 506 507 static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, 508 enum kvm_pgtable_walk_flags visit) 509 { 510 kvm_pte_t *childp = NULL; 511 u64 granule = kvm_granule_size(ctx->level); 512 u64 *unmapped = ctx->arg; 513 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 514 515 if (!kvm_pte_valid(ctx->old)) 516 return -EINVAL; 517 518 if (kvm_pte_table(ctx->old, ctx->level)) { 519 childp = kvm_pte_follow(ctx->old, mm_ops); 520 521 if (mm_ops->page_count(childp) != 1) 522 return 0; 523 524 kvm_clear_pte(ctx->ptep); 525 dsb(ishst); 526 __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); 527 } else { 528 if (ctx->end - ctx->addr < granule) 529 return -EINVAL; 530 531 kvm_clear_pte(ctx->ptep); 532 dsb(ishst); 533 __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); 534 *unmapped += granule; 535 } 536 537 dsb(ish); 538 isb(); 539 mm_ops->put_page(ctx->ptep); 540 541 if (childp) 542 mm_ops->put_page(childp); 543 544 return 0; 545 } 546 547 u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) 548 { 549 u64 unmapped = 0; 550 struct kvm_pgtable_walker walker = { 551 .cb = hyp_unmap_walker, 552 .arg = &unmapped, 553 .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, 554 }; 555 556 if (!pgt->mm_ops->page_count) 557 return 0; 558 559 kvm_pgtable_walk(pgt, addr, size, &walker); 560 return unmapped; 561 } 562 563 int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, 564 struct kvm_pgtable_mm_ops *mm_ops) 565 { 566 u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits); 567 568 pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL); 569 if (!pgt->pgd) 570 return -ENOMEM; 571 572 pgt->ia_bits = va_bits; 573 pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels; 574 pgt->mm_ops = mm_ops; 575 pgt->mmu = NULL; 576 pgt->force_pte_cb = NULL; 577 578 return 0; 579 } 580 581 static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx, 582 enum kvm_pgtable_walk_flags visit) 583 { 584 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 585 586 if (!kvm_pte_valid(ctx->old)) 587 return 0; 588 589 mm_ops->put_page(ctx->ptep); 590 591 if (kvm_pte_table(ctx->old, ctx->level)) 592 mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); 593 594 return 0; 595 } 596 597 void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) 598 { 599 struct kvm_pgtable_walker walker = { 600 .cb = hyp_free_walker, 601 .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, 602 }; 603 604 WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); 605 pgt->mm_ops->put_page(kvm_dereference_pteref(&walker, pgt->pgd)); 606 pgt->pgd = NULL; 607 } 608 609 struct stage2_map_data { 610 const u64 phys; 611 kvm_pte_t attr; 612 u8 owner_id; 613 614 kvm_pte_t *anchor; 615 kvm_pte_t *childp; 616 617 struct kvm_s2_mmu *mmu; 618 void *memcache; 619 620 /* Force mappings to page granularity */ 621 bool force_pte; 622 }; 623 624 u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) 625 { 626 u64 vtcr = VTCR_EL2_FLAGS; 627 u8 lvls; 628 629 vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT; 630 vtcr |= VTCR_EL2_T0SZ(phys_shift); 631 /* 632 * Use a minimum 2 level page table to prevent splitting 633 * host PMD huge pages at stage2. 634 */ 635 lvls = stage2_pgtable_levels(phys_shift); 636 if (lvls < 2) 637 lvls = 2; 638 vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); 639 640 #ifdef CONFIG_ARM64_HW_AFDBM 641 /* 642 * Enable the Hardware Access Flag management, unconditionally 643 * on all CPUs. In systems that have asymmetric support for the feature 644 * this allows KVM to leverage hardware support on the subset of cores 645 * that implement the feature. 646 * 647 * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by 648 * hardware) on implementations that do not advertise support for the 649 * feature. As such, setting HA unconditionally is safe, unless you 650 * happen to be running on a design that has unadvertised support for 651 * HAFDBS. Here be dragons. 652 */ 653 if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) 654 vtcr |= VTCR_EL2_HA; 655 #endif /* CONFIG_ARM64_HW_AFDBM */ 656 657 /* Set the vmid bits */ 658 vtcr |= (get_vmid_bits(mmfr1) == 16) ? 659 VTCR_EL2_VS_16BIT : 660 VTCR_EL2_VS_8BIT; 661 662 return vtcr; 663 } 664 665 static bool stage2_has_fwb(struct kvm_pgtable *pgt) 666 { 667 if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) 668 return false; 669 670 return !(pgt->flags & KVM_PGTABLE_S2_NOFWB); 671 } 672 673 void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, 674 phys_addr_t addr, size_t size) 675 { 676 unsigned long pages, inval_pages; 677 678 if (!system_supports_tlb_range()) { 679 kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); 680 return; 681 } 682 683 pages = size >> PAGE_SHIFT; 684 while (pages > 0) { 685 inval_pages = min(pages, MAX_TLBI_RANGE_PAGES); 686 kvm_call_hyp(__kvm_tlb_flush_vmid_range, mmu, addr, inval_pages); 687 688 addr += inval_pages << PAGE_SHIFT; 689 pages -= inval_pages; 690 } 691 } 692 693 #define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt)) 694 695 static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, 696 kvm_pte_t *ptep) 697 { 698 bool device = prot & KVM_PGTABLE_PROT_DEVICE; 699 kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) : 700 KVM_S2_MEMATTR(pgt, NORMAL); 701 u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; 702 703 if (!(prot & KVM_PGTABLE_PROT_X)) 704 attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; 705 else if (device) 706 return -EINVAL; 707 708 if (prot & KVM_PGTABLE_PROT_R) 709 attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; 710 711 if (prot & KVM_PGTABLE_PROT_W) 712 attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; 713 714 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); 715 attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; 716 attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; 717 *ptep = attr; 718 719 return 0; 720 } 721 722 enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) 723 { 724 enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; 725 726 if (!kvm_pte_valid(pte)) 727 return prot; 728 729 if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R) 730 prot |= KVM_PGTABLE_PROT_R; 731 if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W) 732 prot |= KVM_PGTABLE_PROT_W; 733 if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN)) 734 prot |= KVM_PGTABLE_PROT_X; 735 736 return prot; 737 } 738 739 static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new) 740 { 741 if (!kvm_pte_valid(old) || !kvm_pte_valid(new)) 742 return true; 743 744 return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)); 745 } 746 747 static bool stage2_pte_is_counted(kvm_pte_t pte) 748 { 749 /* 750 * The refcount tracks valid entries as well as invalid entries if they 751 * encode ownership of a page to another entity than the page-table 752 * owner, whose id is 0. 753 */ 754 return !!pte; 755 } 756 757 static bool stage2_pte_is_locked(kvm_pte_t pte) 758 { 759 return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED); 760 } 761 762 static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) 763 { 764 if (!kvm_pgtable_walk_shared(ctx)) { 765 WRITE_ONCE(*ctx->ptep, new); 766 return true; 767 } 768 769 return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old; 770 } 771 772 /** 773 * stage2_try_break_pte() - Invalidates a pte according to the 774 * 'break-before-make' requirements of the 775 * architecture. 776 * 777 * @ctx: context of the visited pte. 778 * @mmu: stage-2 mmu 779 * 780 * Returns: true if the pte was successfully broken. 781 * 782 * If the removed pte was valid, performs the necessary serialization and TLB 783 * invalidation for the old value. For counted ptes, drops the reference count 784 * on the containing table page. 785 */ 786 static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, 787 struct kvm_s2_mmu *mmu) 788 { 789 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 790 791 if (stage2_pte_is_locked(ctx->old)) { 792 /* 793 * Should never occur if this walker has exclusive access to the 794 * page tables. 795 */ 796 WARN_ON(!kvm_pgtable_walk_shared(ctx)); 797 return false; 798 } 799 800 if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED)) 801 return false; 802 803 if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) { 804 /* 805 * Perform the appropriate TLB invalidation based on the 806 * evicted pte value (if any). 807 */ 808 if (kvm_pte_table(ctx->old, ctx->level)) { 809 u64 size = kvm_granule_size(ctx->level); 810 u64 addr = ALIGN_DOWN(ctx->addr, size); 811 812 kvm_tlb_flush_vmid_range(mmu, addr, size); 813 } else if (kvm_pte_valid(ctx->old)) { 814 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, 815 ctx->addr, ctx->level); 816 } 817 } 818 819 if (stage2_pte_is_counted(ctx->old)) 820 mm_ops->put_page(ctx->ptep); 821 822 return true; 823 } 824 825 static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) 826 { 827 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 828 829 WARN_ON(!stage2_pte_is_locked(*ctx->ptep)); 830 831 if (stage2_pte_is_counted(new)) 832 mm_ops->get_page(ctx->ptep); 833 834 smp_store_release(ctx->ptep, new); 835 } 836 837 static bool stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt) 838 { 839 /* 840 * If FEAT_TLBIRANGE is implemented, defer the individual 841 * TLB invalidations until the entire walk is finished, and 842 * then use the range-based TLBI instructions to do the 843 * invalidations. Condition deferred TLB invalidation on the 844 * system supporting FWB as the optimization is entirely 845 * pointless when the unmap walker needs to perform CMOs. 846 */ 847 return system_supports_tlb_range() && stage2_has_fwb(pgt); 848 } 849 850 static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, 851 struct kvm_s2_mmu *mmu, 852 struct kvm_pgtable_mm_ops *mm_ops) 853 { 854 struct kvm_pgtable *pgt = ctx->arg; 855 856 /* 857 * Clear the existing PTE, and perform break-before-make if it was 858 * valid. Depending on the system support, defer the TLB maintenance 859 * for the same until the entire unmap walk is completed. 860 */ 861 if (kvm_pte_valid(ctx->old)) { 862 kvm_clear_pte(ctx->ptep); 863 864 if (!stage2_unmap_defer_tlb_flush(pgt)) 865 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, 866 ctx->addr, ctx->level); 867 } 868 869 mm_ops->put_page(ctx->ptep); 870 } 871 872 static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) 873 { 874 u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; 875 return memattr == KVM_S2_MEMATTR(pgt, NORMAL); 876 } 877 878 static bool stage2_pte_executable(kvm_pte_t pte) 879 { 880 return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); 881 } 882 883 static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx, 884 const struct stage2_map_data *data) 885 { 886 u64 phys = data->phys; 887 888 /* 889 * Stage-2 walks to update ownership data are communicated to the map 890 * walker using an invalid PA. Avoid offsetting an already invalid PA, 891 * which could overflow and make the address valid again. 892 */ 893 if (!kvm_phys_is_valid(phys)) 894 return phys; 895 896 /* 897 * Otherwise, work out the correct PA based on how far the walk has 898 * gotten. 899 */ 900 return phys + (ctx->addr - ctx->start); 901 } 902 903 static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx, 904 struct stage2_map_data *data) 905 { 906 u64 phys = stage2_map_walker_phys_addr(ctx, data); 907 908 if (data->force_pte && (ctx->level < (KVM_PGTABLE_MAX_LEVELS - 1))) 909 return false; 910 911 return kvm_block_mapping_supported(ctx, phys); 912 } 913 914 static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, 915 struct stage2_map_data *data) 916 { 917 kvm_pte_t new; 918 u64 phys = stage2_map_walker_phys_addr(ctx, data); 919 u64 granule = kvm_granule_size(ctx->level); 920 struct kvm_pgtable *pgt = data->mmu->pgt; 921 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 922 923 if (!stage2_leaf_mapping_allowed(ctx, data)) 924 return -E2BIG; 925 926 if (kvm_phys_is_valid(phys)) 927 new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); 928 else 929 new = kvm_init_invalid_leaf_owner(data->owner_id); 930 931 /* 932 * Skip updating the PTE if we are trying to recreate the exact 933 * same mapping or only change the access permissions. Instead, 934 * the vCPU will exit one more time from guest if still needed 935 * and then go through the path of relaxing permissions. 936 */ 937 if (!stage2_pte_needs_update(ctx->old, new)) 938 return -EAGAIN; 939 940 if (!stage2_try_break_pte(ctx, data->mmu)) 941 return -EAGAIN; 942 943 /* Perform CMOs before installation of the guest stage-2 PTE */ 944 if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc && 945 stage2_pte_cacheable(pgt, new)) 946 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops), 947 granule); 948 949 if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou && 950 stage2_pte_executable(new)) 951 mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); 952 953 stage2_make_pte(ctx, new); 954 955 return 0; 956 } 957 958 static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, 959 struct stage2_map_data *data) 960 { 961 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 962 kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops); 963 int ret; 964 965 if (!stage2_leaf_mapping_allowed(ctx, data)) 966 return 0; 967 968 ret = stage2_map_walker_try_leaf(ctx, data); 969 if (ret) 970 return ret; 971 972 mm_ops->free_unlinked_table(childp, ctx->level); 973 return 0; 974 } 975 976 static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, 977 struct stage2_map_data *data) 978 { 979 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 980 kvm_pte_t *childp, new; 981 int ret; 982 983 ret = stage2_map_walker_try_leaf(ctx, data); 984 if (ret != -E2BIG) 985 return ret; 986 987 if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) 988 return -EINVAL; 989 990 if (!data->memcache) 991 return -ENOMEM; 992 993 childp = mm_ops->zalloc_page(data->memcache); 994 if (!childp) 995 return -ENOMEM; 996 997 if (!stage2_try_break_pte(ctx, data->mmu)) { 998 mm_ops->put_page(childp); 999 return -EAGAIN; 1000 } 1001 1002 /* 1003 * If we've run into an existing block mapping then replace it with 1004 * a table. Accesses beyond 'end' that fall within the new table 1005 * will be mapped lazily. 1006 */ 1007 new = kvm_init_table_pte(childp, mm_ops); 1008 stage2_make_pte(ctx, new); 1009 1010 return 0; 1011 } 1012 1013 /* 1014 * The TABLE_PRE callback runs for table entries on the way down, looking 1015 * for table entries which we could conceivably replace with a block entry 1016 * for this mapping. If it finds one it replaces the entry and calls 1017 * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table. 1018 * 1019 * Otherwise, the LEAF callback performs the mapping at the existing leaves 1020 * instead. 1021 */ 1022 static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx, 1023 enum kvm_pgtable_walk_flags visit) 1024 { 1025 struct stage2_map_data *data = ctx->arg; 1026 1027 switch (visit) { 1028 case KVM_PGTABLE_WALK_TABLE_PRE: 1029 return stage2_map_walk_table_pre(ctx, data); 1030 case KVM_PGTABLE_WALK_LEAF: 1031 return stage2_map_walk_leaf(ctx, data); 1032 default: 1033 return -EINVAL; 1034 } 1035 } 1036 1037 int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, 1038 u64 phys, enum kvm_pgtable_prot prot, 1039 void *mc, enum kvm_pgtable_walk_flags flags) 1040 { 1041 int ret; 1042 struct stage2_map_data map_data = { 1043 .phys = ALIGN_DOWN(phys, PAGE_SIZE), 1044 .mmu = pgt->mmu, 1045 .memcache = mc, 1046 .force_pte = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot), 1047 }; 1048 struct kvm_pgtable_walker walker = { 1049 .cb = stage2_map_walker, 1050 .flags = flags | 1051 KVM_PGTABLE_WALK_TABLE_PRE | 1052 KVM_PGTABLE_WALK_LEAF, 1053 .arg = &map_data, 1054 }; 1055 1056 if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys))) 1057 return -EINVAL; 1058 1059 ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); 1060 if (ret) 1061 return ret; 1062 1063 ret = kvm_pgtable_walk(pgt, addr, size, &walker); 1064 dsb(ishst); 1065 return ret; 1066 } 1067 1068 int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, 1069 void *mc, u8 owner_id) 1070 { 1071 int ret; 1072 struct stage2_map_data map_data = { 1073 .phys = KVM_PHYS_INVALID, 1074 .mmu = pgt->mmu, 1075 .memcache = mc, 1076 .owner_id = owner_id, 1077 .force_pte = true, 1078 }; 1079 struct kvm_pgtable_walker walker = { 1080 .cb = stage2_map_walker, 1081 .flags = KVM_PGTABLE_WALK_TABLE_PRE | 1082 KVM_PGTABLE_WALK_LEAF, 1083 .arg = &map_data, 1084 }; 1085 1086 if (owner_id > KVM_MAX_OWNER_ID) 1087 return -EINVAL; 1088 1089 ret = kvm_pgtable_walk(pgt, addr, size, &walker); 1090 return ret; 1091 } 1092 1093 static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, 1094 enum kvm_pgtable_walk_flags visit) 1095 { 1096 struct kvm_pgtable *pgt = ctx->arg; 1097 struct kvm_s2_mmu *mmu = pgt->mmu; 1098 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 1099 kvm_pte_t *childp = NULL; 1100 bool need_flush = false; 1101 1102 if (!kvm_pte_valid(ctx->old)) { 1103 if (stage2_pte_is_counted(ctx->old)) { 1104 kvm_clear_pte(ctx->ptep); 1105 mm_ops->put_page(ctx->ptep); 1106 } 1107 return 0; 1108 } 1109 1110 if (kvm_pte_table(ctx->old, ctx->level)) { 1111 childp = kvm_pte_follow(ctx->old, mm_ops); 1112 1113 if (mm_ops->page_count(childp) != 1) 1114 return 0; 1115 } else if (stage2_pte_cacheable(pgt, ctx->old)) { 1116 need_flush = !stage2_has_fwb(pgt); 1117 } 1118 1119 /* 1120 * This is similar to the map() path in that we unmap the entire 1121 * block entry and rely on the remaining portions being faulted 1122 * back lazily. 1123 */ 1124 stage2_unmap_put_pte(ctx, mmu, mm_ops); 1125 1126 if (need_flush && mm_ops->dcache_clean_inval_poc) 1127 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), 1128 kvm_granule_size(ctx->level)); 1129 1130 if (childp) 1131 mm_ops->put_page(childp); 1132 1133 return 0; 1134 } 1135 1136 int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) 1137 { 1138 int ret; 1139 struct kvm_pgtable_walker walker = { 1140 .cb = stage2_unmap_walker, 1141 .arg = pgt, 1142 .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, 1143 }; 1144 1145 ret = kvm_pgtable_walk(pgt, addr, size, &walker); 1146 if (stage2_unmap_defer_tlb_flush(pgt)) 1147 /* Perform the deferred TLB invalidations */ 1148 kvm_tlb_flush_vmid_range(pgt->mmu, addr, size); 1149 1150 return ret; 1151 } 1152 1153 struct stage2_attr_data { 1154 kvm_pte_t attr_set; 1155 kvm_pte_t attr_clr; 1156 kvm_pte_t pte; 1157 u32 level; 1158 }; 1159 1160 static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, 1161 enum kvm_pgtable_walk_flags visit) 1162 { 1163 kvm_pte_t pte = ctx->old; 1164 struct stage2_attr_data *data = ctx->arg; 1165 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 1166 1167 if (!kvm_pte_valid(ctx->old)) 1168 return -EAGAIN; 1169 1170 data->level = ctx->level; 1171 data->pte = pte; 1172 pte &= ~data->attr_clr; 1173 pte |= data->attr_set; 1174 1175 /* 1176 * We may race with the CPU trying to set the access flag here, 1177 * but worst-case the access flag update gets lost and will be 1178 * set on the next access instead. 1179 */ 1180 if (data->pte != pte) { 1181 /* 1182 * Invalidate instruction cache before updating the guest 1183 * stage-2 PTE if we are going to add executable permission. 1184 */ 1185 if (mm_ops->icache_inval_pou && 1186 stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old)) 1187 mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops), 1188 kvm_granule_size(ctx->level)); 1189 1190 if (!stage2_try_set_pte(ctx, pte)) 1191 return -EAGAIN; 1192 } 1193 1194 return 0; 1195 } 1196 1197 static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, 1198 u64 size, kvm_pte_t attr_set, 1199 kvm_pte_t attr_clr, kvm_pte_t *orig_pte, 1200 u32 *level, enum kvm_pgtable_walk_flags flags) 1201 { 1202 int ret; 1203 kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI; 1204 struct stage2_attr_data data = { 1205 .attr_set = attr_set & attr_mask, 1206 .attr_clr = attr_clr & attr_mask, 1207 }; 1208 struct kvm_pgtable_walker walker = { 1209 .cb = stage2_attr_walker, 1210 .arg = &data, 1211 .flags = flags | KVM_PGTABLE_WALK_LEAF, 1212 }; 1213 1214 ret = kvm_pgtable_walk(pgt, addr, size, &walker); 1215 if (ret) 1216 return ret; 1217 1218 if (orig_pte) 1219 *orig_pte = data.pte; 1220 1221 if (level) 1222 *level = data.level; 1223 return 0; 1224 } 1225 1226 int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) 1227 { 1228 return stage2_update_leaf_attrs(pgt, addr, size, 0, 1229 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, 1230 NULL, NULL, 0); 1231 } 1232 1233 kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) 1234 { 1235 kvm_pte_t pte = 0; 1236 int ret; 1237 1238 ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, 1239 &pte, NULL, 1240 KVM_PGTABLE_WALK_HANDLE_FAULT | 1241 KVM_PGTABLE_WALK_SHARED); 1242 if (!ret) 1243 dsb(ishst); 1244 1245 return pte; 1246 } 1247 1248 struct stage2_age_data { 1249 bool mkold; 1250 bool young; 1251 }; 1252 1253 static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx, 1254 enum kvm_pgtable_walk_flags visit) 1255 { 1256 kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF; 1257 struct stage2_age_data *data = ctx->arg; 1258 1259 if (!kvm_pte_valid(ctx->old) || new == ctx->old) 1260 return 0; 1261 1262 data->young = true; 1263 1264 /* 1265 * stage2_age_walker() is always called while holding the MMU lock for 1266 * write, so this will always succeed. Nonetheless, this deliberately 1267 * follows the race detection pattern of the other stage-2 walkers in 1268 * case the locking mechanics of the MMU notifiers is ever changed. 1269 */ 1270 if (data->mkold && !stage2_try_set_pte(ctx, new)) 1271 return -EAGAIN; 1272 1273 /* 1274 * "But where's the TLBI?!", you scream. 1275 * "Over in the core code", I sigh. 1276 * 1277 * See the '->clear_flush_young()' callback on the KVM mmu notifier. 1278 */ 1279 return 0; 1280 } 1281 1282 bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, 1283 u64 size, bool mkold) 1284 { 1285 struct stage2_age_data data = { 1286 .mkold = mkold, 1287 }; 1288 struct kvm_pgtable_walker walker = { 1289 .cb = stage2_age_walker, 1290 .arg = &data, 1291 .flags = KVM_PGTABLE_WALK_LEAF, 1292 }; 1293 1294 WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker)); 1295 return data.young; 1296 } 1297 1298 int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, 1299 enum kvm_pgtable_prot prot) 1300 { 1301 int ret; 1302 u32 level; 1303 kvm_pte_t set = 0, clr = 0; 1304 1305 if (prot & KVM_PTE_LEAF_ATTR_HI_SW) 1306 return -EINVAL; 1307 1308 if (prot & KVM_PGTABLE_PROT_R) 1309 set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; 1310 1311 if (prot & KVM_PGTABLE_PROT_W) 1312 set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; 1313 1314 if (prot & KVM_PGTABLE_PROT_X) 1315 clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; 1316 1317 ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, 1318 KVM_PGTABLE_WALK_HANDLE_FAULT | 1319 KVM_PGTABLE_WALK_SHARED); 1320 if (!ret) 1321 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level); 1322 return ret; 1323 } 1324 1325 static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx, 1326 enum kvm_pgtable_walk_flags visit) 1327 { 1328 struct kvm_pgtable *pgt = ctx->arg; 1329 struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; 1330 1331 if (!kvm_pte_valid(ctx->old) || !stage2_pte_cacheable(pgt, ctx->old)) 1332 return 0; 1333 1334 if (mm_ops->dcache_clean_inval_poc) 1335 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), 1336 kvm_granule_size(ctx->level)); 1337 return 0; 1338 } 1339 1340 int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) 1341 { 1342 struct kvm_pgtable_walker walker = { 1343 .cb = stage2_flush_walker, 1344 .flags = KVM_PGTABLE_WALK_LEAF, 1345 .arg = pgt, 1346 }; 1347 1348 if (stage2_has_fwb(pgt)) 1349 return 0; 1350 1351 return kvm_pgtable_walk(pgt, addr, size, &walker); 1352 } 1353 1354 kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, 1355 u64 phys, u32 level, 1356 enum kvm_pgtable_prot prot, 1357 void *mc, bool force_pte) 1358 { 1359 struct stage2_map_data map_data = { 1360 .phys = phys, 1361 .mmu = pgt->mmu, 1362 .memcache = mc, 1363 .force_pte = force_pte, 1364 }; 1365 struct kvm_pgtable_walker walker = { 1366 .cb = stage2_map_walker, 1367 .flags = KVM_PGTABLE_WALK_LEAF | 1368 KVM_PGTABLE_WALK_SKIP_BBM_TLBI | 1369 KVM_PGTABLE_WALK_SKIP_CMO, 1370 .arg = &map_data, 1371 }; 1372 /* 1373 * The input address (.addr) is irrelevant for walking an 1374 * unlinked table. Construct an ambiguous IA range to map 1375 * kvm_granule_size(level) worth of memory. 1376 */ 1377 struct kvm_pgtable_walk_data data = { 1378 .walker = &walker, 1379 .addr = 0, 1380 .end = kvm_granule_size(level), 1381 }; 1382 struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; 1383 kvm_pte_t *pgtable; 1384 int ret; 1385 1386 if (!IS_ALIGNED(phys, kvm_granule_size(level))) 1387 return ERR_PTR(-EINVAL); 1388 1389 ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); 1390 if (ret) 1391 return ERR_PTR(ret); 1392 1393 pgtable = mm_ops->zalloc_page(mc); 1394 if (!pgtable) 1395 return ERR_PTR(-ENOMEM); 1396 1397 ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable, 1398 level + 1); 1399 if (ret) { 1400 kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level); 1401 mm_ops->put_page(pgtable); 1402 return ERR_PTR(ret); 1403 } 1404 1405 return pgtable; 1406 } 1407 1408 /* 1409 * Get the number of page-tables needed to replace a block with a 1410 * fully populated tree up to the PTE entries. Note that @level is 1411 * interpreted as in "level @level entry". 1412 */ 1413 static int stage2_block_get_nr_page_tables(u32 level) 1414 { 1415 switch (level) { 1416 case 1: 1417 return PTRS_PER_PTE + 1; 1418 case 2: 1419 return 1; 1420 case 3: 1421 return 0; 1422 default: 1423 WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL || 1424 level >= KVM_PGTABLE_MAX_LEVELS); 1425 return -EINVAL; 1426 }; 1427 } 1428 1429 static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, 1430 enum kvm_pgtable_walk_flags visit) 1431 { 1432 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 1433 struct kvm_mmu_memory_cache *mc = ctx->arg; 1434 struct kvm_s2_mmu *mmu; 1435 kvm_pte_t pte = ctx->old, new, *childp; 1436 enum kvm_pgtable_prot prot; 1437 u32 level = ctx->level; 1438 bool force_pte; 1439 int nr_pages; 1440 u64 phys; 1441 1442 /* No huge-pages exist at the last level */ 1443 if (level == KVM_PGTABLE_MAX_LEVELS - 1) 1444 return 0; 1445 1446 /* We only split valid block mappings */ 1447 if (!kvm_pte_valid(pte)) 1448 return 0; 1449 1450 nr_pages = stage2_block_get_nr_page_tables(level); 1451 if (nr_pages < 0) 1452 return nr_pages; 1453 1454 if (mc->nobjs >= nr_pages) { 1455 /* Build a tree mapped down to the PTE granularity. */ 1456 force_pte = true; 1457 } else { 1458 /* 1459 * Don't force PTEs, so create_unlinked() below does 1460 * not populate the tree up to the PTE level. The 1461 * consequence is that the call will require a single 1462 * page of level 2 entries at level 1, or a single 1463 * page of PTEs at level 2. If we are at level 1, the 1464 * PTEs will be created recursively. 1465 */ 1466 force_pte = false; 1467 nr_pages = 1; 1468 } 1469 1470 if (mc->nobjs < nr_pages) 1471 return -ENOMEM; 1472 1473 mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache); 1474 phys = kvm_pte_to_phys(pte); 1475 prot = kvm_pgtable_stage2_pte_prot(pte); 1476 1477 childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys, 1478 level, prot, mc, force_pte); 1479 if (IS_ERR(childp)) 1480 return PTR_ERR(childp); 1481 1482 if (!stage2_try_break_pte(ctx, mmu)) { 1483 kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level); 1484 mm_ops->put_page(childp); 1485 return -EAGAIN; 1486 } 1487 1488 /* 1489 * Note, the contents of the page table are guaranteed to be made 1490 * visible before the new PTE is assigned because stage2_make_pte() 1491 * writes the PTE using smp_store_release(). 1492 */ 1493 new = kvm_init_table_pte(childp, mm_ops); 1494 stage2_make_pte(ctx, new); 1495 dsb(ishst); 1496 return 0; 1497 } 1498 1499 int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, 1500 struct kvm_mmu_memory_cache *mc) 1501 { 1502 struct kvm_pgtable_walker walker = { 1503 .cb = stage2_split_walker, 1504 .flags = KVM_PGTABLE_WALK_LEAF, 1505 .arg = mc, 1506 }; 1507 1508 return kvm_pgtable_walk(pgt, addr, size, &walker); 1509 } 1510 1511 int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, 1512 struct kvm_pgtable_mm_ops *mm_ops, 1513 enum kvm_pgtable_stage2_flags flags, 1514 kvm_pgtable_force_pte_cb_t force_pte_cb) 1515 { 1516 size_t pgd_sz; 1517 u64 vtcr = mmu->arch->vtcr; 1518 u32 ia_bits = VTCR_EL2_IPA(vtcr); 1519 u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); 1520 u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; 1521 1522 pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; 1523 pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz); 1524 if (!pgt->pgd) 1525 return -ENOMEM; 1526 1527 pgt->ia_bits = ia_bits; 1528 pgt->start_level = start_level; 1529 pgt->mm_ops = mm_ops; 1530 pgt->mmu = mmu; 1531 pgt->flags = flags; 1532 pgt->force_pte_cb = force_pte_cb; 1533 1534 /* Ensure zeroed PGD pages are visible to the hardware walker */ 1535 dsb(ishst); 1536 return 0; 1537 } 1538 1539 size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) 1540 { 1541 u32 ia_bits = VTCR_EL2_IPA(vtcr); 1542 u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); 1543 u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; 1544 1545 return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; 1546 } 1547 1548 static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, 1549 enum kvm_pgtable_walk_flags visit) 1550 { 1551 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 1552 1553 if (!stage2_pte_is_counted(ctx->old)) 1554 return 0; 1555 1556 mm_ops->put_page(ctx->ptep); 1557 1558 if (kvm_pte_table(ctx->old, ctx->level)) 1559 mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); 1560 1561 return 0; 1562 } 1563 1564 void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) 1565 { 1566 size_t pgd_sz; 1567 struct kvm_pgtable_walker walker = { 1568 .cb = stage2_free_walker, 1569 .flags = KVM_PGTABLE_WALK_LEAF | 1570 KVM_PGTABLE_WALK_TABLE_POST, 1571 }; 1572 1573 WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); 1574 pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; 1575 pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz); 1576 pgt->pgd = NULL; 1577 } 1578 1579 void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level) 1580 { 1581 kvm_pteref_t ptep = (kvm_pteref_t)pgtable; 1582 struct kvm_pgtable_walker walker = { 1583 .cb = stage2_free_walker, 1584 .flags = KVM_PGTABLE_WALK_LEAF | 1585 KVM_PGTABLE_WALK_TABLE_POST, 1586 }; 1587 struct kvm_pgtable_walk_data data = { 1588 .walker = &walker, 1589 1590 /* 1591 * At this point the IPA really doesn't matter, as the page 1592 * table being traversed has already been removed from the stage 1593 * 2. Set an appropriate range to cover the entire page table. 1594 */ 1595 .addr = 0, 1596 .end = kvm_granule_size(level), 1597 }; 1598 1599 WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level + 1)); 1600 1601 WARN_ON(mm_ops->page_count(pgtable) != 1); 1602 mm_ops->put_page(pgtable); 1603 } 1604