1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Stand-alone page-table allocator for hyp stage-1 and guest stage-2. 4 * No bombay mix was harmed in the writing of this file. 5 * 6 * Copyright (C) 2020 Google LLC 7 * Author: Will Deacon <will@kernel.org> 8 */ 9 10 #include <linux/bitfield.h> 11 #include <asm/kvm_pgtable.h> 12 #include <asm/stage2_pgtable.h> 13 14 15 #define KVM_PTE_TYPE BIT(1) 16 #define KVM_PTE_TYPE_BLOCK 0 17 #define KVM_PTE_TYPE_PAGE 1 18 #define KVM_PTE_TYPE_TABLE 1 19 20 #define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) 21 22 #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) 23 #define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) 24 #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \ 25 ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; }) 26 #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \ 27 ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; }) 28 #define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) 29 #define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 30 #define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) 31 32 #define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) 33 #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) 34 #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) 35 #define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) 36 #define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 37 #define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) 38 39 #define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50) 40 41 #define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) 42 43 #define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) 44 45 #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) 46 47 #define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) 48 49 #define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ 50 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ 51 KVM_PTE_LEAF_ATTR_HI_S2_XN) 52 53 #define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) 54 #define KVM_MAX_OWNER_ID 1 55 56 /* 57 * Used to indicate a pte for which a 'break-before-make' sequence is in 58 * progress. 59 */ 60 #define KVM_INVALID_PTE_LOCKED BIT(10) 61 62 struct kvm_pgtable_walk_data { 63 struct kvm_pgtable_walker *walker; 64 65 const u64 start; 66 u64 addr; 67 const u64 end; 68 }; 69 70 static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx) 71 { 72 return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI); 73 } 74 75 static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx) 76 { 77 return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO); 78 } 79 80 static bool kvm_phys_is_valid(u64 phys) 81 { 82 return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX)); 83 } 84 85 static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys) 86 { 87 u64 granule = kvm_granule_size(ctx->level); 88 89 if (!kvm_level_supports_block_mapping(ctx->level)) 90 return false; 91 92 if (granule > (ctx->end - ctx->addr)) 93 return false; 94 95 if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule)) 96 return false; 97 98 return IS_ALIGNED(ctx->addr, granule); 99 } 100 101 static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level) 102 { 103 u64 shift = kvm_granule_shift(level); 104 u64 mask = BIT(PAGE_SHIFT - 3) - 1; 105 106 return (data->addr >> shift) & mask; 107 } 108 109 static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) 110 { 111 u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */ 112 u64 mask = BIT(pgt->ia_bits) - 1; 113 114 return (addr & mask) >> shift; 115 } 116 117 static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) 118 { 119 struct kvm_pgtable pgt = { 120 .ia_bits = ia_bits, 121 .start_level = start_level, 122 }; 123 124 return kvm_pgd_page_idx(&pgt, -1ULL) + 1; 125 } 126 127 static bool kvm_pte_table(kvm_pte_t pte, u32 level) 128 { 129 if (level == KVM_PGTABLE_MAX_LEVELS - 1) 130 return false; 131 132 if (!kvm_pte_valid(pte)) 133 return false; 134 135 return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE; 136 } 137 138 static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops) 139 { 140 return mm_ops->phys_to_virt(kvm_pte_to_phys(pte)); 141 } 142 143 static void kvm_clear_pte(kvm_pte_t *ptep) 144 { 145 WRITE_ONCE(*ptep, 0); 146 } 147 148 static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops *mm_ops) 149 { 150 kvm_pte_t pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp)); 151 152 pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE); 153 pte |= KVM_PTE_VALID; 154 return pte; 155 } 156 157 static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, u32 level) 158 { 159 kvm_pte_t pte = kvm_phys_to_pte(pa); 160 u64 type = (level == KVM_PGTABLE_MAX_LEVELS - 1) ? KVM_PTE_TYPE_PAGE : 161 KVM_PTE_TYPE_BLOCK; 162 163 pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI); 164 pte |= FIELD_PREP(KVM_PTE_TYPE, type); 165 pte |= KVM_PTE_VALID; 166 167 return pte; 168 } 169 170 static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id) 171 { 172 return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id); 173 } 174 175 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, 176 const struct kvm_pgtable_visit_ctx *ctx, 177 enum kvm_pgtable_walk_flags visit) 178 { 179 struct kvm_pgtable_walker *walker = data->walker; 180 181 /* Ensure the appropriate lock is held (e.g. RCU lock for stage-2 MMU) */ 182 WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx) && !kvm_pgtable_walk_lock_held()); 183 return walker->cb(ctx, visit); 184 } 185 186 static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker, 187 int r) 188 { 189 /* 190 * Visitor callbacks return EAGAIN when the conditions that led to a 191 * fault are no longer reflected in the page tables due to a race to 192 * update a PTE. In the context of a fault handler this is interpreted 193 * as a signal to retry guest execution. 194 * 195 * Ignore the return code altogether for walkers outside a fault handler 196 * (e.g. write protecting a range of memory) and chug along with the 197 * page table walk. 198 */ 199 if (r == -EAGAIN) 200 return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT); 201 202 return !r; 203 } 204 205 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, 206 struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level); 207 208 static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, 209 struct kvm_pgtable_mm_ops *mm_ops, 210 kvm_pteref_t pteref, u32 level) 211 { 212 enum kvm_pgtable_walk_flags flags = data->walker->flags; 213 kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref); 214 struct kvm_pgtable_visit_ctx ctx = { 215 .ptep = ptep, 216 .old = READ_ONCE(*ptep), 217 .arg = data->walker->arg, 218 .mm_ops = mm_ops, 219 .start = data->start, 220 .addr = data->addr, 221 .end = data->end, 222 .level = level, 223 .flags = flags, 224 }; 225 int ret = 0; 226 bool reload = false; 227 kvm_pteref_t childp; 228 bool table = kvm_pte_table(ctx.old, level); 229 230 if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) { 231 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE); 232 reload = true; 233 } 234 235 if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) { 236 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF); 237 reload = true; 238 } 239 240 /* 241 * Reload the page table after invoking the walker callback for leaf 242 * entries or after pre-order traversal, to allow the walker to descend 243 * into a newly installed or replaced table. 244 */ 245 if (reload) { 246 ctx.old = READ_ONCE(*ptep); 247 table = kvm_pte_table(ctx.old, level); 248 } 249 250 if (!kvm_pgtable_walk_continue(data->walker, ret)) 251 goto out; 252 253 if (!table) { 254 data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level)); 255 data->addr += kvm_granule_size(level); 256 goto out; 257 } 258 259 childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops); 260 ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1); 261 if (!kvm_pgtable_walk_continue(data->walker, ret)) 262 goto out; 263 264 if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST) 265 ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST); 266 267 out: 268 if (kvm_pgtable_walk_continue(data->walker, ret)) 269 return 0; 270 271 return ret; 272 } 273 274 static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, 275 struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level) 276 { 277 u32 idx; 278 int ret = 0; 279 280 if (WARN_ON_ONCE(level >= KVM_PGTABLE_MAX_LEVELS)) 281 return -EINVAL; 282 283 for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) { 284 kvm_pteref_t pteref = &pgtable[idx]; 285 286 if (data->addr >= data->end) 287 break; 288 289 ret = __kvm_pgtable_visit(data, mm_ops, pteref, level); 290 if (ret) 291 break; 292 } 293 294 return ret; 295 } 296 297 static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_data *data) 298 { 299 u32 idx; 300 int ret = 0; 301 u64 limit = BIT(pgt->ia_bits); 302 303 if (data->addr > limit || data->end > limit) 304 return -ERANGE; 305 306 if (!pgt->pgd) 307 return -EINVAL; 308 309 for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) { 310 kvm_pteref_t pteref = &pgt->pgd[idx * PTRS_PER_PTE]; 311 312 ret = __kvm_pgtable_walk(data, pgt->mm_ops, pteref, pgt->start_level); 313 if (ret) 314 break; 315 } 316 317 return ret; 318 } 319 320 int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, 321 struct kvm_pgtable_walker *walker) 322 { 323 struct kvm_pgtable_walk_data walk_data = { 324 .start = ALIGN_DOWN(addr, PAGE_SIZE), 325 .addr = ALIGN_DOWN(addr, PAGE_SIZE), 326 .end = PAGE_ALIGN(walk_data.addr + size), 327 .walker = walker, 328 }; 329 int r; 330 331 r = kvm_pgtable_walk_begin(walker); 332 if (r) 333 return r; 334 335 r = _kvm_pgtable_walk(pgt, &walk_data); 336 kvm_pgtable_walk_end(walker); 337 338 return r; 339 } 340 341 struct leaf_walk_data { 342 kvm_pte_t pte; 343 u32 level; 344 }; 345 346 static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx, 347 enum kvm_pgtable_walk_flags visit) 348 { 349 struct leaf_walk_data *data = ctx->arg; 350 351 data->pte = ctx->old; 352 data->level = ctx->level; 353 354 return 0; 355 } 356 357 int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, 358 kvm_pte_t *ptep, u32 *level) 359 { 360 struct leaf_walk_data data; 361 struct kvm_pgtable_walker walker = { 362 .cb = leaf_walker, 363 .flags = KVM_PGTABLE_WALK_LEAF, 364 .arg = &data, 365 }; 366 int ret; 367 368 ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE), 369 PAGE_SIZE, &walker); 370 if (!ret) { 371 if (ptep) 372 *ptep = data.pte; 373 if (level) 374 *level = data.level; 375 } 376 377 return ret; 378 } 379 380 struct hyp_map_data { 381 const u64 phys; 382 kvm_pte_t attr; 383 }; 384 385 static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) 386 { 387 bool device = prot & KVM_PGTABLE_PROT_DEVICE; 388 u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL; 389 kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype); 390 u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS; 391 u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW : 392 KVM_PTE_LEAF_ATTR_LO_S1_AP_RO; 393 394 if (!(prot & KVM_PGTABLE_PROT_R)) 395 return -EINVAL; 396 397 if (prot & KVM_PGTABLE_PROT_X) { 398 if (prot & KVM_PGTABLE_PROT_W) 399 return -EINVAL; 400 401 if (device) 402 return -EINVAL; 403 404 if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti()) 405 attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP; 406 } else { 407 attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN; 408 } 409 410 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); 411 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); 412 attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; 413 attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; 414 *ptep = attr; 415 416 return 0; 417 } 418 419 enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) 420 { 421 enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; 422 u32 ap; 423 424 if (!kvm_pte_valid(pte)) 425 return prot; 426 427 if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN)) 428 prot |= KVM_PGTABLE_PROT_X; 429 430 ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte); 431 if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO) 432 prot |= KVM_PGTABLE_PROT_R; 433 else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW) 434 prot |= KVM_PGTABLE_PROT_RW; 435 436 return prot; 437 } 438 439 static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, 440 struct hyp_map_data *data) 441 { 442 u64 phys = data->phys + (ctx->addr - ctx->start); 443 kvm_pte_t new; 444 445 if (!kvm_block_mapping_supported(ctx, phys)) 446 return false; 447 448 new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); 449 if (ctx->old == new) 450 return true; 451 if (!kvm_pte_valid(ctx->old)) 452 ctx->mm_ops->get_page(ctx->ptep); 453 else if (WARN_ON((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) 454 return false; 455 456 smp_store_release(ctx->ptep, new); 457 return true; 458 } 459 460 static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx, 461 enum kvm_pgtable_walk_flags visit) 462 { 463 kvm_pte_t *childp, new; 464 struct hyp_map_data *data = ctx->arg; 465 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 466 467 if (hyp_map_walker_try_leaf(ctx, data)) 468 return 0; 469 470 if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) 471 return -EINVAL; 472 473 childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL); 474 if (!childp) 475 return -ENOMEM; 476 477 new = kvm_init_table_pte(childp, mm_ops); 478 mm_ops->get_page(ctx->ptep); 479 smp_store_release(ctx->ptep, new); 480 481 return 0; 482 } 483 484 int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, 485 enum kvm_pgtable_prot prot) 486 { 487 int ret; 488 struct hyp_map_data map_data = { 489 .phys = ALIGN_DOWN(phys, PAGE_SIZE), 490 }; 491 struct kvm_pgtable_walker walker = { 492 .cb = hyp_map_walker, 493 .flags = KVM_PGTABLE_WALK_LEAF, 494 .arg = &map_data, 495 }; 496 497 ret = hyp_set_prot_attr(prot, &map_data.attr); 498 if (ret) 499 return ret; 500 501 ret = kvm_pgtable_walk(pgt, addr, size, &walker); 502 dsb(ishst); 503 isb(); 504 return ret; 505 } 506 507 static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, 508 enum kvm_pgtable_walk_flags visit) 509 { 510 kvm_pte_t *childp = NULL; 511 u64 granule = kvm_granule_size(ctx->level); 512 u64 *unmapped = ctx->arg; 513 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 514 515 if (!kvm_pte_valid(ctx->old)) 516 return -EINVAL; 517 518 if (kvm_pte_table(ctx->old, ctx->level)) { 519 childp = kvm_pte_follow(ctx->old, mm_ops); 520 521 if (mm_ops->page_count(childp) != 1) 522 return 0; 523 524 kvm_clear_pte(ctx->ptep); 525 dsb(ishst); 526 __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); 527 } else { 528 if (ctx->end - ctx->addr < granule) 529 return -EINVAL; 530 531 kvm_clear_pte(ctx->ptep); 532 dsb(ishst); 533 __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); 534 *unmapped += granule; 535 } 536 537 dsb(ish); 538 isb(); 539 mm_ops->put_page(ctx->ptep); 540 541 if (childp) 542 mm_ops->put_page(childp); 543 544 return 0; 545 } 546 547 u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) 548 { 549 u64 unmapped = 0; 550 struct kvm_pgtable_walker walker = { 551 .cb = hyp_unmap_walker, 552 .arg = &unmapped, 553 .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, 554 }; 555 556 if (!pgt->mm_ops->page_count) 557 return 0; 558 559 kvm_pgtable_walk(pgt, addr, size, &walker); 560 return unmapped; 561 } 562 563 int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, 564 struct kvm_pgtable_mm_ops *mm_ops) 565 { 566 u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits); 567 568 pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL); 569 if (!pgt->pgd) 570 return -ENOMEM; 571 572 pgt->ia_bits = va_bits; 573 pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels; 574 pgt->mm_ops = mm_ops; 575 pgt->mmu = NULL; 576 pgt->force_pte_cb = NULL; 577 578 return 0; 579 } 580 581 static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx, 582 enum kvm_pgtable_walk_flags visit) 583 { 584 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 585 586 if (!kvm_pte_valid(ctx->old)) 587 return 0; 588 589 mm_ops->put_page(ctx->ptep); 590 591 if (kvm_pte_table(ctx->old, ctx->level)) 592 mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); 593 594 return 0; 595 } 596 597 void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) 598 { 599 struct kvm_pgtable_walker walker = { 600 .cb = hyp_free_walker, 601 .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, 602 }; 603 604 WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); 605 pgt->mm_ops->put_page(kvm_dereference_pteref(&walker, pgt->pgd)); 606 pgt->pgd = NULL; 607 } 608 609 struct stage2_map_data { 610 const u64 phys; 611 kvm_pte_t attr; 612 u8 owner_id; 613 614 kvm_pte_t *anchor; 615 kvm_pte_t *childp; 616 617 struct kvm_s2_mmu *mmu; 618 void *memcache; 619 620 /* Force mappings to page granularity */ 621 bool force_pte; 622 }; 623 624 u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) 625 { 626 u64 vtcr = VTCR_EL2_FLAGS; 627 u8 lvls; 628 629 vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT; 630 vtcr |= VTCR_EL2_T0SZ(phys_shift); 631 /* 632 * Use a minimum 2 level page table to prevent splitting 633 * host PMD huge pages at stage2. 634 */ 635 lvls = stage2_pgtable_levels(phys_shift); 636 if (lvls < 2) 637 lvls = 2; 638 vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); 639 640 #ifdef CONFIG_ARM64_HW_AFDBM 641 /* 642 * Enable the Hardware Access Flag management, unconditionally 643 * on all CPUs. In systems that have asymmetric support for the feature 644 * this allows KVM to leverage hardware support on the subset of cores 645 * that implement the feature. 646 * 647 * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by 648 * hardware) on implementations that do not advertise support for the 649 * feature. As such, setting HA unconditionally is safe, unless you 650 * happen to be running on a design that has unadvertised support for 651 * HAFDBS. Here be dragons. 652 */ 653 if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) 654 vtcr |= VTCR_EL2_HA; 655 #endif /* CONFIG_ARM64_HW_AFDBM */ 656 657 /* Set the vmid bits */ 658 vtcr |= (get_vmid_bits(mmfr1) == 16) ? 659 VTCR_EL2_VS_16BIT : 660 VTCR_EL2_VS_8BIT; 661 662 return vtcr; 663 } 664 665 static bool stage2_has_fwb(struct kvm_pgtable *pgt) 666 { 667 if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) 668 return false; 669 670 return !(pgt->flags & KVM_PGTABLE_S2_NOFWB); 671 } 672 673 #define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt)) 674 675 static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, 676 kvm_pte_t *ptep) 677 { 678 bool device = prot & KVM_PGTABLE_PROT_DEVICE; 679 kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) : 680 KVM_S2_MEMATTR(pgt, NORMAL); 681 u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; 682 683 if (!(prot & KVM_PGTABLE_PROT_X)) 684 attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; 685 else if (device) 686 return -EINVAL; 687 688 if (prot & KVM_PGTABLE_PROT_R) 689 attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; 690 691 if (prot & KVM_PGTABLE_PROT_W) 692 attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; 693 694 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); 695 attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; 696 attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; 697 *ptep = attr; 698 699 return 0; 700 } 701 702 enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) 703 { 704 enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; 705 706 if (!kvm_pte_valid(pte)) 707 return prot; 708 709 if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R) 710 prot |= KVM_PGTABLE_PROT_R; 711 if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W) 712 prot |= KVM_PGTABLE_PROT_W; 713 if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN)) 714 prot |= KVM_PGTABLE_PROT_X; 715 716 return prot; 717 } 718 719 static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new) 720 { 721 if (!kvm_pte_valid(old) || !kvm_pte_valid(new)) 722 return true; 723 724 return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)); 725 } 726 727 static bool stage2_pte_is_counted(kvm_pte_t pte) 728 { 729 /* 730 * The refcount tracks valid entries as well as invalid entries if they 731 * encode ownership of a page to another entity than the page-table 732 * owner, whose id is 0. 733 */ 734 return !!pte; 735 } 736 737 static bool stage2_pte_is_locked(kvm_pte_t pte) 738 { 739 return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED); 740 } 741 742 static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) 743 { 744 if (!kvm_pgtable_walk_shared(ctx)) { 745 WRITE_ONCE(*ctx->ptep, new); 746 return true; 747 } 748 749 return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old; 750 } 751 752 /** 753 * stage2_try_break_pte() - Invalidates a pte according to the 754 * 'break-before-make' requirements of the 755 * architecture. 756 * 757 * @ctx: context of the visited pte. 758 * @mmu: stage-2 mmu 759 * 760 * Returns: true if the pte was successfully broken. 761 * 762 * If the removed pte was valid, performs the necessary serialization and TLB 763 * invalidation for the old value. For counted ptes, drops the reference count 764 * on the containing table page. 765 */ 766 static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, 767 struct kvm_s2_mmu *mmu) 768 { 769 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 770 771 if (stage2_pte_is_locked(ctx->old)) { 772 /* 773 * Should never occur if this walker has exclusive access to the 774 * page tables. 775 */ 776 WARN_ON(!kvm_pgtable_walk_shared(ctx)); 777 return false; 778 } 779 780 if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED)) 781 return false; 782 783 if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) { 784 /* 785 * Perform the appropriate TLB invalidation based on the 786 * evicted pte value (if any). 787 */ 788 if (kvm_pte_table(ctx->old, ctx->level)) 789 kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); 790 else if (kvm_pte_valid(ctx->old)) 791 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, 792 ctx->addr, ctx->level); 793 } 794 795 if (stage2_pte_is_counted(ctx->old)) 796 mm_ops->put_page(ctx->ptep); 797 798 return true; 799 } 800 801 static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) 802 { 803 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 804 805 WARN_ON(!stage2_pte_is_locked(*ctx->ptep)); 806 807 if (stage2_pte_is_counted(new)) 808 mm_ops->get_page(ctx->ptep); 809 810 smp_store_release(ctx->ptep, new); 811 } 812 813 static void stage2_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu, 814 struct kvm_pgtable_mm_ops *mm_ops) 815 { 816 /* 817 * Clear the existing PTE, and perform break-before-make with 818 * TLB maintenance if it was valid. 819 */ 820 if (kvm_pte_valid(ctx->old)) { 821 kvm_clear_pte(ctx->ptep); 822 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); 823 } 824 825 mm_ops->put_page(ctx->ptep); 826 } 827 828 static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) 829 { 830 u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; 831 return memattr == KVM_S2_MEMATTR(pgt, NORMAL); 832 } 833 834 static bool stage2_pte_executable(kvm_pte_t pte) 835 { 836 return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); 837 } 838 839 static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx, 840 const struct stage2_map_data *data) 841 { 842 u64 phys = data->phys; 843 844 /* 845 * Stage-2 walks to update ownership data are communicated to the map 846 * walker using an invalid PA. Avoid offsetting an already invalid PA, 847 * which could overflow and make the address valid again. 848 */ 849 if (!kvm_phys_is_valid(phys)) 850 return phys; 851 852 /* 853 * Otherwise, work out the correct PA based on how far the walk has 854 * gotten. 855 */ 856 return phys + (ctx->addr - ctx->start); 857 } 858 859 static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx, 860 struct stage2_map_data *data) 861 { 862 u64 phys = stage2_map_walker_phys_addr(ctx, data); 863 864 if (data->force_pte && (ctx->level < (KVM_PGTABLE_MAX_LEVELS - 1))) 865 return false; 866 867 return kvm_block_mapping_supported(ctx, phys); 868 } 869 870 static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, 871 struct stage2_map_data *data) 872 { 873 kvm_pte_t new; 874 u64 phys = stage2_map_walker_phys_addr(ctx, data); 875 u64 granule = kvm_granule_size(ctx->level); 876 struct kvm_pgtable *pgt = data->mmu->pgt; 877 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 878 879 if (!stage2_leaf_mapping_allowed(ctx, data)) 880 return -E2BIG; 881 882 if (kvm_phys_is_valid(phys)) 883 new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); 884 else 885 new = kvm_init_invalid_leaf_owner(data->owner_id); 886 887 /* 888 * Skip updating the PTE if we are trying to recreate the exact 889 * same mapping or only change the access permissions. Instead, 890 * the vCPU will exit one more time from guest if still needed 891 * and then go through the path of relaxing permissions. 892 */ 893 if (!stage2_pte_needs_update(ctx->old, new)) 894 return -EAGAIN; 895 896 if (!stage2_try_break_pte(ctx, data->mmu)) 897 return -EAGAIN; 898 899 /* Perform CMOs before installation of the guest stage-2 PTE */ 900 if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc && 901 stage2_pte_cacheable(pgt, new)) 902 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops), 903 granule); 904 905 if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou && 906 stage2_pte_executable(new)) 907 mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); 908 909 stage2_make_pte(ctx, new); 910 911 return 0; 912 } 913 914 static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, 915 struct stage2_map_data *data) 916 { 917 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 918 kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops); 919 int ret; 920 921 if (!stage2_leaf_mapping_allowed(ctx, data)) 922 return 0; 923 924 ret = stage2_map_walker_try_leaf(ctx, data); 925 if (ret) 926 return ret; 927 928 mm_ops->free_unlinked_table(childp, ctx->level); 929 return 0; 930 } 931 932 static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, 933 struct stage2_map_data *data) 934 { 935 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 936 kvm_pte_t *childp, new; 937 int ret; 938 939 ret = stage2_map_walker_try_leaf(ctx, data); 940 if (ret != -E2BIG) 941 return ret; 942 943 if (WARN_ON(ctx->level == KVM_PGTABLE_MAX_LEVELS - 1)) 944 return -EINVAL; 945 946 if (!data->memcache) 947 return -ENOMEM; 948 949 childp = mm_ops->zalloc_page(data->memcache); 950 if (!childp) 951 return -ENOMEM; 952 953 if (!stage2_try_break_pte(ctx, data->mmu)) { 954 mm_ops->put_page(childp); 955 return -EAGAIN; 956 } 957 958 /* 959 * If we've run into an existing block mapping then replace it with 960 * a table. Accesses beyond 'end' that fall within the new table 961 * will be mapped lazily. 962 */ 963 new = kvm_init_table_pte(childp, mm_ops); 964 stage2_make_pte(ctx, new); 965 966 return 0; 967 } 968 969 /* 970 * The TABLE_PRE callback runs for table entries on the way down, looking 971 * for table entries which we could conceivably replace with a block entry 972 * for this mapping. If it finds one it replaces the entry and calls 973 * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table. 974 * 975 * Otherwise, the LEAF callback performs the mapping at the existing leaves 976 * instead. 977 */ 978 static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx, 979 enum kvm_pgtable_walk_flags visit) 980 { 981 struct stage2_map_data *data = ctx->arg; 982 983 switch (visit) { 984 case KVM_PGTABLE_WALK_TABLE_PRE: 985 return stage2_map_walk_table_pre(ctx, data); 986 case KVM_PGTABLE_WALK_LEAF: 987 return stage2_map_walk_leaf(ctx, data); 988 default: 989 return -EINVAL; 990 } 991 } 992 993 int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, 994 u64 phys, enum kvm_pgtable_prot prot, 995 void *mc, enum kvm_pgtable_walk_flags flags) 996 { 997 int ret; 998 struct stage2_map_data map_data = { 999 .phys = ALIGN_DOWN(phys, PAGE_SIZE), 1000 .mmu = pgt->mmu, 1001 .memcache = mc, 1002 .force_pte = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot), 1003 }; 1004 struct kvm_pgtable_walker walker = { 1005 .cb = stage2_map_walker, 1006 .flags = flags | 1007 KVM_PGTABLE_WALK_TABLE_PRE | 1008 KVM_PGTABLE_WALK_LEAF, 1009 .arg = &map_data, 1010 }; 1011 1012 if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys))) 1013 return -EINVAL; 1014 1015 ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); 1016 if (ret) 1017 return ret; 1018 1019 ret = kvm_pgtable_walk(pgt, addr, size, &walker); 1020 dsb(ishst); 1021 return ret; 1022 } 1023 1024 int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, 1025 void *mc, u8 owner_id) 1026 { 1027 int ret; 1028 struct stage2_map_data map_data = { 1029 .phys = KVM_PHYS_INVALID, 1030 .mmu = pgt->mmu, 1031 .memcache = mc, 1032 .owner_id = owner_id, 1033 .force_pte = true, 1034 }; 1035 struct kvm_pgtable_walker walker = { 1036 .cb = stage2_map_walker, 1037 .flags = KVM_PGTABLE_WALK_TABLE_PRE | 1038 KVM_PGTABLE_WALK_LEAF, 1039 .arg = &map_data, 1040 }; 1041 1042 if (owner_id > KVM_MAX_OWNER_ID) 1043 return -EINVAL; 1044 1045 ret = kvm_pgtable_walk(pgt, addr, size, &walker); 1046 return ret; 1047 } 1048 1049 static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, 1050 enum kvm_pgtable_walk_flags visit) 1051 { 1052 struct kvm_pgtable *pgt = ctx->arg; 1053 struct kvm_s2_mmu *mmu = pgt->mmu; 1054 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 1055 kvm_pte_t *childp = NULL; 1056 bool need_flush = false; 1057 1058 if (!kvm_pte_valid(ctx->old)) { 1059 if (stage2_pte_is_counted(ctx->old)) { 1060 kvm_clear_pte(ctx->ptep); 1061 mm_ops->put_page(ctx->ptep); 1062 } 1063 return 0; 1064 } 1065 1066 if (kvm_pte_table(ctx->old, ctx->level)) { 1067 childp = kvm_pte_follow(ctx->old, mm_ops); 1068 1069 if (mm_ops->page_count(childp) != 1) 1070 return 0; 1071 } else if (stage2_pte_cacheable(pgt, ctx->old)) { 1072 need_flush = !stage2_has_fwb(pgt); 1073 } 1074 1075 /* 1076 * This is similar to the map() path in that we unmap the entire 1077 * block entry and rely on the remaining portions being faulted 1078 * back lazily. 1079 */ 1080 stage2_put_pte(ctx, mmu, mm_ops); 1081 1082 if (need_flush && mm_ops->dcache_clean_inval_poc) 1083 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), 1084 kvm_granule_size(ctx->level)); 1085 1086 if (childp) 1087 mm_ops->put_page(childp); 1088 1089 return 0; 1090 } 1091 1092 int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) 1093 { 1094 struct kvm_pgtable_walker walker = { 1095 .cb = stage2_unmap_walker, 1096 .arg = pgt, 1097 .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, 1098 }; 1099 1100 return kvm_pgtable_walk(pgt, addr, size, &walker); 1101 } 1102 1103 struct stage2_attr_data { 1104 kvm_pte_t attr_set; 1105 kvm_pte_t attr_clr; 1106 kvm_pte_t pte; 1107 u32 level; 1108 }; 1109 1110 static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, 1111 enum kvm_pgtable_walk_flags visit) 1112 { 1113 kvm_pte_t pte = ctx->old; 1114 struct stage2_attr_data *data = ctx->arg; 1115 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 1116 1117 if (!kvm_pte_valid(ctx->old)) 1118 return -EAGAIN; 1119 1120 data->level = ctx->level; 1121 data->pte = pte; 1122 pte &= ~data->attr_clr; 1123 pte |= data->attr_set; 1124 1125 /* 1126 * We may race with the CPU trying to set the access flag here, 1127 * but worst-case the access flag update gets lost and will be 1128 * set on the next access instead. 1129 */ 1130 if (data->pte != pte) { 1131 /* 1132 * Invalidate instruction cache before updating the guest 1133 * stage-2 PTE if we are going to add executable permission. 1134 */ 1135 if (mm_ops->icache_inval_pou && 1136 stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old)) 1137 mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops), 1138 kvm_granule_size(ctx->level)); 1139 1140 if (!stage2_try_set_pte(ctx, pte)) 1141 return -EAGAIN; 1142 } 1143 1144 return 0; 1145 } 1146 1147 static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, 1148 u64 size, kvm_pte_t attr_set, 1149 kvm_pte_t attr_clr, kvm_pte_t *orig_pte, 1150 u32 *level, enum kvm_pgtable_walk_flags flags) 1151 { 1152 int ret; 1153 kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI; 1154 struct stage2_attr_data data = { 1155 .attr_set = attr_set & attr_mask, 1156 .attr_clr = attr_clr & attr_mask, 1157 }; 1158 struct kvm_pgtable_walker walker = { 1159 .cb = stage2_attr_walker, 1160 .arg = &data, 1161 .flags = flags | KVM_PGTABLE_WALK_LEAF, 1162 }; 1163 1164 ret = kvm_pgtable_walk(pgt, addr, size, &walker); 1165 if (ret) 1166 return ret; 1167 1168 if (orig_pte) 1169 *orig_pte = data.pte; 1170 1171 if (level) 1172 *level = data.level; 1173 return 0; 1174 } 1175 1176 int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) 1177 { 1178 return stage2_update_leaf_attrs(pgt, addr, size, 0, 1179 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, 1180 NULL, NULL, 0); 1181 } 1182 1183 kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) 1184 { 1185 kvm_pte_t pte = 0; 1186 int ret; 1187 1188 ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, 1189 &pte, NULL, 1190 KVM_PGTABLE_WALK_HANDLE_FAULT | 1191 KVM_PGTABLE_WALK_SHARED); 1192 if (!ret) 1193 dsb(ishst); 1194 1195 return pte; 1196 } 1197 1198 struct stage2_age_data { 1199 bool mkold; 1200 bool young; 1201 }; 1202 1203 static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx, 1204 enum kvm_pgtable_walk_flags visit) 1205 { 1206 kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF; 1207 struct stage2_age_data *data = ctx->arg; 1208 1209 if (!kvm_pte_valid(ctx->old) || new == ctx->old) 1210 return 0; 1211 1212 data->young = true; 1213 1214 /* 1215 * stage2_age_walker() is always called while holding the MMU lock for 1216 * write, so this will always succeed. Nonetheless, this deliberately 1217 * follows the race detection pattern of the other stage-2 walkers in 1218 * case the locking mechanics of the MMU notifiers is ever changed. 1219 */ 1220 if (data->mkold && !stage2_try_set_pte(ctx, new)) 1221 return -EAGAIN; 1222 1223 /* 1224 * "But where's the TLBI?!", you scream. 1225 * "Over in the core code", I sigh. 1226 * 1227 * See the '->clear_flush_young()' callback on the KVM mmu notifier. 1228 */ 1229 return 0; 1230 } 1231 1232 bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, 1233 u64 size, bool mkold) 1234 { 1235 struct stage2_age_data data = { 1236 .mkold = mkold, 1237 }; 1238 struct kvm_pgtable_walker walker = { 1239 .cb = stage2_age_walker, 1240 .arg = &data, 1241 .flags = KVM_PGTABLE_WALK_LEAF, 1242 }; 1243 1244 WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker)); 1245 return data.young; 1246 } 1247 1248 int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, 1249 enum kvm_pgtable_prot prot) 1250 { 1251 int ret; 1252 u32 level; 1253 kvm_pte_t set = 0, clr = 0; 1254 1255 if (prot & KVM_PTE_LEAF_ATTR_HI_SW) 1256 return -EINVAL; 1257 1258 if (prot & KVM_PGTABLE_PROT_R) 1259 set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; 1260 1261 if (prot & KVM_PGTABLE_PROT_W) 1262 set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; 1263 1264 if (prot & KVM_PGTABLE_PROT_X) 1265 clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; 1266 1267 ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, 1268 KVM_PGTABLE_WALK_HANDLE_FAULT | 1269 KVM_PGTABLE_WALK_SHARED); 1270 if (!ret) 1271 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level); 1272 return ret; 1273 } 1274 1275 static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx, 1276 enum kvm_pgtable_walk_flags visit) 1277 { 1278 struct kvm_pgtable *pgt = ctx->arg; 1279 struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; 1280 1281 if (!kvm_pte_valid(ctx->old) || !stage2_pte_cacheable(pgt, ctx->old)) 1282 return 0; 1283 1284 if (mm_ops->dcache_clean_inval_poc) 1285 mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), 1286 kvm_granule_size(ctx->level)); 1287 return 0; 1288 } 1289 1290 int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) 1291 { 1292 struct kvm_pgtable_walker walker = { 1293 .cb = stage2_flush_walker, 1294 .flags = KVM_PGTABLE_WALK_LEAF, 1295 .arg = pgt, 1296 }; 1297 1298 if (stage2_has_fwb(pgt)) 1299 return 0; 1300 1301 return kvm_pgtable_walk(pgt, addr, size, &walker); 1302 } 1303 1304 kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, 1305 u64 phys, u32 level, 1306 enum kvm_pgtable_prot prot, 1307 void *mc, bool force_pte) 1308 { 1309 struct stage2_map_data map_data = { 1310 .phys = phys, 1311 .mmu = pgt->mmu, 1312 .memcache = mc, 1313 .force_pte = force_pte, 1314 }; 1315 struct kvm_pgtable_walker walker = { 1316 .cb = stage2_map_walker, 1317 .flags = KVM_PGTABLE_WALK_LEAF | 1318 KVM_PGTABLE_WALK_SKIP_BBM_TLBI | 1319 KVM_PGTABLE_WALK_SKIP_CMO, 1320 .arg = &map_data, 1321 }; 1322 /* 1323 * The input address (.addr) is irrelevant for walking an 1324 * unlinked table. Construct an ambiguous IA range to map 1325 * kvm_granule_size(level) worth of memory. 1326 */ 1327 struct kvm_pgtable_walk_data data = { 1328 .walker = &walker, 1329 .addr = 0, 1330 .end = kvm_granule_size(level), 1331 }; 1332 struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; 1333 kvm_pte_t *pgtable; 1334 int ret; 1335 1336 if (!IS_ALIGNED(phys, kvm_granule_size(level))) 1337 return ERR_PTR(-EINVAL); 1338 1339 ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); 1340 if (ret) 1341 return ERR_PTR(ret); 1342 1343 pgtable = mm_ops->zalloc_page(mc); 1344 if (!pgtable) 1345 return ERR_PTR(-ENOMEM); 1346 1347 ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable, 1348 level + 1); 1349 if (ret) { 1350 kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level); 1351 mm_ops->put_page(pgtable); 1352 return ERR_PTR(ret); 1353 } 1354 1355 return pgtable; 1356 } 1357 1358 /* 1359 * Get the number of page-tables needed to replace a block with a 1360 * fully populated tree up to the PTE entries. Note that @level is 1361 * interpreted as in "level @level entry". 1362 */ 1363 static int stage2_block_get_nr_page_tables(u32 level) 1364 { 1365 switch (level) { 1366 case 1: 1367 return PTRS_PER_PTE + 1; 1368 case 2: 1369 return 1; 1370 case 3: 1371 return 0; 1372 default: 1373 WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL || 1374 level >= KVM_PGTABLE_MAX_LEVELS); 1375 return -EINVAL; 1376 }; 1377 } 1378 1379 static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, 1380 enum kvm_pgtable_walk_flags visit) 1381 { 1382 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 1383 struct kvm_mmu_memory_cache *mc = ctx->arg; 1384 struct kvm_s2_mmu *mmu; 1385 kvm_pte_t pte = ctx->old, new, *childp; 1386 enum kvm_pgtable_prot prot; 1387 u32 level = ctx->level; 1388 bool force_pte; 1389 int nr_pages; 1390 u64 phys; 1391 1392 /* No huge-pages exist at the last level */ 1393 if (level == KVM_PGTABLE_MAX_LEVELS - 1) 1394 return 0; 1395 1396 /* We only split valid block mappings */ 1397 if (!kvm_pte_valid(pte)) 1398 return 0; 1399 1400 nr_pages = stage2_block_get_nr_page_tables(level); 1401 if (nr_pages < 0) 1402 return nr_pages; 1403 1404 if (mc->nobjs >= nr_pages) { 1405 /* Build a tree mapped down to the PTE granularity. */ 1406 force_pte = true; 1407 } else { 1408 /* 1409 * Don't force PTEs, so create_unlinked() below does 1410 * not populate the tree up to the PTE level. The 1411 * consequence is that the call will require a single 1412 * page of level 2 entries at level 1, or a single 1413 * page of PTEs at level 2. If we are at level 1, the 1414 * PTEs will be created recursively. 1415 */ 1416 force_pte = false; 1417 nr_pages = 1; 1418 } 1419 1420 if (mc->nobjs < nr_pages) 1421 return -ENOMEM; 1422 1423 mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache); 1424 phys = kvm_pte_to_phys(pte); 1425 prot = kvm_pgtable_stage2_pte_prot(pte); 1426 1427 childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys, 1428 level, prot, mc, force_pte); 1429 if (IS_ERR(childp)) 1430 return PTR_ERR(childp); 1431 1432 if (!stage2_try_break_pte(ctx, mmu)) { 1433 kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level); 1434 mm_ops->put_page(childp); 1435 return -EAGAIN; 1436 } 1437 1438 /* 1439 * Note, the contents of the page table are guaranteed to be made 1440 * visible before the new PTE is assigned because stage2_make_pte() 1441 * writes the PTE using smp_store_release(). 1442 */ 1443 new = kvm_init_table_pte(childp, mm_ops); 1444 stage2_make_pte(ctx, new); 1445 dsb(ishst); 1446 return 0; 1447 } 1448 1449 int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, 1450 struct kvm_mmu_memory_cache *mc) 1451 { 1452 struct kvm_pgtable_walker walker = { 1453 .cb = stage2_split_walker, 1454 .flags = KVM_PGTABLE_WALK_LEAF, 1455 .arg = mc, 1456 }; 1457 1458 return kvm_pgtable_walk(pgt, addr, size, &walker); 1459 } 1460 1461 int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, 1462 struct kvm_pgtable_mm_ops *mm_ops, 1463 enum kvm_pgtable_stage2_flags flags, 1464 kvm_pgtable_force_pte_cb_t force_pte_cb) 1465 { 1466 size_t pgd_sz; 1467 u64 vtcr = mmu->arch->vtcr; 1468 u32 ia_bits = VTCR_EL2_IPA(vtcr); 1469 u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); 1470 u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; 1471 1472 pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; 1473 pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz); 1474 if (!pgt->pgd) 1475 return -ENOMEM; 1476 1477 pgt->ia_bits = ia_bits; 1478 pgt->start_level = start_level; 1479 pgt->mm_ops = mm_ops; 1480 pgt->mmu = mmu; 1481 pgt->flags = flags; 1482 pgt->force_pte_cb = force_pte_cb; 1483 1484 /* Ensure zeroed PGD pages are visible to the hardware walker */ 1485 dsb(ishst); 1486 return 0; 1487 } 1488 1489 size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) 1490 { 1491 u32 ia_bits = VTCR_EL2_IPA(vtcr); 1492 u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); 1493 u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; 1494 1495 return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; 1496 } 1497 1498 static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, 1499 enum kvm_pgtable_walk_flags visit) 1500 { 1501 struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; 1502 1503 if (!stage2_pte_is_counted(ctx->old)) 1504 return 0; 1505 1506 mm_ops->put_page(ctx->ptep); 1507 1508 if (kvm_pte_table(ctx->old, ctx->level)) 1509 mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); 1510 1511 return 0; 1512 } 1513 1514 void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) 1515 { 1516 size_t pgd_sz; 1517 struct kvm_pgtable_walker walker = { 1518 .cb = stage2_free_walker, 1519 .flags = KVM_PGTABLE_WALK_LEAF | 1520 KVM_PGTABLE_WALK_TABLE_POST, 1521 }; 1522 1523 WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); 1524 pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; 1525 pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz); 1526 pgt->pgd = NULL; 1527 } 1528 1529 void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level) 1530 { 1531 kvm_pteref_t ptep = (kvm_pteref_t)pgtable; 1532 struct kvm_pgtable_walker walker = { 1533 .cb = stage2_free_walker, 1534 .flags = KVM_PGTABLE_WALK_LEAF | 1535 KVM_PGTABLE_WALK_TABLE_POST, 1536 }; 1537 struct kvm_pgtable_walk_data data = { 1538 .walker = &walker, 1539 1540 /* 1541 * At this point the IPA really doesn't matter, as the page 1542 * table being traversed has already been removed from the stage 1543 * 2. Set an appropriate range to cover the entire page table. 1544 */ 1545 .addr = 0, 1546 .end = kvm_granule_size(level), 1547 }; 1548 1549 WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level + 1)); 1550 1551 WARN_ON(mm_ops->page_count(pgtable) != 1); 1552 mm_ops->put_page(pgtable); 1553 } 1554