linux/mm/hugetlb.c

1 // SPDX-License-Identifier: GPL-2.0-only
60 	return cma_pages_valid(hugetlb_cma[folio_nid(folio)], &folio->page,
104 	if (spool->count)
106 	if (spool->max_hpages != -1)
107 		return spool->used_hpages == 0;
108 	if (spool->min_hpages != -1)
109 		return spool->rsv_hpages == spool->min_hpages;
117 	spin_unlock_irqrestore(&spool->lock, irq_flags);
123 		if (spool->min_hpages != -1)
124 			hugetlb_acct_memory(spool->hstate,
125 						-spool->min_hpages);
139 	spin_lock_init(&spool->lock);
140 	spool->count = 1;
141 	spool->max_hpages = max_hpages;
142 	spool->hstate = h;
143 	spool->min_hpages = min_hpages;
145 	if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
149 	spool->rsv_hpages = min_hpages;
158 	spin_lock_irqsave(&spool->lock, flags);
159 	BUG_ON(!spool->count);
160 	spool->count--;
166  * Return -ENOMEM if there are not enough resources to satisfy the
180 	spin_lock_irq(&spool->lock);
182 	if (spool->max_hpages != -1) {		/* maximum size accounting */
183 		if ((spool->used_hpages + delta) <= spool->max_hpages)
184 			spool->used_hpages += delta;
186 			ret = -ENOMEM;
192 	if (spool->min_hpages != -1 && spool->rsv_hpages) {
193 		if (delta > spool->rsv_hpages) {
198 			ret = delta - spool->rsv_hpages;
199 			spool->rsv_hpages = 0;
202 			spool->rsv_hpages -= delta;
207 	spin_unlock_irq(&spool->lock);
226 	spin_lock_irqsave(&spool->lock, flags);
228 	if (spool->max_hpages != -1)		/* maximum size accounting */
229 		spool->used_hpages -= delta;
232 	if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
233 		if (spool->rsv_hpages + delta <= spool->min_hpages)
236 			ret = spool->rsv_hpages + delta - spool->min_hpages;
238 		spool->rsv_hpages += delta;
239 		if (spool->rsv_hpages > spool->min_hpages)
240 			spool->rsv_hpages = spool->min_hpages;
254 	return HUGETLBFS_SB(inode->i_sb)->spool;
259 	return subpool_inode(file_inode(vma->vm_file));
268 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
270 		down_read(&vma_lock->rw_sema);
274 		down_read(&resv_map->rw_sema);
281 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
283 		up_read(&vma_lock->rw_sema);
287 		up_read(&resv_map->rw_sema);
294 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
296 		down_write(&vma_lock->rw_sema);
300 		down_write(&resv_map->rw_sema);
307 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
309 		up_write(&vma_lock->rw_sema);
313 		up_write(&resv_map->rw_sema);
321 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
323 		return down_write_trylock(&vma_lock->rw_sema);
327 		return down_write_trylock(&resv_map->rw_sema);
336 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
338 		lockdep_assert_held(&vma_lock->rw_sema);
342 		lockdep_assert_held(&resv_map->rw_sema);
356 	struct vm_area_struct *vma = vma_lock->vma;
361 	 * Semaphore synchronizes access to vma_lock->vma field.
363 	vma_lock->vma = NULL;
364 	vma->vm_private_data = NULL;
365 	up_write(&vma_lock->rw_sema);
366 	kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
372 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
379 		up_write(&resv_map->rw_sema);
391 	if (vma->vm_private_data) {
392 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
394 		down_write(&vma_lock->rw_sema);
404 	if (!vma || !(vma->vm_flags & VM_MAYSHARE))
407 	/* Should never get here with non-NULL vm_private_data */
408 	if (vma->vm_private_data)
427 	kref_init(&vma_lock->refs);
428 	init_rwsem(&vma_lock->rw_sema);
429 	vma_lock->vma = vma;
430 	vma->vm_private_data = vma_lock;
441 	VM_BUG_ON(resv->region_cache_count <= 0);
443 	resv->region_cache_count--;
444 	nrg = list_first_entry(&resv->region_cache, struct file_region, link);
445 	list_del(&nrg->link);
447 	nrg->from = from;
448 	nrg->to = to;
457 	nrg->reservation_counter = rg->reservation_counter;
458 	nrg->css = rg->css;
459 	if (rg->css)
460 		css_get(rg->css);
472 		nrg->reservation_counter =
473 			&h_cg->rsvd_hugepage[hstate_index(h)];
474 		nrg->css = &h_cg->css;
476 		 * The caller will hold exactly one h_cg->css reference for the
481 		 * exactly one h_cg->css reference, we should do css_get for
485 		css_get(&h_cg->css);
486 		if (!resv->pages_per_hpage)
487 			resv->pages_per_hpage = pages_per_huge_page(h);
491 		VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
493 		nrg->reservation_counter = NULL;
494 		nrg->css = NULL;
502 	if (rg->css)
503 		css_put(rg->css);
511 	return rg->reservation_counter == org->reservation_counter &&
512 	       rg->css == org->css;
524 	if (&prg->link != &resv->regions && prg->to == rg->from &&
526 		prg->to = rg->to;
528 		list_del(&rg->link);
536 	if (&nrg->link != &resv->regions && nrg->from == rg->to &&
538 		nrg->from = rg->from;
540 		list_del(&rg->link);
556 		list_add(&nrg->link, rg);
561 	return to - from;
565  * Must be called with resv->lock held.
577 	struct list_head *head = &resv->regions;
586 	 * [last_accounted_offset, iter->from), at every iteration, with some
591 		if (iter->from < f) {
595 			if (iter->to > last_accounted_offset)
596 				last_accounted_offset = iter->to;
603 		if (iter->from >= t) {
604 			rg = iter->link.prev;
608 		/* Add an entry for last_accounted_offset -> iter->from, and
611 		if (iter->from > last_accounted_offset)
612 			add += hugetlb_resv_map_add(resv, iter->link.prev,
614 						    iter->from, h, h_cg,
617 		last_accounted_offset = iter->to;
624 		rg = head->prev;
632 /* Must be called with resv->lock acquired. Will drop lock to allocate entries.
636 	__must_hold(&resv->lock)
653 	while (resv->region_cache_count <
654 	       (resv->adds_in_progress + regions_needed)) {
655 		to_allocate = resv->adds_in_progress + regions_needed -
656 			      resv->region_cache_count;
662 		VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
664 		spin_unlock(&resv->lock);
669 			list_add(&trg->link, &allocated_regions);
672 		spin_lock(&resv->lock);
674 		list_splice(&allocated_regions, &resv->region_cache);
675 		resv->region_cache_count += to_allocate;
682 		list_del(&rg->link);
685 	return -ENOMEM;
700  * this operation and we were not able to allocate, it returns -ENOMEM.
711 	spin_lock(&resv->lock);
728 	    resv->region_cache_count <
729 		    resv->adds_in_progress +
730 			    (actual_regions_needed - in_regions_needed)) {
734 		VM_BUG_ON(t - f <= 1);
737 			    resv, actual_regions_needed - in_regions_needed)) {
738 			return -ENOMEM;
746 	resv->adds_in_progress -= in_regions_needed;
748 	spin_unlock(&resv->lock);
764  * resv->adds_in_progress.  This value needs to be provided to a follow up call
769  * zero.  -ENOMEM is returned if a new file_region structure or cache entry
775 	long chg = 0;
777 	spin_lock(&resv->lock);
780 	chg = add_reservation_in_range(resv, f, t, NULL, NULL,
787 		return -ENOMEM;
789 	resv->adds_in_progress += *out_regions_needed;
791 	spin_unlock(&resv->lock);
792 	return chg;
811 	spin_lock(&resv->lock);
812 	VM_BUG_ON(!resv->region_cache_count);
813 	resv->adds_in_progress -= regions_needed;
814 	spin_unlock(&resv->lock);
826  * be allocated.  If the allocation fails, -ENOMEM will be returned.
828  * a region and possibly return -ENOMEM.  Callers specifying
829  * t == LONG_MAX do not need to check for -ENOMEM error.
833 	struct list_head *head = &resv->regions;
839 	spin_lock(&resv->lock);
848 		if (rg->to <= f && (rg->to != rg->from || rg->to != f))
851 		if (rg->from >= t)
854 		if (f > rg->from && t < rg->to) { /* Must split region */
860 			    resv->region_cache_count > resv->adds_in_progress) {
861 				nrg = list_first_entry(&resv->region_cache,
864 				list_del(&nrg->link);
865 				resv->region_cache_count--;
869 				spin_unlock(&resv->lock);
872 					return -ENOMEM;
876 			del += t - f;
878 				resv, rg, t - f, false);
881 			nrg->from = t;
882 			nrg->to = rg->to;
886 			INIT_LIST_HEAD(&nrg->link);
889 			rg->to = f;
891 			list_add(&nrg->link, &rg->link);
896 		if (f <= rg->from && t >= rg->to) { /* Remove entire region */
897 			del += rg->to - rg->from;
899 							    rg->to - rg->from, true);
900 			list_del(&rg->link);
905 		if (f <= rg->from) {	/* Trim beginning of region */
907 							    t - rg->from, false);
909 			del += t - rg->from;
910 			rg->from = t;
913 							    rg->to - f, false);
915 			del += rg->to - f;
916 			rg->to = f;
920 	spin_unlock(&resv->lock);
960 	struct list_head *head = &resv->regions;
962 	long chg = 0;
964 	spin_lock(&resv->lock);
970 		if (rg->to <= f)
972 		if (rg->from >= t)
975 		seg_from = max(rg->from, f);
976 		seg_to = min(rg->to, t);
978 		chg += seg_to - seg_from;
980 	spin_unlock(&resv->lock);
982 	return chg;
992 	return ((address - vma->vm_start) >> huge_page_shift(h)) +
993 			(vma->vm_pgoff >> huge_page_order(h));
1004  * vma_kernel_pagesize - Page size granularity for this VMA.
1014 	if (vma->vm_ops && vma->vm_ops->pagesize)
1015 		return vma->vm_ops->pagesize(vma);
1023  * architectures where it differs, an architecture-specific 'strong'
1061 	return (unsigned long)vma->vm_private_data;
1067 	vma->vm_private_data = (void *)value;
1077 		resv_map->reservation_counter = NULL;
1078 		resv_map->pages_per_hpage = 0;
1079 		resv_map->css = NULL;
1081 		resv_map->reservation_counter =
1082 			&h_cg->rsvd_hugepage[hstate_index(h)];
1083 		resv_map->pages_per_hpage = pages_per_huge_page(h);
1084 		resv_map->css = &h_cg->css;
1100 	kref_init(&resv_map->refs);
1101 	spin_lock_init(&resv_map->lock);
1102 	INIT_LIST_HEAD(&resv_map->regions);
1103 	init_rwsem(&resv_map->rw_sema);
1105 	resv_map->adds_in_progress = 0;
1109 	 * re-initialized to the proper values, to indicate that hugetlb cgroup
1110 	 * reservations are to be un-charged from here.
1114 	INIT_LIST_HEAD(&resv_map->region_cache);
1115 	list_add(&rg->link, &resv_map->region_cache);
1116 	resv_map->region_cache_count = 1;
1124 	struct list_head *head = &resv_map->region_cache;
1132 		list_del(&rg->link);
1136 	VM_BUG_ON(resv_map->adds_in_progress);
1148 	 * The VERY common case is inode->mapping == &inode->i_data but,
1151 	return (struct resv_map *)(&inode->i_data)->private_data;
1157 	if (vma->vm_flags & VM_MAYSHARE) {
1158 		struct address_space *mapping = vma->vm_file->f_mapping;
1159 		struct inode *inode = mapping->host;
1172 	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
1180 	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
1194 	return !(vma->vm_flags & VM_MAYSHARE) &&
1204 	 * - For shared mappings this is a per-vma semaphore that may be
1210 	 * - For MAP_PRIVATE mappings, this is the reserve map which does
1212 	 *   not guaranteed to succeed, even if read-only.
1214 	if (vma->vm_flags & VM_MAYSHARE) {
1215 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
1217 		if (vma_lock && vma_lock->vma != vma)
1218 			vma->vm_private_data = NULL;
1220 		vma->vm_private_data = NULL;
1225  * Called with mm->mmap_lock writer semaphore held.
1248 		kref_put(&reservations->refs, resv_map_release);
1255 static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
1257 	if (vma->vm_flags & VM_NORESERVE) {
1259 		 * This address is already reserved by other process(chg == 0),
1265 		 * properly, so add work-around here.
1267 		if (vma->vm_flags & VM_MAYSHARE && chg == 0)
1274 	if (vma->vm_flags & VM_MAYSHARE) {
1280 		 * use.  This situation is indicated if chg != 0.
1282 		if (chg)
1296 		 * Examine the value of chg to determine if reserves
1298 		 * Very Subtle - The value of chg comes from a previous
1303 		 * account.  Therefore, the meaning of chg is the same
1308 		if (chg)
1324 	list_move(&folio->lru, &h->hugepage_freelists[nid]);
1325 	h->free_huge_pages++;
1326 	h->free_huge_pages_node[nid]++;
1334 	bool pin = !!(current->flags & PF_MEMALLOC_PIN);
1337 	list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) {
1344 		list_move(&folio->lru, &h->hugepage_activelist);
1347 		h->free_huge_pages--;
1348 		h->free_huge_pages_node[nid]--;
1393 	return h->free_huge_pages - h->resv_huge_pages;
1399 				long chg)
1412 	if (!vma_has_reserves(vma, chg) && !available_huge_pages(h))
1434 	if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) {
1436 		h->resv_huge_pages--;
1449  * nodes_allowed previously, so h->next_node_to_{alloc|free} might
1481 	nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
1482 	h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
1488  * helper for remove_pool_huge_page() - return the previously saved
1499 	nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
1500 	h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
1509 		nr_nodes--)
1515 		nr_nodes--)
1517 /* used to demote non-gigantic_huge pages as well */
1525 	atomic_set(&folio->_entire_mapcount, 0);
1526 	atomic_set(&folio->_nr_pages_mapped, 0);
1527 	atomic_set(&folio->_pincount, 0);
1531 		p->flags &= ~PAGE_FLAGS_CHECK_AT_FREE;
1532 		p->mapping = NULL;
1563 	if (cma_release(hugetlb_cma[nid], &folio->page, 1 << order))
1659 	list_del(&folio->lru);
1662 		h->free_huge_pages--;
1663 		h->free_huge_pages_node[nid]--;
1666 		h->surplus_huge_pages--;
1667 		h->surplus_huge_pages_node[nid]--;
1685 	h->nr_huge_pages--;
1686 	h->nr_huge_pages_node[nid]--;
1711 	INIT_LIST_HEAD(&folio->lru);
1712 	h->nr_huge_pages++;
1713 	h->nr_huge_pages_node[nid]++;
1716 		h->surplus_huge_pages++;
1717 		h->surplus_huge_pages_node[nid]++;
1743 	arch_clear_hugepage_flags(&folio->page);
1760 	if (hugetlb_vmemmap_restore(h, &folio->page)) {
1790 	 * Non-gigantic pages demoted from CMA allocated gigantic pages
1798 		INIT_LIST_HEAD(&folio->_deferred_list);
1799 		__free_pages(&folio->page, huge_page_order(h));
1810  * freed and frees them one-by-one. As the page->mapping pointer is going
1828 		node = node->next;
1829 		page->mapping = NULL;
1866 	if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist))
1910 		__ClearPageAnonExclusive(&folio->page);
1911 	folio->mapping = NULL;
1941 		h->resv_huge_pages++;
1947 	} else if (h->surplus_huge_pages_node[nid]) {
1953 		arch_clear_hugepage_flags(&folio->page);
1965 	h->nr_huge_pages++;
1966 	h->nr_huge_pages_node[nid]++;
1971 	hugetlb_vmemmap_optimize(h, &folio->page);
1972 	INIT_LIST_HEAD(&folio->lru);
2000 		 * boot, it's safer to be consistent with the not-gigantic
2038 			set_compound_head(p, &folio->page);
2043 	atomic_set(&folio->_entire_mapcount, -1);
2044 	atomic_set(&folio->_nr_pages_mapped, 0);
2045 	atomic_set(&folio->_pincount, 0);
2103 		compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
2105 		compound_idx = page - page_head;
2199 			 * Free pages and try again - ONCE!
2258 		if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
2259 		    !list_empty(&h->hugepage_freelists[node])) {
2260 			page = list_entry(h->hugepage_freelists[node].next,
2273  * nothing for in-use hugepages and non-hugepages.
2276  *  -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
2280  *  -EBUSY:  failed to dissolved free hugepages or the hugepage is in-use
2287 	int rc = -EBUSY;
2315 			 * Theoretically, we should return -EBUSY when we
2326 		h->max_huge_pages--;
2337 		rc = hugetlb_vmemmap_restore(h, &folio->page);
2343 			h->max_huge_pages++;
2399 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
2415 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
2422 	h->surplus_huge_pages++;
2423 	h->surplus_huge_pages_node[folio_nid(folio)]++;
2531 	if (mpol->mode == MPOL_BIND &&
2533 		 cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
2534 		return &mpol->nodes;
2556 	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
2558 		h->resv_huge_pages += delta;
2564 	ret = -ENOMEM;
2581 		list_add(&folio->lru, &surplus_list);
2591 	needed = (h->resv_huge_pages + delta) -
2592 			(h->free_huge_pages + allocated);
2612 	h->resv_huge_pages += delta;
2617 		if ((--needed) < 0)
2653 	h->resv_huge_pages -= unused_resv_pages;
2660 	 * by pre-allocated pages. Only free surplus pages.
2662 	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
2670 	 * on-line nodes with memory and will handle the hstate accounting.
2672 	while (nr_pages--) {
2677 		list_add(&page->lru, &page_list);
2756 		if (vma->vm_flags & VM_MAYSHARE) {
2766 		if (vma->vm_flags & VM_MAYSHARE) {
2779 	if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
2788 	 * Subtle - The reserve map for private mappings has the
2908 			if (!(vma->vm_flags & VM_MAYSHARE))
2927  * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve
2951 		return -ENOMEM;
2966 		 * Fail with -EBUSY if not possible.
2970 		ret = isolated ? 0 : -EBUSY;
3021 	int ret = -EBUSY;
3039 	 * alloc_contig_range and them. Return -ENOMEM as this has the effect
3043 		return -ENOMEM;
3073 		return ERR_PTR(-ENOMEM);
3086 			return ERR_PTR(-ENOSPC);
3130 			h->resv_huge_pages--;
3132 		list_add(&folio->lru, &h->hugepage_activelist);
3164 		hugetlb_acct_memory(h, -rsv_adjust);
3184 	return ERR_PTR(-ENOSPC);
3219 	INIT_LIST_HEAD(&m->list);
3220 	list_add(&m->list, &huge_boot_pages);
3221 	m->hstate = h;
3236 		struct hstate *h = m->hstate;
3252 		 * other side-effects, like CommitLimit going negative.
3263 	for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
3279 	if (i == h->max_huge_pages_node[nid])
3284 		h->max_huge_pages_node[nid], buf, nid, i);
3285 	h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
3286 	h->max_huge_pages_node[nid] = i;
3303 		if (h->max_huge_pages_node[i] > 0) {
3315 		 * Bit mask controlling how hard we retry per-node allocations.
3327 	/* bit mask controlling how hard we retry per-node allocations */
3331 	for (i = 0; i < h->max_huge_pages; ++i) {
3341 	if (i < h->max_huge_pages) {
3346 			h->max_huge_pages, buf, i);
3347 		h->max_huge_pages = i;
3363 		 * h->demote_order is initially 0.
3364 		 * - We can not demote gigantic pages if runtime freeing
3366 		 * - If CMA allocation is possible, we can not demote
3371 		if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER)
3376 			if (h2->order < h->order &&
3377 			    h2->order > h->demote_order)
3378 				h->demote_order = h2->order;
3391 		pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
3392 			buf, h->free_huge_pages);
3414 		struct list_head *freel = &h->hugepage_freelists[i];
3416 			if (count >= h->nr_huge_pages)
3421 			list_add(&page->lru, &page_list);
3438  * Increment or decrement surplus_huge_pages.  Keep node-specific counters
3439  * balanced by operating on them in a round-robin fashion.
3448 	VM_BUG_ON(delta != -1 && delta != 1);
3452 			if (h->surplus_huge_pages_node[node])
3457 			if (h->surplus_huge_pages_node[node] <
3458 					h->nr_huge_pages_node[node])
3465 	h->surplus_huge_pages += delta;
3466 	h->surplus_huge_pages_node[node] += delta;
3470 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
3480 	 * Bit mask controlling how hard we retry per-node allocations.
3487 		return -ENOMEM;
3493 	mutex_lock(&h->resize_lock);
3506 		count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
3527 			mutex_unlock(&h->resize_lock);
3529 			return -EINVAL;
3545 	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
3546 		if (!adjust_pool_surplus(h, nodes_allowed, -1))
3567 		/* Bail for signals. Probably ctrl-c from user */
3587 	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
3599 		list_add(&page->lru, &page_list);
3612 	h->max_huge_pages = persistent_huge_pages(h);
3614 	mutex_unlock(&h->resize_lock);
3629 	target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
3634 	rc = hugetlb_vmemmap_restore(h, &folio->page);
3654 	 * Note that we already hold h->resize_lock.  To prevent deadlock,
3657 	mutex_lock(&target_hstate->resize_lock);
3664 							target_hstate->order);
3666 			prep_compound_page(subpage, target_hstate->order);
3671 	mutex_unlock(&target_hstate->resize_lock);
3679 	h->max_huge_pages--;
3680 	target_hstate->max_huge_pages +=
3695 	if (!h->demote_order) {
3697 		return -EINVAL;		/* internal error */
3701 		list_for_each_entry(folio, &h->hugepage_freelists[node], lru) {
3710 	 * Return -EBUSY so that caller will not retry.
3712 	return -EBUSY;
3752 		nr_huge_pages = h->nr_huge_pages;
3754 		nr_huge_pages = h->nr_huge_pages_node[nid];
3767 		return -EINVAL;
3825  * hstate attribute for optionally mempolicy-based constraint on persistent
3848 	return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages);
3859 		return -EINVAL;
3866 	h->nr_overcommit_huge_pages = input;
3882 		free_huge_pages = h->free_huge_pages;
3884 		free_huge_pages = h->free_huge_pages_node[nid];
3894 	return sysfs_emit(buf, "%lu\n", h->resv_huge_pages);
3907 		surplus_huge_pages = h->surplus_huge_pages;
3909 		surplus_huge_pages = h->surplus_huge_pages_node[nid];
3938 	mutex_lock(&h->resize_lock);
3947 			nr_available = h->free_huge_pages_node[nid];
3949 			nr_available = h->free_huge_pages;
3950 		nr_available -= h->resv_huge_pages;
3958 		nr_demote--;
3962 	mutex_unlock(&h->resize_lock);
3974 	unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
3991 		return -EINVAL;
3992 	demote_order = demote_hstate->order;
3994 		return -EINVAL;
3998 	if (demote_order >= h->order)
3999 		return -EINVAL;
4002 	mutex_lock(&h->resize_lock);
4003 	h->demote_order = demote_order;
4004 	mutex_unlock(&h->resize_lock);
4043 	hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
4045 		return -ENOMEM;
4054 	if (h->demote_order) {
4058 			pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
4073  * node_hstate/s - associate per node hstate attributes, via their kobjects,
4100  * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
4101  * Returns node id via non-NULL nidp.
4111 			if (nhs->hstate_kobjs[i] == kobj) {
4124  * No-op if no hstate attributes attached.
4129 	struct node_hstate *nhs = &node_hstates[node->dev.id];
4131 	if (!nhs->hugepages_kobj)
4136 		struct kobject *hstate_kobj = nhs->hstate_kobjs[idx];
4140 		if (h->demote_order)
4144 		nhs->hstate_kobjs[idx] = NULL;
4147 	kobject_put(nhs->hugepages_kobj);
4148 	nhs->hugepages_kobj = NULL;
4154  * No-op if attributes already registered.
4159 	struct node_hstate *nhs = &node_hstates[node->dev.id];
4165 	if (nhs->hugepages_kobj)
4168 	nhs->hugepages_kobj = kobject_create_and_add("hugepages",
4169 							&node->dev.kobj);
4170 	if (!nhs->hugepages_kobj)
4174 		err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
4175 						nhs->hstate_kobjs,
4179 				h->name, node->dev.id);
4188  * devices of nodes that have memory.  All on-line nodes should have
4204 		*nidp = -1;
4233 			pr_err("HugeTLB: Unable to add hstate %s", h->name);
4257 			pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
4338 	__mutex_init(&h->resize_lock, "resize mutex", &h->resize_key);
4339 	h->order = order;
4340 	h->mask = ~(huge_page_size(h) - 1);
4342 		INIT_LIST_HEAD(&h->hugepage_freelists[i]);
4343 	INIT_LIST_HEAD(&h->hugepage_activelist);
4344 	h->next_nid_to_alloc = first_memory_node;
4345 	h->next_nid_to_free = first_memory_node;
4346 	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
4364 		parsed_hstate->max_huge_pages = 0;
4365 		memset(parsed_hstate->max_huge_pages_node, 0,
4366 			sizeof(parsed_hstate->max_huge_pages_node));
4401 		mhp = &parsed_hstate->max_huge_pages;
4428 				parsed_hstate->max_huge_pages_node[node] = tmp;
4507 	hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4535 	hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
4566 	unsigned int *array = h->free_huge_pages_node;
4600 	unsigned long tmp = h->max_huge_pages;
4604 		return -EOPNOTSUPP;
4643 		return -EOPNOTSUPP;
4645 	tmp = h->nr_overcommit_huge_pages;
4648 		return -EINVAL;
4657 		h->nr_overcommit_huge_pages = tmp;
4713 		unsigned long count = h->nr_huge_pages;
4725 				   h->free_huge_pages,
4726 				   h->resv_huge_pages,
4727 				   h->surplus_huge_pages,
4745 			     nid, h->nr_huge_pages_node[nid],
4746 			     nid, h->free_huge_pages_node[nid],
4747 			     nid, h->surplus_huge_pages_node[nid]);
4760 			h->nr_huge_pages_node[nid],
4761 			h->free_huge_pages_node[nid],
4762 			h->surplus_huge_pages_node[nid],
4769 		   K(atomic_long_read(&mm->hugetlb_usage)));
4779 		nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
4785 	int ret = -ENOMEM;
4826 		return_unused_surplus_pages(h, (unsigned long) -delta);
4848 		kref_get(&resv->refs);
4857 	if (vma->vm_flags & VM_MAYSHARE) {
4858 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
4861 			if (vma_lock->vma != vma) {
4862 				vma->vm_private_data = NULL;
4885 	start = vma_hugecache_offset(h, vma, vma->vm_start);
4886 	end = vma_hugecache_offset(h, vma, vma->vm_end);
4888 	reserve = (end - start) - region_count(resv, start, end);
4896 		hugetlb_acct_memory(h, -gbl_reserve);
4899 	kref_put(&resv->refs, resv_map_release);
4905 		return -EINVAL;
4908 	 * PMD sharing is only possible for PUD_SIZE-aligned address ranges
4921 		if (floor >= vma->vm_start && ceil <= vma->vm_end)
4935  * handle_mm_fault() to try to instantiate regular-sized pages in the
4968 					 vma->vm_page_prot)));
4971 					   vma->vm_page_prot));
4974 	entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
5019 	pte_t newpte = make_huge_pte(vma, &new_folio->page, 1);
5025 	set_huge_pte_at(vma->vm_mm, addr, ptep, newpte, sz);
5026 	hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
5037 	bool cow = is_cow_mapping(src_vma->vm_flags);
5047 					src_vma->vm_start,
5048 					src_vma->vm_end);
5051 		raw_write_seqcount_begin(&src->write_protect_seq);
5063 	for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
5072 			ret = -ENOMEM;
5140 			 * When pre-allocating the page or copying data, we
5145 				page_dup_file_rmap(&pte_folio->page, true);
5146 			} else if (page_try_dup_anon_rmap(&pte_folio->page,
5211 		raw_write_seqcount_end(&src->write_protect_seq);
5225 	struct mm_struct *mm = vma->vm_mm;
5253 	struct address_space *mapping = vma->vm_file->f_mapping;
5255 	struct mm_struct *mm = vma->vm_mm;
5303 		flush_hugetlb_tlb_range(vma, old_end - len, old_end);
5308 	return len + old_addr - old_end;
5315 	struct mm_struct *mm = vma->vm_mm;
5367 			 * If the pte was wr-protected by uffd-wp in any of the
5369 			 * drop the uffd-wp bit in this zap, then replace the
5406 		/* Leave a uffd-wp pte marker if needed */
5445 	if (!vma->vm_file)	/* hugetlbfs_file_mmap error */
5450 	if (vma->vm_file)
5451 		i_mmap_lock_write(vma->vm_file->f_mapping);
5457 	zap_flags_t zap_flags = details ? details->zap_flags : 0;
5459 	if (!vma->vm_file)	/* hugetlbfs_file_mmap error */
5477 	if (vma->vm_file)
5478 		i_mmap_unlock_write(vma->vm_file->f_mapping);
5488 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
5492 	tlb_gather_mmu(&tlb, vma->vm_mm);
5519 	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
5520 			vma->vm_pgoff;
5521 	mapping = vma->vm_file->f_mapping;
5529 	vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
5539 		if (iter_vma->vm_flags & VM_MAYSHARE)
5545 		 * areas. This is because a future no-page fault on this VMA
5577 	 * Never handle CoW for uffd-wp protected pages.  It should be only
5578 	 * handled when the uffd-wp protection is removed.
5582 	 * uffd-wp bit first.
5588 	 * hugetlb does not support FOLL_FORCE-style write faults that keep the
5591 	if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE)))
5595 	if (vma->vm_flags & VM_MAYSHARE) {
5606 	 * If no-one else is actually using this page, we're the exclusive
5610 		if (!PageAnonExclusive(&old_folio->page))
5611 			page_move_anon_rmap(&old_folio->page, vma);
5619 		       PageAnonExclusive(&old_folio->page), &old_folio->page);
5652 			struct address_space *mapping = vma->vm_file->f_mapping;
5671 			unmap_ref_private(mm, vma, &old_folio->page, haddr);
5681 			 * race occurs while re-acquiring page table
5718 		pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare);
5722 		page_remove_rmap(&old_folio->page, vma, true);
5756 	struct address_space *mapping = vma->vm_file->f_mapping;
5770 	struct inode *inode = mapping->host;
5785 	 * by non-hugetlbfs specific code paths.
5789 	spin_lock(&inode->i_lock);
5790 	inode->i_blocks += blocks_per_huge_page(h);
5791 	spin_unlock(&inode->i_lock);
5872 			   current->pid);
5883 		size = i_size_read(mapping->host) >> huge_page_shift(h);
5890 			 * without pgtable lock, we need to re-test under
5893 			 * either changed or during-changing ptes and retry
5935 		clear_huge_page(&folio->page, address, pages_per_huge_page(h));
5939 		if (vma->vm_flags & VM_MAYSHARE) {
5943 				 * err can't be -EEXIST which implies someone
5995 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
6013 		page_dup_file_rmap(&folio->page, true);
6014 	new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE)
6015 				&& (vma->vm_flags & VM_SHARED)));
6017 	 * If this pte was previously wr-protected, keep it wr-protected even
6025 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
6068 	return hash & (num_fault_mutexes - 1);
6107 	mapping = vma->vm_file->f_mapping;
6182 	    !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) {
6201 	/* Handle userfault-wp first, before trying to lock more pages */
6286 	struct mm_struct *dst_mm = dst_vma->vm_mm;
6290 	struct address_space *mapping = dst_vma->vm_file->f_mapping;
6293 	int vm_shared = dst_vma->vm_flags & VM_SHARED;
6296 	int ret = -ENOMEM;
6307 			return -EEXIST;
6314 		/* No need to invalidate - it was non-present before */
6322 		ret = -EFAULT;
6329 		 * a non-missing case. Return -EEXIST.
6333 			ret = -EEXIST;
6339 			ret = -ENOMEM;
6348 			ret = -ENOENT;
6360 				ret = -ENOMEM;
6374 			ret = -EEXIST;
6382 			ret = -ENOMEM;
6404 		size = i_size_read(mapping->host) >> huge_page_shift(h);
6405 		ret = -EFAULT;
6423 	ret = -EIO;
6429 	 * registered, we firstly wr-protect a none pte which has no page cache
6432 	ret = -EEXIST;
6437 		page_dup_file_rmap(&folio->page, true);
6442 	 * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
6448 		writable = dst_vma->vm_flags & VM_WRITE;
6450 	_dst_pte = make_huge_pte(dst_vma, &folio->page, writable);
6467 	/* No need to invalidate - it was non-present before */
6495 	struct mm_struct *mm = vma->vm_mm;
6520 				page = ERR_PTR(-EMLINK);
6528 		 * Note that page may be a sub-page, and with vmemmap
6543 		*page_mask = (1U << huge_page_order(h)) - 1;
6556 		page = ERR_PTR(-EFAULT);
6565 	struct mm_struct *mm = vma->vm_mm;
6591 	i_mmap_lock_write(vma->vm_file->f_mapping);
6602 			 * Userfaultfd wr-protect requires pgtable
6603 			 * pre-allocations to install pte markers.
6607 				pages = -ENOMEM;
6614 			 * When uffd-wp is enabled on the vma, unshare
6658 				/* Safe to modify directly (non-present->none). */
6666 			pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
6676 				/* Safe to modify directly (none->non-present). */
6701 	i_mmap_unlock_write(vma->vm_file->f_mapping);
6705 	return pages > 0 ? (pages << h->order) : pages;
6714 	long chg = -1, add = -1;
6744 	 * to reserve the full area even if read-only as mprotect() may be
6745 	 * called to make the mapping read-write. Assume !vma is a shm mapping
6747 	if (!vma || vma->vm_flags & VM_MAYSHARE) {
6755 		chg = region_chg(resv_map, from, to, &regions_needed);
6762 		chg = to - from;
6768 	if (chg < 0)
6772 				chg * pages_per_huge_page(h), &h_cg) < 0)
6775 	if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
6787 	gbl_reserve = hugepage_subpool_get_pages(spool, chg);
6803 	 * the reservation was consumed. Private mappings are per-VMA and
6809 	if (!vma || vma->vm_flags & VM_MAYSHARE) {
6813 			hugetlb_acct_memory(h, -gbl_reserve);
6815 		} else if (unlikely(chg > add)) {
6827 			 * reference to h_cg->css. See comment below for detail.
6831 				(chg - add) * pages_per_huge_page(h), h_cg);
6834 								chg - add);
6835 			hugetlb_acct_memory(h, -rsv_adjust);
6839 			 * h_cg->css. So we should release the reference held
6849 	/* put back original number of pages, chg */
6850 	(void)hugepage_subpool_put_pages(spool, chg);
6853 					    chg * pages_per_huge_page(h), h_cg);
6856 	if (!vma || vma->vm_flags & VM_MAYSHARE)
6860 		if (chg >= 0 && add < 0)
6863 		kref_put(&resv_map->refs, resv_map_release);
6874 	long chg = 0;
6883 		chg = region_del(resv_map, start, end);
6889 		if (chg < 0)
6890 			return chg;
6893 	spin_lock(&inode->i_lock);
6894 	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
6895 	spin_unlock(&inode->i_lock);
6901 	 * Note that !resv_map implies freed == 0. So (chg - freed)
6904 	gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
6905 	hugetlb_acct_memory(h, -gbl_reserve);
6915 	unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
6916 				svma->vm_start;
6921 	unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
6922 	unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
6933 	    !svma->vm_private_data)
6951 	if (!(vma->vm_flags & VM_MAYSHARE))
6953 	if (!vma->vm_private_data)	/* vma lock required for sharing */
6968 	unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
6969 		v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
6975 	if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
6992  * pud has to be populated inside the same i_mmap_rwsem section - otherwise
6999 	struct address_space *mapping = vma->vm_file->f_mapping;
7000 	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
7001 			vma->vm_pgoff;
7008 	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
7026 	spin_lock(&mm->page_table_lock);
7034 	spin_unlock(&mm->page_table_lock);
7057 	i_mmap_assert_write_locked(vma->vm_file->f_mapping);
7131  * huge_pte_offset() - Walk the page table to resolve the hugepage
7156 		/* must be pud huge, non-present or none */
7163 	/* must be pmd huge, non-present or none */
7169  * page in a page table page mapping size.  Used to skip non-present
7179 		return P4D_SIZE - PUD_SIZE;
7181 		return PUD_SIZE - PMD_SIZE;
7193 		return PUD_SIZE - PMD_SIZE;
7216 	list_move_tail(&folio->lru, list);
7235 			ret = -EBUSY;
7256 	list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist);
7266 	set_page_owner_migrate_reason(&new_folio->page, reason);
7272 	 * the temporary status.
7274 	 * Also note that we have to transfer the per-node surplus state
7276 	 * the per-node's.
7287 		 * There is no need to transfer the per-node surplus state
7293 		if (h->surplus_huge_pages_node[old_nid]) {
7294 			h->surplus_huge_pages_node[old_nid]--;
7295 			h->surplus_huge_pages_node[new_nid]++;
7307 	struct mm_struct *mm = vma->vm_mm;
7313 	if (!(vma->vm_flags & VM_MAYSHARE))
7328 	i_mmap_lock_write(vma->vm_file->f_mapping);
7338 	i_mmap_unlock_write(vma->vm_file->f_mapping);
7353 	hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
7354 			ALIGN_DOWN(vma->vm_end, PUD_SIZE));
7416 			hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
7424 			hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
7463 			size = min(per_node, hugetlb_cma_size - reserved);