1 #define _GNU_SOURCE
2 #include <ctype.h>
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <limits.h>
6 #include <dirent.h>
7 #include <signal.h>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <stdbool.h>
11 #include <string.h>
12 #include <unistd.h>
13 
14 #include <linux/mman.h>
15 #include <sys/mman.h>
16 #include <sys/wait.h>
17 #include <sys/types.h>
18 #include <sys/stat.h>
19 #include <sys/sysmacros.h>
20 #include <sys/vfs.h>
21 
22 #include "linux/magic.h"
23 
24 #include "vm_util.h"
25 
26 #define BASE_ADDR ((void *)(1UL << 30))
27 static unsigned long hpage_pmd_size;
28 static unsigned long page_size;
29 static int hpage_pmd_nr;
30 
31 #define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/"
32 #define PID_SMAPS "/proc/self/smaps"
33 #define TEST_FILE "collapse_test_file"
34 
35 #define MAX_LINE_LENGTH 500
36 
37 enum vma_type {
38 	VMA_ANON,
39 	VMA_FILE,
40 	VMA_SHMEM,
41 };
42 
43 struct mem_ops {
44 	void *(*setup_area)(int nr_hpages);
45 	void (*cleanup_area)(void *p, unsigned long size);
46 	void (*fault)(void *p, unsigned long start, unsigned long end);
47 	bool (*check_huge)(void *addr, int nr_hpages);
48 	const char *name;
49 };
50 
51 static struct mem_ops *file_ops;
52 static struct mem_ops *anon_ops;
53 static struct mem_ops *shmem_ops;
54 
55 struct collapse_context {
56 	void (*collapse)(const char *msg, char *p, int nr_hpages,
57 			 struct mem_ops *ops, bool expect);
58 	bool enforce_pte_scan_limits;
59 	const char *name;
60 };
61 
62 static struct collapse_context *khugepaged_context;
63 static struct collapse_context *madvise_context;
64 
65 struct file_info {
66 	const char *dir;
67 	char path[PATH_MAX];
68 	enum vma_type type;
69 	int fd;
70 	char dev_queue_read_ahead_path[PATH_MAX];
71 };
72 
73 static struct file_info finfo;
74 
75 enum thp_enabled {
76 	THP_ALWAYS,
77 	THP_MADVISE,
78 	THP_NEVER,
79 };
80 
81 static const char *thp_enabled_strings[] = {
82 	"always",
83 	"madvise",
84 	"never",
85 	NULL
86 };
87 
88 enum thp_defrag {
89 	THP_DEFRAG_ALWAYS,
90 	THP_DEFRAG_DEFER,
91 	THP_DEFRAG_DEFER_MADVISE,
92 	THP_DEFRAG_MADVISE,
93 	THP_DEFRAG_NEVER,
94 };
95 
96 static const char *thp_defrag_strings[] = {
97 	"always",
98 	"defer",
99 	"defer+madvise",
100 	"madvise",
101 	"never",
102 	NULL
103 };
104 
105 enum shmem_enabled {
106 	SHMEM_ALWAYS,
107 	SHMEM_WITHIN_SIZE,
108 	SHMEM_ADVISE,
109 	SHMEM_NEVER,
110 	SHMEM_DENY,
111 	SHMEM_FORCE,
112 };
113 
114 static const char *shmem_enabled_strings[] = {
115 	"always",
116 	"within_size",
117 	"advise",
118 	"never",
119 	"deny",
120 	"force",
121 	NULL
122 };
123 
124 struct khugepaged_settings {
125 	bool defrag;
126 	unsigned int alloc_sleep_millisecs;
127 	unsigned int scan_sleep_millisecs;
128 	unsigned int max_ptes_none;
129 	unsigned int max_ptes_swap;
130 	unsigned int max_ptes_shared;
131 	unsigned long pages_to_scan;
132 };
133 
134 struct settings {
135 	enum thp_enabled thp_enabled;
136 	enum thp_defrag thp_defrag;
137 	enum shmem_enabled shmem_enabled;
138 	bool use_zero_page;
139 	struct khugepaged_settings khugepaged;
140 	unsigned long read_ahead_kb;
141 };
142 
143 static struct settings saved_settings;
144 static bool skip_settings_restore;
145 
146 static int exit_status;
147 
success(const char * msg)148 static void success(const char *msg)
149 {
150 	printf(" \e[32m%s\e[0m\n", msg);
151 }
152 
fail(const char * msg)153 static void fail(const char *msg)
154 {
155 	printf(" \e[31m%s\e[0m\n", msg);
156 	exit_status++;
157 }
158 
skip(const char * msg)159 static void skip(const char *msg)
160 {
161 	printf(" \e[33m%s\e[0m\n", msg);
162 }
163 
read_file(const char * path,char * buf,size_t buflen)164 static int read_file(const char *path, char *buf, size_t buflen)
165 {
166 	int fd;
167 	ssize_t numread;
168 
169 	fd = open(path, O_RDONLY);
170 	if (fd == -1)
171 		return 0;
172 
173 	numread = read(fd, buf, buflen - 1);
174 	if (numread < 1) {
175 		close(fd);
176 		return 0;
177 	}
178 
179 	buf[numread] = '\0';
180 	close(fd);
181 
182 	return (unsigned int) numread;
183 }
184 
write_file(const char * path,const char * buf,size_t buflen)185 static int write_file(const char *path, const char *buf, size_t buflen)
186 {
187 	int fd;
188 	ssize_t numwritten;
189 
190 	fd = open(path, O_WRONLY);
191 	if (fd == -1) {
192 		printf("open(%s)\n", path);
193 		exit(EXIT_FAILURE);
194 		return 0;
195 	}
196 
197 	numwritten = write(fd, buf, buflen - 1);
198 	close(fd);
199 	if (numwritten < 1) {
200 		printf("write(%s)\n", buf);
201 		exit(EXIT_FAILURE);
202 		return 0;
203 	}
204 
205 	return (unsigned int) numwritten;
206 }
207 
read_string(const char * name,const char * strings[])208 static int read_string(const char *name, const char *strings[])
209 {
210 	char path[PATH_MAX];
211 	char buf[256];
212 	char *c;
213 	int ret;
214 
215 	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
216 	if (ret >= PATH_MAX) {
217 		printf("%s: Pathname is too long\n", __func__);
218 		exit(EXIT_FAILURE);
219 	}
220 
221 	if (!read_file(path, buf, sizeof(buf))) {
222 		perror(path);
223 		exit(EXIT_FAILURE);
224 	}
225 
226 	c = strchr(buf, '[');
227 	if (!c) {
228 		printf("%s: Parse failure\n", __func__);
229 		exit(EXIT_FAILURE);
230 	}
231 
232 	c++;
233 	memmove(buf, c, sizeof(buf) - (c - buf));
234 
235 	c = strchr(buf, ']');
236 	if (!c) {
237 		printf("%s: Parse failure\n", __func__);
238 		exit(EXIT_FAILURE);
239 	}
240 	*c = '\0';
241 
242 	ret = 0;
243 	while (strings[ret]) {
244 		if (!strcmp(strings[ret], buf))
245 			return ret;
246 		ret++;
247 	}
248 
249 	printf("Failed to parse %s\n", name);
250 	exit(EXIT_FAILURE);
251 }
252 
write_string(const char * name,const char * val)253 static void write_string(const char *name, const char *val)
254 {
255 	char path[PATH_MAX];
256 	int ret;
257 
258 	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
259 	if (ret >= PATH_MAX) {
260 		printf("%s: Pathname is too long\n", __func__);
261 		exit(EXIT_FAILURE);
262 	}
263 
264 	if (!write_file(path, val, strlen(val) + 1)) {
265 		perror(path);
266 		exit(EXIT_FAILURE);
267 	}
268 }
269 
_read_num(const char * path)270 static const unsigned long _read_num(const char *path)
271 {
272 	char buf[21];
273 
274 	if (read_file(path, buf, sizeof(buf)) < 0) {
275 		perror("read_file(read_num)");
276 		exit(EXIT_FAILURE);
277 	}
278 
279 	return strtoul(buf, NULL, 10);
280 }
281 
read_num(const char * name)282 static const unsigned long read_num(const char *name)
283 {
284 	char path[PATH_MAX];
285 	int ret;
286 
287 	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
288 	if (ret >= PATH_MAX) {
289 		printf("%s: Pathname is too long\n", __func__);
290 		exit(EXIT_FAILURE);
291 	}
292 	return _read_num(path);
293 }
294 
_write_num(const char * path,unsigned long num)295 static void _write_num(const char *path, unsigned long num)
296 {
297 	char buf[21];
298 
299 	sprintf(buf, "%ld", num);
300 	if (!write_file(path, buf, strlen(buf) + 1)) {
301 		perror(path);
302 		exit(EXIT_FAILURE);
303 	}
304 }
305 
write_num(const char * name,unsigned long num)306 static void write_num(const char *name, unsigned long num)
307 {
308 	char path[PATH_MAX];
309 	int ret;
310 
311 	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
312 	if (ret >= PATH_MAX) {
313 		printf("%s: Pathname is too long\n", __func__);
314 		exit(EXIT_FAILURE);
315 	}
316 	_write_num(path, num);
317 }
318 
write_settings(struct settings * settings)319 static void write_settings(struct settings *settings)
320 {
321 	struct khugepaged_settings *khugepaged = &settings->khugepaged;
322 
323 	write_string("enabled", thp_enabled_strings[settings->thp_enabled]);
324 	write_string("defrag", thp_defrag_strings[settings->thp_defrag]);
325 	write_string("shmem_enabled",
326 			shmem_enabled_strings[settings->shmem_enabled]);
327 	write_num("use_zero_page", settings->use_zero_page);
328 
329 	write_num("khugepaged/defrag", khugepaged->defrag);
330 	write_num("khugepaged/alloc_sleep_millisecs",
331 			khugepaged->alloc_sleep_millisecs);
332 	write_num("khugepaged/scan_sleep_millisecs",
333 			khugepaged->scan_sleep_millisecs);
334 	write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
335 	write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
336 	write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared);
337 	write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
338 
339 	if (file_ops && finfo.type == VMA_FILE)
340 		_write_num(finfo.dev_queue_read_ahead_path,
341 			   settings->read_ahead_kb);
342 }
343 
344 #define MAX_SETTINGS_DEPTH 4
345 static struct settings settings_stack[MAX_SETTINGS_DEPTH];
346 static int settings_index;
347 
current_settings(void)348 static struct settings *current_settings(void)
349 {
350 	if (!settings_index) {
351 		printf("Fail: No settings set");
352 		exit(EXIT_FAILURE);
353 	}
354 	return settings_stack + settings_index - 1;
355 }
356 
push_settings(struct settings * settings)357 static void push_settings(struct settings *settings)
358 {
359 	if (settings_index >= MAX_SETTINGS_DEPTH) {
360 		printf("Fail: Settings stack exceeded");
361 		exit(EXIT_FAILURE);
362 	}
363 	settings_stack[settings_index++] = *settings;
364 	write_settings(current_settings());
365 }
366 
pop_settings(void)367 static void pop_settings(void)
368 {
369 	if (settings_index <= 0) {
370 		printf("Fail: Settings stack empty");
371 		exit(EXIT_FAILURE);
372 	}
373 	--settings_index;
374 	write_settings(current_settings());
375 }
376 
restore_settings(int sig)377 static void restore_settings(int sig)
378 {
379 	if (skip_settings_restore)
380 		goto out;
381 
382 	printf("Restore THP and khugepaged settings...");
383 	write_settings(&saved_settings);
384 	success("OK");
385 	if (sig)
386 		exit(EXIT_FAILURE);
387 out:
388 	exit(exit_status);
389 }
390 
save_settings(void)391 static void save_settings(void)
392 {
393 	printf("Save THP and khugepaged settings...");
394 	saved_settings = (struct settings) {
395 		.thp_enabled = read_string("enabled", thp_enabled_strings),
396 		.thp_defrag = read_string("defrag", thp_defrag_strings),
397 		.shmem_enabled =
398 			read_string("shmem_enabled", shmem_enabled_strings),
399 		.use_zero_page = read_num("use_zero_page"),
400 	};
401 	saved_settings.khugepaged = (struct khugepaged_settings) {
402 		.defrag = read_num("khugepaged/defrag"),
403 		.alloc_sleep_millisecs =
404 			read_num("khugepaged/alloc_sleep_millisecs"),
405 		.scan_sleep_millisecs =
406 			read_num("khugepaged/scan_sleep_millisecs"),
407 		.max_ptes_none = read_num("khugepaged/max_ptes_none"),
408 		.max_ptes_swap = read_num("khugepaged/max_ptes_swap"),
409 		.max_ptes_shared = read_num("khugepaged/max_ptes_shared"),
410 		.pages_to_scan = read_num("khugepaged/pages_to_scan"),
411 	};
412 	if (file_ops && finfo.type == VMA_FILE)
413 		saved_settings.read_ahead_kb =
414 				_read_num(finfo.dev_queue_read_ahead_path);
415 
416 	success("OK");
417 
418 	signal(SIGTERM, restore_settings);
419 	signal(SIGINT, restore_settings);
420 	signal(SIGHUP, restore_settings);
421 	signal(SIGQUIT, restore_settings);
422 }
423 
get_finfo(const char * dir)424 static void get_finfo(const char *dir)
425 {
426 	struct stat path_stat;
427 	struct statfs fs;
428 	char buf[1 << 10];
429 	char path[PATH_MAX];
430 	char *str, *end;
431 
432 	finfo.dir = dir;
433 	stat(finfo.dir, &path_stat);
434 	if (!S_ISDIR(path_stat.st_mode)) {
435 		printf("%s: Not a directory (%s)\n", __func__, finfo.dir);
436 		exit(EXIT_FAILURE);
437 	}
438 	if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE,
439 		     finfo.dir) >= sizeof(finfo.path)) {
440 		printf("%s: Pathname is too long\n", __func__);
441 		exit(EXIT_FAILURE);
442 	}
443 	if (statfs(finfo.dir, &fs)) {
444 		perror("statfs()");
445 		exit(EXIT_FAILURE);
446 	}
447 	finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE;
448 	if (finfo.type == VMA_SHMEM)
449 		return;
450 
451 	/* Find owning device's queue/read_ahead_kb control */
452 	if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent",
453 		     major(path_stat.st_dev), minor(path_stat.st_dev))
454 	    >= sizeof(path)) {
455 		printf("%s: Pathname is too long\n", __func__);
456 		exit(EXIT_FAILURE);
457 	}
458 	if (read_file(path, buf, sizeof(buf)) < 0) {
459 		perror("read_file(read_num)");
460 		exit(EXIT_FAILURE);
461 	}
462 	if (strstr(buf, "DEVTYPE=disk")) {
463 		/* Found it */
464 		if (snprintf(finfo.dev_queue_read_ahead_path,
465 			     sizeof(finfo.dev_queue_read_ahead_path),
466 			     "/sys/dev/block/%d:%d/queue/read_ahead_kb",
467 			     major(path_stat.st_dev), minor(path_stat.st_dev))
468 		    >= sizeof(finfo.dev_queue_read_ahead_path)) {
469 			printf("%s: Pathname is too long\n", __func__);
470 			exit(EXIT_FAILURE);
471 		}
472 		return;
473 	}
474 	if (!strstr(buf, "DEVTYPE=partition")) {
475 		printf("%s: Unknown device type: %s\n", __func__, path);
476 		exit(EXIT_FAILURE);
477 	}
478 	/*
479 	 * Partition of block device - need to find actual device.
480 	 * Using naming convention that devnameN is partition of
481 	 * device devname.
482 	 */
483 	str = strstr(buf, "DEVNAME=");
484 	if (!str) {
485 		printf("%s: Could not read: %s", __func__, path);
486 		exit(EXIT_FAILURE);
487 	}
488 	str += 8;
489 	end = str;
490 	while (*end) {
491 		if (isdigit(*end)) {
492 			*end = '\0';
493 			if (snprintf(finfo.dev_queue_read_ahead_path,
494 				     sizeof(finfo.dev_queue_read_ahead_path),
495 				     "/sys/block/%s/queue/read_ahead_kb",
496 				     str) >= sizeof(finfo.dev_queue_read_ahead_path)) {
497 				printf("%s: Pathname is too long\n", __func__);
498 				exit(EXIT_FAILURE);
499 			}
500 			return;
501 		}
502 		++end;
503 	}
504 	printf("%s: Could not read: %s\n", __func__, path);
505 	exit(EXIT_FAILURE);
506 }
507 
check_swap(void * addr,unsigned long size)508 static bool check_swap(void *addr, unsigned long size)
509 {
510 	bool swap = false;
511 	int ret;
512 	FILE *fp;
513 	char buffer[MAX_LINE_LENGTH];
514 	char addr_pattern[MAX_LINE_LENGTH];
515 
516 	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
517 		       (unsigned long) addr);
518 	if (ret >= MAX_LINE_LENGTH) {
519 		printf("%s: Pattern is too long\n", __func__);
520 		exit(EXIT_FAILURE);
521 	}
522 
523 
524 	fp = fopen(PID_SMAPS, "r");
525 	if (!fp) {
526 		printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
527 		exit(EXIT_FAILURE);
528 	}
529 	if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
530 		goto err_out;
531 
532 	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
533 		       size >> 10);
534 	if (ret >= MAX_LINE_LENGTH) {
535 		printf("%s: Pattern is too long\n", __func__);
536 		exit(EXIT_FAILURE);
537 	}
538 	/*
539 	 * Fetch the Swap: in the same block and check whether it got
540 	 * the expected number of hugeepages next.
541 	 */
542 	if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer)))
543 		goto err_out;
544 
545 	if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
546 		goto err_out;
547 
548 	swap = true;
549 err_out:
550 	fclose(fp);
551 	return swap;
552 }
553 
alloc_mapping(int nr)554 static void *alloc_mapping(int nr)
555 {
556 	void *p;
557 
558 	p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
559 		 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
560 	if (p != BASE_ADDR) {
561 		printf("Failed to allocate VMA at %p\n", BASE_ADDR);
562 		exit(EXIT_FAILURE);
563 	}
564 
565 	return p;
566 }
567 
fill_memory(int * p,unsigned long start,unsigned long end)568 static void fill_memory(int *p, unsigned long start, unsigned long end)
569 {
570 	int i;
571 
572 	for (i = start / page_size; i < end / page_size; i++)
573 		p[i * page_size / sizeof(*p)] = i + 0xdead0000;
574 }
575 
576 /*
577  * MADV_COLLAPSE is a best-effort request and may fail if an internal
578  * resource is temporarily unavailable, in which case it will set errno to
579  * EAGAIN.  In such a case, immediately reattempt the operation one more
580  * time.
581  */
madvise_collapse_retry(void * p,unsigned long size)582 static int madvise_collapse_retry(void *p, unsigned long size)
583 {
584 	bool retry = true;
585 	int ret;
586 
587 retry:
588 	ret = madvise(p, size, MADV_COLLAPSE);
589 	if (ret && errno == EAGAIN && retry) {
590 		retry = false;
591 		goto retry;
592 	}
593 	return ret;
594 }
595 
596 /*
597  * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with
598  * validate_memory()'able contents.
599  */
alloc_hpage(struct mem_ops * ops)600 static void *alloc_hpage(struct mem_ops *ops)
601 {
602 	void *p = ops->setup_area(1);
603 
604 	ops->fault(p, 0, hpage_pmd_size);
605 
606 	/*
607 	 * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE.
608 	 * The latter is ineligible for collapse by MADV_COLLAPSE
609 	 * while the former might cause MADV_COLLAPSE to race with
610 	 * khugepaged on low-load system (like a test machine), which
611 	 * would cause MADV_COLLAPSE to fail with EAGAIN.
612 	 */
613 	printf("Allocate huge page...");
614 	if (madvise_collapse_retry(p, hpage_pmd_size)) {
615 		perror("madvise(MADV_COLLAPSE)");
616 		exit(EXIT_FAILURE);
617 	}
618 	if (!ops->check_huge(p, 1)) {
619 		perror("madvise(MADV_COLLAPSE)");
620 		exit(EXIT_FAILURE);
621 	}
622 	if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) {
623 		perror("madvise(MADV_HUGEPAGE)");
624 		exit(EXIT_FAILURE);
625 	}
626 	success("OK");
627 	return p;
628 }
629 
validate_memory(int * p,unsigned long start,unsigned long end)630 static void validate_memory(int *p, unsigned long start, unsigned long end)
631 {
632 	int i;
633 
634 	for (i = start / page_size; i < end / page_size; i++) {
635 		if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
636 			printf("Page %d is corrupted: %#x\n",
637 					i, p[i * page_size / sizeof(*p)]);
638 			exit(EXIT_FAILURE);
639 		}
640 	}
641 }
642 
anon_setup_area(int nr_hpages)643 static void *anon_setup_area(int nr_hpages)
644 {
645 	return alloc_mapping(nr_hpages);
646 }
647 
anon_cleanup_area(void * p,unsigned long size)648 static void anon_cleanup_area(void *p, unsigned long size)
649 {
650 	munmap(p, size);
651 }
652 
anon_fault(void * p,unsigned long start,unsigned long end)653 static void anon_fault(void *p, unsigned long start, unsigned long end)
654 {
655 	fill_memory(p, start, end);
656 }
657 
anon_check_huge(void * addr,int nr_hpages)658 static bool anon_check_huge(void *addr, int nr_hpages)
659 {
660 	return check_huge_anon(addr, nr_hpages, hpage_pmd_size);
661 }
662 
file_setup_area(int nr_hpages)663 static void *file_setup_area(int nr_hpages)
664 {
665 	int fd;
666 	void *p;
667 	unsigned long size;
668 
669 	unlink(finfo.path);  /* Cleanup from previous failed tests */
670 	printf("Creating %s for collapse%s...", finfo.path,
671 	       finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
672 	fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
673 		  777);
674 	if (fd < 0) {
675 		perror("open()");
676 		exit(EXIT_FAILURE);
677 	}
678 
679 	size = nr_hpages * hpage_pmd_size;
680 	p = alloc_mapping(nr_hpages);
681 	fill_memory(p, 0, size);
682 	write(fd, p, size);
683 	close(fd);
684 	munmap(p, size);
685 	success("OK");
686 
687 	printf("Opening %s read only for collapse...", finfo.path);
688 	finfo.fd = open(finfo.path, O_RDONLY, 777);
689 	if (finfo.fd < 0) {
690 		perror("open()");
691 		exit(EXIT_FAILURE);
692 	}
693 	p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC,
694 		 MAP_PRIVATE, finfo.fd, 0);
695 	if (p == MAP_FAILED || p != BASE_ADDR) {
696 		perror("mmap()");
697 		exit(EXIT_FAILURE);
698 	}
699 
700 	/* Drop page cache */
701 	write_file("/proc/sys/vm/drop_caches", "3", 2);
702 	success("OK");
703 	return p;
704 }
705 
file_cleanup_area(void * p,unsigned long size)706 static void file_cleanup_area(void *p, unsigned long size)
707 {
708 	munmap(p, size);
709 	close(finfo.fd);
710 	unlink(finfo.path);
711 }
712 
file_fault(void * p,unsigned long start,unsigned long end)713 static void file_fault(void *p, unsigned long start, unsigned long end)
714 {
715 	if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) {
716 		perror("madvise(MADV_POPULATE_READ");
717 		exit(EXIT_FAILURE);
718 	}
719 }
720 
file_check_huge(void * addr,int nr_hpages)721 static bool file_check_huge(void *addr, int nr_hpages)
722 {
723 	switch (finfo.type) {
724 	case VMA_FILE:
725 		return check_huge_file(addr, nr_hpages, hpage_pmd_size);
726 	case VMA_SHMEM:
727 		return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
728 	default:
729 		exit(EXIT_FAILURE);
730 		return false;
731 	}
732 }
733 
shmem_setup_area(int nr_hpages)734 static void *shmem_setup_area(int nr_hpages)
735 {
736 	void *p;
737 	unsigned long size = nr_hpages * hpage_pmd_size;
738 
739 	finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0);
740 	if (finfo.fd < 0)  {
741 		perror("memfd_create()");
742 		exit(EXIT_FAILURE);
743 	}
744 	if (ftruncate(finfo.fd, size)) {
745 		perror("ftruncate()");
746 		exit(EXIT_FAILURE);
747 	}
748 	p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd,
749 		 0);
750 	if (p != BASE_ADDR) {
751 		perror("mmap()");
752 		exit(EXIT_FAILURE);
753 	}
754 	return p;
755 }
756 
shmem_cleanup_area(void * p,unsigned long size)757 static void shmem_cleanup_area(void *p, unsigned long size)
758 {
759 	munmap(p, size);
760 	close(finfo.fd);
761 }
762 
shmem_check_huge(void * addr,int nr_hpages)763 static bool shmem_check_huge(void *addr, int nr_hpages)
764 {
765 	return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
766 }
767 
768 static struct mem_ops __anon_ops = {
769 	.setup_area = &anon_setup_area,
770 	.cleanup_area = &anon_cleanup_area,
771 	.fault = &anon_fault,
772 	.check_huge = &anon_check_huge,
773 	.name = "anon",
774 };
775 
776 static struct mem_ops __file_ops = {
777 	.setup_area = &file_setup_area,
778 	.cleanup_area = &file_cleanup_area,
779 	.fault = &file_fault,
780 	.check_huge = &file_check_huge,
781 	.name = "file",
782 };
783 
784 static struct mem_ops __shmem_ops = {
785 	.setup_area = &shmem_setup_area,
786 	.cleanup_area = &shmem_cleanup_area,
787 	.fault = &anon_fault,
788 	.check_huge = &shmem_check_huge,
789 	.name = "shmem",
790 };
791 
__madvise_collapse(const char * msg,char * p,int nr_hpages,struct mem_ops * ops,bool expect)792 static void __madvise_collapse(const char *msg, char *p, int nr_hpages,
793 			       struct mem_ops *ops, bool expect)
794 {
795 	int ret;
796 	struct settings settings = *current_settings();
797 
798 	printf("%s...", msg);
799 
800 	/*
801 	 * Prevent khugepaged interference and tests that MADV_COLLAPSE
802 	 * ignores /sys/kernel/mm/transparent_hugepage/enabled
803 	 */
804 	settings.thp_enabled = THP_NEVER;
805 	settings.shmem_enabled = SHMEM_NEVER;
806 	push_settings(&settings);
807 
808 	/* Clear VM_NOHUGEPAGE */
809 	madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
810 	ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size);
811 	if (((bool)ret) == expect)
812 		fail("Fail: Bad return value");
813 	else if (!ops->check_huge(p, expect ? nr_hpages : 0))
814 		fail("Fail: check_huge()");
815 	else
816 		success("OK");
817 
818 	pop_settings();
819 }
820 
madvise_collapse(const char * msg,char * p,int nr_hpages,struct mem_ops * ops,bool expect)821 static void madvise_collapse(const char *msg, char *p, int nr_hpages,
822 			     struct mem_ops *ops, bool expect)
823 {
824 	/* Sanity check */
825 	if (!ops->check_huge(p, 0)) {
826 		printf("Unexpected huge page\n");
827 		exit(EXIT_FAILURE);
828 	}
829 	__madvise_collapse(msg, p, nr_hpages, ops, expect);
830 }
831 
832 #define TICK 500000
wait_for_scan(const char * msg,char * p,int nr_hpages,struct mem_ops * ops)833 static bool wait_for_scan(const char *msg, char *p, int nr_hpages,
834 			  struct mem_ops *ops)
835 {
836 	int full_scans;
837 	int timeout = 6; /* 3 seconds */
838 
839 	/* Sanity check */
840 	if (!ops->check_huge(p, 0)) {
841 		printf("Unexpected huge page\n");
842 		exit(EXIT_FAILURE);
843 	}
844 
845 	madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
846 
847 	/* Wait until the second full_scan completed */
848 	full_scans = read_num("khugepaged/full_scans") + 2;
849 
850 	printf("%s...", msg);
851 	while (timeout--) {
852 		if (ops->check_huge(p, nr_hpages))
853 			break;
854 		if (read_num("khugepaged/full_scans") >= full_scans)
855 			break;
856 		printf(".");
857 		usleep(TICK);
858 	}
859 
860 	madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE);
861 
862 	return timeout == -1;
863 }
864 
khugepaged_collapse(const char * msg,char * p,int nr_hpages,struct mem_ops * ops,bool expect)865 static void khugepaged_collapse(const char *msg, char *p, int nr_hpages,
866 				struct mem_ops *ops, bool expect)
867 {
868 	if (wait_for_scan(msg, p, nr_hpages, ops)) {
869 		if (expect)
870 			fail("Timeout");
871 		else
872 			success("OK");
873 		return;
874 	}
875 
876 	/*
877 	 * For file and shmem memory, khugepaged only retracts pte entries after
878 	 * putting the new hugepage in the page cache. The hugepage must be
879 	 * subsequently refaulted to install the pmd mapping for the mm.
880 	 */
881 	if (ops != &__anon_ops)
882 		ops->fault(p, 0, nr_hpages * hpage_pmd_size);
883 
884 	if (ops->check_huge(p, expect ? nr_hpages : 0))
885 		success("OK");
886 	else
887 		fail("Fail");
888 }
889 
890 static struct collapse_context __khugepaged_context = {
891 	.collapse = &khugepaged_collapse,
892 	.enforce_pte_scan_limits = true,
893 	.name = "khugepaged",
894 };
895 
896 static struct collapse_context __madvise_context = {
897 	.collapse = &madvise_collapse,
898 	.enforce_pte_scan_limits = false,
899 	.name = "madvise",
900 };
901 
is_tmpfs(struct mem_ops * ops)902 static bool is_tmpfs(struct mem_ops *ops)
903 {
904 	return ops == &__file_ops && finfo.type == VMA_SHMEM;
905 }
906 
alloc_at_fault(void)907 static void alloc_at_fault(void)
908 {
909 	struct settings settings = *current_settings();
910 	char *p;
911 
912 	settings.thp_enabled = THP_ALWAYS;
913 	push_settings(&settings);
914 
915 	p = alloc_mapping(1);
916 	*p = 1;
917 	printf("Allocate huge page on fault...");
918 	if (check_huge_anon(p, 1, hpage_pmd_size))
919 		success("OK");
920 	else
921 		fail("Fail");
922 
923 	pop_settings();
924 
925 	madvise(p, page_size, MADV_DONTNEED);
926 	printf("Split huge PMD on MADV_DONTNEED...");
927 	if (check_huge_anon(p, 0, hpage_pmd_size))
928 		success("OK");
929 	else
930 		fail("Fail");
931 	munmap(p, hpage_pmd_size);
932 }
933 
collapse_full(struct collapse_context * c,struct mem_ops * ops)934 static void collapse_full(struct collapse_context *c, struct mem_ops *ops)
935 {
936 	void *p;
937 	int nr_hpages = 4;
938 	unsigned long size = nr_hpages * hpage_pmd_size;
939 
940 	p = ops->setup_area(nr_hpages);
941 	ops->fault(p, 0, size);
942 	c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages,
943 		    ops, true);
944 	validate_memory(p, 0, size);
945 	ops->cleanup_area(p, size);
946 }
947 
collapse_empty(struct collapse_context * c,struct mem_ops * ops)948 static void collapse_empty(struct collapse_context *c, struct mem_ops *ops)
949 {
950 	void *p;
951 
952 	p = ops->setup_area(1);
953 	c->collapse("Do not collapse empty PTE table", p, 1, ops, false);
954 	ops->cleanup_area(p, hpage_pmd_size);
955 }
956 
collapse_single_pte_entry(struct collapse_context * c,struct mem_ops * ops)957 static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops)
958 {
959 	void *p;
960 
961 	p = ops->setup_area(1);
962 	ops->fault(p, 0, page_size);
963 	c->collapse("Collapse PTE table with single PTE entry present", p,
964 		    1, ops, true);
965 	ops->cleanup_area(p, hpage_pmd_size);
966 }
967 
collapse_max_ptes_none(struct collapse_context * c,struct mem_ops * ops)968 static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops)
969 {
970 	int max_ptes_none = hpage_pmd_nr / 2;
971 	struct settings settings = *current_settings();
972 	void *p;
973 
974 	settings.khugepaged.max_ptes_none = max_ptes_none;
975 	push_settings(&settings);
976 
977 	p = ops->setup_area(1);
978 
979 	if (is_tmpfs(ops)) {
980 		/* shmem pages always in the page cache */
981 		printf("tmpfs...");
982 		skip("Skip");
983 		goto skip;
984 	}
985 
986 	ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
987 	c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1,
988 		    ops, !c->enforce_pte_scan_limits);
989 	validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
990 
991 	if (c->enforce_pte_scan_limits) {
992 		ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
993 		c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops,
994 			    true);
995 		validate_memory(p, 0,
996 				(hpage_pmd_nr - max_ptes_none) * page_size);
997 	}
998 skip:
999 	ops->cleanup_area(p, hpage_pmd_size);
1000 	pop_settings();
1001 }
1002 
collapse_swapin_single_pte(struct collapse_context * c,struct mem_ops * ops)1003 static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops)
1004 {
1005 	void *p;
1006 
1007 	p = ops->setup_area(1);
1008 	ops->fault(p, 0, hpage_pmd_size);
1009 
1010 	printf("Swapout one page...");
1011 	if (madvise(p, page_size, MADV_PAGEOUT)) {
1012 		perror("madvise(MADV_PAGEOUT)");
1013 		exit(EXIT_FAILURE);
1014 	}
1015 	if (check_swap(p, page_size)) {
1016 		success("OK");
1017 	} else {
1018 		fail("Fail");
1019 		goto out;
1020 	}
1021 
1022 	c->collapse("Collapse with swapping in single PTE entry", p, 1, ops,
1023 		    true);
1024 	validate_memory(p, 0, hpage_pmd_size);
1025 out:
1026 	ops->cleanup_area(p, hpage_pmd_size);
1027 }
1028 
collapse_max_ptes_swap(struct collapse_context * c,struct mem_ops * ops)1029 static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops)
1030 {
1031 	int max_ptes_swap = read_num("khugepaged/max_ptes_swap");
1032 	void *p;
1033 
1034 	p = ops->setup_area(1);
1035 	ops->fault(p, 0, hpage_pmd_size);
1036 
1037 	printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
1038 	if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
1039 		perror("madvise(MADV_PAGEOUT)");
1040 		exit(EXIT_FAILURE);
1041 	}
1042 	if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
1043 		success("OK");
1044 	} else {
1045 		fail("Fail");
1046 		goto out;
1047 	}
1048 
1049 	c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops,
1050 		    !c->enforce_pte_scan_limits);
1051 	validate_memory(p, 0, hpage_pmd_size);
1052 
1053 	if (c->enforce_pte_scan_limits) {
1054 		ops->fault(p, 0, hpage_pmd_size);
1055 		printf("Swapout %d of %d pages...", max_ptes_swap,
1056 		       hpage_pmd_nr);
1057 		if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
1058 			perror("madvise(MADV_PAGEOUT)");
1059 			exit(EXIT_FAILURE);
1060 		}
1061 		if (check_swap(p, max_ptes_swap * page_size)) {
1062 			success("OK");
1063 		} else {
1064 			fail("Fail");
1065 			goto out;
1066 		}
1067 
1068 		c->collapse("Collapse with max_ptes_swap pages swapped out", p,
1069 			    1, ops, true);
1070 		validate_memory(p, 0, hpage_pmd_size);
1071 	}
1072 out:
1073 	ops->cleanup_area(p, hpage_pmd_size);
1074 }
1075 
collapse_single_pte_entry_compound(struct collapse_context * c,struct mem_ops * ops)1076 static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops)
1077 {
1078 	void *p;
1079 
1080 	p = alloc_hpage(ops);
1081 
1082 	if (is_tmpfs(ops)) {
1083 		/* MADV_DONTNEED won't evict tmpfs pages */
1084 		printf("tmpfs...");
1085 		skip("Skip");
1086 		goto skip;
1087 	}
1088 
1089 	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
1090 	printf("Split huge page leaving single PTE mapping compound page...");
1091 	madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
1092 	if (ops->check_huge(p, 0))
1093 		success("OK");
1094 	else
1095 		fail("Fail");
1096 
1097 	c->collapse("Collapse PTE table with single PTE mapping compound page",
1098 		    p, 1, ops, true);
1099 	validate_memory(p, 0, page_size);
1100 skip:
1101 	ops->cleanup_area(p, hpage_pmd_size);
1102 }
1103 
collapse_full_of_compound(struct collapse_context * c,struct mem_ops * ops)1104 static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops)
1105 {
1106 	void *p;
1107 
1108 	p = alloc_hpage(ops);
1109 	printf("Split huge page leaving single PTE page table full of compound pages...");
1110 	madvise(p, page_size, MADV_NOHUGEPAGE);
1111 	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
1112 	if (ops->check_huge(p, 0))
1113 		success("OK");
1114 	else
1115 		fail("Fail");
1116 
1117 	c->collapse("Collapse PTE table full of compound pages", p, 1, ops,
1118 		    true);
1119 	validate_memory(p, 0, hpage_pmd_size);
1120 	ops->cleanup_area(p, hpage_pmd_size);
1121 }
1122 
collapse_compound_extreme(struct collapse_context * c,struct mem_ops * ops)1123 static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops)
1124 {
1125 	void *p;
1126 	int i;
1127 
1128 	p = ops->setup_area(1);
1129 	for (i = 0; i < hpage_pmd_nr; i++) {
1130 		printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
1131 				i + 1, hpage_pmd_nr);
1132 
1133 		madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
1134 		ops->fault(BASE_ADDR, 0, hpage_pmd_size);
1135 		if (!ops->check_huge(BASE_ADDR, 1)) {
1136 			printf("Failed to allocate huge page\n");
1137 			exit(EXIT_FAILURE);
1138 		}
1139 		madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
1140 
1141 		p = mremap(BASE_ADDR - i * page_size,
1142 				i * page_size + hpage_pmd_size,
1143 				(i + 1) * page_size,
1144 				MREMAP_MAYMOVE | MREMAP_FIXED,
1145 				BASE_ADDR + 2 * hpage_pmd_size);
1146 		if (p == MAP_FAILED) {
1147 			perror("mremap+unmap");
1148 			exit(EXIT_FAILURE);
1149 		}
1150 
1151 		p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
1152 				(i + 1) * page_size,
1153 				(i + 1) * page_size + hpage_pmd_size,
1154 				MREMAP_MAYMOVE | MREMAP_FIXED,
1155 				BASE_ADDR - (i + 1) * page_size);
1156 		if (p == MAP_FAILED) {
1157 			perror("mremap+alloc");
1158 			exit(EXIT_FAILURE);
1159 		}
1160 	}
1161 
1162 	ops->cleanup_area(BASE_ADDR, hpage_pmd_size);
1163 	ops->fault(p, 0, hpage_pmd_size);
1164 	if (!ops->check_huge(p, 1))
1165 		success("OK");
1166 	else
1167 		fail("Fail");
1168 
1169 	c->collapse("Collapse PTE table full of different compound pages", p, 1,
1170 		    ops, true);
1171 
1172 	validate_memory(p, 0, hpage_pmd_size);
1173 	ops->cleanup_area(p, hpage_pmd_size);
1174 }
1175 
collapse_fork(struct collapse_context * c,struct mem_ops * ops)1176 static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
1177 {
1178 	int wstatus;
1179 	void *p;
1180 
1181 	p = ops->setup_area(1);
1182 
1183 	printf("Allocate small page...");
1184 	ops->fault(p, 0, page_size);
1185 	if (ops->check_huge(p, 0))
1186 		success("OK");
1187 	else
1188 		fail("Fail");
1189 
1190 	printf("Share small page over fork()...");
1191 	if (!fork()) {
1192 		/* Do not touch settings on child exit */
1193 		skip_settings_restore = true;
1194 		exit_status = 0;
1195 
1196 		if (ops->check_huge(p, 0))
1197 			success("OK");
1198 		else
1199 			fail("Fail");
1200 
1201 		ops->fault(p, page_size, 2 * page_size);
1202 		c->collapse("Collapse PTE table with single page shared with parent process",
1203 			    p, 1, ops, true);
1204 
1205 		validate_memory(p, 0, page_size);
1206 		ops->cleanup_area(p, hpage_pmd_size);
1207 		exit(exit_status);
1208 	}
1209 
1210 	wait(&wstatus);
1211 	exit_status += WEXITSTATUS(wstatus);
1212 
1213 	printf("Check if parent still has small page...");
1214 	if (ops->check_huge(p, 0))
1215 		success("OK");
1216 	else
1217 		fail("Fail");
1218 	validate_memory(p, 0, page_size);
1219 	ops->cleanup_area(p, hpage_pmd_size);
1220 }
1221 
collapse_fork_compound(struct collapse_context * c,struct mem_ops * ops)1222 static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops)
1223 {
1224 	int wstatus;
1225 	void *p;
1226 
1227 	p = alloc_hpage(ops);
1228 	printf("Share huge page over fork()...");
1229 	if (!fork()) {
1230 		/* Do not touch settings on child exit */
1231 		skip_settings_restore = true;
1232 		exit_status = 0;
1233 
1234 		if (ops->check_huge(p, 1))
1235 			success("OK");
1236 		else
1237 			fail("Fail");
1238 
1239 		printf("Split huge page PMD in child process...");
1240 		madvise(p, page_size, MADV_NOHUGEPAGE);
1241 		madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
1242 		if (ops->check_huge(p, 0))
1243 			success("OK");
1244 		else
1245 			fail("Fail");
1246 		ops->fault(p, 0, page_size);
1247 
1248 		write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
1249 		c->collapse("Collapse PTE table full of compound pages in child",
1250 			    p, 1, ops, true);
1251 		write_num("khugepaged/max_ptes_shared",
1252 			  current_settings()->khugepaged.max_ptes_shared);
1253 
1254 		validate_memory(p, 0, hpage_pmd_size);
1255 		ops->cleanup_area(p, hpage_pmd_size);
1256 		exit(exit_status);
1257 	}
1258 
1259 	wait(&wstatus);
1260 	exit_status += WEXITSTATUS(wstatus);
1261 
1262 	printf("Check if parent still has huge page...");
1263 	if (ops->check_huge(p, 1))
1264 		success("OK");
1265 	else
1266 		fail("Fail");
1267 	validate_memory(p, 0, hpage_pmd_size);
1268 	ops->cleanup_area(p, hpage_pmd_size);
1269 }
1270 
collapse_max_ptes_shared(struct collapse_context * c,struct mem_ops * ops)1271 static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops)
1272 {
1273 	int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
1274 	int wstatus;
1275 	void *p;
1276 
1277 	p = alloc_hpage(ops);
1278 	printf("Share huge page over fork()...");
1279 	if (!fork()) {
1280 		/* Do not touch settings on child exit */
1281 		skip_settings_restore = true;
1282 		exit_status = 0;
1283 
1284 		if (ops->check_huge(p, 1))
1285 			success("OK");
1286 		else
1287 			fail("Fail");
1288 
1289 		printf("Trigger CoW on page %d of %d...",
1290 				hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
1291 		ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
1292 		if (ops->check_huge(p, 0))
1293 			success("OK");
1294 		else
1295 			fail("Fail");
1296 
1297 		c->collapse("Maybe collapse with max_ptes_shared exceeded", p,
1298 			    1, ops, !c->enforce_pte_scan_limits);
1299 
1300 		if (c->enforce_pte_scan_limits) {
1301 			printf("Trigger CoW on page %d of %d...",
1302 			       hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
1303 			ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) *
1304 				    page_size);
1305 			if (ops->check_huge(p, 0))
1306 				success("OK");
1307 			else
1308 				fail("Fail");
1309 
1310 			c->collapse("Collapse with max_ptes_shared PTEs shared",
1311 				    p, 1, ops, true);
1312 		}
1313 
1314 		validate_memory(p, 0, hpage_pmd_size);
1315 		ops->cleanup_area(p, hpage_pmd_size);
1316 		exit(exit_status);
1317 	}
1318 
1319 	wait(&wstatus);
1320 	exit_status += WEXITSTATUS(wstatus);
1321 
1322 	printf("Check if parent still has huge page...");
1323 	if (ops->check_huge(p, 1))
1324 		success("OK");
1325 	else
1326 		fail("Fail");
1327 	validate_memory(p, 0, hpage_pmd_size);
1328 	ops->cleanup_area(p, hpage_pmd_size);
1329 }
1330 
madvise_collapse_existing_thps(struct collapse_context * c,struct mem_ops * ops)1331 static void madvise_collapse_existing_thps(struct collapse_context *c,
1332 					   struct mem_ops *ops)
1333 {
1334 	void *p;
1335 
1336 	p = ops->setup_area(1);
1337 	ops->fault(p, 0, hpage_pmd_size);
1338 	c->collapse("Collapse fully populated PTE table...", p, 1, ops, true);
1339 	validate_memory(p, 0, hpage_pmd_size);
1340 
1341 	/* c->collapse() will find a hugepage and complain - call directly. */
1342 	__madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true);
1343 	validate_memory(p, 0, hpage_pmd_size);
1344 	ops->cleanup_area(p, hpage_pmd_size);
1345 }
1346 
1347 /*
1348  * Test race with khugepaged where page tables have been retracted and
1349  * pmd cleared.
1350  */
madvise_retracted_page_tables(struct collapse_context * c,struct mem_ops * ops)1351 static void madvise_retracted_page_tables(struct collapse_context *c,
1352 					  struct mem_ops *ops)
1353 {
1354 	void *p;
1355 	int nr_hpages = 1;
1356 	unsigned long size = nr_hpages * hpage_pmd_size;
1357 
1358 	p = ops->setup_area(nr_hpages);
1359 	ops->fault(p, 0, size);
1360 
1361 	/* Let khugepaged collapse and leave pmd cleared */
1362 	if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages,
1363 			  ops)) {
1364 		fail("Timeout");
1365 		return;
1366 	}
1367 	success("OK");
1368 	c->collapse("Install huge PMD from page cache", p, nr_hpages, ops,
1369 		    true);
1370 	validate_memory(p, 0, size);
1371 	ops->cleanup_area(p, size);
1372 }
1373 
usage(void)1374 static void usage(void)
1375 {
1376 	fprintf(stderr, "\nUsage: ./khugepaged <test type> [dir]\n\n");
1377 	fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n");
1378 	fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n");
1379 	fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n");
1380 	fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n");
1381 	fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n");
1382 	fprintf(stderr,	"\tCONFIG_READ_ONLY_THP_FOR_FS=y\n");
1383 	fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n");
1384 	fprintf(stderr,	"\tmounted with huge=madvise option for khugepaged tests to work\n");
1385 	exit(1);
1386 }
1387 
parse_test_type(int argc,const char ** argv)1388 static void parse_test_type(int argc, const char **argv)
1389 {
1390 	char *buf;
1391 	const char *token;
1392 
1393 	if (argc == 1) {
1394 		/* Backwards compatibility */
1395 		khugepaged_context =  &__khugepaged_context;
1396 		madvise_context =  &__madvise_context;
1397 		anon_ops = &__anon_ops;
1398 		return;
1399 	}
1400 
1401 	buf = strdup(argv[1]);
1402 	token = strsep(&buf, ":");
1403 
1404 	if (!strcmp(token, "all")) {
1405 		khugepaged_context =  &__khugepaged_context;
1406 		madvise_context =  &__madvise_context;
1407 	} else if (!strcmp(token, "khugepaged")) {
1408 		khugepaged_context =  &__khugepaged_context;
1409 	} else if (!strcmp(token, "madvise")) {
1410 		madvise_context =  &__madvise_context;
1411 	} else {
1412 		usage();
1413 	}
1414 
1415 	if (!buf)
1416 		usage();
1417 
1418 	if (!strcmp(buf, "all")) {
1419 		file_ops =  &__file_ops;
1420 		anon_ops = &__anon_ops;
1421 		shmem_ops = &__shmem_ops;
1422 	} else if (!strcmp(buf, "anon")) {
1423 		anon_ops = &__anon_ops;
1424 	} else if (!strcmp(buf, "file")) {
1425 		file_ops =  &__file_ops;
1426 	} else if (!strcmp(buf, "shmem")) {
1427 		shmem_ops = &__shmem_ops;
1428 	} else {
1429 		usage();
1430 	}
1431 
1432 	if (!file_ops)
1433 		return;
1434 
1435 	if (argc != 3)
1436 		usage();
1437 }
1438 
main(int argc,const char ** argv)1439 int main(int argc, const char **argv)
1440 {
1441 	struct settings default_settings = {
1442 		.thp_enabled = THP_MADVISE,
1443 		.thp_defrag = THP_DEFRAG_ALWAYS,
1444 		.shmem_enabled = SHMEM_ADVISE,
1445 		.use_zero_page = 0,
1446 		.khugepaged = {
1447 			.defrag = 1,
1448 			.alloc_sleep_millisecs = 10,
1449 			.scan_sleep_millisecs = 10,
1450 		},
1451 		/*
1452 		 * When testing file-backed memory, the collapse path
1453 		 * looks at how many pages are found in the page cache, not
1454 		 * what pages are mapped. Disable read ahead optimization so
1455 		 * pages don't find their way into the page cache unless
1456 		 * we mem_ops->fault() them in.
1457 		 */
1458 		.read_ahead_kb = 0,
1459 	};
1460 
1461 	parse_test_type(argc, argv);
1462 
1463 	if (file_ops)
1464 		get_finfo(argv[2]);
1465 
1466 	setbuf(stdout, NULL);
1467 
1468 	page_size = getpagesize();
1469 	hpage_pmd_size = read_pmd_pagesize();
1470 	if (!hpage_pmd_size) {
1471 		printf("Reading PMD pagesize failed");
1472 		exit(EXIT_FAILURE);
1473 	}
1474 	hpage_pmd_nr = hpage_pmd_size / page_size;
1475 
1476 	default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
1477 	default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
1478 	default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
1479 	default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
1480 
1481 	save_settings();
1482 	push_settings(&default_settings);
1483 
1484 	alloc_at_fault();
1485 
1486 #define TEST(t, c, o) do { \
1487 	if (c && o) { \
1488 		printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \
1489 		t(c, o); \
1490 	} \
1491 	} while (0)
1492 
1493 	TEST(collapse_full, khugepaged_context, anon_ops);
1494 	TEST(collapse_full, khugepaged_context, file_ops);
1495 	TEST(collapse_full, khugepaged_context, shmem_ops);
1496 	TEST(collapse_full, madvise_context, anon_ops);
1497 	TEST(collapse_full, madvise_context, file_ops);
1498 	TEST(collapse_full, madvise_context, shmem_ops);
1499 
1500 	TEST(collapse_empty, khugepaged_context, anon_ops);
1501 	TEST(collapse_empty, madvise_context, anon_ops);
1502 
1503 	TEST(collapse_single_pte_entry, khugepaged_context, anon_ops);
1504 	TEST(collapse_single_pte_entry, khugepaged_context, file_ops);
1505 	TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops);
1506 	TEST(collapse_single_pte_entry, madvise_context, anon_ops);
1507 	TEST(collapse_single_pte_entry, madvise_context, file_ops);
1508 	TEST(collapse_single_pte_entry, madvise_context, shmem_ops);
1509 
1510 	TEST(collapse_max_ptes_none, khugepaged_context, anon_ops);
1511 	TEST(collapse_max_ptes_none, khugepaged_context, file_ops);
1512 	TEST(collapse_max_ptes_none, madvise_context, anon_ops);
1513 	TEST(collapse_max_ptes_none, madvise_context, file_ops);
1514 
1515 	TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops);
1516 	TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops);
1517 	TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops);
1518 	TEST(collapse_single_pte_entry_compound, madvise_context, file_ops);
1519 
1520 	TEST(collapse_full_of_compound, khugepaged_context, anon_ops);
1521 	TEST(collapse_full_of_compound, khugepaged_context, file_ops);
1522 	TEST(collapse_full_of_compound, khugepaged_context, shmem_ops);
1523 	TEST(collapse_full_of_compound, madvise_context, anon_ops);
1524 	TEST(collapse_full_of_compound, madvise_context, file_ops);
1525 	TEST(collapse_full_of_compound, madvise_context, shmem_ops);
1526 
1527 	TEST(collapse_compound_extreme, khugepaged_context, anon_ops);
1528 	TEST(collapse_compound_extreme, madvise_context, anon_ops);
1529 
1530 	TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops);
1531 	TEST(collapse_swapin_single_pte, madvise_context, anon_ops);
1532 
1533 	TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops);
1534 	TEST(collapse_max_ptes_swap, madvise_context, anon_ops);
1535 
1536 	TEST(collapse_fork, khugepaged_context, anon_ops);
1537 	TEST(collapse_fork, madvise_context, anon_ops);
1538 
1539 	TEST(collapse_fork_compound, khugepaged_context, anon_ops);
1540 	TEST(collapse_fork_compound, madvise_context, anon_ops);
1541 
1542 	TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops);
1543 	TEST(collapse_max_ptes_shared, madvise_context, anon_ops);
1544 
1545 	TEST(madvise_collapse_existing_thps, madvise_context, anon_ops);
1546 	TEST(madvise_collapse_existing_thps, madvise_context, file_ops);
1547 	TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops);
1548 
1549 	TEST(madvise_retracted_page_tables, madvise_context, file_ops);
1550 	TEST(madvise_retracted_page_tables, madvise_context, shmem_ops);
1551 
1552 	restore_settings(0);
1553 }
1554