1 #define _GNU_SOURCE
2 #include <ctype.h>
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <limits.h>
6 #include <dirent.h>
7 #include <signal.h>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <stdbool.h>
11 #include <string.h>
12 #include <unistd.h>
13 
14 #include <sys/mman.h>
15 #include <sys/wait.h>
16 #include <sys/types.h>
17 #include <sys/stat.h>
18 #include <sys/sysmacros.h>
19 #include <sys/vfs.h>
20 
21 #include "linux/magic.h"
22 
23 #include "vm_util.h"
24 
25 #define BASE_ADDR ((void *)(1UL << 30))
26 static unsigned long hpage_pmd_size;
27 static unsigned long page_size;
28 static int hpage_pmd_nr;
29 
30 #define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/"
31 #define PID_SMAPS "/proc/self/smaps"
32 #define TEST_FILE "collapse_test_file"
33 
34 #define MAX_LINE_LENGTH 500
35 
36 enum vma_type {
37 	VMA_ANON,
38 	VMA_FILE,
39 	VMA_SHMEM,
40 };
41 
42 struct mem_ops {
43 	void *(*setup_area)(int nr_hpages);
44 	void (*cleanup_area)(void *p, unsigned long size);
45 	void (*fault)(void *p, unsigned long start, unsigned long end);
46 	bool (*check_huge)(void *addr, int nr_hpages);
47 	const char *name;
48 };
49 
50 static struct mem_ops *file_ops;
51 static struct mem_ops *anon_ops;
52 static struct mem_ops *shmem_ops;
53 
54 struct collapse_context {
55 	void (*collapse)(const char *msg, char *p, int nr_hpages,
56 			 struct mem_ops *ops, bool expect);
57 	bool enforce_pte_scan_limits;
58 	const char *name;
59 };
60 
61 static struct collapse_context *khugepaged_context;
62 static struct collapse_context *madvise_context;
63 
64 struct file_info {
65 	const char *dir;
66 	char path[PATH_MAX];
67 	enum vma_type type;
68 	int fd;
69 	char dev_queue_read_ahead_path[PATH_MAX];
70 };
71 
72 static struct file_info finfo;
73 
74 enum thp_enabled {
75 	THP_ALWAYS,
76 	THP_MADVISE,
77 	THP_NEVER,
78 };
79 
80 static const char *thp_enabled_strings[] = {
81 	"always",
82 	"madvise",
83 	"never",
84 	NULL
85 };
86 
87 enum thp_defrag {
88 	THP_DEFRAG_ALWAYS,
89 	THP_DEFRAG_DEFER,
90 	THP_DEFRAG_DEFER_MADVISE,
91 	THP_DEFRAG_MADVISE,
92 	THP_DEFRAG_NEVER,
93 };
94 
95 static const char *thp_defrag_strings[] = {
96 	"always",
97 	"defer",
98 	"defer+madvise",
99 	"madvise",
100 	"never",
101 	NULL
102 };
103 
104 enum shmem_enabled {
105 	SHMEM_ALWAYS,
106 	SHMEM_WITHIN_SIZE,
107 	SHMEM_ADVISE,
108 	SHMEM_NEVER,
109 	SHMEM_DENY,
110 	SHMEM_FORCE,
111 };
112 
113 static const char *shmem_enabled_strings[] = {
114 	"always",
115 	"within_size",
116 	"advise",
117 	"never",
118 	"deny",
119 	"force",
120 	NULL
121 };
122 
123 struct khugepaged_settings {
124 	bool defrag;
125 	unsigned int alloc_sleep_millisecs;
126 	unsigned int scan_sleep_millisecs;
127 	unsigned int max_ptes_none;
128 	unsigned int max_ptes_swap;
129 	unsigned int max_ptes_shared;
130 	unsigned long pages_to_scan;
131 };
132 
133 struct settings {
134 	enum thp_enabled thp_enabled;
135 	enum thp_defrag thp_defrag;
136 	enum shmem_enabled shmem_enabled;
137 	bool use_zero_page;
138 	struct khugepaged_settings khugepaged;
139 	unsigned long read_ahead_kb;
140 };
141 
142 static struct settings saved_settings;
143 static bool skip_settings_restore;
144 
145 static int exit_status;
146 
147 static void success(const char *msg)
148 {
149 	printf(" \e[32m%s\e[0m\n", msg);
150 }
151 
152 static void fail(const char *msg)
153 {
154 	printf(" \e[31m%s\e[0m\n", msg);
155 	exit_status++;
156 }
157 
158 static void skip(const char *msg)
159 {
160 	printf(" \e[33m%s\e[0m\n", msg);
161 }
162 
163 static int read_file(const char *path, char *buf, size_t buflen)
164 {
165 	int fd;
166 	ssize_t numread;
167 
168 	fd = open(path, O_RDONLY);
169 	if (fd == -1)
170 		return 0;
171 
172 	numread = read(fd, buf, buflen - 1);
173 	if (numread < 1) {
174 		close(fd);
175 		return 0;
176 	}
177 
178 	buf[numread] = '\0';
179 	close(fd);
180 
181 	return (unsigned int) numread;
182 }
183 
184 static int write_file(const char *path, const char *buf, size_t buflen)
185 {
186 	int fd;
187 	ssize_t numwritten;
188 
189 	fd = open(path, O_WRONLY);
190 	if (fd == -1) {
191 		printf("open(%s)\n", path);
192 		exit(EXIT_FAILURE);
193 		return 0;
194 	}
195 
196 	numwritten = write(fd, buf, buflen - 1);
197 	close(fd);
198 	if (numwritten < 1) {
199 		printf("write(%s)\n", buf);
200 		exit(EXIT_FAILURE);
201 		return 0;
202 	}
203 
204 	return (unsigned int) numwritten;
205 }
206 
207 static int read_string(const char *name, const char *strings[])
208 {
209 	char path[PATH_MAX];
210 	char buf[256];
211 	char *c;
212 	int ret;
213 
214 	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
215 	if (ret >= PATH_MAX) {
216 		printf("%s: Pathname is too long\n", __func__);
217 		exit(EXIT_FAILURE);
218 	}
219 
220 	if (!read_file(path, buf, sizeof(buf))) {
221 		perror(path);
222 		exit(EXIT_FAILURE);
223 	}
224 
225 	c = strchr(buf, '[');
226 	if (!c) {
227 		printf("%s: Parse failure\n", __func__);
228 		exit(EXIT_FAILURE);
229 	}
230 
231 	c++;
232 	memmove(buf, c, sizeof(buf) - (c - buf));
233 
234 	c = strchr(buf, ']');
235 	if (!c) {
236 		printf("%s: Parse failure\n", __func__);
237 		exit(EXIT_FAILURE);
238 	}
239 	*c = '\0';
240 
241 	ret = 0;
242 	while (strings[ret]) {
243 		if (!strcmp(strings[ret], buf))
244 			return ret;
245 		ret++;
246 	}
247 
248 	printf("Failed to parse %s\n", name);
249 	exit(EXIT_FAILURE);
250 }
251 
252 static void write_string(const char *name, const char *val)
253 {
254 	char path[PATH_MAX];
255 	int ret;
256 
257 	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
258 	if (ret >= PATH_MAX) {
259 		printf("%s: Pathname is too long\n", __func__);
260 		exit(EXIT_FAILURE);
261 	}
262 
263 	if (!write_file(path, val, strlen(val) + 1)) {
264 		perror(path);
265 		exit(EXIT_FAILURE);
266 	}
267 }
268 
269 static const unsigned long _read_num(const char *path)
270 {
271 	char buf[21];
272 
273 	if (read_file(path, buf, sizeof(buf)) < 0) {
274 		perror("read_file(read_num)");
275 		exit(EXIT_FAILURE);
276 	}
277 
278 	return strtoul(buf, NULL, 10);
279 }
280 
281 static const unsigned long read_num(const char *name)
282 {
283 	char path[PATH_MAX];
284 	int ret;
285 
286 	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
287 	if (ret >= PATH_MAX) {
288 		printf("%s: Pathname is too long\n", __func__);
289 		exit(EXIT_FAILURE);
290 	}
291 	return _read_num(path);
292 }
293 
294 static void _write_num(const char *path, unsigned long num)
295 {
296 	char buf[21];
297 
298 	sprintf(buf, "%ld", num);
299 	if (!write_file(path, buf, strlen(buf) + 1)) {
300 		perror(path);
301 		exit(EXIT_FAILURE);
302 	}
303 }
304 
305 static void write_num(const char *name, unsigned long num)
306 {
307 	char path[PATH_MAX];
308 	int ret;
309 
310 	ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
311 	if (ret >= PATH_MAX) {
312 		printf("%s: Pathname is too long\n", __func__);
313 		exit(EXIT_FAILURE);
314 	}
315 	_write_num(path, num);
316 }
317 
318 static void write_settings(struct settings *settings)
319 {
320 	struct khugepaged_settings *khugepaged = &settings->khugepaged;
321 
322 	write_string("enabled", thp_enabled_strings[settings->thp_enabled]);
323 	write_string("defrag", thp_defrag_strings[settings->thp_defrag]);
324 	write_string("shmem_enabled",
325 			shmem_enabled_strings[settings->shmem_enabled]);
326 	write_num("use_zero_page", settings->use_zero_page);
327 
328 	write_num("khugepaged/defrag", khugepaged->defrag);
329 	write_num("khugepaged/alloc_sleep_millisecs",
330 			khugepaged->alloc_sleep_millisecs);
331 	write_num("khugepaged/scan_sleep_millisecs",
332 			khugepaged->scan_sleep_millisecs);
333 	write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
334 	write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
335 	write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared);
336 	write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
337 
338 	if (file_ops && finfo.type == VMA_FILE)
339 		_write_num(finfo.dev_queue_read_ahead_path,
340 			   settings->read_ahead_kb);
341 }
342 
343 #define MAX_SETTINGS_DEPTH 4
344 static struct settings settings_stack[MAX_SETTINGS_DEPTH];
345 static int settings_index;
346 
347 static struct settings *current_settings(void)
348 {
349 	if (!settings_index) {
350 		printf("Fail: No settings set");
351 		exit(EXIT_FAILURE);
352 	}
353 	return settings_stack + settings_index - 1;
354 }
355 
356 static void push_settings(struct settings *settings)
357 {
358 	if (settings_index >= MAX_SETTINGS_DEPTH) {
359 		printf("Fail: Settings stack exceeded");
360 		exit(EXIT_FAILURE);
361 	}
362 	settings_stack[settings_index++] = *settings;
363 	write_settings(current_settings());
364 }
365 
366 static void pop_settings(void)
367 {
368 	if (settings_index <= 0) {
369 		printf("Fail: Settings stack empty");
370 		exit(EXIT_FAILURE);
371 	}
372 	--settings_index;
373 	write_settings(current_settings());
374 }
375 
376 static void restore_settings(int sig)
377 {
378 	if (skip_settings_restore)
379 		goto out;
380 
381 	printf("Restore THP and khugepaged settings...");
382 	write_settings(&saved_settings);
383 	success("OK");
384 	if (sig)
385 		exit(EXIT_FAILURE);
386 out:
387 	exit(exit_status);
388 }
389 
390 static void save_settings(void)
391 {
392 	printf("Save THP and khugepaged settings...");
393 	saved_settings = (struct settings) {
394 		.thp_enabled = read_string("enabled", thp_enabled_strings),
395 		.thp_defrag = read_string("defrag", thp_defrag_strings),
396 		.shmem_enabled =
397 			read_string("shmem_enabled", shmem_enabled_strings),
398 		.use_zero_page = read_num("use_zero_page"),
399 	};
400 	saved_settings.khugepaged = (struct khugepaged_settings) {
401 		.defrag = read_num("khugepaged/defrag"),
402 		.alloc_sleep_millisecs =
403 			read_num("khugepaged/alloc_sleep_millisecs"),
404 		.scan_sleep_millisecs =
405 			read_num("khugepaged/scan_sleep_millisecs"),
406 		.max_ptes_none = read_num("khugepaged/max_ptes_none"),
407 		.max_ptes_swap = read_num("khugepaged/max_ptes_swap"),
408 		.max_ptes_shared = read_num("khugepaged/max_ptes_shared"),
409 		.pages_to_scan = read_num("khugepaged/pages_to_scan"),
410 	};
411 	if (file_ops && finfo.type == VMA_FILE)
412 		saved_settings.read_ahead_kb =
413 				_read_num(finfo.dev_queue_read_ahead_path);
414 
415 	success("OK");
416 
417 	signal(SIGTERM, restore_settings);
418 	signal(SIGINT, restore_settings);
419 	signal(SIGHUP, restore_settings);
420 	signal(SIGQUIT, restore_settings);
421 }
422 
423 static void get_finfo(const char *dir)
424 {
425 	struct stat path_stat;
426 	struct statfs fs;
427 	char buf[1 << 10];
428 	char path[PATH_MAX];
429 	char *str, *end;
430 
431 	finfo.dir = dir;
432 	stat(finfo.dir, &path_stat);
433 	if (!S_ISDIR(path_stat.st_mode)) {
434 		printf("%s: Not a directory (%s)\n", __func__, finfo.dir);
435 		exit(EXIT_FAILURE);
436 	}
437 	if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE,
438 		     finfo.dir) >= sizeof(finfo.path)) {
439 		printf("%s: Pathname is too long\n", __func__);
440 		exit(EXIT_FAILURE);
441 	}
442 	if (statfs(finfo.dir, &fs)) {
443 		perror("statfs()");
444 		exit(EXIT_FAILURE);
445 	}
446 	finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE;
447 	if (finfo.type == VMA_SHMEM)
448 		return;
449 
450 	/* Find owning device's queue/read_ahead_kb control */
451 	if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent",
452 		     major(path_stat.st_dev), minor(path_stat.st_dev))
453 	    >= sizeof(path)) {
454 		printf("%s: Pathname is too long\n", __func__);
455 		exit(EXIT_FAILURE);
456 	}
457 	if (read_file(path, buf, sizeof(buf)) < 0) {
458 		perror("read_file(read_num)");
459 		exit(EXIT_FAILURE);
460 	}
461 	if (strstr(buf, "DEVTYPE=disk")) {
462 		/* Found it */
463 		if (snprintf(finfo.dev_queue_read_ahead_path,
464 			     sizeof(finfo.dev_queue_read_ahead_path),
465 			     "/sys/dev/block/%d:%d/queue/read_ahead_kb",
466 			     major(path_stat.st_dev), minor(path_stat.st_dev))
467 		    >= sizeof(finfo.dev_queue_read_ahead_path)) {
468 			printf("%s: Pathname is too long\n", __func__);
469 			exit(EXIT_FAILURE);
470 		}
471 		return;
472 	}
473 	if (!strstr(buf, "DEVTYPE=partition")) {
474 		printf("%s: Unknown device type: %s\n", __func__, path);
475 		exit(EXIT_FAILURE);
476 	}
477 	/*
478 	 * Partition of block device - need to find actual device.
479 	 * Using naming convention that devnameN is partition of
480 	 * device devname.
481 	 */
482 	str = strstr(buf, "DEVNAME=");
483 	if (!str) {
484 		printf("%s: Could not read: %s", __func__, path);
485 		exit(EXIT_FAILURE);
486 	}
487 	str += 8;
488 	end = str;
489 	while (*end) {
490 		if (isdigit(*end)) {
491 			*end = '\0';
492 			if (snprintf(finfo.dev_queue_read_ahead_path,
493 				     sizeof(finfo.dev_queue_read_ahead_path),
494 				     "/sys/block/%s/queue/read_ahead_kb",
495 				     str) >= sizeof(finfo.dev_queue_read_ahead_path)) {
496 				printf("%s: Pathname is too long\n", __func__);
497 				exit(EXIT_FAILURE);
498 			}
499 			return;
500 		}
501 		++end;
502 	}
503 	printf("%s: Could not read: %s\n", __func__, path);
504 	exit(EXIT_FAILURE);
505 }
506 
507 static bool check_swap(void *addr, unsigned long size)
508 {
509 	bool swap = false;
510 	int ret;
511 	FILE *fp;
512 	char buffer[MAX_LINE_LENGTH];
513 	char addr_pattern[MAX_LINE_LENGTH];
514 
515 	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
516 		       (unsigned long) addr);
517 	if (ret >= MAX_LINE_LENGTH) {
518 		printf("%s: Pattern is too long\n", __func__);
519 		exit(EXIT_FAILURE);
520 	}
521 
522 
523 	fp = fopen(PID_SMAPS, "r");
524 	if (!fp) {
525 		printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
526 		exit(EXIT_FAILURE);
527 	}
528 	if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
529 		goto err_out;
530 
531 	ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
532 		       size >> 10);
533 	if (ret >= MAX_LINE_LENGTH) {
534 		printf("%s: Pattern is too long\n", __func__);
535 		exit(EXIT_FAILURE);
536 	}
537 	/*
538 	 * Fetch the Swap: in the same block and check whether it got
539 	 * the expected number of hugeepages next.
540 	 */
541 	if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer)))
542 		goto err_out;
543 
544 	if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
545 		goto err_out;
546 
547 	swap = true;
548 err_out:
549 	fclose(fp);
550 	return swap;
551 }
552 
553 static void *alloc_mapping(int nr)
554 {
555 	void *p;
556 
557 	p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
558 		 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
559 	if (p != BASE_ADDR) {
560 		printf("Failed to allocate VMA at %p\n", BASE_ADDR);
561 		exit(EXIT_FAILURE);
562 	}
563 
564 	return p;
565 }
566 
567 static void fill_memory(int *p, unsigned long start, unsigned long end)
568 {
569 	int i;
570 
571 	for (i = start / page_size; i < end / page_size; i++)
572 		p[i * page_size / sizeof(*p)] = i + 0xdead0000;
573 }
574 
575 /*
576  * MADV_COLLAPSE is a best-effort request and may fail if an internal
577  * resource is temporarily unavailable, in which case it will set errno to
578  * EAGAIN.  In such a case, immediately reattempt the operation one more
579  * time.
580  */
581 static int madvise_collapse_retry(void *p, unsigned long size)
582 {
583 	bool retry = true;
584 	int ret;
585 
586 retry:
587 	ret = madvise(p, size, MADV_COLLAPSE);
588 	if (ret && errno == EAGAIN && retry) {
589 		retry = false;
590 		goto retry;
591 	}
592 	return ret;
593 }
594 
595 /*
596  * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with
597  * validate_memory()'able contents.
598  */
599 static void *alloc_hpage(struct mem_ops *ops)
600 {
601 	void *p = ops->setup_area(1);
602 
603 	ops->fault(p, 0, hpage_pmd_size);
604 
605 	/*
606 	 * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE.
607 	 * The latter is ineligible for collapse by MADV_COLLAPSE
608 	 * while the former might cause MADV_COLLAPSE to race with
609 	 * khugepaged on low-load system (like a test machine), which
610 	 * would cause MADV_COLLAPSE to fail with EAGAIN.
611 	 */
612 	printf("Allocate huge page...");
613 	if (madvise_collapse_retry(p, hpage_pmd_size)) {
614 		perror("madvise(MADV_COLLAPSE)");
615 		exit(EXIT_FAILURE);
616 	}
617 	if (!ops->check_huge(p, 1)) {
618 		perror("madvise(MADV_COLLAPSE)");
619 		exit(EXIT_FAILURE);
620 	}
621 	if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) {
622 		perror("madvise(MADV_HUGEPAGE)");
623 		exit(EXIT_FAILURE);
624 	}
625 	success("OK");
626 	return p;
627 }
628 
629 static void validate_memory(int *p, unsigned long start, unsigned long end)
630 {
631 	int i;
632 
633 	for (i = start / page_size; i < end / page_size; i++) {
634 		if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
635 			printf("Page %d is corrupted: %#x\n",
636 					i, p[i * page_size / sizeof(*p)]);
637 			exit(EXIT_FAILURE);
638 		}
639 	}
640 }
641 
642 static void *anon_setup_area(int nr_hpages)
643 {
644 	return alloc_mapping(nr_hpages);
645 }
646 
647 static void anon_cleanup_area(void *p, unsigned long size)
648 {
649 	munmap(p, size);
650 }
651 
652 static void anon_fault(void *p, unsigned long start, unsigned long end)
653 {
654 	fill_memory(p, start, end);
655 }
656 
657 static bool anon_check_huge(void *addr, int nr_hpages)
658 {
659 	return check_huge_anon(addr, nr_hpages, hpage_pmd_size);
660 }
661 
662 static void *file_setup_area(int nr_hpages)
663 {
664 	int fd;
665 	void *p;
666 	unsigned long size;
667 
668 	unlink(finfo.path);  /* Cleanup from previous failed tests */
669 	printf("Creating %s for collapse%s...", finfo.path,
670 	       finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
671 	fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
672 		  777);
673 	if (fd < 0) {
674 		perror("open()");
675 		exit(EXIT_FAILURE);
676 	}
677 
678 	size = nr_hpages * hpage_pmd_size;
679 	p = alloc_mapping(nr_hpages);
680 	fill_memory(p, 0, size);
681 	write(fd, p, size);
682 	close(fd);
683 	munmap(p, size);
684 	success("OK");
685 
686 	printf("Opening %s read only for collapse...", finfo.path);
687 	finfo.fd = open(finfo.path, O_RDONLY, 777);
688 	if (finfo.fd < 0) {
689 		perror("open()");
690 		exit(EXIT_FAILURE);
691 	}
692 	p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC,
693 		 MAP_PRIVATE, finfo.fd, 0);
694 	if (p == MAP_FAILED || p != BASE_ADDR) {
695 		perror("mmap()");
696 		exit(EXIT_FAILURE);
697 	}
698 
699 	/* Drop page cache */
700 	write_file("/proc/sys/vm/drop_caches", "3", 2);
701 	success("OK");
702 	return p;
703 }
704 
705 static void file_cleanup_area(void *p, unsigned long size)
706 {
707 	munmap(p, size);
708 	close(finfo.fd);
709 	unlink(finfo.path);
710 }
711 
712 static void file_fault(void *p, unsigned long start, unsigned long end)
713 {
714 	if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) {
715 		perror("madvise(MADV_POPULATE_READ");
716 		exit(EXIT_FAILURE);
717 	}
718 }
719 
720 static bool file_check_huge(void *addr, int nr_hpages)
721 {
722 	switch (finfo.type) {
723 	case VMA_FILE:
724 		return check_huge_file(addr, nr_hpages, hpage_pmd_size);
725 	case VMA_SHMEM:
726 		return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
727 	default:
728 		exit(EXIT_FAILURE);
729 		return false;
730 	}
731 }
732 
733 static void *shmem_setup_area(int nr_hpages)
734 {
735 	void *p;
736 	unsigned long size = nr_hpages * hpage_pmd_size;
737 
738 	finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0);
739 	if (finfo.fd < 0)  {
740 		perror("memfd_create()");
741 		exit(EXIT_FAILURE);
742 	}
743 	if (ftruncate(finfo.fd, size)) {
744 		perror("ftruncate()");
745 		exit(EXIT_FAILURE);
746 	}
747 	p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd,
748 		 0);
749 	if (p != BASE_ADDR) {
750 		perror("mmap()");
751 		exit(EXIT_FAILURE);
752 	}
753 	return p;
754 }
755 
756 static void shmem_cleanup_area(void *p, unsigned long size)
757 {
758 	munmap(p, size);
759 	close(finfo.fd);
760 }
761 
762 static bool shmem_check_huge(void *addr, int nr_hpages)
763 {
764 	return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
765 }
766 
767 static struct mem_ops __anon_ops = {
768 	.setup_area = &anon_setup_area,
769 	.cleanup_area = &anon_cleanup_area,
770 	.fault = &anon_fault,
771 	.check_huge = &anon_check_huge,
772 	.name = "anon",
773 };
774 
775 static struct mem_ops __file_ops = {
776 	.setup_area = &file_setup_area,
777 	.cleanup_area = &file_cleanup_area,
778 	.fault = &file_fault,
779 	.check_huge = &file_check_huge,
780 	.name = "file",
781 };
782 
783 static struct mem_ops __shmem_ops = {
784 	.setup_area = &shmem_setup_area,
785 	.cleanup_area = &shmem_cleanup_area,
786 	.fault = &anon_fault,
787 	.check_huge = &shmem_check_huge,
788 	.name = "shmem",
789 };
790 
791 static void __madvise_collapse(const char *msg, char *p, int nr_hpages,
792 			       struct mem_ops *ops, bool expect)
793 {
794 	int ret;
795 	struct settings settings = *current_settings();
796 
797 	printf("%s...", msg);
798 
799 	/*
800 	 * Prevent khugepaged interference and tests that MADV_COLLAPSE
801 	 * ignores /sys/kernel/mm/transparent_hugepage/enabled
802 	 */
803 	settings.thp_enabled = THP_NEVER;
804 	settings.shmem_enabled = SHMEM_NEVER;
805 	push_settings(&settings);
806 
807 	/* Clear VM_NOHUGEPAGE */
808 	madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
809 	ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size);
810 	if (((bool)ret) == expect)
811 		fail("Fail: Bad return value");
812 	else if (!ops->check_huge(p, expect ? nr_hpages : 0))
813 		fail("Fail: check_huge()");
814 	else
815 		success("OK");
816 
817 	pop_settings();
818 }
819 
820 static void madvise_collapse(const char *msg, char *p, int nr_hpages,
821 			     struct mem_ops *ops, bool expect)
822 {
823 	/* Sanity check */
824 	if (!ops->check_huge(p, 0)) {
825 		printf("Unexpected huge page\n");
826 		exit(EXIT_FAILURE);
827 	}
828 	__madvise_collapse(msg, p, nr_hpages, ops, expect);
829 }
830 
831 #define TICK 500000
832 static bool wait_for_scan(const char *msg, char *p, int nr_hpages,
833 			  struct mem_ops *ops)
834 {
835 	int full_scans;
836 	int timeout = 6; /* 3 seconds */
837 
838 	/* Sanity check */
839 	if (!ops->check_huge(p, 0)) {
840 		printf("Unexpected huge page\n");
841 		exit(EXIT_FAILURE);
842 	}
843 
844 	madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
845 
846 	/* Wait until the second full_scan completed */
847 	full_scans = read_num("khugepaged/full_scans") + 2;
848 
849 	printf("%s...", msg);
850 	while (timeout--) {
851 		if (ops->check_huge(p, nr_hpages))
852 			break;
853 		if (read_num("khugepaged/full_scans") >= full_scans)
854 			break;
855 		printf(".");
856 		usleep(TICK);
857 	}
858 
859 	madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE);
860 
861 	return timeout == -1;
862 }
863 
864 static void khugepaged_collapse(const char *msg, char *p, int nr_hpages,
865 				struct mem_ops *ops, bool expect)
866 {
867 	if (wait_for_scan(msg, p, nr_hpages, ops)) {
868 		if (expect)
869 			fail("Timeout");
870 		else
871 			success("OK");
872 		return;
873 	}
874 
875 	/*
876 	 * For file and shmem memory, khugepaged only retracts pte entries after
877 	 * putting the new hugepage in the page cache. The hugepage must be
878 	 * subsequently refaulted to install the pmd mapping for the mm.
879 	 */
880 	if (ops != &__anon_ops)
881 		ops->fault(p, 0, nr_hpages * hpage_pmd_size);
882 
883 	if (ops->check_huge(p, expect ? nr_hpages : 0))
884 		success("OK");
885 	else
886 		fail("Fail");
887 }
888 
889 static struct collapse_context __khugepaged_context = {
890 	.collapse = &khugepaged_collapse,
891 	.enforce_pte_scan_limits = true,
892 	.name = "khugepaged",
893 };
894 
895 static struct collapse_context __madvise_context = {
896 	.collapse = &madvise_collapse,
897 	.enforce_pte_scan_limits = false,
898 	.name = "madvise",
899 };
900 
901 static bool is_tmpfs(struct mem_ops *ops)
902 {
903 	return ops == &__file_ops && finfo.type == VMA_SHMEM;
904 }
905 
906 static void alloc_at_fault(void)
907 {
908 	struct settings settings = *current_settings();
909 	char *p;
910 
911 	settings.thp_enabled = THP_ALWAYS;
912 	push_settings(&settings);
913 
914 	p = alloc_mapping(1);
915 	*p = 1;
916 	printf("Allocate huge page on fault...");
917 	if (check_huge_anon(p, 1, hpage_pmd_size))
918 		success("OK");
919 	else
920 		fail("Fail");
921 
922 	pop_settings();
923 
924 	madvise(p, page_size, MADV_DONTNEED);
925 	printf("Split huge PMD on MADV_DONTNEED...");
926 	if (check_huge_anon(p, 0, hpage_pmd_size))
927 		success("OK");
928 	else
929 		fail("Fail");
930 	munmap(p, hpage_pmd_size);
931 }
932 
933 static void collapse_full(struct collapse_context *c, struct mem_ops *ops)
934 {
935 	void *p;
936 	int nr_hpages = 4;
937 	unsigned long size = nr_hpages * hpage_pmd_size;
938 
939 	p = ops->setup_area(nr_hpages);
940 	ops->fault(p, 0, size);
941 	c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages,
942 		    ops, true);
943 	validate_memory(p, 0, size);
944 	ops->cleanup_area(p, size);
945 }
946 
947 static void collapse_empty(struct collapse_context *c, struct mem_ops *ops)
948 {
949 	void *p;
950 
951 	p = ops->setup_area(1);
952 	c->collapse("Do not collapse empty PTE table", p, 1, ops, false);
953 	ops->cleanup_area(p, hpage_pmd_size);
954 }
955 
956 static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops)
957 {
958 	void *p;
959 
960 	p = ops->setup_area(1);
961 	ops->fault(p, 0, page_size);
962 	c->collapse("Collapse PTE table with single PTE entry present", p,
963 		    1, ops, true);
964 	ops->cleanup_area(p, hpage_pmd_size);
965 }
966 
967 static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops)
968 {
969 	int max_ptes_none = hpage_pmd_nr / 2;
970 	struct settings settings = *current_settings();
971 	void *p;
972 
973 	settings.khugepaged.max_ptes_none = max_ptes_none;
974 	push_settings(&settings);
975 
976 	p = ops->setup_area(1);
977 
978 	if (is_tmpfs(ops)) {
979 		/* shmem pages always in the page cache */
980 		printf("tmpfs...");
981 		skip("Skip");
982 		goto skip;
983 	}
984 
985 	ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
986 	c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1,
987 		    ops, !c->enforce_pte_scan_limits);
988 	validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
989 
990 	if (c->enforce_pte_scan_limits) {
991 		ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
992 		c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops,
993 			    true);
994 		validate_memory(p, 0,
995 				(hpage_pmd_nr - max_ptes_none) * page_size);
996 	}
997 skip:
998 	ops->cleanup_area(p, hpage_pmd_size);
999 	pop_settings();
1000 }
1001 
1002 static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops)
1003 {
1004 	void *p;
1005 
1006 	p = ops->setup_area(1);
1007 	ops->fault(p, 0, hpage_pmd_size);
1008 
1009 	printf("Swapout one page...");
1010 	if (madvise(p, page_size, MADV_PAGEOUT)) {
1011 		perror("madvise(MADV_PAGEOUT)");
1012 		exit(EXIT_FAILURE);
1013 	}
1014 	if (check_swap(p, page_size)) {
1015 		success("OK");
1016 	} else {
1017 		fail("Fail");
1018 		goto out;
1019 	}
1020 
1021 	c->collapse("Collapse with swapping in single PTE entry", p, 1, ops,
1022 		    true);
1023 	validate_memory(p, 0, hpage_pmd_size);
1024 out:
1025 	ops->cleanup_area(p, hpage_pmd_size);
1026 }
1027 
1028 static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops)
1029 {
1030 	int max_ptes_swap = read_num("khugepaged/max_ptes_swap");
1031 	void *p;
1032 
1033 	p = ops->setup_area(1);
1034 	ops->fault(p, 0, hpage_pmd_size);
1035 
1036 	printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
1037 	if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
1038 		perror("madvise(MADV_PAGEOUT)");
1039 		exit(EXIT_FAILURE);
1040 	}
1041 	if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
1042 		success("OK");
1043 	} else {
1044 		fail("Fail");
1045 		goto out;
1046 	}
1047 
1048 	c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops,
1049 		    !c->enforce_pte_scan_limits);
1050 	validate_memory(p, 0, hpage_pmd_size);
1051 
1052 	if (c->enforce_pte_scan_limits) {
1053 		ops->fault(p, 0, hpage_pmd_size);
1054 		printf("Swapout %d of %d pages...", max_ptes_swap,
1055 		       hpage_pmd_nr);
1056 		if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
1057 			perror("madvise(MADV_PAGEOUT)");
1058 			exit(EXIT_FAILURE);
1059 		}
1060 		if (check_swap(p, max_ptes_swap * page_size)) {
1061 			success("OK");
1062 		} else {
1063 			fail("Fail");
1064 			goto out;
1065 		}
1066 
1067 		c->collapse("Collapse with max_ptes_swap pages swapped out", p,
1068 			    1, ops, true);
1069 		validate_memory(p, 0, hpage_pmd_size);
1070 	}
1071 out:
1072 	ops->cleanup_area(p, hpage_pmd_size);
1073 }
1074 
1075 static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops)
1076 {
1077 	void *p;
1078 
1079 	p = alloc_hpage(ops);
1080 
1081 	if (is_tmpfs(ops)) {
1082 		/* MADV_DONTNEED won't evict tmpfs pages */
1083 		printf("tmpfs...");
1084 		skip("Skip");
1085 		goto skip;
1086 	}
1087 
1088 	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
1089 	printf("Split huge page leaving single PTE mapping compound page...");
1090 	madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
1091 	if (ops->check_huge(p, 0))
1092 		success("OK");
1093 	else
1094 		fail("Fail");
1095 
1096 	c->collapse("Collapse PTE table with single PTE mapping compound page",
1097 		    p, 1, ops, true);
1098 	validate_memory(p, 0, page_size);
1099 skip:
1100 	ops->cleanup_area(p, hpage_pmd_size);
1101 }
1102 
1103 static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops)
1104 {
1105 	void *p;
1106 
1107 	p = alloc_hpage(ops);
1108 	printf("Split huge page leaving single PTE page table full of compound pages...");
1109 	madvise(p, page_size, MADV_NOHUGEPAGE);
1110 	madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
1111 	if (ops->check_huge(p, 0))
1112 		success("OK");
1113 	else
1114 		fail("Fail");
1115 
1116 	c->collapse("Collapse PTE table full of compound pages", p, 1, ops,
1117 		    true);
1118 	validate_memory(p, 0, hpage_pmd_size);
1119 	ops->cleanup_area(p, hpage_pmd_size);
1120 }
1121 
1122 static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops)
1123 {
1124 	void *p;
1125 	int i;
1126 
1127 	p = ops->setup_area(1);
1128 	for (i = 0; i < hpage_pmd_nr; i++) {
1129 		printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
1130 				i + 1, hpage_pmd_nr);
1131 
1132 		madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
1133 		ops->fault(BASE_ADDR, 0, hpage_pmd_size);
1134 		if (!ops->check_huge(BASE_ADDR, 1)) {
1135 			printf("Failed to allocate huge page\n");
1136 			exit(EXIT_FAILURE);
1137 		}
1138 		madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
1139 
1140 		p = mremap(BASE_ADDR - i * page_size,
1141 				i * page_size + hpage_pmd_size,
1142 				(i + 1) * page_size,
1143 				MREMAP_MAYMOVE | MREMAP_FIXED,
1144 				BASE_ADDR + 2 * hpage_pmd_size);
1145 		if (p == MAP_FAILED) {
1146 			perror("mremap+unmap");
1147 			exit(EXIT_FAILURE);
1148 		}
1149 
1150 		p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
1151 				(i + 1) * page_size,
1152 				(i + 1) * page_size + hpage_pmd_size,
1153 				MREMAP_MAYMOVE | MREMAP_FIXED,
1154 				BASE_ADDR - (i + 1) * page_size);
1155 		if (p == MAP_FAILED) {
1156 			perror("mremap+alloc");
1157 			exit(EXIT_FAILURE);
1158 		}
1159 	}
1160 
1161 	ops->cleanup_area(BASE_ADDR, hpage_pmd_size);
1162 	ops->fault(p, 0, hpage_pmd_size);
1163 	if (!ops->check_huge(p, 1))
1164 		success("OK");
1165 	else
1166 		fail("Fail");
1167 
1168 	c->collapse("Collapse PTE table full of different compound pages", p, 1,
1169 		    ops, true);
1170 
1171 	validate_memory(p, 0, hpage_pmd_size);
1172 	ops->cleanup_area(p, hpage_pmd_size);
1173 }
1174 
1175 static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
1176 {
1177 	int wstatus;
1178 	void *p;
1179 
1180 	p = ops->setup_area(1);
1181 
1182 	printf("Allocate small page...");
1183 	ops->fault(p, 0, page_size);
1184 	if (ops->check_huge(p, 0))
1185 		success("OK");
1186 	else
1187 		fail("Fail");
1188 
1189 	printf("Share small page over fork()...");
1190 	if (!fork()) {
1191 		/* Do not touch settings on child exit */
1192 		skip_settings_restore = true;
1193 		exit_status = 0;
1194 
1195 		if (ops->check_huge(p, 0))
1196 			success("OK");
1197 		else
1198 			fail("Fail");
1199 
1200 		ops->fault(p, page_size, 2 * page_size);
1201 		c->collapse("Collapse PTE table with single page shared with parent process",
1202 			    p, 1, ops, true);
1203 
1204 		validate_memory(p, 0, page_size);
1205 		ops->cleanup_area(p, hpage_pmd_size);
1206 		exit(exit_status);
1207 	}
1208 
1209 	wait(&wstatus);
1210 	exit_status += WEXITSTATUS(wstatus);
1211 
1212 	printf("Check if parent still has small page...");
1213 	if (ops->check_huge(p, 0))
1214 		success("OK");
1215 	else
1216 		fail("Fail");
1217 	validate_memory(p, 0, page_size);
1218 	ops->cleanup_area(p, hpage_pmd_size);
1219 }
1220 
1221 static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops)
1222 {
1223 	int wstatus;
1224 	void *p;
1225 
1226 	p = alloc_hpage(ops);
1227 	printf("Share huge page over fork()...");
1228 	if (!fork()) {
1229 		/* Do not touch settings on child exit */
1230 		skip_settings_restore = true;
1231 		exit_status = 0;
1232 
1233 		if (ops->check_huge(p, 1))
1234 			success("OK");
1235 		else
1236 			fail("Fail");
1237 
1238 		printf("Split huge page PMD in child process...");
1239 		madvise(p, page_size, MADV_NOHUGEPAGE);
1240 		madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
1241 		if (ops->check_huge(p, 0))
1242 			success("OK");
1243 		else
1244 			fail("Fail");
1245 		ops->fault(p, 0, page_size);
1246 
1247 		write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
1248 		c->collapse("Collapse PTE table full of compound pages in child",
1249 			    p, 1, ops, true);
1250 		write_num("khugepaged/max_ptes_shared",
1251 			  current_settings()->khugepaged.max_ptes_shared);
1252 
1253 		validate_memory(p, 0, hpage_pmd_size);
1254 		ops->cleanup_area(p, hpage_pmd_size);
1255 		exit(exit_status);
1256 	}
1257 
1258 	wait(&wstatus);
1259 	exit_status += WEXITSTATUS(wstatus);
1260 
1261 	printf("Check if parent still has huge page...");
1262 	if (ops->check_huge(p, 1))
1263 		success("OK");
1264 	else
1265 		fail("Fail");
1266 	validate_memory(p, 0, hpage_pmd_size);
1267 	ops->cleanup_area(p, hpage_pmd_size);
1268 }
1269 
1270 static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops)
1271 {
1272 	int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
1273 	int wstatus;
1274 	void *p;
1275 
1276 	p = alloc_hpage(ops);
1277 	printf("Share huge page over fork()...");
1278 	if (!fork()) {
1279 		/* Do not touch settings on child exit */
1280 		skip_settings_restore = true;
1281 		exit_status = 0;
1282 
1283 		if (ops->check_huge(p, 1))
1284 			success("OK");
1285 		else
1286 			fail("Fail");
1287 
1288 		printf("Trigger CoW on page %d of %d...",
1289 				hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
1290 		ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
1291 		if (ops->check_huge(p, 0))
1292 			success("OK");
1293 		else
1294 			fail("Fail");
1295 
1296 		c->collapse("Maybe collapse with max_ptes_shared exceeded", p,
1297 			    1, ops, !c->enforce_pte_scan_limits);
1298 
1299 		if (c->enforce_pte_scan_limits) {
1300 			printf("Trigger CoW on page %d of %d...",
1301 			       hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
1302 			ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) *
1303 				    page_size);
1304 			if (ops->check_huge(p, 0))
1305 				success("OK");
1306 			else
1307 				fail("Fail");
1308 
1309 			c->collapse("Collapse with max_ptes_shared PTEs shared",
1310 				    p, 1, ops, true);
1311 		}
1312 
1313 		validate_memory(p, 0, hpage_pmd_size);
1314 		ops->cleanup_area(p, hpage_pmd_size);
1315 		exit(exit_status);
1316 	}
1317 
1318 	wait(&wstatus);
1319 	exit_status += WEXITSTATUS(wstatus);
1320 
1321 	printf("Check if parent still has huge page...");
1322 	if (ops->check_huge(p, 1))
1323 		success("OK");
1324 	else
1325 		fail("Fail");
1326 	validate_memory(p, 0, hpage_pmd_size);
1327 	ops->cleanup_area(p, hpage_pmd_size);
1328 }
1329 
1330 static void madvise_collapse_existing_thps(struct collapse_context *c,
1331 					   struct mem_ops *ops)
1332 {
1333 	void *p;
1334 
1335 	p = ops->setup_area(1);
1336 	ops->fault(p, 0, hpage_pmd_size);
1337 	c->collapse("Collapse fully populated PTE table...", p, 1, ops, true);
1338 	validate_memory(p, 0, hpage_pmd_size);
1339 
1340 	/* c->collapse() will find a hugepage and complain - call directly. */
1341 	__madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true);
1342 	validate_memory(p, 0, hpage_pmd_size);
1343 	ops->cleanup_area(p, hpage_pmd_size);
1344 }
1345 
1346 /*
1347  * Test race with khugepaged where page tables have been retracted and
1348  * pmd cleared.
1349  */
1350 static void madvise_retracted_page_tables(struct collapse_context *c,
1351 					  struct mem_ops *ops)
1352 {
1353 	void *p;
1354 	int nr_hpages = 1;
1355 	unsigned long size = nr_hpages * hpage_pmd_size;
1356 
1357 	p = ops->setup_area(nr_hpages);
1358 	ops->fault(p, 0, size);
1359 
1360 	/* Let khugepaged collapse and leave pmd cleared */
1361 	if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages,
1362 			  ops)) {
1363 		fail("Timeout");
1364 		return;
1365 	}
1366 	success("OK");
1367 	c->collapse("Install huge PMD from page cache", p, nr_hpages, ops,
1368 		    true);
1369 	validate_memory(p, 0, size);
1370 	ops->cleanup_area(p, size);
1371 }
1372 
1373 static void usage(void)
1374 {
1375 	fprintf(stderr, "\nUsage: ./khugepaged <test type> [dir]\n\n");
1376 	fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n");
1377 	fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n");
1378 	fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n");
1379 	fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n");
1380 	fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n");
1381 	fprintf(stderr,	"\tCONFIG_READ_ONLY_THP_FOR_FS=y\n");
1382 	fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n");
1383 	fprintf(stderr,	"\tmounted with huge=madvise option for khugepaged tests to work\n");
1384 	exit(1);
1385 }
1386 
1387 static void parse_test_type(int argc, const char **argv)
1388 {
1389 	char *buf;
1390 	const char *token;
1391 
1392 	if (argc == 1) {
1393 		/* Backwards compatibility */
1394 		khugepaged_context =  &__khugepaged_context;
1395 		madvise_context =  &__madvise_context;
1396 		anon_ops = &__anon_ops;
1397 		return;
1398 	}
1399 
1400 	buf = strdup(argv[1]);
1401 	token = strsep(&buf, ":");
1402 
1403 	if (!strcmp(token, "all")) {
1404 		khugepaged_context =  &__khugepaged_context;
1405 		madvise_context =  &__madvise_context;
1406 	} else if (!strcmp(token, "khugepaged")) {
1407 		khugepaged_context =  &__khugepaged_context;
1408 	} else if (!strcmp(token, "madvise")) {
1409 		madvise_context =  &__madvise_context;
1410 	} else {
1411 		usage();
1412 	}
1413 
1414 	if (!buf)
1415 		usage();
1416 
1417 	if (!strcmp(buf, "all")) {
1418 		file_ops =  &__file_ops;
1419 		anon_ops = &__anon_ops;
1420 		shmem_ops = &__shmem_ops;
1421 	} else if (!strcmp(buf, "anon")) {
1422 		anon_ops = &__anon_ops;
1423 	} else if (!strcmp(buf, "file")) {
1424 		file_ops =  &__file_ops;
1425 	} else if (!strcmp(buf, "shmem")) {
1426 		shmem_ops = &__shmem_ops;
1427 	} else {
1428 		usage();
1429 	}
1430 
1431 	if (!file_ops)
1432 		return;
1433 
1434 	if (argc != 3)
1435 		usage();
1436 }
1437 
1438 int main(int argc, const char **argv)
1439 {
1440 	struct settings default_settings = {
1441 		.thp_enabled = THP_MADVISE,
1442 		.thp_defrag = THP_DEFRAG_ALWAYS,
1443 		.shmem_enabled = SHMEM_ADVISE,
1444 		.use_zero_page = 0,
1445 		.khugepaged = {
1446 			.defrag = 1,
1447 			.alloc_sleep_millisecs = 10,
1448 			.scan_sleep_millisecs = 10,
1449 		},
1450 		/*
1451 		 * When testing file-backed memory, the collapse path
1452 		 * looks at how many pages are found in the page cache, not
1453 		 * what pages are mapped. Disable read ahead optimization so
1454 		 * pages don't find their way into the page cache unless
1455 		 * we mem_ops->fault() them in.
1456 		 */
1457 		.read_ahead_kb = 0,
1458 	};
1459 
1460 	parse_test_type(argc, argv);
1461 
1462 	if (file_ops)
1463 		get_finfo(argv[2]);
1464 
1465 	setbuf(stdout, NULL);
1466 
1467 	page_size = getpagesize();
1468 	hpage_pmd_size = read_pmd_pagesize();
1469 	if (!hpage_pmd_size) {
1470 		printf("Reading PMD pagesize failed");
1471 		exit(EXIT_FAILURE);
1472 	}
1473 	hpage_pmd_nr = hpage_pmd_size / page_size;
1474 
1475 	default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
1476 	default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
1477 	default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
1478 	default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
1479 
1480 	save_settings();
1481 	push_settings(&default_settings);
1482 
1483 	alloc_at_fault();
1484 
1485 #define TEST(t, c, o) do { \
1486 	if (c && o) { \
1487 		printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \
1488 		t(c, o); \
1489 	} \
1490 	} while (0)
1491 
1492 	TEST(collapse_full, khugepaged_context, anon_ops);
1493 	TEST(collapse_full, khugepaged_context, file_ops);
1494 	TEST(collapse_full, khugepaged_context, shmem_ops);
1495 	TEST(collapse_full, madvise_context, anon_ops);
1496 	TEST(collapse_full, madvise_context, file_ops);
1497 	TEST(collapse_full, madvise_context, shmem_ops);
1498 
1499 	TEST(collapse_empty, khugepaged_context, anon_ops);
1500 	TEST(collapse_empty, madvise_context, anon_ops);
1501 
1502 	TEST(collapse_single_pte_entry, khugepaged_context, anon_ops);
1503 	TEST(collapse_single_pte_entry, khugepaged_context, file_ops);
1504 	TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops);
1505 	TEST(collapse_single_pte_entry, madvise_context, anon_ops);
1506 	TEST(collapse_single_pte_entry, madvise_context, file_ops);
1507 	TEST(collapse_single_pte_entry, madvise_context, shmem_ops);
1508 
1509 	TEST(collapse_max_ptes_none, khugepaged_context, anon_ops);
1510 	TEST(collapse_max_ptes_none, khugepaged_context, file_ops);
1511 	TEST(collapse_max_ptes_none, madvise_context, anon_ops);
1512 	TEST(collapse_max_ptes_none, madvise_context, file_ops);
1513 
1514 	TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops);
1515 	TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops);
1516 	TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops);
1517 	TEST(collapse_single_pte_entry_compound, madvise_context, file_ops);
1518 
1519 	TEST(collapse_full_of_compound, khugepaged_context, anon_ops);
1520 	TEST(collapse_full_of_compound, khugepaged_context, file_ops);
1521 	TEST(collapse_full_of_compound, khugepaged_context, shmem_ops);
1522 	TEST(collapse_full_of_compound, madvise_context, anon_ops);
1523 	TEST(collapse_full_of_compound, madvise_context, file_ops);
1524 	TEST(collapse_full_of_compound, madvise_context, shmem_ops);
1525 
1526 	TEST(collapse_compound_extreme, khugepaged_context, anon_ops);
1527 	TEST(collapse_compound_extreme, madvise_context, anon_ops);
1528 
1529 	TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops);
1530 	TEST(collapse_swapin_single_pte, madvise_context, anon_ops);
1531 
1532 	TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops);
1533 	TEST(collapse_max_ptes_swap, madvise_context, anon_ops);
1534 
1535 	TEST(collapse_fork, khugepaged_context, anon_ops);
1536 	TEST(collapse_fork, madvise_context, anon_ops);
1537 
1538 	TEST(collapse_fork_compound, khugepaged_context, anon_ops);
1539 	TEST(collapse_fork_compound, madvise_context, anon_ops);
1540 
1541 	TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops);
1542 	TEST(collapse_max_ptes_shared, madvise_context, anon_ops);
1543 
1544 	TEST(madvise_collapse_existing_thps, madvise_context, anon_ops);
1545 	TEST(madvise_collapse_existing_thps, madvise_context, file_ops);
1546 	TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops);
1547 
1548 	TEST(madvise_retracted_page_tables, madvise_context, file_ops);
1549 	TEST(madvise_retracted_page_tables, madvise_context, shmem_ops);
1550 
1551 	restore_settings(0);
1552 }
1553