1 /* 2 * NUMA emulation 3 */ 4 #include <linux/kernel.h> 5 #include <linux/errno.h> 6 #include <linux/topology.h> 7 #include <linux/memblock.h> 8 #include <linux/bootmem.h> 9 #include <asm/dma.h> 10 11 #include "numa_internal.h" 12 13 static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; 14 static char *emu_cmdline __initdata; 15 16 void __init numa_emu_cmdline(char *str) 17 { 18 emu_cmdline = str; 19 } 20 21 static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) 22 { 23 int i; 24 25 for (i = 0; i < mi->nr_blks; i++) 26 if (mi->blk[i].nid == nid) 27 return i; 28 return -ENOENT; 29 } 30 31 static u64 __init mem_hole_size(u64 start, u64 end) 32 { 33 unsigned long start_pfn = PFN_UP(start); 34 unsigned long end_pfn = PFN_DOWN(end); 35 36 if (start_pfn < end_pfn) 37 return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); 38 return 0; 39 } 40 41 /* 42 * Sets up nid to range from @start to @end. The return value is -errno if 43 * something went wrong, 0 otherwise. 44 */ 45 static int __init emu_setup_memblk(struct numa_meminfo *ei, 46 struct numa_meminfo *pi, 47 int nid, int phys_blk, u64 size) 48 { 49 struct numa_memblk *eb = &ei->blk[ei->nr_blks]; 50 struct numa_memblk *pb = &pi->blk[phys_blk]; 51 52 if (ei->nr_blks >= NR_NODE_MEMBLKS) { 53 pr_err("NUMA: Too many emulated memblks, failing emulation\n"); 54 return -EINVAL; 55 } 56 57 ei->nr_blks++; 58 eb->start = pb->start; 59 eb->end = pb->start + size; 60 eb->nid = nid; 61 62 if (emu_nid_to_phys[nid] == NUMA_NO_NODE) 63 emu_nid_to_phys[nid] = nid; 64 65 pb->start += size; 66 if (pb->start >= pb->end) { 67 WARN_ON_ONCE(pb->start > pb->end); 68 numa_remove_memblk_from(phys_blk, pi); 69 } 70 71 printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n", 72 nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20); 73 return 0; 74 } 75 76 /* 77 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr 78 * to max_addr. The return value is the number of nodes allocated. 79 */ 80 static int __init split_nodes_interleave(struct numa_meminfo *ei, 81 struct numa_meminfo *pi, 82 u64 addr, u64 max_addr, int nr_nodes) 83 { 84 nodemask_t physnode_mask = NODE_MASK_NONE; 85 u64 size; 86 int big; 87 int nid = 0; 88 int i, ret; 89 90 if (nr_nodes <= 0) 91 return -1; 92 if (nr_nodes > MAX_NUMNODES) { 93 pr_info("numa=fake=%d too large, reducing to %d\n", 94 nr_nodes, MAX_NUMNODES); 95 nr_nodes = MAX_NUMNODES; 96 } 97 98 /* 99 * Calculate target node size. x86_32 freaks on __udivdi3() so do 100 * the division in ulong number of pages and convert back. 101 */ 102 size = max_addr - addr - mem_hole_size(addr, max_addr); 103 size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); 104 105 /* 106 * Calculate the number of big nodes that can be allocated as a result 107 * of consolidating the remainder. 108 */ 109 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / 110 FAKE_NODE_MIN_SIZE; 111 112 size &= FAKE_NODE_MIN_HASH_MASK; 113 if (!size) { 114 pr_err("Not enough memory for each node. " 115 "NUMA emulation disabled.\n"); 116 return -1; 117 } 118 119 for (i = 0; i < pi->nr_blks; i++) 120 node_set(pi->blk[i].nid, physnode_mask); 121 122 /* 123 * Continue to fill physical nodes with fake nodes until there is no 124 * memory left on any of them. 125 */ 126 while (nodes_weight(physnode_mask)) { 127 for_each_node_mask(i, physnode_mask) { 128 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); 129 u64 start, limit, end; 130 int phys_blk; 131 132 phys_blk = emu_find_memblk_by_nid(i, pi); 133 if (phys_blk < 0) { 134 node_clear(i, physnode_mask); 135 continue; 136 } 137 start = pi->blk[phys_blk].start; 138 limit = pi->blk[phys_blk].end; 139 end = start + size; 140 141 if (nid < big) 142 end += FAKE_NODE_MIN_SIZE; 143 144 /* 145 * Continue to add memory to this fake node if its 146 * non-reserved memory is less than the per-node size. 147 */ 148 while (end - start - mem_hole_size(start, end) < size) { 149 end += FAKE_NODE_MIN_SIZE; 150 if (end > limit) { 151 end = limit; 152 break; 153 } 154 } 155 156 /* 157 * If there won't be at least FAKE_NODE_MIN_SIZE of 158 * non-reserved memory in ZONE_DMA32 for the next node, 159 * this one must extend to the boundary. 160 */ 161 if (end < dma32_end && dma32_end - end - 162 mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 163 end = dma32_end; 164 165 /* 166 * If there won't be enough non-reserved memory for the 167 * next node, this one must extend to the end of the 168 * physical node. 169 */ 170 if (limit - end - mem_hole_size(end, limit) < size) 171 end = limit; 172 173 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, 174 phys_blk, 175 min(end, limit) - start); 176 if (ret < 0) 177 return ret; 178 } 179 } 180 return 0; 181 } 182 183 /* 184 * Returns the end address of a node so that there is at least `size' amount of 185 * non-reserved memory or `max_addr' is reached. 186 */ 187 static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) 188 { 189 u64 end = start + size; 190 191 while (end - start - mem_hole_size(start, end) < size) { 192 end += FAKE_NODE_MIN_SIZE; 193 if (end > max_addr) { 194 end = max_addr; 195 break; 196 } 197 } 198 return end; 199 } 200 201 /* 202 * Sets up fake nodes of `size' interleaved over physical nodes ranging from 203 * `addr' to `max_addr'. The return value is the number of nodes allocated. 204 */ 205 static int __init split_nodes_size_interleave(struct numa_meminfo *ei, 206 struct numa_meminfo *pi, 207 u64 addr, u64 max_addr, u64 size) 208 { 209 nodemask_t physnode_mask = NODE_MASK_NONE; 210 u64 min_size; 211 int nid = 0; 212 int i, ret; 213 214 if (!size) 215 return -1; 216 /* 217 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is 218 * increased accordingly if the requested size is too small. This 219 * creates a uniform distribution of node sizes across the entire 220 * machine (but not necessarily over physical nodes). 221 */ 222 min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES; 223 min_size = max(min_size, FAKE_NODE_MIN_SIZE); 224 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) 225 min_size = (min_size + FAKE_NODE_MIN_SIZE) & 226 FAKE_NODE_MIN_HASH_MASK; 227 if (size < min_size) { 228 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", 229 size >> 20, min_size >> 20); 230 size = min_size; 231 } 232 size &= FAKE_NODE_MIN_HASH_MASK; 233 234 for (i = 0; i < pi->nr_blks; i++) 235 node_set(pi->blk[i].nid, physnode_mask); 236 237 /* 238 * Fill physical nodes with fake nodes of size until there is no memory 239 * left on any of them. 240 */ 241 while (nodes_weight(physnode_mask)) { 242 for_each_node_mask(i, physnode_mask) { 243 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); 244 u64 start, limit, end; 245 int phys_blk; 246 247 phys_blk = emu_find_memblk_by_nid(i, pi); 248 if (phys_blk < 0) { 249 node_clear(i, physnode_mask); 250 continue; 251 } 252 start = pi->blk[phys_blk].start; 253 limit = pi->blk[phys_blk].end; 254 255 end = find_end_of_node(start, limit, size); 256 /* 257 * If there won't be at least FAKE_NODE_MIN_SIZE of 258 * non-reserved memory in ZONE_DMA32 for the next node, 259 * this one must extend to the boundary. 260 */ 261 if (end < dma32_end && dma32_end - end - 262 mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 263 end = dma32_end; 264 265 /* 266 * If there won't be enough non-reserved memory for the 267 * next node, this one must extend to the end of the 268 * physical node. 269 */ 270 if (limit - end - mem_hole_size(end, limit) < size) 271 end = limit; 272 273 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, 274 phys_blk, 275 min(end, limit) - start); 276 if (ret < 0) 277 return ret; 278 } 279 } 280 return 0; 281 } 282 283 /** 284 * numa_emulation - Emulate NUMA nodes 285 * @numa_meminfo: NUMA configuration to massage 286 * @numa_dist_cnt: The size of the physical NUMA distance table 287 * 288 * Emulate NUMA nodes according to the numa=fake kernel parameter. 289 * @numa_meminfo contains the physical memory configuration and is modified 290 * to reflect the emulated configuration on success. @numa_dist_cnt is 291 * used to determine the size of the physical distance table. 292 * 293 * On success, the following modifications are made. 294 * 295 * - @numa_meminfo is updated to reflect the emulated nodes. 296 * 297 * - __apicid_to_node[] is updated such that APIC IDs are mapped to the 298 * emulated nodes. 299 * 300 * - NUMA distance table is rebuilt to represent distances between emulated 301 * nodes. The distances are determined considering how emulated nodes 302 * are mapped to physical nodes and match the actual distances. 303 * 304 * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical 305 * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). 306 * 307 * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with 308 * identity mapping and no other modification is made. 309 */ 310 void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) 311 { 312 static struct numa_meminfo ei __initdata; 313 static struct numa_meminfo pi __initdata; 314 const u64 max_addr = PFN_PHYS(max_pfn); 315 u8 *phys_dist = NULL; 316 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); 317 int max_emu_nid, dfl_phys_nid; 318 int i, j, ret; 319 320 if (!emu_cmdline) 321 goto no_emu; 322 323 memset(&ei, 0, sizeof(ei)); 324 pi = *numa_meminfo; 325 326 for (i = 0; i < MAX_NUMNODES; i++) 327 emu_nid_to_phys[i] = NUMA_NO_NODE; 328 329 /* 330 * If the numa=fake command-line contains a 'M' or 'G', it represents 331 * the fixed node size. Otherwise, if it is just a single number N, 332 * split the system RAM into N fake nodes. 333 */ 334 if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { 335 u64 size; 336 337 size = memparse(emu_cmdline, &emu_cmdline); 338 ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); 339 } else { 340 unsigned long n; 341 342 n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); 343 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); 344 } 345 if (*emu_cmdline == ':') 346 emu_cmdline++; 347 348 if (ret < 0) 349 goto no_emu; 350 351 if (numa_cleanup_meminfo(&ei) < 0) { 352 pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); 353 goto no_emu; 354 } 355 356 /* copy the physical distance table */ 357 if (numa_dist_cnt) { 358 u64 phys; 359 360 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), 361 phys_size, PAGE_SIZE); 362 if (!phys) { 363 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); 364 goto no_emu; 365 } 366 memblock_reserve(phys, phys_size); 367 phys_dist = __va(phys); 368 369 for (i = 0; i < numa_dist_cnt; i++) 370 for (j = 0; j < numa_dist_cnt; j++) 371 phys_dist[i * numa_dist_cnt + j] = 372 node_distance(i, j); 373 } 374 375 /* 376 * Determine the max emulated nid and the default phys nid to use 377 * for unmapped nodes. 378 */ 379 max_emu_nid = 0; 380 dfl_phys_nid = NUMA_NO_NODE; 381 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { 382 if (emu_nid_to_phys[i] != NUMA_NO_NODE) { 383 max_emu_nid = i; 384 if (dfl_phys_nid == NUMA_NO_NODE) 385 dfl_phys_nid = emu_nid_to_phys[i]; 386 } 387 } 388 if (dfl_phys_nid == NUMA_NO_NODE) { 389 pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); 390 goto no_emu; 391 } 392 393 /* commit */ 394 *numa_meminfo = ei; 395 396 /* 397 * Transform __apicid_to_node table to use emulated nids by 398 * reverse-mapping phys_nid. The maps should always exist but fall 399 * back to zero just in case. 400 */ 401 for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { 402 if (__apicid_to_node[i] == NUMA_NO_NODE) 403 continue; 404 for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) 405 if (__apicid_to_node[i] == emu_nid_to_phys[j]) 406 break; 407 __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; 408 } 409 410 /* make sure all emulated nodes are mapped to a physical node */ 411 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) 412 if (emu_nid_to_phys[i] == NUMA_NO_NODE) 413 emu_nid_to_phys[i] = dfl_phys_nid; 414 415 /* transform distance table */ 416 numa_reset_distance(); 417 for (i = 0; i < max_emu_nid + 1; i++) { 418 for (j = 0; j < max_emu_nid + 1; j++) { 419 int physi = emu_nid_to_phys[i]; 420 int physj = emu_nid_to_phys[j]; 421 int dist; 422 423 if (get_option(&emu_cmdline, &dist) == 2) 424 ; 425 else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) 426 dist = physi == physj ? 427 LOCAL_DISTANCE : REMOTE_DISTANCE; 428 else 429 dist = phys_dist[physi * numa_dist_cnt + physj]; 430 431 numa_set_distance(i, j, dist); 432 } 433 } 434 435 /* free the copied physical distance table */ 436 if (phys_dist) 437 memblock_free(__pa(phys_dist), phys_size); 438 return; 439 440 no_emu: 441 /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ 442 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) 443 emu_nid_to_phys[i] = i; 444 } 445 446 #ifndef CONFIG_DEBUG_PER_CPU_MAPS 447 void __cpuinit numa_add_cpu(int cpu) 448 { 449 int physnid, nid; 450 451 nid = early_cpu_to_node(cpu); 452 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); 453 454 physnid = emu_nid_to_phys[nid]; 455 456 /* 457 * Map the cpu to each emulated node that is allocated on the physical 458 * node of the cpu's apic id. 459 */ 460 for_each_online_node(nid) 461 if (emu_nid_to_phys[nid] == physnid) 462 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); 463 } 464 465 void __cpuinit numa_remove_cpu(int cpu) 466 { 467 int i; 468 469 for_each_online_node(i) 470 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); 471 } 472 #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 473 static void __cpuinit numa_set_cpumask(int cpu, bool enable) 474 { 475 int nid, physnid; 476 477 nid = early_cpu_to_node(cpu); 478 if (nid == NUMA_NO_NODE) { 479 /* early_cpu_to_node() already emits a warning and trace */ 480 return; 481 } 482 483 physnid = emu_nid_to_phys[nid]; 484 485 for_each_online_node(nid) { 486 if (emu_nid_to_phys[nid] != physnid) 487 continue; 488 489 debug_cpumask_set_cpu(cpu, nid, enable); 490 } 491 } 492 493 void __cpuinit numa_add_cpu(int cpu) 494 { 495 numa_set_cpumask(cpu, true); 496 } 497 498 void __cpuinit numa_remove_cpu(int cpu) 499 { 500 numa_set_cpumask(cpu, false); 501 } 502 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 503