1 /* 2 * NUMA emulation 3 */ 4 #include <linux/kernel.h> 5 #include <linux/errno.h> 6 #include <linux/topology.h> 7 #include <linux/memblock.h> 8 #include <linux/bootmem.h> 9 #include <asm/dma.h> 10 11 #include "numa_internal.h" 12 13 static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; 14 static char *emu_cmdline __initdata; 15 16 void __init numa_emu_cmdline(char *str) 17 { 18 emu_cmdline = str; 19 } 20 21 static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) 22 { 23 int i; 24 25 for (i = 0; i < mi->nr_blks; i++) 26 if (mi->blk[i].nid == nid) 27 return i; 28 return -ENOENT; 29 } 30 31 static u64 __init mem_hole_size(u64 start, u64 end) 32 { 33 unsigned long start_pfn = PFN_UP(start); 34 unsigned long end_pfn = PFN_DOWN(end); 35 36 if (start_pfn < end_pfn) 37 return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); 38 return 0; 39 } 40 41 /* 42 * Sets up nid to range from @start to @end. The return value is -errno if 43 * something went wrong, 0 otherwise. 44 */ 45 static int __init emu_setup_memblk(struct numa_meminfo *ei, 46 struct numa_meminfo *pi, 47 int nid, int phys_blk, u64 size) 48 { 49 struct numa_memblk *eb = &ei->blk[ei->nr_blks]; 50 struct numa_memblk *pb = &pi->blk[phys_blk]; 51 52 if (ei->nr_blks >= NR_NODE_MEMBLKS) { 53 pr_err("NUMA: Too many emulated memblks, failing emulation\n"); 54 return -EINVAL; 55 } 56 57 ei->nr_blks++; 58 eb->start = pb->start; 59 eb->end = pb->start + size; 60 eb->nid = nid; 61 62 if (emu_nid_to_phys[nid] == NUMA_NO_NODE) 63 emu_nid_to_phys[nid] = nid; 64 65 pb->start += size; 66 if (pb->start >= pb->end) { 67 WARN_ON_ONCE(pb->start > pb->end); 68 numa_remove_memblk_from(phys_blk, pi); 69 } 70 71 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, 72 eb->start, eb->end, (eb->end - eb->start) >> 20); 73 return 0; 74 } 75 76 /* 77 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr 78 * to max_addr. The return value is the number of nodes allocated. 79 */ 80 static int __init split_nodes_interleave(struct numa_meminfo *ei, 81 struct numa_meminfo *pi, 82 u64 addr, u64 max_addr, int nr_nodes) 83 { 84 nodemask_t physnode_mask = NODE_MASK_NONE; 85 u64 size; 86 int big; 87 int nid = 0; 88 int i, ret; 89 90 if (nr_nodes <= 0) 91 return -1; 92 if (nr_nodes > MAX_NUMNODES) { 93 pr_info("numa=fake=%d too large, reducing to %d\n", 94 nr_nodes, MAX_NUMNODES); 95 nr_nodes = MAX_NUMNODES; 96 } 97 98 /* 99 * Calculate target node size. x86_32 freaks on __udivdi3() so do 100 * the division in ulong number of pages and convert back. 101 */ 102 size = max_addr - addr - mem_hole_size(addr, max_addr); 103 size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); 104 105 /* 106 * Calculate the number of big nodes that can be allocated as a result 107 * of consolidating the remainder. 108 */ 109 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / 110 FAKE_NODE_MIN_SIZE; 111 112 size &= FAKE_NODE_MIN_HASH_MASK; 113 if (!size) { 114 pr_err("Not enough memory for each node. " 115 "NUMA emulation disabled.\n"); 116 return -1; 117 } 118 119 for (i = 0; i < pi->nr_blks; i++) 120 node_set(pi->blk[i].nid, physnode_mask); 121 122 /* 123 * Continue to fill physical nodes with fake nodes until there is no 124 * memory left on any of them. 125 */ 126 while (nodes_weight(physnode_mask)) { 127 for_each_node_mask(i, physnode_mask) { 128 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); 129 u64 start, limit, end; 130 int phys_blk; 131 132 phys_blk = emu_find_memblk_by_nid(i, pi); 133 if (phys_blk < 0) { 134 node_clear(i, physnode_mask); 135 continue; 136 } 137 start = pi->blk[phys_blk].start; 138 limit = pi->blk[phys_blk].end; 139 end = start + size; 140 141 if (nid < big) 142 end += FAKE_NODE_MIN_SIZE; 143 144 /* 145 * Continue to add memory to this fake node if its 146 * non-reserved memory is less than the per-node size. 147 */ 148 while (end - start - mem_hole_size(start, end) < size) { 149 end += FAKE_NODE_MIN_SIZE; 150 if (end > limit) { 151 end = limit; 152 break; 153 } 154 } 155 156 /* 157 * If there won't be at least FAKE_NODE_MIN_SIZE of 158 * non-reserved memory in ZONE_DMA32 for the next node, 159 * this one must extend to the boundary. 160 */ 161 if (end < dma32_end && dma32_end - end - 162 mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 163 end = dma32_end; 164 165 /* 166 * If there won't be enough non-reserved memory for the 167 * next node, this one must extend to the end of the 168 * physical node. 169 */ 170 if (limit - end - mem_hole_size(end, limit) < size) 171 end = limit; 172 173 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, 174 phys_blk, 175 min(end, limit) - start); 176 if (ret < 0) 177 return ret; 178 } 179 } 180 return 0; 181 } 182 183 /* 184 * Returns the end address of a node so that there is at least `size' amount of 185 * non-reserved memory or `max_addr' is reached. 186 */ 187 static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) 188 { 189 u64 end = start + size; 190 191 while (end - start - mem_hole_size(start, end) < size) { 192 end += FAKE_NODE_MIN_SIZE; 193 if (end > max_addr) { 194 end = max_addr; 195 break; 196 } 197 } 198 return end; 199 } 200 201 /* 202 * Sets up fake nodes of `size' interleaved over physical nodes ranging from 203 * `addr' to `max_addr'. The return value is the number of nodes allocated. 204 */ 205 static int __init split_nodes_size_interleave(struct numa_meminfo *ei, 206 struct numa_meminfo *pi, 207 u64 addr, u64 max_addr, u64 size) 208 { 209 nodemask_t physnode_mask = NODE_MASK_NONE; 210 u64 min_size; 211 int nid = 0; 212 int i, ret; 213 214 if (!size) 215 return -1; 216 /* 217 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is 218 * increased accordingly if the requested size is too small. This 219 * creates a uniform distribution of node sizes across the entire 220 * machine (but not necessarily over physical nodes). 221 */ 222 min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES; 223 min_size = max(min_size, FAKE_NODE_MIN_SIZE); 224 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) 225 min_size = (min_size + FAKE_NODE_MIN_SIZE) & 226 FAKE_NODE_MIN_HASH_MASK; 227 if (size < min_size) { 228 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", 229 size >> 20, min_size >> 20); 230 size = min_size; 231 } 232 size &= FAKE_NODE_MIN_HASH_MASK; 233 234 for (i = 0; i < pi->nr_blks; i++) 235 node_set(pi->blk[i].nid, physnode_mask); 236 237 /* 238 * Fill physical nodes with fake nodes of size until there is no memory 239 * left on any of them. 240 */ 241 while (nodes_weight(physnode_mask)) { 242 for_each_node_mask(i, physnode_mask) { 243 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); 244 u64 start, limit, end; 245 int phys_blk; 246 247 phys_blk = emu_find_memblk_by_nid(i, pi); 248 if (phys_blk < 0) { 249 node_clear(i, physnode_mask); 250 continue; 251 } 252 start = pi->blk[phys_blk].start; 253 limit = pi->blk[phys_blk].end; 254 255 end = find_end_of_node(start, limit, size); 256 /* 257 * If there won't be at least FAKE_NODE_MIN_SIZE of 258 * non-reserved memory in ZONE_DMA32 for the next node, 259 * this one must extend to the boundary. 260 */ 261 if (end < dma32_end && dma32_end - end - 262 mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) 263 end = dma32_end; 264 265 /* 266 * If there won't be enough non-reserved memory for the 267 * next node, this one must extend to the end of the 268 * physical node. 269 */ 270 if (limit - end - mem_hole_size(end, limit) < size) 271 end = limit; 272 273 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, 274 phys_blk, 275 min(end, limit) - start); 276 if (ret < 0) 277 return ret; 278 } 279 } 280 return 0; 281 } 282 283 /** 284 * numa_emulation - Emulate NUMA nodes 285 * @numa_meminfo: NUMA configuration to massage 286 * @numa_dist_cnt: The size of the physical NUMA distance table 287 * 288 * Emulate NUMA nodes according to the numa=fake kernel parameter. 289 * @numa_meminfo contains the physical memory configuration and is modified 290 * to reflect the emulated configuration on success. @numa_dist_cnt is 291 * used to determine the size of the physical distance table. 292 * 293 * On success, the following modifications are made. 294 * 295 * - @numa_meminfo is updated to reflect the emulated nodes. 296 * 297 * - __apicid_to_node[] is updated such that APIC IDs are mapped to the 298 * emulated nodes. 299 * 300 * - NUMA distance table is rebuilt to represent distances between emulated 301 * nodes. The distances are determined considering how emulated nodes 302 * are mapped to physical nodes and match the actual distances. 303 * 304 * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical 305 * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). 306 * 307 * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with 308 * identity mapping and no other modification is made. 309 */ 310 void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) 311 { 312 static struct numa_meminfo ei __initdata; 313 static struct numa_meminfo pi __initdata; 314 const u64 max_addr = PFN_PHYS(max_pfn); 315 u8 *phys_dist = NULL; 316 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); 317 int max_emu_nid, dfl_phys_nid; 318 int i, j, ret; 319 320 if (!emu_cmdline) 321 goto no_emu; 322 323 memset(&ei, 0, sizeof(ei)); 324 pi = *numa_meminfo; 325 326 for (i = 0; i < MAX_NUMNODES; i++) 327 emu_nid_to_phys[i] = NUMA_NO_NODE; 328 329 /* 330 * If the numa=fake command-line contains a 'M' or 'G', it represents 331 * the fixed node size. Otherwise, if it is just a single number N, 332 * split the system RAM into N fake nodes. 333 */ 334 if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { 335 u64 size; 336 337 size = memparse(emu_cmdline, &emu_cmdline); 338 ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); 339 } else { 340 unsigned long n; 341 342 n = simple_strtoul(emu_cmdline, NULL, 0); 343 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); 344 } 345 346 if (ret < 0) 347 goto no_emu; 348 349 if (numa_cleanup_meminfo(&ei) < 0) { 350 pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); 351 goto no_emu; 352 } 353 354 /* copy the physical distance table */ 355 if (numa_dist_cnt) { 356 u64 phys; 357 358 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), 359 phys_size, PAGE_SIZE); 360 if (!phys) { 361 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); 362 goto no_emu; 363 } 364 memblock_reserve(phys, phys_size); 365 phys_dist = __va(phys); 366 367 for (i = 0; i < numa_dist_cnt; i++) 368 for (j = 0; j < numa_dist_cnt; j++) 369 phys_dist[i * numa_dist_cnt + j] = 370 node_distance(i, j); 371 } 372 373 /* 374 * Determine the max emulated nid and the default phys nid to use 375 * for unmapped nodes. 376 */ 377 max_emu_nid = 0; 378 dfl_phys_nid = NUMA_NO_NODE; 379 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { 380 if (emu_nid_to_phys[i] != NUMA_NO_NODE) { 381 max_emu_nid = i; 382 if (dfl_phys_nid == NUMA_NO_NODE) 383 dfl_phys_nid = emu_nid_to_phys[i]; 384 } 385 } 386 if (dfl_phys_nid == NUMA_NO_NODE) { 387 pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); 388 goto no_emu; 389 } 390 391 /* commit */ 392 *numa_meminfo = ei; 393 394 /* 395 * Transform __apicid_to_node table to use emulated nids by 396 * reverse-mapping phys_nid. The maps should always exist but fall 397 * back to zero just in case. 398 */ 399 for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { 400 if (__apicid_to_node[i] == NUMA_NO_NODE) 401 continue; 402 for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) 403 if (__apicid_to_node[i] == emu_nid_to_phys[j]) 404 break; 405 __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; 406 } 407 408 /* make sure all emulated nodes are mapped to a physical node */ 409 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) 410 if (emu_nid_to_phys[i] == NUMA_NO_NODE) 411 emu_nid_to_phys[i] = dfl_phys_nid; 412 413 /* transform distance table */ 414 numa_reset_distance(); 415 for (i = 0; i < max_emu_nid + 1; i++) { 416 for (j = 0; j < max_emu_nid + 1; j++) { 417 int physi = emu_nid_to_phys[i]; 418 int physj = emu_nid_to_phys[j]; 419 int dist; 420 421 if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) 422 dist = physi == physj ? 423 LOCAL_DISTANCE : REMOTE_DISTANCE; 424 else 425 dist = phys_dist[physi * numa_dist_cnt + physj]; 426 427 numa_set_distance(i, j, dist); 428 } 429 } 430 431 /* free the copied physical distance table */ 432 if (phys_dist) 433 memblock_free(__pa(phys_dist), phys_size); 434 return; 435 436 no_emu: 437 /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ 438 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) 439 emu_nid_to_phys[i] = i; 440 } 441 442 #ifndef CONFIG_DEBUG_PER_CPU_MAPS 443 void __cpuinit numa_add_cpu(int cpu) 444 { 445 int physnid, nid; 446 447 nid = early_cpu_to_node(cpu); 448 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); 449 450 physnid = emu_nid_to_phys[nid]; 451 452 /* 453 * Map the cpu to each emulated node that is allocated on the physical 454 * node of the cpu's apic id. 455 */ 456 for_each_online_node(nid) 457 if (emu_nid_to_phys[nid] == physnid) 458 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); 459 } 460 461 void __cpuinit numa_remove_cpu(int cpu) 462 { 463 int i; 464 465 for_each_online_node(i) 466 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); 467 } 468 #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ 469 static void __cpuinit numa_set_cpumask(int cpu, bool enable) 470 { 471 int nid, physnid; 472 473 nid = early_cpu_to_node(cpu); 474 if (nid == NUMA_NO_NODE) { 475 /* early_cpu_to_node() already emits a warning and trace */ 476 return; 477 } 478 479 physnid = emu_nid_to_phys[nid]; 480 481 for_each_online_node(nid) { 482 if (emu_nid_to_phys[nid] != physnid) 483 continue; 484 485 debug_cpumask_set_cpu(cpu, nid, enable); 486 } 487 } 488 489 void __cpuinit numa_add_cpu(int cpu) 490 { 491 numa_set_cpumask(cpu, true); 492 } 493 494 void __cpuinit numa_remove_cpu(int cpu) 495 { 496 numa_set_cpumask(cpu, false); 497 } 498 #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 499