linux/arch/ia64/mm/numa.c

/*
 * This file is subject to the terms and conditions of the GNU General Public
 * License.  See the file "COPYING" in the main directory of this archive
 * for more details.
 *
 * This file contains NUMA specific variables and functions which can
 * be split away from DISCONTIGMEM and are used on NUMA machines with
 * contiguous memory.
 * 
 *                         2002/08/07 Erich Focht <efocht@ess.nec.de>
 */

#include <linux/cpu.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/node.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/module.h>
#include <asm/mmzone.h>
#include <asm/numa.h>


/*
 * The following structures are usually initialized by ACPI or
 * similar mechanisms and describe the NUMA characteristics of the machine.
 */
int num_node_memblks;
struct node_memblk_s node_memblk[NR_NODE_MEMBLKS];
struct node_cpuid_s node_cpuid[NR_CPUS] =
	{ [0 ... NR_CPUS-1] = { .phys_id = 0, .nid = NUMA_NO_NODE } };

/*
 * This is a matrix with "distances" between nodes, they should be
 * proportional to the memory access latency ratios.
 */
u8 numa_slit[MAX_NUMNODES * MAX_NUMNODES];

/* Identify which cnode a physical address resides on */
int
paddr_to_nid(unsigned long paddr)
{
	int	i;

	for (i = 0; i < num_node_memblks; i++)
		if (paddr >= node_memblk[i].start_paddr &&
		    paddr < node_memblk[i].start_paddr + node_memblk[i].size)
			break;

	return (i < num_node_memblks) ? node_memblk[i].nid : (num_node_memblks ? -1 : 0);
}

#if defined(CONFIG_SPARSEMEM) && defined(CONFIG_NUMA)
/*
 * Because of holes evaluate on section limits.
 * If the section of memory exists, then return the node where the section
 * resides.  Otherwise return node 0 as the default.  This is used by
 * SPARSEMEM to allocate the SPARSEMEM sectionmap on the NUMA node where
 * the section resides.
 */
int __meminit __early_pfn_to_nid(unsigned long pfn)
{
	int i, section = pfn >> PFN_SECTION_SHIFT, ssec, esec;

	for (i = 0; i < num_node_memblks; i++) {
		ssec = node_memblk[i].start_paddr >> PA_SECTION_SHIFT;
		esec = (node_memblk[i].start_paddr + node_memblk[i].size +
			((1L << PA_SECTION_SHIFT) - 1)) >> PA_SECTION_SHIFT;
		if (section >= ssec && section < esec)
			return node_memblk[i].nid;
	}

	return -1;
}

#ifdef CONFIG_MEMORY_HOTPLUG
/*
 *  SRAT information is stored in node_memblk[], then we can use SRAT
 *  information at memory-hot-add if necessary.
 */

int memory_add_physaddr_to_nid(u64 addr)
{
	int nid = paddr_to_nid(addr);
	if (nid < 0)
		return 0;
	return nid;
}

EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif
#endif
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 22:20:36 +00:00			`/*`
			`* This file is subject to the terms and conditions of the GNU General Public`
			`* License. See the file "COPYING" in the main directory of this archive`
			`* for more details.`
			`*`
			`* This file contains NUMA specific variables and functions which can`
			`* be split away from DISCONTIGMEM and are used on NUMA machines with`
			`* contiguous memory.`
			`*`
			`* 2002/08/07 Erich Focht <efocht@ess.nec.de>`
			`*/`

			`#include <linux/cpu.h>`
			`#include <linux/kernel.h>`
			`#include <linux/mm.h>`
			`#include <linux/node.h>`
			`#include <linux/init.h>`
			`#include <linux/bootmem.h>`
[PATCH] hot-add-mem x86_64: memory_add_physaddr_to_nid node fixup In cases where the acpi memory-add event does not containe the pxm (node) infomation allow the driver to look up node info based on the address. The acpi_get_node call returns -1 if it can't decode the pxm info, this causes add_memory to panic. acpi_get_node would have to decode the resource from the handle (a lenghty proposition). This seems to be the cleanist point to interject the hook. [kamezawa.hiroyu@jp.fujitsu.com: build fixes] [y-goto@jp.fujitsu.com: build fixes] Signed-off-by: Keith Mannthey <kmannth@us.ibm.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Andi Kleen <ak@muc.de> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2006-10-01 06:27:07 +00:00			`#include <linux/module.h>`
Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 22:20:36 +00:00			`#include <asm/mmzone.h>`
			`#include <asm/numa.h>`


			`/*`
			`* The following structures are usually initialized by ACPI or`
			`* similar mechanisms and describe the NUMA characteristics of the machine.`
			`*/`
			`int num_node_memblks;`
			`struct node_memblk_s node_memblk[NR_NODE_MEMBLKS];`
[IA64] Minimize per_cpu reservations. This attached patch significantly shrinks boot memory allocation on ia64. It does this by not allocating per_cpu areas for cpus that can never exist. In the case where acpi does not have any numa node description of the cpus, I defaulted to assigning the first 32 round-robin on the known nodes.. For the !CONFIG_ACPI I used for_each_possible_cpu(). Signed-off-by: Robin Holt <holt@sgi.com> Signed-off-by: Tony Luck <tony.luck@intel.com> 2008-04-03 20:17:13 +00:00			`struct node_cpuid_s node_cpuid[NR_CPUS] =`
			`{ [0 ... NR_CPUS-1] = { .phys_id = 0, .nid = NUMA_NO_NODE } };`

Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! 2005-04-16 22:20:36 +00:00			`/*`
			`* This is a matrix with "distances" between nodes, they should be`
			`* proportional to the memory access latency ratios.`
			`*/`
			`u8 numa_slit[MAX_NUMNODES * MAX_NUMNODES];`

			`/* Identify which cnode a physical address resides on */`
			`int`
			`paddr_to_nid(unsigned long paddr)`
			`{`
			`int i;`

			`for (i = 0; i < num_node_memblks; i++)`
			`if (paddr >= node_memblk[i].start_paddr &&`
			`paddr < node_memblk[i].start_paddr + node_memblk[i].size)`
			`break;`

			`return (i < num_node_memblks) ? node_memblk[i].nid : (num_node_memblks ? -1 : 0);`
			`}`
[PATCH] V5 ia64 SPARSEMEM - SPARSEMEM code changes This patch is the minimal set of changes required by ia64 to use SPARSEMEM. Signed-off-by: Bob Picco <bob.picco@hp.com> Signed-off-by: Tony Luck <tony.luck@intel.com> 2005-10-04 19:13:57 +00:00
			`#if defined(CONFIG_SPARSEMEM) && defined(CONFIG_NUMA)`
			`/*`
			`* Because of holes evaluate on section limits.`
			`* If the section of memory exists, then return the node where the section`
			`* resides. Otherwise return node 0 as the default. This is used by`
			`* SPARSEMEM to allocate the SPARSEMEM sectionmap on the NUMA node where`
			`* the section resides.`
			`*/`
mm: clean up for early_pfn_to_nid() What's happening is that the assertion in mm/page_alloc.c:move_freepages() is triggering: BUG_ON(page_zone(start_page) != page_zone(end_page)); Once I knew this is what was happening, I added some annotations: if (unlikely(page_zone(start_page) != page_zone(end_page))) { printk(KERN_ERR "move_freepages: Bogus zones: " "start_page[%p] end_page[%p] zone[%p]\n", start_page, end_page, zone); printk(KERN_ERR "move_freepages: " "start_zone[%p] end_zone[%p]\n", page_zone(start_page), page_zone(end_page)); printk(KERN_ERR "move_freepages: " "start_pfn[0x%lx] end_pfn[0x%lx]\n", page_to_pfn(start_page), page_to_pfn(end_page)); printk(KERN_ERR "move_freepages: " "start_nid[%d] end_nid[%d]\n", page_to_nid(start_page), page_to_nid(end_page)); ... And here's what I got: move_freepages: Bogus zones: start_page[2207d0000] end_page[2207dffc0] zone[fffff8103effcb00] move_freepages: start_zone[fffff8103effcb00] end_zone[fffff8003fffeb00] move_freepages: start_pfn[0x81f600] end_pfn[0x81f7ff] move_freepages: start_nid[1] end_nid[0] My memory layout on this box is: [ 0.000000] Zone PFN ranges: [ 0.000000] Normal 0x00000000 -> 0x0081ff5d [ 0.000000] Movable zone start PFN for each node [ 0.000000] early_node_map[8] active PFN ranges [ 0.000000] 0: 0x00000000 -> 0x00020000 [ 0.000000] 1: 0x00800000 -> 0x0081f7ff [ 0.000000] 1: 0x0081f800 -> 0x0081fe50 [ 0.000000] 1: 0x0081fed1 -> 0x0081fed8 [ 0.000000] 1: 0x0081feda -> 0x0081fedb [ 0.000000] 1: 0x0081fedd -> 0x0081fee5 [ 0.000000] 1: 0x0081fee7 -> 0x0081ff51 [ 0.000000] 1: 0x0081ff59 -> 0x0081ff5d So it's a block move in that 0x81f600-->0x81f7ff region which triggers the problem. This patch: Declaration of early_pfn_to_nid() is scattered over per-arch include files, and it seems it's complicated to know when the declaration is used. I think it makes fix-for-memmap-init not easy. This patch moves all declaration to include/linux/mm.h After this, if !CONFIG_NODES_POPULATES_NODE_MAP && !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID -> Use static definition in include/linux/mm.h else if !CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID -> Use generic definition in mm/page_alloc.c else -> per-arch back end function will be called. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Reported-by: David Miller <davem@davemlloft.net> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: <stable@kernel.org> [2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2009-02-18 22:48:32 +00:00			`int __meminit __early_pfn_to_nid(unsigned long pfn)`
[PATCH] V5 ia64 SPARSEMEM - SPARSEMEM code changes This patch is the minimal set of changes required by ia64 to use SPARSEMEM. Signed-off-by: Bob Picco <bob.picco@hp.com> Signed-off-by: Tony Luck <tony.luck@intel.com> 2005-10-04 19:13:57 +00:00			`{`
			`int i, section = pfn >> PFN_SECTION_SHIFT, ssec, esec;`

			`for (i = 0; i < num_node_memblks; i++) {`
			`ssec = node_memblk[i].start_paddr >> PA_SECTION_SHIFT;`
			`esec = (node_memblk[i].start_paddr + node_memblk[i].size +`
			`((1L << PA_SECTION_SHIFT) - 1)) >> PA_SECTION_SHIFT;`
			`if (section >= ssec && section < esec)`
			`return node_memblk[i].nid;`
			`}`

mm: fix memmap init for handling memory hole Now, early_pfn_in_nid(PFN, NID) may returns false if PFN is a hole. and memmap initialization was not done. This was a trouble for sparc boot. To fix this, the PFN should be initialized and marked as PG_reserved. This patch changes early_pfn_in_nid() return true if PFN is a hole. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Reported-by: David Miller <davem@davemlloft.net> Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: <stable@kernel.org> [2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2009-02-18 22:48:33 +00:00			`return -1;`
[PATCH] V5 ia64 SPARSEMEM - SPARSEMEM code changes This patch is the minimal set of changes required by ia64 to use SPARSEMEM. Signed-off-by: Bob Picco <bob.picco@hp.com> Signed-off-by: Tony Luck <tony.luck@intel.com> 2005-10-04 19:13:57 +00:00			`}`
[PATCH] hot-add-mem x86_64: memory_add_physaddr_to_nid node fixup In cases where the acpi memory-add event does not containe the pxm (node) infomation allow the driver to look up node info based on the address. The acpi_get_node call returns -1 if it can't decode the pxm info, this causes add_memory to panic. acpi_get_node would have to decode the resource from the handle (a lenghty proposition). This seems to be the cleanist point to interject the hook. [kamezawa.hiroyu@jp.fujitsu.com: build fixes] [y-goto@jp.fujitsu.com: build fixes] Signed-off-by: Keith Mannthey <kmannth@us.ibm.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Andi Kleen <ak@muc.de> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org> 2006-10-01 06:27:07 +00:00
			`#ifdef CONFIG_MEMORY_HOTPLUG`
			`/*`
			`* SRAT information is stored in node_memblk[], then we can use SRAT`
			`* information at memory-hot-add if necessary.`
			`*/`

			`int memory_add_physaddr_to_nid(u64 addr)`
			`{`
			`int nid = paddr_to_nid(addr);`
			`if (nid < 0)`
			`return 0;`
			`return nid;`
			`}`

			`EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);`
			`#endif`
[PATCH] V5 ia64 SPARSEMEM - SPARSEMEM code changes This patch is the minimal set of changes required by ia64 to use SPARSEMEM. Signed-off-by: Bob Picco <bob.picco@hp.com> Signed-off-by: Tony Luck <tony.luck@intel.com> 2005-10-04 19:13:57 +00:00			`#endif`