mm/bootmem.c    
   
   上次对启动过程有个简单总结,下面再次关注一下其中的bootmem部分。
setup.S->asmlinkage void __init start_kernel(void) (init/main.c)
  |
  +-->setup_arch --->  处理e820内存报告
                -->   关于内存的提示信息
                --->  初始化bootmem (init_bootmem)
                --->  paging_init--+
                         +-------+
  |                      +--> pagetable_init(含fix map,vmalloc init)
 \ /                     +--> load cr3
  .                      +--> kmap_init                        
  .                      +--> free_area_init(zone-buddy初始化)
  .             --->smp,apic,roms等处理
  +--> idt gate modules,kmem_cache_init
  |
  +--> mem_init -->free_all_bootmem buddy 得到页面控制权
  +
  +--> proc_root_init,fork_init, ipc,inode
  +--> smp_init
  +
  +--> 创建kernel thread, init (init/main.c->函数init)
                +--->do_basic_setup
                       ---->init pci,mtrr,sysctl,mca....
                       ---->filesystem_setup
                       ---->mount_root (关注...)
                       ---->......
                +--->  free_initmem
                +--->  打开console
                +--->execve("/sbin/init",argv_init,envp_init);
	        +--->execve("/etc/init",argv_init,envp_init);
	        +--->execve("/bin/init",argv_init,envp_init);
	        +--->execve("/bin/sh",argv_init,envp_init);

    首先是setup_arch(arch/i386/kernel/setup.c), 
void __init setup_arch(char **cmdline_p)
{
  .....
  	/*
	 * partially used pages are not usable - thus
	 * we are rounding upwards:
	 */
	start_pfn = PFN_UP(__pa(&_end));  /*
	                                   * boot mem 只能使用_end之后
	                                   * 的内存
	                                   */
        //接着从e820报告中找最高地址的ram页面
        //其pfn赋值给max_pfn,代码略
        
        //然后的代码寻找 max_low_pfn, highstart_pfn,highend_pfn
        //逻辑简单,不再罗列
        
	/*
	 * Initialize the boot-time allocator (with low memory only):
	 */
	/*
	 * 初始化boot mem,只使用 low memory
	 */
	bootmap_size = init_bootmem(start_pfn, max_low_pfn);
	
	/*
	 * Register fully available low RAM pages with the bootmem allocator.
	 */
	for (i = 0; i < e820.nr_map; i++) { //从e820中寻找ram
		unsigned long curr_pfn, last_pfn, size;
 		/*
		 * Reserve usable low memory
		 */
		if (e820.map[i].type != E820_RAM)
			continue;
		/*
		 * We are rounding up the start address of usable memory:
		 */
		curr_pfn = PFN_UP(e820.map[i].addr);
		if (curr_pfn >= max_low_pfn)
			continue;
		/*
		 * ... and at the end of the usable range downwards:
		 */
		last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);

		if (last_pfn > max_low_pfn)
			last_pfn = max_low_pfn;

		/*
		 * .. finally, did all the rounding and playing
		 * around just make the area go away?
		 */
		if (last_pfn <= curr_pfn)
			continue;

		size = last_pfn - curr_pfn;
		/*
		 * 注册ram到bootmem(标记页面为free)
		 */
		free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
	}

        /*
	 * Reserve the bootmem bitmap itself as well. We do this in two
	 * steps (first step was init_bootmem()) because this catches
	 * the (very unlikely) case of us accidentally initializing the
	 * bootmem allocator with an invalid RAM area.
	 */
	 /* 保留从物理地址1M开始(内核加载地址),
	  * 大小是内核image+bootmem bitmap 的物理内存
	  */
	reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) +
			 bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY));

	/*
	 * reserve physical page 0 - it's a special BIOS page on many boxes,
	 * enabling clean reboots, SMP operation, laptop functions.
	 */
	 reserve_bootmem(0, PAGE_SIZE);  /*保留page pfn 0*/
	 
	 ..........
    

}

    然后在mem_init调用free_all_bootmem前就可以使用bootmem分配内存了.
当mem_init调用此函数后,buddy系统可以工作了.bootmem的使命即告终结.

    bootmem的管理结构如下:
    typedef struct bootmem_data {
	unsigned long node_boot_start; /*所能看到的page的起始地址*/
	unsigned long node_low_pfn;  /*此node bootmme可管理的最大pfn*/
	void *node_bootmem_map;   /*bootmem bit位图,一般是可以使用的起始地址(除去内核image)*/
	unsigned long last_offset; /*上次分配的内存的结束地址在last_pos内的偏移*/
	unsigned long last_pos; /*上次分配的内存所使用的最后一个页面的pfn*/
} bootmem_data_t;
    bootmem_data_t属于node, pg_data_t.
    在NUMA系统中,每个节点属于一个node,pgdat.

typedef struct pglist_data {
	zone_t node_zones[MAX_NR_ZONES];
	zonelist_t node_zonelists[NR_GFPINDEX];
	struct page *node_mem_map;  /*NUMA系统中的page结构数组,不再是
	                             *全局变量mem_map了
	                             */
	unsigned long *valid_addr_bitmap;
	struct bootmem_data *bdata;
	unsigned long node_start_paddr;
	unsigned long node_start_mapnr;
	unsigned long node_size;
	int node_id;
	struct pglist_data *node_next;
} pg_data_t;
     
    NUMA系统中的page结构数组,不再是局变量mem_map了,而是每个zone的
pgdat.node_mem_map.
    linux2.4.0中对NUMA的处理比较晦涩,不如直接看2.6的代码.2.6中看宏
#define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
     
    在NUMA系统pfn_to_page定义如下:
 #define pfn_to_page(pfn)					\
({								\
	unsigned long __pfn = pfn;				\
	int __node  = pfn_to_nid(__pfn);			\
	&node_mem_map(__node)[node_localnr(__pfn,__node)];	\
})
    
#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)

    而pfn_to_nid在mmzone.h定义为
/*
 * generic node memory support, the following assumptions apply:
 *
 * 1) memory comes in 256Mb contigious chunks which are either present or not
 * 2) we will not have more than 64Gb in total
 *
 * for now assume that 64Gb is max amount of RAM for whole system
 *    64Gb / 4096bytes/page = 16777216 pages
 */
#define MAX_NR_PAGES 16777216
#define MAX_ELEMENTS 256
#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS)

extern s8 physnode_map[];

static inline int pfn_to_nid(unsigned long pfn)
{
#ifdef CONFIG_NUMA
	return((int) physnode_map[(pfn) / PAGES_PER_ELEMENT]);
#else
	return 0;
#endif
}

     从其中注释,容易看懂pgdat之作用,以及zone的划分方式.
  
     
     bootmem.c涉及到的函数应该不难,有几个相关的注释,只是罗列于此,以
保分析完整.




/* return the number of _pages_ that will be allocated for the boot bitmap */
unsigned long __init bootmem_bootmap_pages (unsigned long pages)
{
	unsigned long mapsize;

	mapsize = (pages+7)/8; //每页用一个bit管理,转换为字节,宁多勿少
	mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; //字节数安4k对其,宁多勿少 
	mapsize >>= PAGE_SHIFT; //换成页面个数

	return mapsize;  //pages 个页面需要mapsize个页面来管理
}

/*
 * Called once to set up the allocator itself.
 * mapstart: bootmem所能够"管理"页面的起始pfn (i386 就是 内核_end以上) 
 * start: 此node的起始pfn,登记到位图,但只能从mapstart开始使用
 * end  : bootmem所管理的最后一个页面的pfn
 */
static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
	unsigned long mapstart, unsigned long start, unsigned long end)
{
	bootmem_data_t *bdata = pgdat->bdata;
	unsigned long mapsize = ((end - start)+7)/8;
	           //每页用一个bit管理,转换为字节,宁多勿少

	pgdat->node_next = pgdat_list; //pglist_datag,NUMA , 每node一个pgdat
	pgdat_list = pgdat;

	mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL);

	bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
	    //只设置了管理位图的起始地址,需要明确调用reserve_bootmem保留内核
	    //和boot mem 自己使用的内存

	bdata->node_boot_start = (start << PAGE_SHIFT);
	bdata->node_low_pfn = end;

	/*
	 * Initially all pages are reserved - setup_arch() has to
	 * register free RAM areas explicitly.
	 * 开始都处于已使用状态,需要setup_arch 注册ram页面
	 */
	memset(bdata->node_bootmem_map, 0xff, mapsize);

	return mapsize;
}


/*
 * We 'merge' subsequent allocations to save space. We might 'lose'
 * some fraction of a page if allocations cannot be satisfied due to
 * size constraints on boxes where there is physical RAM space
 * fragmentation - in these cases * (mostly large memory boxes) this
 * is not a problem.
 *
 * On low memory boxes we get it right in 100% of the cases.
 */

/*
 * alignment has to be a power of 2 value.
 */
 /*
  *  align : 按align对齐
  *  goal:   请求目标地址(以上的)内存
  */
static void * __init __alloc_bootmem_core (bootmem_data_t *bdata, 
	unsigned long size, unsigned long align, unsigned long goal)
{
	unsigned long i, start = 0; //start is pfn
	void *ret;
	unsigned long offset, remaining_size;
	unsigned long areasize, preferred, incr;//areasize is pfn number
	unsigned long eidx = bdata->node_low_pfn - (bdata->node_boot_start >>
							PAGE_SHIFT);

	if (!size) BUG();

	/*
	 * We try to allocate bootmem pages above 'goal'
	 * first, then we try to allocate lower pages.
	 */
	if (goal && (goal >= bdata->node_boot_start) && 
			((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) {
		preferred = goal - bdata->node_boot_start;
	} else
		preferred = 0;

    /* 按要求对齐 */
	preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT; 
	areasize = (size+PAGE_SIZE-1)/PAGE_SIZE; /*size 按page对齐,up round*/
	incr = align >> PAGE_SHIFT ? : 1; /*按对齐要求计算每次步进几个pfn*/

restart_scan:
	for (i = preferred; i < eidx; i += incr) {
		unsigned long j;
		if (test_bit(i, bdata->node_bootmem_map))
			continue;
		for (j = i + 1; j < i + areasize; ++j) {
			if (j >= eidx)
				goto fail_block;
			if (test_bit (j, bdata->node_bootmem_map))
				goto fail_block;
		}
		start = i;
		goto found; /*找到了所要求的几个连续的page*/
	fail_block:;
	}
	if (preferred) {
		preferred = 0; /*
不能满足goal 要求,尝试从node_boot_start开始寻找*/
		goto restart_scan;
	}
found:
	if (start >= eidx)
		BUG();

	/*
	 * Is the next page of the previous allocation-end the start
	 * of this allocation's buffer? If yes then we can 'merge'
	 * the previous partial page with this allocation.
	 */
	if (align <= PAGE_SIZE
	    && bdata->last_offset && bdata->last_pos+1 == start) {
		offset = (bdata->last_offset+align-1) & ~(align-1);
		if (offset > PAGE_SIZE)
			BUG();
		remaining_size = PAGE_SIZE-offset;
		if (size < remaining_size) {
			areasize = 0;
			// last_pos unchanged
			bdata->last_offset = offset+size;
			ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
						bdata->node_boot_start);
		} else {
			remaining_size = size - remaining_size;
			areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE;
			ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
						bdata->node_boot_start);
			bdata->last_pos = start+areasize-1;
			bdata->last_offset = remaining_size;
		}
		bdata->last_offset &= ~PAGE_MASK;
	} else {
		bdata->last_pos = start + areasize - 1;/*start包含在内,故减1*/
		bdata->last_offset = size & ~PAGE_MASK; /*
		                                         * 上次分配的结束地址在页面last_pos
		                                         * 内的偏移
		                                         */
		ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start);
	}
	/*
	 * Reserve the area now:
	 */
	for (i = start; i < start+areasize; i++)
		if (test_and_set_bit(i, bdata->node_bootmem_map))
			BUG();
	memset(ret, 0, size);
	return ret;
}


/*
 * 释放未使用的页面和自己使用的页面到buddy系统
 */
static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)


    罗列的代码,其中有注释.其余函数.....算来吧,没有注释,用不着了.