mm/bootmem.c
上次对启动过程有个简单总结,下面再次关注一下其中的bootmem部分。
setup.S->asmlinkage void __init start_kernel(void) (init/main.c)
|
+-->setup_arch ---> 处理e820内存报告
--> 关于内存的提示信息
---> 初始化bootmem (init_bootmem)
---> paging_init--+
+-------+
| +--> pagetable_init(含fix map,vmalloc init)
\ / +--> load cr3
. +--> kmap_init
. +--> free_area_init(zone-buddy初始化)
. --->smp,apic,roms等处理
+--> idt gate modules,kmem_cache_init
|
+--> mem_init -->free_all_bootmem buddy 得到页面控制权
+
+--> proc_root_init,fork_init, ipc,inode
+--> smp_init
+
+--> 创建kernel thread, init (init/main.c->函数init)
+--->do_basic_setup
---->init pci,mtrr,sysctl,mca....
---->filesystem_setup
---->mount_root (关注...)
---->......
+---> free_initmem
+---> 打开console
+--->execve("/sbin/init",argv_init,envp_init);
+--->execve("/etc/init",argv_init,envp_init);
+--->execve("/bin/init",argv_init,envp_init);
+--->execve("/bin/sh",argv_init,envp_init);
首先是setup_arch(arch/i386/kernel/setup.c),
void __init setup_arch(char **cmdline_p)
{
.....
start_pfn = PFN_UP(__pa(&_end));
bootmap_size = init_bootmem(start_pfn, max_low_pfn);
for (i = 0; i < e820.nr_map; i++) { unsigned long curr_pfn, last_pfn, size;
if (e820.map[i].type != E820_RAM)
continue;
curr_pfn = PFN_UP(e820.map[i].addr);
if (curr_pfn >= max_low_pfn)
continue;
last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
if (last_pfn > max_low_pfn)
last_pfn = max_low_pfn;
if (last_pfn <= curr_pfn)
continue;
size = last_pfn - curr_pfn;
free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
}
reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) +
bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY));
reserve_bootmem(0, PAGE_SIZE);
..........
}
然后在mem_init调用free_all_bootmem前就可以使用bootmem分配内存了.
当mem_init调用此函数后,buddy系统可以工作了.bootmem的使命即告终结.
bootmem的管理结构如下:
typedef struct bootmem_data {
unsigned long node_boot_start;
unsigned long node_low_pfn;
void *node_bootmem_map;
unsigned long last_offset;
unsigned long last_pos;
} bootmem_data_t;
bootmem_data_t属于node, pg_data_t.
在NUMA系统中,每个节点属于一个node,pgdat.
typedef struct pglist_data {
zone_t node_zones[MAX_NR_ZONES];
zonelist_t node_zonelists[NR_GFPINDEX];
struct page *node_mem_map;
unsigned long *valid_addr_bitmap;
struct bootmem_data *bdata;
unsigned long node_start_paddr;
unsigned long node_start_mapnr;
unsigned long node_size;
int node_id;
struct pglist_data *node_next;
} pg_data_t;
NUMA系统中的page结构数组,不再是局变量mem_map了,而是每个zone的
pgdat.node_mem_map.
linux2.4.0中对NUMA的处理比较晦涩,不如直接看2.6的代码. 在2.6中看宏
#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
在NUMA系统pfn_to_page定义如下:
#define pfn_to_page(pfn) \
({ \
unsigned long __pfn = pfn; \
int __node = pfn_to_nid(__pfn); \
&node_mem_map(__node)[node_localnr(__pfn,__node)]; \
})
#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map)
而pfn_to_nid在mmzone.h定义为
#define MAX_NR_PAGES 16777216
#define MAX_ELEMENTS 256
#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS)
extern s8 physnode_map[];
static inline int pfn_to_nid(unsigned long pfn)
{
#ifdef CONFIG_NUMA
return((int) physnode_map[(pfn) / PAGES_PER_ELEMENT]);
#else
return 0;
#endif
}
从其中注释,容易看懂pgdat之作用,以及zone的划分方式.
bootmem.c涉及到的函数应该不难,有几个相关的注释,只是罗列于此,以
保分析完整.
unsigned long __init bootmem_bootmap_pages (unsigned long pages)
{
unsigned long mapsize;
mapsize = (pages+7)/8; mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; mapsize >>= PAGE_SHIFT;
return mapsize;}
static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
unsigned long mapstart, unsigned long start, unsigned long end)
{
bootmem_data_t *bdata = pgdat->bdata;
unsigned long mapsize = ((end - start)+7)/8;
pgdat->node_next = pgdat_list; pgdat_list = pgdat;
mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL);
bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
bdata->node_boot_start = (start << PAGE_SHIFT);
bdata->node_low_pfn = end;
memset(bdata->node_bootmem_map, 0xff, mapsize);
return mapsize;
}
static void * __init __alloc_bootmem_core (bootmem_data_t *bdata,
unsigned long size, unsigned long align, unsigned long goal)
{
unsigned long i, start = 0; void *ret;
unsigned long offset, remaining_size;
unsigned long areasize, preferred, incr; unsigned long eidx = bdata->node_low_pfn - (bdata->node_boot_start >>
PAGE_SHIFT);
if (!size) BUG();
if (goal && (goal >= bdata->node_boot_start) &&
((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) {
preferred = goal - bdata->node_boot_start;
} else
preferred = 0;
preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT;
areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
incr = align >> PAGE_SHIFT ? : 1;
restart_scan:
for (i = preferred; i < eidx; i += incr) {
unsigned long j;
if (test_bit(i, bdata->node_bootmem_map))
continue;
for (j = i + 1; j < i + areasize; ++j) {
if (j >= eidx)
goto fail_block;
if (test_bit (j, bdata->node_bootmem_map))
goto fail_block;
}
start = i;
goto found;
fail_block:;
}
if (preferred) {
preferred = 0;
goto restart_scan;
}
found:
if (start >= eidx)
BUG();
if (align <= PAGE_SIZE
&& bdata->last_offset && bdata->last_pos+1 == start) {
offset = (bdata->last_offset+align-1) & ~(align-1);
if (offset > PAGE_SIZE)
BUG();
remaining_size = PAGE_SIZE-offset;
if (size < remaining_size) {
areasize = 0; bdata->last_offset = offset+size;
ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
bdata->node_boot_start);
} else {
remaining_size = size - remaining_size;
areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE;
ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset +
bdata->node_boot_start);
bdata->last_pos = start+areasize-1;
bdata->last_offset = remaining_size;
}
bdata->last_offset &= ~PAGE_MASK;
} else {
bdata->last_pos = start + areasize - 1;
bdata->last_offset = size & ~PAGE_MASK;
ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start);
}
for (i = start; i < start+areasize; i++)
if (test_and_set_bit(i, bdata->node_bootmem_map))
BUG();
memset(ret, 0, size);
return ret;
}
static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
罗列的代码,其中有注释.其余函数.....算来吧,没有注释,用不着了.