mm/slab.c
内存管理中,大家讨论最多的一块.我们将分几部分来阐述.
第一部分 概述
slab从zone-buddy系统分配页面,然后拆分成小块内存共内核使用,相当于程序使用
的malloc/fee.
内核使用的每一个对象(inode,dentry, sock........)都有一个专门的cache,就是
这里的slab机制,并且如果对内存的种类有要求,需要另外创建单独的cache,比如DMA,
HIGH mem.
slab最主要的作用就是小块内存分配,另外对SMP,cache进行了优化.内核通过buddy
控制大粒度的内存碎片,通过slab控制小块的内存碎片----slab 一次分配一个或者几个
物理地址连续的内存页.
关于slab对cache的优化这里采用一示意图副图:
1)假设只有两个cache line,0,1.
2)每个page 两个cache line大小
3)第一个slab color为0,从page的起始位置开始存放slab_t
4)第二个slab color 为1,从page 偏移cache size开始存放slab_t
+------------------+ +------------------+
| cache line 0 | | cache line 1 |
/------------------+ +------------------+
| /
| |
/---<-+ +-->-----------------------------------/
| | | |
| \ | |
| +--------------------+--------------------+-----------------+--------------+
| | cache line 0 | cache line 1 | cache line 0 | cache line 1 |
| +--------------------+--------------------+-----------------+--------------+
| page0 |page1
| slab_t color=0 | slab_t clolor=1
| |
| |
| |
| |
| |
\-----------------------------------------------\
如果遍历cache的所有slab,访问slab1,然后访问slab2,然后又访问slab1.这个期间
cache line0,cache line 1不用失效即可完成访问.
如果两个slab的color相同,每次访问都会使cache line 失效从而降低效率.
在这里不讨论关于cache本身的详细问题,此图仅为参考.(fix me)
第二部分 数据结构和示意图
还是先给出一张图,然后详解相关的数据结构.这是一个非off slab(slab_t不在slab
之中)的示意图.图中将关键的数据清楚的表明了其用图.
1)kmem_cache_s
slabs是所有属于此cache的slab(连续的几个页面)的一个链表.
firtnotfull:这些slab按照 full, part used, empty的顺序排列,这个指针指向第
一个部分使用的slab.
objsize: cache 管理的对象之字节大小.
flags: like off_sab bit
color: color的最大值,把slab的剩余空间按照coloroff的粒度分成'clolor'
coloroff: 颜色粒度,一般和cache line大小相关(n倍)
2)slab_t : slab管理所使用的结构
list: 挂入kmem_cache_s的slabs链表
colouroff:从slab起始页面到第一个obj(s_mem)的偏移.cache align(color,slab_t,bufctl)
inuse: 已分配objs数量
free: bufctrl是一个数组构成的链表,从索引free开始,是其空闲objs组成的链表.
kmem_cache_s
+-------------+
/-|slabs |
| |*firstnotfull->>-----------------------/
| |objsize | |
| |flags; | |
| |num; | |
| | | |
| |color(max) |=left/cache.coloroff |
| |coloroff |L1 cache size |
| |
| |
| |
| |
| |
| slab_t struct slab_s |
| |
/--|-----+----------+ page --/--------+----------+page
| \---->|list --------------/ | | |
| |colouroff;| | | | color |=cache.colornext*cache.coloroff
| |*s_mem;---------/ | | | |
| |inuse; | | | | +----------+ slab_t
| |free -------/ | \-----+------->|list |
| +----------+ | | | |colouroff;|
\ | bufctl | | | | |*s_mem;---------/
colouroff |----------| | | | |inuse; | |
/ /-->... | | | |free -------/ |
| || | | | | +----------+ | |
| \---next <--/ | \ | bufctl | | |
| | | | colouroff |----------| | |
| +----------+ | / /-->... | |
| ... | | || | | |
| cache align | | \---next <--/ |
-\--------+----------+<----/ | | | |
| obj | | +----------+ |
+----------| | ... /
| | | cache align |
| | ---/--------+----------+<----/
| | | obj |
....
| | +----------+
| | | |
| | | |
| | ....
| | | |
| | | |
| | | |
| | | |
+----------+ | |
| |
| |
| |
+----------+
需要说明的是,一旦page从buddy分配到slab, 其page 描述符中的prev指向所属slab, next
指向所属的cache.(见kmem_cache_grow).这样,对于一个obj, rounddowntopage(objp)->next
就是其cache,rounddowntopage(objp)->prev就是其slab.
然后列出其定义:
struct kmem_cache_s {
struct list_head slabs; struct list_head *firstnotfull;
unsigned int objsize;
unsigned int flags;
unsigned int num;
spinlock_t spinlock;
#ifdef CONFIG_SMP
unsigned int batchcount;
#endif
unsigned int gfporder;
unsigned int gfpflags;
size_t colour;
unsigned int colour_off;
unsigned int colour_next;
kmem_cache_t *slabp_cache;
unsigned int growing;
unsigned int dflags;
char name[CACHE_NAMELEN];
struct list_head next;
#ifdef CONFIG_SMP
cpucache_t *cpudata[NR_CPUS];
#endif
#if STATS //忽略
#endif
};
typedef struct slab_s {
struct list_head list; unsigned long colouroff; void *s_mem;
unsigned int inuse;
kmem_bufctl_t free;
} slab_t;
另外一个对slab工作起到重要作用的是cache_cache:
static kmem_cache_t cache_cache = {
slabs: LIST_HEAD_INIT(cache_cache.slabs),
firstnotfull: &cache_cache.slabs,
objsize: sizeof(kmem_cache_t),
flags: SLAB_NO_REAP,
spinlock: SPIN_LOCK_UNLOCKED,
colour_off: L1_CACHE_BYTES,
name: "kmem_cache",
};
这是一个手工简立的cache, 其管理的对象是kmem_cache_t.用于分配kmem_cache_t,所以
是cache 的cache.
再内核初始化slab的时候,首先初始化cache_cache:kmem_cache_init然后初始化通用
cache: kmem_cache_sizes_init.这里有个问题要提一下:
kmem_cache_sizes_init->kmem_cache_create(name, sizes->cs_size,0,..):
kmem_cache_t * kmem_cache_create (...)
{
const char *func_nm = KERN_ERR "kmem_create: ";
size_t left_over, align, slab_size;
kmem_cache_t *cachep = NULL;
....
cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
....
if (size >= (PAGE_SIZE>>3)) flags |= CFLGS_OFF_SLAB;
..........
if (flags & CFLGS_OFF_SLAB)
cachep->slabp_cache = kmem_find_general_cachep(slab_size,0); ........
}
这里初始化通用cache的时候,如果是offslab,需要在通用cache中分配slab_t,问题是选中的通
用cache初始化好了吗?
是这样的:先初始化32-512字节的通用cache,使用inslab的slab_t没有问题,等初始化>512字节
的cachesize时,只要slab_size=cachealign(sizeof(slab_t)+num*sizeof(bufctrl))小于512即可.
static unsigned long offslab_limit;
这个限制也是在这里计算的,每初始化一个非offslab的通用cache,就修改此值:
kmem_cache_sizes_init:
if (!(OFF_SLAB(sizes->cs_cachep))) {
offslab_limit = sizes->cs_size-sizeof(slab_t);
offslab_limit /= 2;
}
这里最大的inslab通用cache是512:offslab_limit=(512-24)/2=244.有了这个限制,这个问题就
解决了.
static cache_sizes_t cache_sizes[] = {
#if PAGE_SIZE == 4096
{ 32, NULL, NULL},
#endif
{ 64, NULL, NULL},
{ 128, NULL, NULL},
{ 256, NULL, NULL},
{ 512, NULL, NULL},
{ 1024, NULL, NULL}, { 2048, NULL, NULL},
{ 4096, NULL, NULL},
{ 8192, NULL, NULL},
{ 16384, NULL, NULL},
{ 32768, NULL, NULL},
{ 65536, NULL, NULL},
{131072, NULL, NULL},
{ 0, NULL, NULL}
};
第三部分 核心算法
内存分配相关算法
这里讨论slab密切相关的几个函数,对于理解slab的关键操作很有好处.第一个我们讨论
kmem_cache_estimate,通过这个函数可以看到slab_t,bufctrl,obj的相对位置,关系.
此函数根据gfporder计算此slab可以包含的obj个数,并返回剩余的可以做color的大小.
static void kmem_cache_estimate (unsigned long gfporder, size_t size,
int flags, size_t *left_over, unsigned int *num)
{
int i;
size_t wastage = PAGE_SIZE<<gfporder;
size_t extra = 0;
size_t base = 0;
if (!(flags & CFLGS_OFF_SLAB)) {
base = sizeof(slab_t);
extra = sizeof(kmem_bufctl_t);
}
i = 0;
while (i*size + L1_CACHE_ALIGN(base+i*extra) <= wastage)
i++;
if (i > 0)
i--;
if (i > SLAB_LIMIT)
i = SLAB_LIMIT;
*num = i;
wastage -= i*size;
wastage -= L1_CACHE_ALIGN(base+i*extra);
*left_over = wastage;
}
然后是为slab分配slab_t的算法,根据off/in slab 的slab_t,分成两种情况.
static inline slab_t * kmem_cache_slabmgmt (kmem_cache_t *cachep,
void *objp, int colour_off, int local_flags)
{
slab_t *slabp;
if (OFF_SLAB(cachep)) {
slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
if (!slabp)
return NULL;
} else {
slabp = objp+colour_off;
colour_off += L1_CACHE_ALIGN(cachep->num *
sizeof(kmem_bufctl_t) + sizeof(slab_t));
}
slabp->inuse = 0;
slabp->colouroff = colour_off;
slabp->s_mem = objp+colour_off;
return slabp;
}
下一个是static inline void kmem_cache_init_objs (kmem_cache_t * cachep,
slab_t * slabp, unsigned long ctor_flags)详细代码略.(罗列代码于事无补^_^).
其作用是:初始化一个cache 中指定slab中的所有对象,并构建基于bufctl数组的free链表.
然后是从slab中分配obj的核心部分:
static inline void * kmem_cache_alloc_one_tail (kmem_cache_t *cachep,
slab_t *slabp)
{
void *objp;
STATS_INC_ALLOCED(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
slabp->inuse++;
objp = slabp->s_mem + slabp->free*cachep->objsize;
slabp->free=slab_bufctl(slabp)[slabp->free];
if (slabp->free == BUFCTL_END)
cachep->firstnotfull = slabp->list.next;
#if DEBUG
...
#endif
return objp;
}
有关分配的关键函数,最后一个是kmem_cache_grow,分配新的slab给slab cache.(已经做了
很重的注释,不需要更多了吧 ?)(在分析很多函数的时候,都没有把其同步/互斥操作作为重点
以后再做此项工作吧.)
static int kmem_cache_grow (kmem_cache_t * cachep, int flags)
{
slab_t *slabp;
struct page *page;
void *objp;
size_t offset;
unsigned int i, local_flags;
unsigned long ctor_flags;
unsigned long save_flags;
if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
BUG();
if (flags & SLAB_NO_GROW)
return 0;
if (in_interrupt() && (flags & SLAB_LEVEL_MASK) != SLAB_ATOMIC)
BUG();
ctor_flags = SLAB_CTOR_CONSTRUCTOR;
local_flags = (flags & SLAB_LEVEL_MASK);
if (local_flags == SLAB_ATOMIC)
ctor_flags |= SLAB_CTOR_ATOMIC;
spin_lock_irqsave(&cachep->spinlock, save_flags);
offset = cachep->colour_next;
cachep->colour_next++;
if (cachep->colour_next >= cachep->colour)
cachep->colour_next = 0;
offset *= cachep->colour_off;
cachep->dflags |= DFLGS_GROWN;
cachep->growing++;
spin_unlock_irqrestore(&cachep->spinlock, save_flags);
if (!(objp = kmem_getpages(cachep, flags)))
goto failed;
if (!(slabp = kmem_cache_slabmgmt(cachep, objp, offset, local_flags)))
goto opps1;
i = 1 << cachep->gfporder;
page = virt_to_page(objp);
do {
SET_PAGE_CACHE(page, cachep);
SET_PAGE_SLAB(page, slabp);
PageSetSlab(page);
page++;
} while (--i);
kmem_cache_init_objs(cachep, slabp, ctor_flags);
spin_lock_irqsave(&cachep->spinlock, save_flags);
cachep->growing--;
list_add_tail(&slabp->list,&cachep->slabs);
if (cachep->firstnotfull == &cachep->slabs)
cachep->firstnotfull = &slabp->list;
STATS_INC_GROWN(cachep);
cachep->failures = 0;
spin_unlock_irqrestore(&cachep->spinlock, save_flags);
return 1;
opps1:
kmem_freepages(cachep, objp);
failed:
spin_lock_irqsave(&cachep->spinlock, save_flags);
cachep->growing--;
spin_unlock_irqrestore(&cachep->spinlock, save_flags);
return 0;
}
内存释放的相关算法
kmem_cache_free_one: 根据objp 找到其所属的slab( 所属的cache已经找到,即cachep)把
这个obj挂入空闲对象链表根据slab 的状态调整他在cache->slab列表的位置.
static inline void kmem_cache_free_one(kmem_cache_t *cachep, void *objp)
{
slab_t* slabp;
CHECK_PAGE(virt_to_page(objp));
slabp = GET_PAGE_SLAB(virt_to_page(objp));
#if DEBUG
....
#endif
{
unsigned int objnr = (objp-slabp->s_mem)/cachep->objsize;
slab_bufctl(slabp)[objnr] = slabp->free;
slabp->free = objnr;
}
STATS_DEC_ACTIVE(cachep);
if (slabp->inuse-- == cachep->num)
goto moveslab_partial;
if (!slabp->inuse)
goto moveslab_free;
return;
moveslab_partial:
{
....
}
moveslab_free:
{
....
}
}
kmem_slab_destroy: 使用cachep->dtor释放此slab中所有obj.释放此slab使用的页面,还
有slab_t. 代码略.
第四部分 提供的主要接口
对外提供的接口分成几个部分:初始化,cache的创建删除,obj的分配释放,proc系统支持.
I) slab初始化部分
最早调用的函数: void __init kmem_cache_init(void)设置cache_cache(cache 的全局
管理机构).
void __init kmem_cache_init(void)
{
size_t left_over;
init_MUTEX(&cache_chain_sem);
INIT_LIST_HEAD(&cache_chain);
kmem_cache_estimate(0, cache_cache.objsize, 0,
&left_over, &cache_cache.num);
if (!cache_cache.num)
BUG();
cache_cache.colour = left_over/cache_cache.colour_off;
cache_cache.colour_next = 0;
}
然后初始化通用cache(cache_size): void __init kmem_cache_sizes_init(void)不再
罗列其代码,有关cache size初始化的特殊问题已经在第二部分说明.这里不再做其他分析
了.
最后一个处世话是__initcall(kmem_cpucache_init),再init进程中调用.主要工作是由
static void enable_all_cpucaches (void){
struct list_head* p;
down(&cache_chain_sem);
p = &cache_cache.next;
do { kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next);
enable_cpucache(cachep);
p = cachep->next.next;
} while (p != &cache_cache.next);
up(&cache_chain_sem);
}
II)cache的创建和销毁
kmem_cache_t *kmem_cache_create (const char *name, size_t size, size_t offset,
unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
void (*dtor)(void*, kmem_cache_t *, unsigned long))
int kmem_cache_destroy (kmem_cache_t * cachep)
int kmem_cache_shrink(kmem_cache_t *cachep)
void kmem_cache_reap (int gfp_mask)
kmem_cache_t *
kmem_cache_create (const char *name, size_t size, size_t offset,
unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
void (*dtor)(void*, kmem_cache_t *, unsigned long))
{
const char *func_nm = KERN_ERR "kmem_create: ";
size_t left_over, align, slab_size;
kmem_cache_t *cachep = NULL;
...#if DEBUG
..........#endif
if (flags & ~CREATE_MASK) BUG();
cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
if (!cachep)
goto opps;
memset(cachep, 0, sizeof(kmem_cache_t));
if (size & (BYTES_PER_WORD-1)) { size += (BYTES_PER_WORD-1);
size &= ~(BYTES_PER_WORD-1);
printk("%sForcing size word alignment - %s\n", func_nm, name);
}
#if DEBUG
....#endif
align = BYTES_PER_WORD;
if (flags & SLAB_HWCACHE_ALIGN)
align = L1_CACHE_BYTES;
if (size >= (PAGE_SIZE>>3))
flags |= CFLGS_OFF_SLAB;
if (flags & SLAB_HWCACHE_ALIGN) {
while (size < align/2)
align /= 2;
size = (size+align-1)&(~(align-1));
}
do {
unsigned int break_flag = 0;
cal_wastage:
kmem_cache_estimate(cachep->gfporder, size, flags,
&left_over, &cachep->num);
if (break_flag)
break;
if (cachep->gfporder >= MAX_GFP_ORDER) break;
if (!cachep->num) goto next;
if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) {
cachep->gfporder--;
break_flag++;
goto cal_wastage;
}
if (cachep->gfporder >= slab_break_gfp_order) break;
if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder)) break;
next