mm/slab.c

  内存管理中,大家讨论最多的一块.我们将分几部分来阐述.
  
  
                            第一部分    概述 
                            
   slab从zone-buddy系统分配页面,然后拆分成小块内存共内核使用,相当于程序使用
的malloc/fee.                
   内核使用的每一个对象(inode,dentry, sock........)都有一个专门的cache,就是
这里的slab机制,并且如果对内存的种类有要求,需要另外创建单独的cache,比如DMA,
HIGH mem.
   slab最主要的作用就是小块内存分配,另外对SMP,cache进行了优化.内核通过buddy
控制大粒度的内存碎片,通过slab控制小块的内存碎片----slab 一次分配一个或者几个
物理地址连续的内存页.
   
   关于slab对cache的优化这里采用一示意图副图:
   
      1)假设只有两个cache line,0,1.
      2)每个page 两个cache line大小                                                                                    
      3)第一个slab color为0,从page的起始位置开始存放slab_t
      4)第二个slab color 为1,从page 偏移cache size开始存放slab_t                                                                              
      +------------------+ +------------------+                                     
      |  cache line 0    | | cache line 1     |                                     
      /------------------+ +------------------+                                     
      |                    /                                                        
      |                    |                                                        
/---<-+                    +-->-----------------------------------/                 
|     |                    |                                      |                 
|     \                    |                                      |                 
|     +--------------------+--------------------+-----------------+--------------+  
|     |   cache line 0     | cache line 1       | cache line 0    | cache line 1 |  
|     +--------------------+--------------------+-----------------+--------------+  
|     page0                                     |page1                              
|     slab_t color=0                            |                 slab_t clolor=1   
|                                               |                                   
|                                               |                                   
|                                               |                                   
|                                               |                                   
|                                               |                                   
\-----------------------------------------------\                                   
                                                                                    
  如果遍历cache的所有slab,访问slab1,然后访问slab2,然后又访问slab1.这个期间
cache line0,cache line 1不用失效即可完成访问.
  如果两个slab的color相同,每次访问都会使cache line 失效从而降低效率.
   
   在这里不讨论关于cache本身的详细问题,此图仅为参考.(fix me)
   

  
                             第二部分  数据结构和示意图
   
   还是先给出一张图,然后详解相关的数据结构.这是一个非off slab(slab_t不在slab
之中)的示意图.图中将关键的数据清楚的表明了其用图.
   1)kmem_cache_s 
     slabs是所有属于此cache的slab(连续的几个页面)的一个链表.
     firtnotfull:这些slab按照 full, part used, empty的顺序排列,这个指针指向第
          一个部分使用的slab.
     objsize:   cache 管理的对象之字节大小.
     flags:     like off_sab bit
     color:     color的最大值,把slab的剩余空间按照coloroff的粒度分成'clolor'
     coloroff:  颜色粒度,一般和cache line大小相关(n倍)                                                                                               
     
   2)slab_t : slab管理所使用的结构
     list: 挂入kmem_cache_s的slabs链表
     colouroff:从slab起始页面到第一个obj(s_mem)的偏移.cache align(color,slab_t,bufctl)
     inuse: 已分配objs数量
     free:  bufctrl是一个数组构成的链表,从索引free开始,是其空闲objs组成的链表.
     
     kmem_cache_s                                                                            
         +-------------+                                                                      
       /-|slabs        |                                                                      
       | |*firstnotfull->>-----------------------/                                            
       | |objsize      |                         |                                            
       | |flags;       |                         |                                            
       | |num;         |                         |                                            
       | |             |                         |                                            
       | |color(max)   |=left/cache.coloroff     |                                            
       | |coloroff     |L1 cache size            |                                            
       |                                         |                                            
       |                                         |                                            
       |                                         |                                            
       |                                         |                                            
       |                                         |                                            
       |     slab_t  struct slab_s               |                                            
       |                                         |                                            
    /--|-----+----------+ page        --/--------+----------+page                             
    |  \---->|list  --------------/     |        |          |                                 
    |        |colouroff;|         |     |        | color    |=cache.colornext*cache.coloroff  
    |        |*s_mem;---------/   |     |        |          |                                 
    |        |inuse;    |     |   |     |        +----------+ slab_t                          
    |        |free -------/   |   \-----+------->|list      |                                 
    |        +----------+ |   |         |        |colouroff;|                                 
    \        | bufctl   | |   |         |        |*s_mem;---------/                           
colouroff    |----------| |   |         |        |inuse;    |     |                           
    /       /-->...       |   |         |        |free -------/   |                           
    |       ||          | |   |         |        +----------+ |   |                           
    |       \---next   <--/   |         \        | bufctl   | |   |                           
    |        |          |     |     colouroff    |----------| |   |                           
    |        +----------+     |         /       /-->...       |   |                           
    |            ...          |         |       ||          | |   |                           
    |         cache align     |         |       \---next   <--/   |                           
   -\--------+----------+<----/         |        |          |     |                           
             |   obj    |               |        +----------+     |                           
             +----------|               |            ...          /                           
             |          |               |         cache align     |                           
             |          |            ---/--------+----------+<----/                           
             |          |                        |  obj     |                                 
                 ....                                                                         
             |          |                        +----------+                                 
             |          |                        |          |                                 
             |          |                        |          |                                 
             |          |                            ....                                     
             |          |                        |          |                                 
             |          |                        |          |                                 
             |          |                        |          |                                 
             |          |                        |          |                                 
             +----------+                        |          |                                 
                                                 |          |                                 
                                                 |          |                                 
                                                 |          |                                 
                                                 +----------+                                 
 
   需要说明的是,一旦page从buddy分配到slab, 其page 描述符中的prev指向所属slab, next
指向所属的cache.(见kmem_cache_grow).这样,对于一个obj, rounddowntopage(objp)->next
就是其cache,rounddowntopage(objp)->prev就是其slab.

                      
   然后列出其定义:
struct kmem_cache_s {
/* 1) each alloc & free */
/* full, partial first, then free */
	struct list_head	slabs; //cache 管理的slabs 之表头
	struct list_head	*firstnotfull; 
	unsigned int		objsize;
	unsigned int	 	flags;	/* constant flags */
	unsigned int		num;	/* # of objs per slab */
	spinlock_t		spinlock;
#ifdef CONFIG_SMP
	unsigned int		batchcount; /*SMP的时候每个cpu一个快速分配链表,一次从slab分配的个数*/
#endif

/* 2) slab additions /removals */
	/* order of pgs per slab (2^n) */
	unsigned int		gfporder;

	/* force GFP flags, e.g. GFP_DMA */
	unsigned int		gfpflags;

	size_t			colour;		/* cache colouring range */
	unsigned int		colour_off;	/* colour 粒度, colour*color_off 是slab 的color_off */
	unsigned int		colour_next;	/* cache colouring */
	kmem_cache_t		*slabp_cache; /*当这个cache中的slab,其管理部分
	                                    (slab描述符和kmem_bufctl_t数组)放在
	                                    slab外面时,这个指针指向放置
	                                    的通用cache*/
	unsigned int		growing;
	unsigned int		dflags;		/* dynamic flags */

	/* constructor func de-constructor func 忽略*/
	
/* 3) cache creation/removal */
	char			name[CACHE_NAMELEN];
	struct list_head	next; /* 用它和其它的cache串成一个链,这个链上按照时钟算法
	                           定期地回收某个cache的部分slab*/
#ifdef CONFIG_SMP
/* 4) per-cpu data */
	cpucache_t		*cpudata[NR_CPUS];
#endif
#if STATS  //忽略
#endif
};

/*
 * slab_t
 *
 * 管理一个slab 中的对象. 放在一个slab 开始的地方
 * , 或从general cache 中分配.
 * Slabs 链成一个有序的list: 满员, 部分使用, 空slab.
 */
typedef struct slab_s {
	struct list_head	list; //链成list, 表头kmem_cache_s::slabs
	unsigned long		colouroff; //s_mem = SlabBase(buddypage)+colouroff
	void			*s_mem;		/*  第一个对象所处位置*/
	unsigned int		inuse;	/*  slab 中正在使用中的对象个数*/
	kmem_bufctl_t		free;   /*空闲对象表头*/
} slab_t;


 另外一个对slab工作起到重要作用的是cache_cache:
/* internal cache of cache description objs */
static kmem_cache_t cache_cache = {
	slabs:		LIST_HEAD_INIT(cache_cache.slabs),
	firstnotfull:	&cache_cache.slabs,
	objsize:	sizeof(kmem_cache_t),
	flags:		SLAB_NO_REAP,
	spinlock:	SPIN_LOCK_UNLOCKED,
	colour_off:	L1_CACHE_BYTES,
	name:		"kmem_cache",
};

  这是一个手工简立的cache, 其管理的对象是kmem_cache_t.用于分配kmem_cache_t,所以
是cache 的cache.

  再内核初始化slab的时候,首先初始化cache_cache:kmem_cache_init然后初始化通用
cache: kmem_cache_sizes_init.这里有个问题要提一下:

kmem_cache_sizes_init->kmem_cache_create(name, sizes->cs_size,0,..):

kmem_cache_t * kmem_cache_create (...)
{
	const char *func_nm = KERN_ERR "kmem_create: ";
	size_t left_over, align, slab_size;
	kmem_cache_t *cachep = NULL;

	/* Sanity checks... debug */
	 ....
	
	/* 分配cache 的描述对象. 没有问题,cache_cache已经初始化 */
	cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
	....
	/* 确定以那种方式管理对象 ( 'on' or 'off' slab.) */
	if (size >= (PAGE_SIZE>>3))//大于512字节就要off了(一般,4k页面)
			flags |= CFLGS_OFF_SLAB;
	
  ..........
	if (flags & CFLGS_OFF_SLAB)
		cachep->slabp_cache = kmem_find_general_cachep(slab_size,0);//再通用cache中分配
	........
}

  这里初始化通用cache的时候,如果是offslab,需要在通用cache中分配slab_t,问题是选中的通
用cache初始化好了吗?
	是这样的:先初始化32-512字节的通用cache,使用inslab的slab_t没有问题,等初始化>512字节
的cachesize时,只要slab_size=cachealign(sizeof(slab_t)+num*sizeof(bufctrl))小于512即可.
static unsigned long offslab_limit;/*  采用off-slab 管理方式的slab所含对象的上限.*/
这个限制也是在这里计算的,每初始化一个非offslab的通用cache,就修改此值:
kmem_cache_sizes_init:
if (!(OFF_SLAB(sizes->cs_cachep))) {
			offslab_limit = sizes->cs_size-sizeof(slab_t);
			offslab_limit /= 2;
		}
这里最大的inslab通用cache是512:offslab_limit=(512-24)/2=244.有了这个限制,这个问题就
解决了.

static cache_sizes_t cache_sizes[] = {
#if PAGE_SIZE == 4096
	{    32,	NULL, NULL},
#endif
	{    64,	NULL, NULL},
	{   128,	NULL, NULL},
	{   256,	NULL, NULL},
	{   512,	NULL, NULL},
	{  1024,	NULL, NULL},//start offs lab
	{  2048,	NULL, NULL},
	{  4096,	NULL, NULL},
	{  8192,	NULL, NULL},
	{ 16384,	NULL, NULL},
	{ 32768,	NULL, NULL},
	{ 65536,	NULL, NULL},
	{131072,	NULL, NULL},
	{     0,	NULL, NULL}
};




                            第三部分  核心算法

                            内存分配相关算法
                                                        
  这里讨论slab密切相关的几个函数,对于理解slab的关键操作很有好处.第一个我们讨论
kmem_cache_estimate,通过这个函数可以看到slab_t,bufctrl,obj的相对位置,关系.
  此函数根据gfporder计算此slab可以包含的obj个数,并返回剩余的可以做color的大小.                   
                      
/* 对一个给定的slab 计算可以包含的对象个数num, 
  * bytes left over(除了对象,管理机构后剩余的字节).
  * gfporder: slab size 2^gfporder*PAGE_SIZE
  * size: 对象大小, flags: may be CFLGS_OFF_SLAB
  */
static void kmem_cache_estimate (unsigned long gfporder, size_t size,
		 int flags, size_t *left_over, unsigned int *num)
{
	int i;
	size_t wastage = PAGE_SIZE<<gfporder; /*全部空间*/
	size_t extra = 0; /*bufctl占用的空间*/
	size_t base = 0; /*slab_t大小*/

  /*如果是off slab_t,只考虑有多少个obj即可*/
	
	if (!(flags & CFLGS_OFF_SLAB)) {
		base = sizeof(slab_t);
		extra = sizeof(kmem_bufctl_t);
	}
	i = 0;

  /*逐步加大对象个数,只要能盛的下即可*/
	while (i*size + L1_CACHE_ALIGN(base+i*extra) <= wastage)
		i++;
	         /*
	          * 计算说明: base+i*extra 应该是第一个对象的开始位置。
	          * L1_CACHE_ALIGN 作用其上,得到对齐后的地址。
	          */
	
	if (i > 0)
		i--;/*while已经算到盛不下了,要减1*/

	if (i > SLAB_LIMIT)/*不要有太多对象*/
		i = SLAB_LIMIT; 

	*num = i;
	wastage -= i*size; /*总空间减去对象大小*/
	wastage -= L1_CACHE_ALIGN(base+i*extra);/*减去管理结构大小*/
	*left_over = wastage; /*剩下的用作color*/
}

  然后是为slab分配slab_t的算法,根据off/in slab 的slab_t,分成两种情况.

/*
  *为一个slab 的slab_t 分配内存
  *非offlab的情况直接使用slab 的顶端
  */
static inline slab_t * kmem_cache_slabmgmt (kmem_cache_t *cachep,
			void *objp, int colour_off, int local_flags)
{
	slab_t *slabp;
	
	if (OFF_SLAB(cachep)) {/*offslab的话从指定的通用cache分配slab_t*/
		/* Slab management obj is off-slab. */
		slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
		if (!slabp)
			return NULL;
	} else {/*in slab 的slab_t*/
		/* FIXME: change to
			slabp = objp
		 * if you enable OPTIMIZE
		 */
		slabp = objp+colour_off; /*看图,color在slab的顶端*/
		colour_off += L1_CACHE_ALIGN(cachep->num *
				sizeof(kmem_bufctl_t) + sizeof(slab_t));/*看图,colour_off和s_mem的关系*/
	}
	slabp->inuse = 0;
	slabp->colouroff = colour_off;
	slabp->s_mem = objp+colour_off;

	return slabp;
}


下一个是static inline void kmem_cache_init_objs (kmem_cache_t * cachep,
			slab_t * slabp, unsigned long ctor_flags)详细代码略.(罗列代码于事无补^_^).
其作用是:初始化一个cache 中指定slab中的所有对象,并构建基于bufctl数组的free链表.


   然后是从slab中分配obj的核心部分:

/*
 * 从slab 分配一个object
 */
static inline void * kmem_cache_alloc_one_tail (kmem_cache_t *cachep,
							 slab_t *slabp)
{
	void *objp;

	STATS_INC_ALLOCED(cachep);
	STATS_INC_ACTIVE(cachep);
	STATS_SET_HIGH(cachep);

	/* get obj pointer */
	slabp->inuse++;
	objp = slabp->s_mem + slabp->free*cachep->objsize; /*对象指针是这样算的!*/
	slabp->free=slab_bufctl(slabp)[slabp->free];/*next=obj->next*/

	if (slabp->free == BUFCTL_END) /*这个slab用完了,firstnotfull=next slab*/
		/* slab now full: move to next slab for next alloc */
		cachep->firstnotfull = slabp->list.next;
#if DEBUG
	  ...
#endif
	return objp;
}

  有关分配的关键函数,最后一个是kmem_cache_grow,分配新的slab给slab cache.(已经做了
很重的注释,不需要更多了吧 ?)(在分析很多函数的时候,都没有把其同步/互斥操作作为重点
以后再做此项工作吧.)
/*
 * 为指定的cache 分配一个slab.  
 */
static int kmem_cache_grow (kmem_cache_t * cachep, int flags)
{
	slab_t	*slabp;
	struct page	*page;
	void		*objp;
	size_t		 offset;
	unsigned int	 i, local_flags;
	unsigned long	 ctor_flags;
	unsigned long	 save_flags;

	/* Be lazy and only check for valid flags here,
 	 * keeping it out of the critical path in kmem_cache_alloc().
	 */
	if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
		BUG();/*其他flags都是无效的*/
	if (flags & SLAB_NO_GROW)
		return 0;

	/*
	 * The test for missing atomic flag is performed here, rather than
	 * the more obvious place, simply to reduce the critical path length
	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
	 * will eventually be caught here (where it matters).
	 */
	if (in_interrupt() && (flags & SLAB_LEVEL_MASK) != SLAB_ATOMIC)
		BUG();/*在中断环境中,只容许进行SLAB_ATOMIC类型(not sleep...)的分配*/

	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
	local_flags = (flags & SLAB_LEVEL_MASK);
	if (local_flags == SLAB_ATOMIC)/*禁止sleep的slab分配,告诉constructor*/
		/*
		 * Not allowed to sleep.  Need to tell a constructor about
		 * this - it might need to know...
		 */
		ctor_flags |= SLAB_CTOR_ATOMIC;

	/* About to mess with non-constant members - lock. */
	spin_lock_irqsave(&cachep->spinlock, save_flags);

	/* Get colour for the slab, and cal the next value. */
	offset = cachep->colour_next; /*这个slab应有的color*/
	cachep->colour_next++;
	if (cachep->colour_next >= cachep->colour)
		cachep->colour_next = 0;
	offset *= cachep->colour_off;/*slab头部预留这多color bytes*/
	cachep->dflags |= DFLGS_GROWN;

	cachep->growing++;
	spin_unlock_irqrestore(&cachep->spinlock, save_flags);

	/* 一个新slab的一系列内存分配动作.
	 * Neither the cache-chain semaphore, or cache-lock, are
	 * held, but the incrementing c_growing prevents this
	 * cache from being reaped or shrunk.
	 * Note: The cache could be selected in for reaping in
	 * kmem_cache_reap(), but when the final test is made the
	 * growing value will be seen.
	 */

	/* 分配slab,一组连续的buddy页面 */
	if (!(objp = kmem_getpages(cachep, flags)))
		goto failed;

	/* 分配slab 的管理结构,in/off slab的slab_t */
	if (!(slabp = kmem_cache_slabmgmt(cachep, objp, offset, local_flags)))
		goto opps1;

	/* !!!!!! I hope this is OK. */
	i = 1 << cachep->gfporder;
	page = virt_to_page(objp);
	do {
              /* 分给slab 的页面,其page 描述符中的prev 
                * 指向所属slab, next指向所属的cache
                */
		SET_PAGE_CACHE(page, cachep);
		SET_PAGE_SLAB(page, slabp); 

		
		PageSetSlab(page);
		page++;
	} while (--i);/*slab中每个页面都有此设置*/

	kmem_cache_init_objs(cachep, slabp, ctor_flags); /*初始化*/

	spin_lock_irqsave(&cachep->spinlock, save_flags);
	cachep->growing--;

	/* Make slab active. */
	list_add_tail(&slabp->list,&cachep->slabs); /*加入cache的slab列表尾部*/
	if (cachep->firstnotfull == &cachep->slabs)/*调整firstnotfull指针*/
		cachep->firstnotfull = &slabp->list;
	STATS_INC_GROWN(cachep);
	cachep->failures = 0;

	spin_unlock_irqrestore(&cachep->spinlock, save_flags);
	return 1;
opps1:
	kmem_freepages(cachep, objp);
failed:
	spin_lock_irqsave(&cachep->spinlock, save_flags);
	cachep->growing--;
	spin_unlock_irqrestore(&cachep->spinlock, save_flags);
	return 0;
}




                            内存释放的相关算法
                            
kmem_cache_free_one: 根据objp 找到其所属的slab( 所属的cache已经找到,即cachep)把
这个obj挂入空闲对象链表根据slab 的状态调整他在cache->slab列表的位置.

static inline void kmem_cache_free_one(kmem_cache_t *cachep, void *objp)
{
	slab_t* slabp;

	CHECK_PAGE(virt_to_page(objp));
	/* reduces memory footprint
	 *
	if (OPTIMIZE(cachep))
		slabp = (void*)((unsigned long)objp&(~(PAGE_SIZE-1)));
	 else
	 */
	slabp = GET_PAGE_SLAB(virt_to_page(objp));
	      /*这里的page->prev 记录了obj 所在的slab,见GET_PAGE_SLAB*/
   
#if DEBUG
	....
#endif
	{
    /*把这个obj挂入空闲对象链接表*/
		unsigned int objnr = (objp-slabp->s_mem)/cachep->objsize;
		slab_bufctl(slabp)[objnr] = slabp->free;
		slabp->free = objnr;
	}
	STATS_DEC_ACTIVE(cachep);/*统计*/	
	
	/* fixup slab chain */
	if (slabp->inuse-- == cachep->num) /*从full到->part full*/
		goto moveslab_partial;
	if (!slabp->inuse)/*从part->empty*/
		goto moveslab_free;
	return;

  /*移动slab的位置,simple..*/
moveslab_partial:
  /* was full.
	 * Even if the page is now empty, we can set c_firstnotfull to
	 * slabp: there are no partial slabs in this case
	 */
	{
	  ....
	}
moveslab_free:
	/*
	 * was partial, now empty.
	 * c_firstnotfull might point to slabp
	 * FIXME: optimize
	 */
	{
	  ....
	}
}
                          
kmem_slab_destroy: 使用cachep->dtor释放此slab中所有obj.释放此slab使用的页面,还
有slab_t. 代码略.



                         第四部分  提供的主要接口
   对外提供的接口分成几个部分:初始化,cache的创建删除,obj的分配释放,proc系统支持.
                       
                          I) slab初始化部分
  最早调用的函数: void __init kmem_cache_init(void)设置cache_cache(cache 的全局
管理机构).
void __init kmem_cache_init(void)
{/*已经静态初始化了许多,这里比较简单*/
	size_t left_over;

	init_MUTEX(&cache_chain_sem);
	INIT_LIST_HEAD(&cache_chain);

	kmem_cache_estimate(0, cache_cache.objsize, 0,
			&left_over, &cache_cache.num);
	if (!cache_cache.num)
		BUG();

	cache_cache.colour = left_over/cache_cache.colour_off;
	cache_cache.colour_next = 0;
}
  然后初始化通用cache(cache_size): void __init kmem_cache_sizes_init(void)不再
罗列其代码,有关cache size初始化的特殊问题已经在第二部分说明.这里不再做其他分析
了.
  最后一个处世话是__initcall(kmem_cpucache_init),再init进程中调用.主要工作是由
static void enable_all_cpucaches (void) //完成的.
{
	struct list_head* p;

	down(&cache_chain_sem);

	p = &cache_cache.next;
	do { //遍历所有cache,初始化SMP,
		kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next);

		enable_cpucache(cachep);/*设置bache alloc limit,然后通过IPI中断让各个cpu
		                          初始化私有数据*/
		p = cachep->next.next;
	} while (p != &cache_cache.next);

	up(&cache_chain_sem);
}
      
                         II)cache的创建和销毁
      
      kmem_cache_t *kmem_cache_create (const char *name, size_t size, size_t offset,
 	             unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
 	             void (*dtor)(void*, kmem_cache_t *, unsigned long))
      int kmem_cache_destroy (kmem_cache_t * cachep)
      int kmem_cache_shrink(kmem_cache_t *cachep)
      
      void kmem_cache_reap (int gfp_mask)

/**
 * kmem_cache_create - 创建一个cache
 * @name: 用于/proc/slabinfo:本cache的标识符串
 * @size: 这cache中对象的大小.
 * @offset: 从这个偏移使用slab.
 * @flags: SLAB flags
 * @ctor: cache 中page 的构造函数.
 * @dtor: cahce 中page 的解构函数.
 *
 * Returns a ptr to the cache on success, NULL on failure.
 * Cannot be called within a int, but can be interrupted.
 * The @ctor is run when new pages are allocated by the cache
 * and the @dtor is run before the pages are handed back.
 * The flags are
 
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
 * to catch references to uninitialised memory.
 *
 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
 * for buffer overruns.
 *
 * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
 * memory pressure.
 *
 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 */
kmem_cache_t *
kmem_cache_create (const char *name, size_t size, size_t offset,
	unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
	void (*dtor)(void*, kmem_cache_t *, unsigned long))
{
	const char *func_nm = KERN_ERR "kmem_create: ";
	size_t left_over, align, slab_size;
	kmem_cache_t *cachep = NULL;

	/*
	 * Sanity checks... these are all serious usage bugs.
	 */
	 ... //略
#if DEBUG
	.......... //略
#endif

	/*
	 * Always checks flags, a caller might be expecting debug
	 * support which isn't available.
	 */
	if (flags & ~CREATE_MASK) //不得指定除此之外的其他flags
		BUG();

	/* 分配 kmem_cache_s*/
	cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
	if (!cachep)
		goto opps;
	memset(cachep, 0, sizeof(kmem_cache_t));

	/* Check that size is in terms of words.  This is needed to avoid
	 * unaligned accesses for some archs when redzoning is used, and makes
	 * sure any on-slab bufctl's are also correctly aligned.
	 */
	if (size & (BYTES_PER_WORD-1)) { //强制obj size WORD 对齐
		size += (BYTES_PER_WORD-1);
		size &= ~(BYTES_PER_WORD-1);
		printk("%sForcing size word alignment - %s\n", func_nm, name);
	}
	
#if DEBUG
.... //略
#endif
	align = BYTES_PER_WORD;
	if (flags & SLAB_HWCACHE_ALIGN)
		align = L1_CACHE_BYTES;

	/* 确定以那种方式管理对象 ( 'on' or 'off' slab.) */
	if (size >= (PAGE_SIZE>>3)) //obj过大,则采用off slab 的slab_t
		/*
		 * Size is large, assume best to place the slab management obj
		 * off-slab (should allow better packing of objs).
		 */
		flags |= CFLGS_OFF_SLAB;

  
	if (flags & SLAB_HWCACHE_ALIGN) {/*调整obj size,以做到cache line 对齐*/
		/* Need to adjust size so that objs are cache aligned. */
		/* Small obj size, can get at least two per cache line. */
		/* FIXME: only power of 2 supported, was better */
		while (size < align/2)
			align /= 2;
		size = (size+align-1)&(~(align-1));
	}

	/* 计算slab的大小(单位page_size), 和每个slab中的对象个数.
	 * This could be made much more intelligent.  For now, try to avoid
	 * using high page-orders for slabs.  When the gfp() funcs are more
	 * friendly towards high-order requests, this should be changed.
	 */
	do {
		unsigned int break_flag = 0;
cal_wastage:
		kmem_cache_estimate(cachep->gfporder, size, flags,
						&left_over, &cachep->num);
		if (break_flag)
			break;
		if (cachep->gfporder >= MAX_GFP_ORDER) //slab绝对不超过这个size
			break;
		if (!cachep->num) //放不下任何obj?
			goto next; //只有加大slab
			
		if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) {//这个检查
		 //第二部分有详述
			/* Oops, this num of objs will cause problems. */
			cachep->gfporder--;
			break_flag++;
			goto cal_wastage;
		}

		/*
		 * Large num of objs is good, but v. large slabs are currently
		 * bad for the gfp()s.
		 */
		if (cachep->gfporder >= slab_break_gfp_order)//slab中能够放的下obj时
					break;//最多2个页面分配给slab.

		if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))//浪费少于1/8,可以接受
			break;	/* Acceptable internal fragmentation. */ 
next