2007-12-14
binfmt 跳过了, 但是exec就不跳过了这次看了吧.
这里的exec是各种binfmt的管理单元...
1.binfmt管理 /* * This structure defines the functions that are used to load the binary formats that * linux accepts. */ struct linux_binfmt { //别和linux_binprm混淆了 .... struct linux_binfmt * next; struct module *module; int (*load_binary)(struct linux_binprm *, struct pt_regs * regs); int (*load_shlib)(struct file *); int (*core_dump)(long signr, struct pt_regs * regs, struct file * file); unsigned long min_coredump; /* minimal dump size */ };
static struct linux_binfmt *formats; static rwlock_t binfmt_lock = RW_LOCK_UNLOCKED;
就是一个链表,注册和注销非常直接简单. int register_binfmt(struct linux_binfmt * fmt) int unregister_binfmt(struct linux_binfmt * fmt)
2.Core Dump asmlinkage long sys_uselib(const char * library); /*就是打开文件名为library的文件,再调用fmt->load_shlib(file)*/ int do_coredump(long signr, struct pt_regs * regs) /*core dump 是通过信号来触发的*/ { struct linux_binfmt * binfmt; char corename[6+sizeof(current->comm)]; /*current->comm是进程对应的bin文件名*/ struct file * file; struct inode * inode;
lock_kernel(); ........//sanity check,略过 memcpy(corename,"core.", 5); //core.binfilename就是coredump的文件名
file = filp_open(corename, O_CREAT | 2 | O_TRUNC | O_NOFOLLOW, 0600); /*创建coredump文件,准备写...*/ ..........//略 if (inode->i_nlink > 1) goto close_fail; /* multiple links - don't dump */ /*如果文件已存在,并且是一个符号链接,就不产生core dump */ ...//more check if (!binfmt->core_dump(signr, regs, file)) /*调用binfmt的core dump*/ goto close_fail; ........//略.... }
3.execve
do_execve 是载入bin运行的核心函数了,推荐阅读材料: dynamic linker and loader linker and loader /* * sys_execve() executes a new program. */ asmlinkage int sys_execve(abi64_no_regargs, struct pt_regs regs) { int error; char * filename;
filename = getname((char *) (long)regs.regs[4]); error = PTR_ERR(filename); if (IS_ERR(filename)) goto out; error = do_execve(filename, (char **) (long)regs.regs[5], (char **) (long)regs.regs[6], ®s); putname(filename);
out: return error; } 以前读这个函数最大的问题是 bprm.p的理解, 总是和建立的main函数调用栈对不上号.原因在于,一开始的时候bprm.p 是一个size,典型大小是 PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *) :128k. 最后到调用具体的binfmt的load_binary 函数的时候,bprm.p是剩余的空间.(copy_strings会吧已经占用的空间从bprm.p中刨去). copy_strings 和 setup_arg_pages 是理解其涵义的关键函数. 先看个图示:注意page的映射方式
从setup_arg_pages可以看出来页面的映射方式,同时参考上图 int setup_arg_pages(struct linux_binprm *bprm) { unsigned long stack_base; struct vm_area_struct *mpnt; int i;
stack_base = STACK_TOP - MAX_ARG_PAGES*PAGE_SIZE;
bprm->p += stack_base; if (bprm->loader) bprm->loader += stack_base; bprm->exec += stack_base;
mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!mpnt) return -ENOMEM; down(¤t->mm->mmap_sem); { [vma->start,vma->end) 左闭右开区间 ....... mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p; [] mpnt->vm_end = STACK_TOP; /*STATCK TOP :0XC0000000 不属于这个vma的,见find_vma*/ ........ }
for (i = 0 ; i < MAX_ARG_PAGES ; i++) { struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; current->mm->rss++; put_dirty_page(current,page,stack_base); } stack_base += PAGE_SIZE; } up(¤t->mm->mmap_sem); return 0; } 可以想象,相应的copy_strings必须从page[31]的高地址向地地址逐个写入参数: /* * 'copy_strings()' copies argument/envelope strings from user * memory to free pages in kernel mem. These are in a format ready * to be put directly into the top of new user memory. */ int copy_strings(int argc,char ** argv, struct linux_binprm *bprm) { while (argc-- > 0) { ... if (get_user(str, argv+argc) || !str || !(len = strnlen_user(str, bprm->p))) return -EFAULT; ......... bprm->p -= len; /*预留出这个字符串的空间*/ /* XXX: add architecture specific overflow check here. */
pos = bprm->p; while (len > 0) { .............
offset = pos % PAGE_SIZE; i = pos/PAGE_SIZE; page = bprm->page[i]; /*这样计算偏移和page,肯定是最后的page[31]先分配*/ new = 0; ...........//alloc page kaddr = kmap(page); ........//copy
pos += bytes_to_copy; str += bytes_to_copy; len -= bytes_to_copy; } } return 0; } 最后看execve的过程...................... /* * sys_execve() executes a new program. */ int do_execve(char * filename, char ** argv, char ** envp, struct pt_regs * regs) { file = open_exec(filename); //打开可执行文件....,内核内怎么读写文件的又一个例子,open.. ....//sanity check 略
bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); //bprm.p 现在是一个size:当前可用于参数传递的空间大小 (注意是剩余的空间大小阿....) //预留了一个指针 (怪异) memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0])); /*copy arc的时候会分配page*/ ........//略 if ((bprm.argc = count(argv, bprm.p / sizeof(void *))) < 0) { //看看有多少个参数 { 第一个是bin的文件名.. ..... }
if ((bprm.envc = count(envp, bprm.p / sizeof(void *))) < 0) { /*多少个环境变量*/ { /env也是必须有的... ... } retval = prepare_binprm(&bprm); /*e_uid/uid e_gid/gid capability 的处理*/ /*并读入一小段文件内容,以便判定是那种bin....*/ if (retval < 0) goto out;
retval = copy_strings_kernel(1, &bprm.filename, &bprm); /*copy 1个参数,从bprm.filename->bprm->page, bprm->p代表当前可用于参数传递的空间大小.*/ if (retval < 0) goto out;
bprm.exec = bprm.p; retval = copy_strings(bprm.envc, envp, &bprm); /*copy all env*/ if (retval < 0) goto out;
retval = copy_strings(bprm.argc, argv, &bprm); /*copy all argc*/ if (retval < 0) goto out;
retval = search_binary_handler(&bprm,regs); /*找到对应的linux_binfmt 调用fmt->load_binary; */ if (retval >= 0) /* execve success */ return retval; ............ //错误处理,略...
return retval; }
这里的工作只是copy参数到制定页面,映射页面(setup_arg_pages)和建立调用堆栈需要具体的binfmt来做,以elf为例吧.
static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs) { ........
/* Get the exec-header */ elf_ex = *((struct elfhdr *) bprm->buf); /*prepare_binprm已经读入一小段了*/
......//判定是否是elf
/* Now read in all of the header information */ ........ retval = kernel_read(bprm->file, elf_ex.e_phoff, (char *) elf_phdata, size); .... fd_install(elf_exec_fileno = retval, bprm->file); //读入完整头,并设置到已打开文件表
elf_ppnt = elf_phdata; elf_bss = 0; elf_brk = 0;
start_code = ~0UL; end_code = 0; start_data = 0; end_data = 0; /*下面加载shared libraries 的interpreter,具体分析elf再说吧....*/ for (i = 0; i < elf_ex.e_phnum; i++) { .......... }
/* Some simple consistency checks for the interpreter */ if (elf_interpreter) { .............. }
/* OK, we are done with that, now set up the arg stuff, and then start this sucker up */ //我们关注下 argc argv envc env的设置.......... if (!bprm->sh_bang) { ........ }
/* Flush all traces of the currently running executable */ retval = flush_old_exec(bprm); /*释放老的sig ctx copy一份新的;释放老的mm(用户页面和pagetable),建立并切换到新的mm (新的4g virtual address sapce); 释放老的文件;清空 fpu,thread,从thread group摘除 */ if (retval) goto out_free_dentry;
/* OK, This is the point of no return */ current->mm->start_data = 0; current->mm->end_data = 0; current->mm->end_code = 0; current->mm->mmap = NULL; current->flags &= ~PF_FORKNOEXEC; elf_entry = (unsigned long) elf_ex.e_entry;
/* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ SET_PERSONALITY(elf_ex, ibcs2_interpreter);
/* Do this so that we can load the interpreter, if need be. We will change some of these later */ current->mm->rss = 0; setup_arg_pages(bprm); /* XXX: check error */ /*上面看过了映射页面+setup vma,不过这里是ia32_setup_arg_pages,差别不大*/ current->mm->start_stack = bprm->p;
/* Try and get dynamic programs out of the way of the default mmap base, as well as whatever program they might try to exec. This is because the brk will follow the loader, and is not movable. */
load_bias = ELF_PAGESTART(elf_ex.e_type==ET_DYN ? ELF_ET_DYN_BASE : 0);
/* Now we do a little grungy work by mmaping the ELF image into the correct location in memory. At this point, we assume that the image should be loaded at fixed address, not at a variable address. */
old_fs = get_fs(); set_fs(get_ds()); for(i = 0, elf_ppnt = elf_phdata; i < elf_ex.e_phnum; i++, elf_ppnt++) { int elf_prot = 0, elf_flags; ......... } set_fs(old_fs);
elf_entry += load_bias; elf_bss += load_bias; elf_brk += load_bias; start_code += load_bias; end_code += load_bias; start_data += load_bias; end_data += load_bias;
if (elf_interpreter) {/*加载elf interpreter*/ if (interpreter_type == INTERPRETER_AOUT) elf_entry = load_aout_interp(&interp_ex, interpreter); else elf_entry = load_elf_interp(&interp_elf_ex, interpreter, &interp_load_addr); .............. } .......下面的函数是真正建立argc arv...的代码 bprm->p = (unsigned long) create_elf_tables((char *)bprm->p, bprm->argc, bprm->envc, (interpreter_type == INTERPRETER_ELF ? &elf_ex : NULL), load_addr, load_bias, interp_load_addr, (interpreter_type == INTERPRETER_AOUT ? 0 : 1)); /* N.B. passed_fileno might not be initialized? */ if (interpreter_type == INTERPRETER_AOUT) current->mm->arg_start += strlen(passed_fileno) + 1; current->mm->start_brk = current->mm->brk = elf_brk; current->mm->end_code = end_code; current->mm->start_code = start_code; current->mm->start_data = start_data; current->mm->end_data = end_data; current->mm->start_stack = bprm->p;
/* Calling set_brk effectively mmaps the pages that we need * for the bss and break sections */ set_brk(elf_bss, elf_brk);
padzero(elf_bss); ............. } static elf_addr_t * create_elf_tables(char *p, int argc, int envc, struct elfhdr * exec, unsigned long load_addr, unsigned long load_bias, unsigned long interp_load_addr, int ibcs) { elf_caddr_t *argv; elf_caddr_t *envp; elf_addr_t *sp, *csp; char *k_platform, *u_platform; long hwcap; size_t platform_len = 0;
/* * Get hold of platform and hardware capabilities masks for * the machine we are running on. In some cases (Sparc), * this info is impossible to get, in others (i386) it is * merely difficult. */ hwcap = ELF_HWCAP; k_platform = ELF_PLATFORM; if (k_platform) { //多传递一个平台特定的elf优化参数 platform_len = strlen(k_platform) + 1; u_platform = p - platform_len; __copy_to_user(u_platform, k_platform, platform_len); } else u_platform = p;
/* * Force 16 byte _final_ alignment here for generality. * Leave an extra 16 bytes free so that on the PowerPC we * can move the aux table up to start on a 16-byte boundary. */ /*sp是cruuret stack top*/ /*为各个参数预留空间:*/ sp = (elf_addr_t *)((~15UL & (unsigned long)(u_platform)) - 16UL); csp = sp; csp -= ((exec ? DLINFO_ITEMS*2 : 4) + (k_platform ? 2 : 0)); csp -= envc+1; csp -= argc+1; csp -= (!ibcs ? 3 : 1); /* argc itself */ if ((unsigned long)csp & 15UL) sp -= ((unsigned long)csp & 15UL) / sizeof(*sp);
/* * Put the ELF interpreter info on the stack */ #define NEW_AUX_ENT(nr, id, val) \ __put_user ((id), sp+(nr*2)); \ __put_user ((val), sp+(nr*2+1)); \
sp -= 2; NEW_AUX_ENT(0, AT_NULL, 0); if (k_platform) { sp -= 2; NEW_AUX_ENT(0, AT_PLATFORM, (elf_addr_t)(unsigned long) u_platform); } sp -= 3*2; NEW_AUX_ENT(0, AT_HWCAP, hwcap); NEW_AUX_ENT(1, AT_PAGESZ, ELF_EXEC_PAGESIZE); NEW_AUX_ENT(2, AT_CLKTCK, CLOCKS_PER_SEC);
if (exec) { sp -= 10*2;
NEW_AUX_ENT(0, AT_PHDR, load_addr + exec->e_phoff); NEW_AUX_ENT(1, AT_PHENT, sizeof (struct elf_phdr)); NEW_AUX_ENT(2, AT_PHNUM, exec->e_phnum); NEW_AUX_ENT(3, AT_BASE, interp_load_addr); NEW_AUX_ENT(4, AT_FLAGS, 0); NEW_AUX_ENT(5, AT_ENTRY, load_bias + exec->e_entry); NEW_AUX_ENT(6, AT_UID, (elf_addr_t) current->uid); NEW_AUX_ENT(7, AT_EUID, (elf_addr_t) current->euid); NEW_AUX_ENT(8, AT_GID, (elf_addr_t) current->gid); NEW_AUX_ENT(9, AT_EGID, (elf_addr_t) current->egid); } #undef NEW_AUX_ENT /*这里才到了建立argc,argv,...的代码, 倒是也不难了....*/ sp -= envc+1; envp = (elf_caddr_t *) sp; sp -= argc+1; argv = (elf_caddr_t *) sp; if (!ibcs) { __put_user((elf_addr_t)(unsigned long) envp,--sp); __put_user((elf_addr_t)(unsigned long) argv,--sp); }
__put_user((elf_addr_t)argc,--sp); current->mm->arg_start = (unsigned long) p; while (argc-->0) { __put_user((elf_caddr_t)(unsigned long)p,argv++); p += strlen_user(p); } __put_user(NULL, argv); current->mm->arg_end = current->mm->env_start = (unsigned long) p; while (envc-->0) { __put_user((elf_caddr_t)(unsigned long)p,envp++); p += strlen_user(p); } __put_user(NULL, envp); current->mm->env_end = (unsigned long) p; return sp; } 传递的参数真是不少啊.......... (怎么不检查空间够不够啊.... 预留的4个byte是啥意思...)
4. execve辅助函数
int flush_old_exec(struct linux_binprm * bprm) { char * name; int i, ch, retval; struct signal_struct * oldsig;
/* * Make sure we have a private signal table */ oldsig = current->sig; retval = make_private_signals(); /*分配并拷贝新的sigact*/ if (retval) goto flush_failed;
/* * Release all of the old mmap stuff */ retval = exec_mmap(); /*有了对mm的分析,理解这个函数自然是不难的...*/ /*建立新的虚拟空间,释放所有对原有mm的引用:用户页面.. CLEAR page table*/ /*fork的进程共享所有页面,但是并没有共享pgd哦...见copy_mm->mm_init*/ if (retval) goto mmap_failed;
/* This is the point of no return */ release_old_signals(oldsig); /*释放老的sigact*/
current->sas_ss_sp = current->sas_ss_size = 0; /*设置好current->comm*/ .........
flush_thread(); /*clear fpu 和 debug regs*/
de_thread(current); /*摘链表的操作*/
........ flush_signal_handlers(current); /*不是IGN就换成DFL(不忽略就改为default)*/ flush_old_files(current->files); /*如果有close on exec 就close 文件*/
return 0;
mmap_failed: ....... } map页面到指定的虚拟地址 void put_dirty_page(struct task_struct * tsk, struct page *page, unsigned long address)
capbilities 的继承和设置策略. /* * This function is used to produce the new IDs and capabilities * from the old ones and the file's capabilities. * * The formula used for evolving capabilities is: * * pI' = pI * (***) pP' = (fP & X) | (fI & pI) * pE' = pP' & fE [NB. fE is 0 or ~0] * * I=Inheritable, P=Permitted, E=Effective // p=process, f=file * ' indicates post-exec(), and X is the global 'cap_bset'. * */ void compute_creds(struct linux_binprm *bprm)
done. 2007.12.15
|