- 论坛徽章:
- 0
|
分析mmap()的大致函数调用过程
mmap()->sys_mmap()->do_mmap->do_mmap_pgoff
static inline unsigned long do_mmap(struct file *file, unsigned long addr,unsigned long len, unsigned long prot,
unsigned long flag, unsigned long offset)
{
unsigned long ret = -EINVAL;
if ((offset + PAGE_ALIGN(len)) < offset)
goto out;
//判断offset是否溢出
if (!(offset & ~PAGE_MASK))
ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
//offset页对齐,调用do_mmap_pgoff
out:
return ret;
}
file: 映射的文件
addr: 映射的起始地址
len: 映射的大小
prot: 映射区域权限
flags: 映射区域标志位
pgoff: 映射文件偏移
unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags, unsigned long pgoff)
{
struct mm_struct * mm = current->mm;
struct vm_area_struct * vma, * prev;
unsigned int vm_flags;
int correct_wcount = 0;
int error;
rb_node_t ** rb_link, * rb_parent;
if (file && (!file->f_op || !file->f_op->mmap))
return -ENODEV;
if (!len)
return addr;
len = PAGE_ALIGN(len);
if (len > TASK_SIZE || len == 0)
return -EINVAL;
/* offset overflow? */
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
return -EINVAL;
/* Too many mappings? */
if (mm->map_count > max_map_count)
return -ENOMEM;
//WMA线性区的个数要小于max_map_count
/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*/
addr = get_unmapped_area(file, addr, len, pgoff, flags);
if (addr & ~PAGE_MASK)
return addr;
//获取空闲区域
/* Do simple checking here so the lower-level routines won't have
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
vm_flags = calc_vm_flags(prot,flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
//把用户空间的prot和flags转成内核空间的vmflags。
//mm->def_flags只有一种可能值WM_LOCKD
/* mlock MCL_FUTURE? */
if (vm_flags & VM_LOCKED) {
unsigned long locked = mm->locked_vm << PAGE_SHIFT;
locked += len;
if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
return -EAGAIN;
}
//判断上锁的内存总量是否超过系统限制。
if (file) {
//文件映射
switch (flags & MAP_TYPE) {
//MAP_TYPE=0x0f,是MAP_SHARED和MAP_PRIVATE的掩码
case MAP_SHARED:
//共享映射
if ((prot & PROT_WRITE) && !(file->f_mode & FMODE_WRITE))
return -EACCES;
//该线性区域可写,但文件不可写,返回
/* Make sure we don't allow writing to an append-only file.. */
if (IS_APPEND(file->f_dentry->d_inode) && (file->f_mode & FMODE_WRITE))
return -EACCES;
//不能写一个以只追加模式打开的文件
/* make sure there are no mandatory locks on the file. */
if (locks_verify_locked(file->f_dentry->d_inode))
return -EAGAIN;
//判断文件是否上了强制锁
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
//vmflags添加共享标志位,如果文件不可写,去除可写标志位
/* fall through */
case MAP_PRIVATE:
//私有映射(写时复制)
if (!(file->f_mode & FMODE_READ))
return -EACCES;
//文件不可读,则返回。
break;
default:
return -EINVAL;
}
} else {
//匿名映射
vm_flags |= VM_SHARED | VM_MAYSHARE;
switch (flags & MAP_TYPE) {
default:
return -EINVAL;
case MAP_PRIVATE:
vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
/* fall through */
//私有映射,去除共享标志位
case MAP_SHARED:
break;
}
}
/* Clear old maps */
munmap_back:
vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
//与find_vma()类似,返回第一个vma->end大于addr的VMA描述符
if (vma && vma->vm_start < addr + len) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
goto munmap_back;
}
//如果返回的VMA与addr到addr+len的区域有重叠,则释放重叠部分。
//addr到addr+len的区域会包含重叠部分
//释放成功,则重新查找addr
/* Check against address space limit. */
if ((mm->total_vm << PAGE_SHIFT) + len
> current->rlim[RLIMIT_AS].rlim_cur)
return -ENOMEM;
//判断进程总内存量是否超过系统限制
/* Private writable mapping? Check memory availability.. */
if ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&
!(flags & MAP_NORESERVE) &&
!vm_enough_memory(len >> PAGE_SHIFT))
return -ENOMEM;
//如果是私有映射可写
//而且不保留交换空间
//而且系统内存不足则返回
/* Can we just expand an old anonymous mapping? */
if (!file && !(vm_flags & VM_SHARED) && rb_parent)
if (vma_merge(mm, prev, rb_parent, addr, addr + len, vm_flags))
goto out;
//如果是匿名映射,而且该线性区域是私有的,而且父节点存在
//则进行合并
//rb_parent是find_vma_prepare返回的。该线性区域的父节点
/* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
if (!vma)
return -ENOMEM;
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
vma->vm_page_prot = protection_map[vm_flags & 0x0f];
vma->vm_ops = NULL;
vma->vm_pgoff = pgoff;
vma->vm_file = NULL;
vma->vm_private_data = NULL;
vma->vm_raend = 0;
//分配一个新的vma,并初始化
if (file) {
error = -EINVAL;
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
goto free_vma;
//文件映射的时候,VM_GROWSDOWN和VM_GROWSUP是无效标志位
if (vm_flags & VM_DENYWRITE) {
//MAP_DENYWRITE只允许对映射区域的写入操作,其他对文件直接写入的操作将会被拒绝。
error = deny_write_access(file);
//如果没有进程对该文件进行写,则递增对文件的不可写计数
if (error)
goto free_vma;
correct_wcount = 1;
//记录本次不可写操作
}
vma->vm_file = file;
//线性区域vma和文件建立关联
get_file(file);
//增加文件引用计数
error = file->f_op->mmap(file, vma);
//调用文件相关mmap,建立页表项之类的。
if (error)
goto unmap_and_free_vma;
} else if (flags & MAP_SHARED) {
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
}
//如果是匿名共享映射,则由shmem_zero_setup进行创建。关系到tmpfs文件系统,暂时不分析
/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
* f_op->mmap method. -DaveM
*/
if (addr != vma->vm_start) {
//刚才调用f_op->mmap时,有些设备会修改addr
/*
* It is a bit too late to pretend changing the virtual
* area of the mapping, we just corrupted userspace
* in the do_munmap, so FIXME (not in 2.4 to avoid breaking
* the driver API).
*/
struct vm_area_struct * stale_vma;
/* Since addr changed, we rely on the mmap op to prevent
* collisions with existing vmas and just use find_vma_prepare
* to update the tree pointers.
*/
addr = vma->vm_start;
//更正addr
stale_vma = find_vma_prepare(mm, addr, &prev,
&rb_link, &rb_parent);
//查找第一个大于ma->end大于addr的vma线性区描述符
/*
* Make sure the lowlevel driver did its job right.
*/
if (unlikely(stale_vma && stale_vma->vm_start < vma->vm_end)) {
printk(KERN_ERR "buggy mmap operation: [<%p>]\n",
file ? file->f_op->mmap : NULL);
BUG();
}
//找到stale_vma不能和新的vma重叠,不然BUG()。
}
vma_link(mm, vma, prev, rb_link, rb_parent);
//将新的vma插入红黑树
if (correct_wcount)
atomic_inc(&file->f_dentry->d_inode->i_writecount);
//清除不可写记录
out:
mm->total_vm += len >> PAGE_SHIFT;
if (vm_flags & VM_LOCKED) {
mm->locked_vm += len >> PAGE_SHIFT;
make_pages_present(addr, addr + len);
//创建addr到addr+len的页表项
}
return addr;
unmap_and_free_vma:
if (correct_wcount)
atomic_inc(&file->f_dentry->d_inode->i_writecount);
vma->vm_file = NULL;
//取消线性区域和文件的关联
fput(file);
//递减文件计数
/* Undo any partial mapping done by a device driver. */
zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start);
//清除映射的页表项
free_vma:
kmem_cache_free(vm_area_cachep, vma);
return error;
}
static inline int locks_verify_locked(struct inode *inode)
{
if (MANDATORY_LOCK(inode))
return locks_mandatory_locked(inode);
return 0;
}
#define MANDATORY_LOCK(inode) \
(IS_MANDLOCK(inode) && ((inode)->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
//判断该节点的文件系统是否允许加强制锁,和该节点加的是不是强制锁
//设置 SGID 位,清除组可执行位。这种组合毫无意义,只表示该文件被加了强制锁
int locks_mandatory_locked(struct inode *inode)
{
fl_owner_t owner = current->files;
struct file_lock *fl;
/*
* Search the lock list for this inode for any POSIX locks.
*/
lock_kernel();
for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
if (!(fl->fl_flags & FL_POSIX))
continue;
if (fl->fl_owner != owner)
break;
}
unlock_kernel();
return fl ? -EAGAIN : 0;
}
//遍历inode的文件共享锁链表,是否有其他进程持有强制锁
//FL_POSIX锁是fcntl创建的。
int vm_enough_memory(long pages)
{
/* Stupid algorithm to decide if we have enough memory: while
* simple, it hopefully works in most obvious cases.. Easy to
* fool it, but this should catch most mistakes.
*/
/* 23/11/98 NJC: Somewhat less stupid version of algorithm,
* which tries to do "TheRightThing". Instead of using half of
* (buffers+cache), use the minimum values. Allow an extra 2%
* of num_physpages for safety margin.
*/
unsigned long free;
/* Sometimes we want to use more memory than we have. */
if (sysctl_overcommit_memory)
return 1;
/* The page cache contains buffer pages these days.. */
free = atomic_read(&page_cache_size);
free += nr_free_pages();
free += nr_swap_pages;
/*
* This double-counts: the nrpages are both in the page-cache
* and in the swapper space. At the same time, this compensates
* for the swap-space over-allocation (ie "nr_swap_pages" being
* too small.
*/
free += swapper_space.nrpages;
/*
* The code below doesn't account for free space in the inode
* and dentry slab cache, slab cache fragmentation, inodes and
* dentries which will become freeable under VM load, etc.
* Lets just hope all these (complex) factors balance out...
*/
free += (dentry_stat.nr_unused * sizeof(struct dentry)) >> PAGE_SHIFT;
free += (inodes_stat.nr_unused * sizeof(struct inode)) >> PAGE_SHIFT;
return free > pages;
//sysctl_overcommit_memory
//0, 表示内核将检查是否有足够的可用内存供应用进程使用;如果有足够的可用内存,内存申请允许;否则,内存申请失败,并把错误返回给应用进程。
//1, 表示内核允许分配所有的物理内存,而不管当前的内存状态如何。
//2, 表示内核允许分配超过所有物理内存和交换空间总和的内存
//统计所有可能的空闲页面,判断是否大于需求页面
|
|