linux内核分析之进程地址空间
linux内核分析之进程地址空间view plaincopy to clipboard01.unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
02. unsigned long len, unsigned long prot,
03. unsigned long flags, unsigned long pgoff)
04.{
05. struct mm_struct * mm = current->mm;
06. struct inode *inode;
07. unsigned int vm_flags;
08. int error;
09. unsigned long reqprot = prot;
10. /*下面主要是对参数的基本检查,所提的请求
11. 是否能满足要求*/
12. /*
13. * Does the application expect PROT_READ to imply PROT_EXEC?
14. *
15. * (the exception is when the underlying filesystem is noexec
16. *mounted, in which case we dont add PROT_EXEC.)
17. */
18. if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
19. if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
20. prot |= PROT_EXEC;
21.
22. if (!len)
23. return -EINVAL;
24.
25. if (!(flags & MAP_FIXED))
26. addr = round_hint_to_min(addr);
27.
28. error = arch_mmap_check(addr, len, flags);
29. if (error)
30. return error;
31.
32. /* Careful about overflows.. */
33. len = PAGE_ALIGN(len);
34. if (!len || len > TASK_SIZE)
35. return -ENOMEM;
36.
37. /* offset overflow? */
38. if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
39. return -EOVERFLOW;
40.
41. /* Too many mappings? */
42. if (mm->map_count > sysctl_max_map_count)
43. return -ENOMEM;
44.
45. if (flags & MAP_HUGETLB) {
46. struct user_struct *user = NULL;
47. if (file)
48. return -EINVAL;
49.
50. /*
51. * VM_NORESERVE is used because the reservations will be
52. * taken when vm_ops->mmap() is called
53. * A dummy user value is used because we are not locking
54. * memory so no accounting is necessary
55. */
56. len = ALIGN(len, huge_page_size(&default_hstate));
57. file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
58. &user, HUGETLB_ANONHUGE_INODE);
59. if (IS_ERR(file))
60. return PTR_ERR(file);
61. }
62.
63. /* Obtain the address to map to. we verify (or select) it and ensure
64. * that it represents a valid section of the address space.
65. */
66. /*获得新线性区的线性地址区间*/
67. addr = get_unmapped_area(file, addr, len, pgoff, flags);
68. if (addr & ~PAGE_MASK)
69. return addr;
70.
71. /* Do simple checking here so the lower-level routines won't have
72. * to. we assume access permissions have been handled by the open
73. * of the memory object, so we don't do any here.
74. */
75. /*通过把存放在prot和flags参数中的值进行组合
76. 来计算新线性区描述符的标志*/
77. vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
78. mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
79.
80. if (flags & MAP_LOCKED)
81. if (!can_do_mlock())
82. return -EPERM;
83.
84. /* mlock MCL_FUTURE? */
85. if (vm_flags & VM_LOCKED) {
86. unsigned long locked, lock_limit;
87. locked = len >> PAGE_SHIFT;
88. locked += mm->locked_vm;
89. lock_limit = current->signal->rlim.rlim_cur;
90. lock_limit >>= PAGE_SHIFT;
91. if (locked > lock_limit && !capable(CAP_IPC_LOCK))
92. return -EAGAIN;
93. }
94.
95. inode = file ? file->f_path.dentry->d_inode : NULL;
96.
97. if (file) {
98. switch (flags & MAP_TYPE) {
99. case MAP_SHARED:
100. if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
101. return -EACCES;
102.
103. /*
104. * Make sure we don't allow writing to an append-only
105. * file..
106. */
107. if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
108. return -EACCES;
109.
110. /*
111. * Make sure there are no mandatory locks on the file.
112. */
113. if (locks_verify_locked(inode))
114. return -EAGAIN;
115.
116. vm_flags |= VM_SHARED | VM_MAYSHARE;
117. if (!(file->f_mode & FMODE_WRITE))
118. vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
119.
120. /* fall through */
121. case MAP_PRIVATE:
122. if (!(file->f_mode & FMODE_READ))
123. return -EACCES;
124. if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
125. if (vm_flags & VM_EXEC)
126. return -EPERM;
127. vm_flags &= ~VM_MAYEXEC;
128. }
129.
130. if (!file->f_op || !file->f_op->mmap)
131. return -ENODEV;
132. break;
133.
134. default:
135. return -EINVAL;
136. }
137. } else {
138. switch (flags & MAP_TYPE) {
139. case MAP_SHARED:
140. /*
141. * Ignore pgoff.
142. */
143. pgoff = 0;
144. vm_flags |= VM_SHARED | VM_MAYSHARE;
145. break;
146. case MAP_PRIVATE:
147. /*
148. * Set pgoff according to addr for anon_vma.
149. */
150. pgoff = addr >> PAGE_SHIFT;
151. break;
152. default:
153. return -EINVAL;
154. }
155. }
156.
157. error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
158. if (error)
159. return error;
160. error = ima_file_mmap(file, prot);
161. if (error)
162. return error;
163. /*实际工作*/
164. return mmap_region(file, addr, len, flags, vm_flags, pgoff);
165.}我们get_unmapped_area函数获得新的线性地址区间
view plaincopy to clipboard01./*
02.The parameters passed are the following:
03.
04.file The file or device being mapped
05.
06.addr The requested address to map to
07.
08.len The length of the mapping
09.
10.pgoff The offset within the file being mapped
11.
12.flags Protection flags
13.*/
14.//When a new area is to be memory mapped, a free region has to be found that is large enough to contain the new mapping.
15./*查找进程地址空间以找到一个可以使用的
16.线性地址区间,函数根据线性地址区间是否应该
17.用于文件内存映射或匿名内存映射,调用两个
18.方法(get_unmapped_area文件操作和内存描述符的
19.get_unmapped_area方法)中的一个*/
20.unsigned long
21.get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
22. unsigned long pgoff, unsigned long flags)
23.{
24. unsigned long (*get_area)(struct file *, unsigned long,
25. unsigned long, unsigned long, unsigned long);
26.
27. get_area = current->mm->get_unmapped_area;
28. if (file && file->f_op && file->f_op->get_unmapped_area)
29. get_area = file->f_op->get_unmapped_area;
30. addr = get_area(file, addr, len, pgoff, flags);/*调用对应的函数*/
31. if (IS_ERR_VALUE(addr))
32. return addr;
33.
34. if (addr > TASK_SIZE - len)
35. return -ENOMEM;
36. if (addr & ~PAGE_MASK)
37. return -EINVAL;
38. /*x86 ia-32直接返回地址*/
39. return arch_rebalance_pgtables(addr, len);
40.}我们看不使用文件的一个,对于和文件相关的一个,在文件系统中再来分析
对于内存相关的get_unmapped_area函数在如下函数中设置
view plaincopy to clipboard01./*
02. * This function, called very early during the creation of a new
03. * process VM image, sets up which VM layout function to use:
04. */
05.void arch_pick_mmap_layout(struct mm_struct *mm)
06.{
07. if (mmap_is_legacy()) {
08. mm->mmap_base = mmap_legacy_base();
09. mm->get_unmapped_area = arch_get_unmapped_area;
10. mm->unmap_area = arch_unmap_area;
11. } else {
12. mm->mmap_base = mmap_base();
13. mm->get_unmapped_area = arch_get_unmapped_area_topdown;
14. mm->unmap_area = arch_unmap_area_topdown;
15. }
16.}
我们直接看arch_get_unmmapped_area,其他一个类似。
view plaincopy to clipboard01.unsigned long
02.arch_get_unmapped_area(struct file *filp, unsigned long addr,
03. unsigned long len, unsigned long pgoff, unsigned long flags)
04.{
05. struct mm_struct *mm = current->mm;
06. struct vm_area_struct *vma;
07. unsigned long start_addr;
08.
09. if (len > TASK_SIZE)
10. return -ENOMEM;
11.
12. if (flags & MAP_FIXED)
13. return addr;
14.
15. if (addr) {
16. addr = PAGE_ALIGN(addr);
17. /*从现有地址空间中查找地址*/
18. vma = find_vma(mm, addr);
19. /*当地址合法,现有进程地址空间中没有
20. vma或者该地址不属于现有进程地址空间中已经
21. 的vma中(也就是说现有地址空间有vma存在)*/
22. if (TASK_SIZE - len >= addr &&
23. (!vma || addr + len <= vma->vm_start))
24. return addr;/*返回地址*/
25. }
26. /*达到这里表示addr为0或者前面的搜索失败*/
27. /*cached_hole_size表示在free_area_cache下面地址中最大
28. 的一个空洞,所以从free_area_cache开始搜索,
29. 这样提高搜索效率*/
30. if (len > mm->cached_hole_size) {
31. start_addr = addr = mm->free_area_cache;
32. } else {/*设置搜索起点为用户态地址空间的三分之一
33. 处*/
34. start_addr = addr = TASK_UNMAPPED_BASE;
35. mm->cached_hole_size = 0;
36. }
37.
38.full_search:
39. /*逐个访问查找从addr开始的vma*/
40. for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
41. /* At this point:(!vma || addr < vma->vm_end). */
42. if (TASK_SIZE - len < addr) {
43. /*
44. * Start a new search - just in case we missed
45. * some holes.
46. */
47. if (start_addr != TASK_UNMAPPED_BASE) {
48. addr = TASK_UNMAPPED_BASE;
49. start_addr = addr;
50. mm->cached_hole_size = 0;
51. goto full_search;
52. }
53. return -ENOMEM;
54. }
55. /*满足没映射的要求*/
56. if (!vma || addr + len <= vma->vm_start) {
57. /*
58. * Remember the place where we stopped the search:
59. */
60. mm->free_area_cache = addr + len;
61. return addr;
62. }/*更新cached_hole_size,这里每次会更新cached_hole_size
63. 变量,因为查找len长度为从低地址到高地址
64. 依次开始查找的,所以第一个满足要求的肯定
65. 满足比这个地址更低的地址中没有比他的空洞
66. 更大的了,同时这里的每次更新和上面的
67. free_area_cache变量的更新可以对应上*/
68. if (addr + mm->cached_hole_size < vma->vm_start)
69. mm->cached_hole_size = vma->vm_start - addr;
70. addr = vma->vm_end;/*更新addr为本次搜索先行区间的末*/
71. }
72.}接着上面的调用mmap_region函数
view plaincopy to clipboard01.unsigned long mmap_region(struct file *file, unsigned long addr,
02. unsigned long len, unsigned long flags,
03. unsigned int vm_flags, unsigned long pgoff)
04.{
05. struct mm_struct *mm = current->mm;
06. struct vm_area_struct *vma, *prev;
07. int correct_wcount = 0;
08. int error;
09. struct rb_node **rb_link, *rb_parent;
10. unsigned long charged = 0;
11. struct inode *inode =file ? file->f_path.dentry->d_inode : NULL;
12.
13. /* Clear old maps */
14. error = -ENOMEM;
15.munmap_back:
16. /*确定处于新区间之前的线性区对象的位置,
17. 以及在红黑树这两个新线性区的位置*/
18. vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
19. /*检查是否还存在于新区建重叠的线性区*/
20. if (vma && vma->vm_start < addr + len) {
21. if (do_munmap(mm, addr, len))/*删除新的区间*/
22. return -ENOMEM;
23. goto munmap_back;
24. }
25.
26. /* Check against address space limit. */
27. /*检查插入新的线性区是否引起进程地址空间的
28. 大小超过上限*/
29. if (!may_expand_vm(mm, len >> PAGE_SHIFT))
30. return -ENOMEM;
31.
32. /*
33. * Set 'VM_NORESERVE' if we should not account for the
34. * memory use of this mapping.
35. */
36. if ((flags & MAP_NORESERVE)) {
37. /* We honor MAP_NORESERVE if allowed to overcommit */
38. if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
39. vm_flags |= VM_NORESERVE;
40.
41. /* hugetlb applies strict overcommit unless MAP_NORESERVE */
42. if (file && is_file_hugepages(file))
43. vm_flags |= VM_NORESERVE;
44. }
45.
46. /*
47. * Private writable mapping: check memory availability
48. */
49. if (accountable_mapping(file, vm_flags)) {
50. charged = len >> PAGE_SHIFT;
51. if (security_vm_enough_memory(charged))
52. return -ENOMEM;
53. vm_flags |= VM_ACCOUNT;
54. }
55.
56. /*
57. * Can we just expand an old mapping?
58. */
59. /*检查是否可以和前一个线性区进行合并*/
60. vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
61. if (vma)/*合并成功*/
62. goto out;
63.
64. /*
65. * Determine the object being mapped and call the appropriate
66. * specific mapper. the address has already been validated, but
67. * not unmapped, but the maps are removed from the list.
68. */
69. /*程序运行到这里表示新区将建立为新区间
70. 分配一个vma结构*/
71. vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
72. if (!vma) {
73. error = -ENOMEM;
74. goto unacct_error;
75. }
76. /*初始化新区对象*/
77. vma->vm_mm = mm;
78. vma->vm_start = addr;
79. vma->vm_end = addr + len;
80. vma->vm_flags = vm_flags;
81. vma->vm_page_prot = vm_get_page_prot(vm_flags);
82. vma->vm_pgoff = pgoff;
83.
84. if (file) {
85. error = -EINVAL;
86. if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
87. goto free_vma;
88. if (vm_flags & VM_DENYWRITE) {
89. error = deny_write_access(file);
90. if (error)
91. goto free_vma;
92. correct_wcount = 1;
93. }
94. vma->vm_file = file;
95. get_file(file);
96. error = file->f_op->mmap(file, vma);
97. if (error)
98. goto unmap_and_free_vma;
99. if (vm_flags & VM_EXECUTABLE)
100. added_exe_file_vma(mm);
101.
102. /* Can addr have changed??
103. *
104. * Answer: Yes, several device drivers can do it in their
105. * f_op->mmap method. -DaveM
106. */
107. addr = vma->vm_start;
108. pgoff = vma->vm_pgoff;
109. vm_flags = vma->vm_flags;
110.
111. }
112. /*如果该区间是一个共享匿名区*/
113. else if (vm_flags & VM_SHARED) {
114. /*初始化,共享匿名区主要用于进程间通信*/
115. error = shmem_zero_setup(vma);
116. if (error)
117. goto free_vma;
118. }
119.
120. if (vma_wants_writenotify(vma))
121. vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
122. /*将新区间插入到进程的线性地址空间中*/
123. vma_link(mm, vma, prev, rb_link, rb_parent);
124. file = vma->vm_file;
125.
126. /* Once vma denies write, undo our temporary denial count */
127. if (correct_wcount)
128. atomic_inc(&inode->i_writecount);
129.out:
130. perf_event_mmap(vma);
131. /*增加total_vm字段大小*/
132. mm->total_vm += len >> PAGE_SHIFT;
133. vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
134. if (vm_flags & VM_LOCKED) {
135. /*
136. * makes pages present; downgrades, drops, reacquires mmap_sem
137. */
138. /*连续分配线性区的所有页,并将他们
139. 锁在RAM中*/
140. long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
141. if (nr_pages < 0)
142. return nr_pages; /* vma gone! */
143. mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
144. } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
145. /*连续分配线性区的所有页*/
146. make_pages_present(addr, addr + len);
147. return addr;/*返回新线性区地址*/
148.
149.unmap_and_free_vma:
150. if (correct_wcount)
151. atomic_inc(&inode->i_writecount);
152. vma->vm_file = NULL;
153. fput(file);
154.
155. /* Undo any partial mapping done by a device driver. */
156. unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
157. charged = 0;
158.free_vma:
159. kmem_cache_free(vm_area_cachep, vma);
160.unacct_error:
161. if (charged)
162. vm_unacct_memory(charged);
163. return error;
164.}到这里分配线性地址空间就算走完了,主要完成的工作依次由根据地址和长度在进程地址空间中查找一个未添加进来的线性区间,如果这个区间可以和当前进程线性地址空间的线性区间可以合并,则合并之。如果不能合并,创建一个线性区间,将这个线性区间vma插入到进程现有的线性地址空间里作为他的线性地址空间的一部分。最后对线性区间分配实际的物理页面并返回基地址。
谢谢分享
页:
[1]