- 论坛徽章:
- 0
|
关于 big kernel lock,大内核锁
关于内核各人自有不同的解读方法,本人常用readelf, objdump等binutils中的工具,因对有些函数如 lock_kernel 的操作有很多编译参数的控制导致看不清其代码到底是什么,故将生成的vmlinux反汇编一下,于是一目了然: c02a57a0 :
c02a57a0: ba 00 e0 ff ff mov $0xffffe000,%edx
c02a57a5: 21 e2 and %esp,%edx
c02a57a7: 8b 02 mov (%edx),%eax
c02a57a9: 8b 48 14 mov 0x14(%eax),%ecx
c02a57ac: 41 inc %ecx
c02a57ad: 75 05 jne c02a57b4
c02a57af: ff 42 14 incl 0x14(%edx)
c02a57b2: 8b 02 mov (%edx),%eax
c02a57b4: 89 48 14 mov %ecx,0x14(%eax)
c02a57b7: c3 ret
c02a57b8: 90 nop
c02a57b9: 8d b4 26 00 00 00 00 lea 0x0(%esi,1),%esi
c02a57c0 :
c02a57c0: ba 00 e0 ff ff mov $0xffffe000,%edx
c02a57c5: 21 e2 and %esp,%edx
c02a57c7: 8b 0a mov (%edx),%ecx
c02a57c9: 8b 41 14 mov 0x14(%ecx),%eax
c02a57cc: 85 c0 test %eax,%eax
c02a57ce: 78 19 js c02a57e9
c02a57d0: 48 dec %eax
c02a57d1: 85 c0 test %eax,%eax
c02a57d3: 89 41 14 mov %eax,0x14(%ecx)
c02a57d6: 79 0b jns c02a57e3
c02a57d8: ff 4a 14 decl 0x14(%edx)
c02a57db: 8b 42 08 mov 0x8(%edx),%eax
c02a57de: 83 e0 08 and $0x8,%eax
c02a57e1: 75 01 jne c02a57e4
c02a57e3: c3 ret
c02a57e4: e9 a7 ed ff ff jmp c02a4590
c02a57e9: 0f 0b ud2a
c02a57eb: c7 00 00 fb 2b c0 movl $0xc02bfb00,(%eax)
c02a57f1: eb dd jmp c02a57d0
栈指针 esp 中包含的是内核栈上的位置,而常数 $0xffffe000 实际上就是值为 ~(THREAD_SIZE - 1),因在内核代码运行时所用的栈区就是每个进程的内核栈,大小为THREAD_SIZE,并且是THREAD_SIZE对齐的,并正好放在一个struct thread_info上,因而将 esp 与 ~(THREAD_SIZE - 1) 的结果就是当前进程的 struct thread_info结构体指针,在结构体中包含struct task_struct指针,于是struct task_struct 结构体指针轻易取得。
从这两个结构定义中可以看出,
- struct thread_info 的偏移 0x14 处的是int preempt_count,代表当前进程在调度时是否可抢占:等于0为可抢占,大于0是不可抢占,小于0则是出BUG了
- struct task_struct 的偏移 0x14 处的是int lock_depth, 正是大内核锁的实现所定义的,从 kernel_locked() 的宏定义可以看出如果 lock_depth 大于或等于0 则是大内核锁锁上了,小于0则是出BUG了,在 include/linux/init_task.h 中对 init_task 初始化的值为 -1, 说明其正常的值为-1,大于或等于零则是内核上锁了。
根据 lock_kernel 反汇编码可以看出所谓大内核锁的上锁其实就是对当前进程的 lock_depth 加1,并判断加1后如果等于0则为第一次上锁,需对当前 struct thread_info 的 preempt_count 也加1阻止抢占,大内核锁的特点之一就是可以递归调用,也即可以重复上锁,只要解锁次数与上锁次数相同即可。实现方式就是 lock_depth 值累计上锁次数,但 preempt_count 值并不需累计,所以同一进程的第二次上锁时只对 lock_depth 累计而不对 preempt_count 累计。
解锁时在 unlock_kernel 中也是可看出:首先检查 lock_depth 小于0,小于0则出BUG了(此处以 ud2a 实现,关于内核中 BUG 函数以 ud2a 所实现代码的巧妙性可以另一文写出),说明有人在未上锁时先解锁了,肯定是代码有BUG了。然后对 lock_depth 减1操作并写回,如果减1后不为负数则不是递归上锁时的最后一次解锁,直接返回。而如果是负数则是最后一次解锁,这时递减preempt_count值,将恢复为可抢占状态,
大内核锁的特点就是可递归使用,在 schedule 调度过程中如果发现当前进程对 BKL 上锁则释放它,下一次调度时再重新获得它,使用这两个函数: c02a5780 :
c02a5780: b8 00 e0 ff ff mov $0xffffe000,%eax
c02a5785: 21 e0 and %esp,%eax
c02a5787: ff 40 14 incl 0x14(%eax)
c02a578a: 31 c0 xor %eax,%eax
c02a578c: c3 ret
c02a578d: 8d 76 00 lea 0x0(%esi),%esi
本次试验编译平台为 CONFIG_PREEMPT=y 而 CONFIG_PREEMPT_BKL not set,CONFIG_SMP not set 这是为桌面编译 2.6 内核的很典型配置(是啊,编译 2.6 内核如果不打开PREEMPT那还有何趣味?)。在 CONFIG_SMP 关闭情况下,spinlock 优化为无。
__reacquire_kernel_lock 其实就是很简单的 preempt_disable(),对 preempt_count 直接自增,并由 eax 返回0 c02a5790 :
c02a5790: b8 00 e0 ff ff mov $0xffffe000,%eax
c02a5795: 21 e0 and %esp,%eax
c02a5797: ff 48 14 decl 0x14(%eax)
c02a579a: c3 ret
c02a579b: 90 nop
c02a579c: 8d 74 26 00 lea 0x0(%esi,1),%esi
相应 __release_kernel_lock 也就是对 preempt_count 直接自减。返回类型为 void
在 unlock_kernel 中 ud2a 指令后的 c02a57eb 处其实是两个数:word 类型的 0x007c 和一个指针: 0xc02bfb00: 在这个指针处查到代码如下,看起来汇编码很乱:仔细对映一下发现它就是字符串:"/usr/src/linux-2.6.14/lib/kernel_lock.c" c02bfaff: 00 2f add %ch,(%edi)
c02bfb01: 75 73 jne c02bfb76
c02bfb03: 72 2f jb c02bfb34
c02bfb05: 73 72 jae c02bfb79
c02bfb07: 63 2f arpl %bp,(%edi)
c02bfb09: 6c insb (%dx),%es:(%edi)
c02bfb0a: 69 6e 75 78 2d 32 2e imul $0x2e322d78,0x75(%esi),%ebp
c02bfb11: 36 2e 31 34 2f xor %esi,%cs:%ss:(%edi,%ebp,1)
c02bfb16: 6c insb (%dx),%es:(%edi)
c02bfb17: 69 62 2f 6b 65 72 6e imul $0x6e72656b,0x2f(%edx),%esp
c02bfb1e: 65 gs
c02bfb1f: 6c insb (%dx),%es:(%edi)
c02bfb20: 5f pop %edi
c02bfb21: 6c insb (%dx),%es:(%edi)
c02bfb22: 6f outsl %ds:(%esi),(%dx)
c02bfb23: 63 6b 2e arpl %bp,0x2e(%ebx)
c02bfb26: 63 00 arpl %ax,(%eax)
在 unlock_kernel 中同时还调用了一个 preempt_schedule ,即在判断 need_schedule 时,如果需要重调度则跳转至此,给出汇编码和 C 代码: c02a4590 :
c02a4590: 55 push %ebp
c02a4591: b8 00 e0 ff ff mov $0xffffe000,%eax
c02a4596: 89 e5 mov %esp,%ebp
c02a4598: 21 e0 and %esp,%eax
c02a459a: 53 push %ebx
c02a459b: 8b 48 14 mov 0x14(%eax),%ecx
c02a459e: 31 d2 xor %edx,%edx
c02a45a0: 85 c9 test %ecx,%ecx
c02a45a2: 75 0d jne c02a45b1
c02a45a4: 9c pushf
c02a45a5: 58 pop %eax
c02a45a6: c1 e8 09 shr $0x9,%eax
c02a45a9: 83 f0 01 xor $0x1,%eax
c02a45ac: 83 e0 01 and $0x1,%eax
c02a45af: 74 05 je c02a45b6
c02a45b1: ba 01 00 00 00 mov $0x1,%edx
c02a45b6: 85 d2 test %edx,%edx
c02a45b8: 75 22 jne c02a45dc
c02a45ba: bb 00 e0 ff ff mov $0xffffe000,%ebx
c02a45bf: 21 e3 and %esp,%ebx
c02a45c1: 81 43 14 00 00 00 10 addl $0x10000000,0x14(%ebx)
c02a45c8: e8 b3 f9 ff ff call c02a3f80
c02a45cd: 81 6b 14 00 00 00 10 subl $0x10000000,0x14(%ebx)
c02a45d4: 8b 43 08 mov 0x8(%ebx),%eax
c02a45d7: 83 e0 08 and $0x8,%eax
c02a45da: 75 de jne c02a45ba
c02a45dc: 5b pop %ebx
c02a45dd: 5d pop %ebp
c02a45de: c3 ret
c02a45df: 90 nop
/*
* this is is the entry point to schedule() from in-kernel preemption
* off of preempt_enable. Kernel preemptions off return from interrupt
* occur there and call schedule directly.
*/
asmlinkage void __sched preempt_schedule(void)
{
struct thread_info *ti = current_thread_info();
/*
* If there is a non-zero preempt_count or interrupts are disabled,
* we do not want to preempt the current task. Just return..
*/
if (unlikely(ti->preempt_count || irqs_disabled()))
return;
need_resched:
add_preempt_count(PREEMPT_ACTIVE);
schedule();
sub_preempt_count(PREEMPT_ACTIVE);
/* we could miss a preemption opportunity between schedule and now */
barrier();
if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
goto need_resched;
}
在 schedule() 之前给 preempt_count 加上了 PREEMPT_ACTIVE(值为0x10000000),以改变调度时对当前进程的某些策略。
有关代码: /* Attach to any functions which should be ignored in wchan output. */
#define __sched __attribute__((__section__(".sched.text")))
#define THREAD_SIZE (8192)
#define STACK_WARN (THREAD_SIZE/8)
#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
#define _TIF_NEED_RESCHED (1preempt_count)
#define inc_preempt_count() add_preempt_count(1)
#define dec_preempt_count() sub_preempt_count(1)
#define preempt_disable() \
do { \
inc_preempt_count(); \
barrier(); \
} while (0)
#define preempt_enable_no_resched() \
do { \
barrier(); \
dec_preempt_count(); \
} while (0)
/* how to get the thread information struct from C */
static inline struct thread_info *current_thread_info(void)
{
struct thread_info *ti;
__asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1)));
return ti;
}
#define __lockfunc fastcall __attribute__((section(".spinlock.text")))
#define kernel_locked() (current->lock_depth >= 0)
#define irqs_disabled() \
({ \
unsigned long flags; \
local_save_flags(flags); \
!(flags & (1 preemptable, BUG */
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
struct thread_info *thread_info;
atomic_t usage;
unsigned long flags; /* per process flags, defined below */
unsigned long ptrace;
int lock_depth; /* BKL lock depth */
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
*/
#define INIT_TASK(tsk) \
{ \
.state = 0, \
.thread_info = &init_thread_info, \
.usage = ATOMIC_INIT(2), \
.flags = 0, \
.lock_depth = -1, \
#define INIT_THREAD_INFO(tsk) \
{ \
.task = &tsk, \
.exec_domain = &default_exec_domain, \
.flags = 0, \
.cpu = 0, \
.preempt_count = 1, \
.addr_limit = KERNEL_DS, \
.restart_block = { \
.fn = do_no_restart_syscall, \
}, \
}
include/linux/spinlock.h
/*
* Must define these before including other files, inline functions need them
*/
#define LOCK_SECTION_NAME \
".text.lock." __stringify(KBUILD_BASENAME)
#define LOCK_SECTION_START(extra) \
".subsection 1\n\t" \
extra \
".ifndef " LOCK_SECTION_NAME "\n\t" \
LOCK_SECTION_NAME ":\n\t" \
".endif\n"
#define LOCK_SECTION_END \
".previous\n\t"
#define __lockfunc fastcall __attribute__((section(".spinlock.text")))
include/linux/compiler.h
# define __acquires(x) __attribute__((context(0,1)))
# define __releases(x) __attribute__((context(1,0)))
extern void __lockfunc lock_kernel(void) __acquires(kernel_lock);
extern void __lockfunc unlock_kernel(void) __releases(kernel_lock);
/*
* Generic compiler-dependent macros required for kernel
* build go below this comment. Actual compiler/compiler version
* specific implementations come from the above header files
*/
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
lib/kernel_lock.c
/*
* Getting the big kernel lock.
*
* This cannot happen asynchronously, so we only need to
* worry about other CPU's.
*/
void __lockfunc lock_kernel(void)
{
int depth = current->lock_depth+1;
if (likely(!depth))
__lock_kernel();
current->lock_depth = depth;
}
void __lockfunc unlock_kernel(void)
{
BUG_ON(current->lock_depth lock_depth 1) {
_raw_spin_lock(&kernel_flag);
return;
}
/*
* Otherwise, let's wait for the kernel lock
* with preemption enabled..
*/
do {
preempt_enable();
while (spin_is_locked(&kernel_flag))
cpu_relax();
preempt_disable();
} while (!_raw_spin_trylock(&kernel_flag));
}
}
static inline void __unlock_kernel(void)
{
spin_unlock(&kernel_flag);
}
本文来自ChinaUnix博客,如果查看原文请点:http://blog.chinaunix.net/u3/93615/showart_1907865.html |
|