论坛徽章:: 0

电梯直达

1楼 [收藏(0)] [报告]

发表于 2008-02-25 15:29 |只看该作者 |倒序浏览

1 下面是我注释过的代码，希望对大家有用，另外大家在使用时注意遵守GPL许可。
2 由于本人水平有限，注释难免出错，有问题欢迎讨论sirouni@yahoo.com.cn或者直接向Xen的邮件列表提问。
3 如果有时间我会整理一片短文。
4 原本想把整个shadow.c的注释发上来，但是由于CU博客的限制，只能发成连载。谢谢大家的关注
/**************************************************************************/
/* Entry points into the shadow code */
/* 整个shadow code 的入口:
* sh_page_fault():
* sh_invlpg():
*/
/* Called from pagefault handler in Xen, and from the HVM trap handlers
* for pagefaults.  Returns 1 if this fault was an artefact of the
* shadow code (and the guest should retry) or 0 if it is not (and the
* fault should be handled elsewhere or passed to the guest). */
/* 由Xen的#PF处理程序调用,或者由HVM #PF trap 处理程序调用,
* 如果这个fault是由shadow制造的假象(客户操作系统需要retray)则返回1,
* 否则返回0(#PF需要在别的地方处理或者传递给客户).
*/
/* 主要工作:
* 1) 遍历客户页表的各个层次.
* 2) 根据error_code做出该#PF是否应由shadow code处理.
* 3) 模拟指令的执行
*/
/*
* [ fix me ] 有时间的话看看MMIO
*/
static int sh_page_fault(struct vcpu *v,
                        unsigned long va,
                        struct cpu_user_regs *regs)
{
struct domain *d = v->domain;
walk_t gw;
u32 accumulated_gflags;
gfn_t gfn;
mfn_t gmfn, sl1mfn=_mfn(0);
shadow_l1e_t sl1e, *ptr_sl1e;
paddr_t gpa;
struct sh_emulate_ctxt emul_ctxt;
struct x86_emulate_ops *emul_ops;
int r, mmio;
fetch_type_t ft = 0;
SHADOW_PRINTK("d:v=%u:%u va=%#lx err=%u\n",
               v->domain->domain_id, v->vcpu_id, va, regs->error_code);
perfc_incr(shadow_fault);
//
// XXX: Need to think about eventually mapping superpages directly in the
//    shadow (when possible), as opposed to splintering them into a
//    bunch of 4K maps.
//
// 从上面的注视可以看出当前的做法是将superpages映射到一些4K页面
// 最终应该考虑将superpages直接映射的shadow
/* 对于64bit Xen/PAE 利用PTE的reserved bit实现的快速路径,所以error code中会有PREC_reserved_bit位设置 */
#if (SHADOW_OPTIMIZATIONS & SHOPT_FAST_FAULT_PATH) && SHADOW_PAGING_LEVELS > 2
if ( (regs->error_code & PFEC_reserved_bit) )
{
      /* The only reasons for reserved bits to be set in shadow entries
      * are the two "magic" shadow_l1e entries. */
      if ( likely((__copy_from_user(&sl1e,
                                    (sh_linear_l1_table(v)
                                    + shadow_l1_linear_offset(va)),
                                    sizeof(sl1e)) == 0)
                  && sh_l1e_is_magic(sl1e)) )
      {
         if ( sh_l1e_is_gnp(sl1e) )
         {
            if ( likely(!is_hvm_domain(d) ||
                        paging_vcpu_mode_translate(v)) )
            {  /* PV translated */
                  /* Not-present in a guest PT: pass to the guest as
                  * a not-present fault (by flipping two bits). */
                  ASSERT(regs->error_code & PFEC_page_present);
                  regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present);
                  reset_early_unshadow(v);
                  perfc_incr(shadow_fault_fast_gnp);
                  SHADOW_PRINTK("fast path not-present\n");
                  return 0; /* 客户不存在,要注入异常 */
            }
            else
            {
                  /* Not-present in the P2M: MMIO */ /* [fix me??] 为什么? */
                  gpa = va;
            }
         }
         else
         {
            /* Magic MMIO marker: extract gfn for MMIO address */ /* MMIO magic 标记 */
            ASSERT(sh_l1e_is_mmio(sl1e));
            gpa = (((paddr_t)(gfn_x(sh_l1e_mmio_get_gfn(sl1e))))
                     PAGE_SHIFT)
                  | (va & ~PAGE_MASK);
         }
         perfc_incr(shadow_fault_fast_mmio);
         SHADOW_PRINTK("fast path mmio %#"PRIpaddr"\n", gpa);
         reset_early_unshadow(v);
         handle_mmio(gpa);
         return EXCRET_fault_fixed;
      }
      else
      {
         /* This should be exceptionally rare: another vcpu has fixed
         * the tables between the fault and our reading the l1e.
         * Retry and let the hardware give us the right fault next time. */
         perfc_incr(shadow_fault_fast_fail);
         SHADOW_PRINTK("fast path false alarm!\n");
         return EXCRET_fault_fixed;
      }
}
#endif /* SHOPT_FAST_FAULT_PATH */
/* Detect if this page fault happened while we were already in Xen
   * doing a shadow operation.  If that happens, the only thing we can
   * do is let Xen's normal fault handlers try to fix it.  In any case,
   * a diagnostic trace of the fault will be more useful than
   * a BUG() when we try to take the lock again. */
/* 检查#PF是否在我们正在处理shadow时发生。
   * 如果发生的话,我们唯一能做的事情是让Xen正常的处理程序试图修复.
   * 在任何情况下,一个诊断的trace比一个BUG()更有效
   */
if ( unlikely(shadow_locked_by_me(d)) ) /*  遍历客户页表层次,将相应的客户页表设位只读 */
{
      SHADOW_ERROR("Recursive shadow fault: lock was taken by %s\n",
                  d->arch.paging.shadow.locker_function);
      return 0;
}
shadow_lock(d);

shadow_audit_tables(v);

if ( guest_walk_tables(v, va, &gw, 1) != 0 )
{
      SHADOW_PRINTK("malformed guest pagetable!");
      print_gw(&gw);
}
/*  与上面的shadow_autid_table()并不矛盾,注意他们的编译开关 */
sh_audit_gw(v, &gw);
// 首先检查是否由于客户自身引发的#PF
// We do not look at the gw->l1e, as that will not exist for superpages.
// Instead, we use the gw->eff_l1e...
//
// We need not check all the levels of the guest page table entries for
// present vs not-present, as the eff_l1e will always be not present if
// one of the higher level entries is not present.
//
// 我们并不查看gw->l1e, 因为对于superpage, 不存在.
// 相反我们使用gw->eff_l1e
//
// 我们也不必检查所有层次的gle是否存在,
// 因为如果eff_l1e在高层entry不存在情况下总是不存在点的.
// (初始化gw的时候使用了memset(...,0).)
if ( unlikely(!(guest_l1e_get_flags(gw.eff_l1e) & _PAGE_PRESENT)) )
{ /* 客户页表l2e/l1e标记为 NOT PRESENT
      * 对于guest_walk_tables()来说,l2e不存在的话 gw.eff_l1e=0 , 所以gw.eff_l1e&_PAGE_PRESENT也为0
      */
      if ( is_hvm_domain(d) && !paging_vcpu_mode_translate(v) ) /* HVM domain,并未开启分页,使用P2M表,其中标记为Not present */
      {
         /* Not present in p2m map, means this is mmio */
         gpa = va;
         goto mmio;
      }
      perfc_incr(shadow_fault_bail_not_present);
      goto not_a_shadow_fault; /* 由于客户本页表项不存在引发的缺页 */
}
// All levels of the guest page table are now known to be present.
/* 这里已经确定所有层次的guest page table 是存在的*/
accumulated_gflags = accumulate_guest_flags(v, &gw);
// Check for attempts to access supervisor-only pages from user mode,
// i.e. ring 3.  Such errors are not caused or dealt with by the shadow
// code.
//
// 检查是否由于试图从用户模式访问特权页面引发的#PF,
// 这种错误不应该由shadow code处理.
if ( (regs->error_code & PFEC_user_mode) &&
      !(accumulated_gflags & _PAGE_USER) )
{
      /* illegal user-mode access to supervisor-only page */
      /* 用户模式非法访问特权页面 */
      perfc_incr(shadow_fault_bail_user_supervisor);
      goto not_a_shadow_fault;
}
// Was it a write fault?
ft = ((regs->error_code & PFEC_write_access)
      ? ft_demand_write : ft_demand_read);
if ( ft == ft_demand_write )
{
      if ( unlikely(!(accumulated_gflags & _PAGE_RW)) )
      { /* 客户尝试写一个只读的页面 */
         perfc_incr(shadow_fault_bail_ro_mapping);
         goto not_a_shadow_fault;
      }
}
else // must have been either an insn fetch or read fault 一定是由于指令预取或者read引发的fault
{
      // Check for NX bit violations: attempts to execute code that is
      // marked "do not execute".  Such errors are not caused or dealt with
      // by the shadow code.
      //
      // 检查是否用于NX位引发的缺页:
      // 尝试执行被标记为"不可执行"的代码页面.
      // 这种错误不由shadow code引发而且不由shadow code 处理
      if ( regs->error_code & PFEC_insn_fetch )
      {
         if ( accumulated_gflags & _PAGE_NX_BIT )
         {
            /* NX prevented this code fetch */
            perfc_incr(shadow_fault_bail_nx);
            goto not_a_shadow_fault;
         }
      }
}
// 到达这里就可以确定是由shadow引发的#PF

/* gfn仅仅是地址空间虚拟化的一个抽象*/
/* What mfn is the guest trying to access? */
gfn = guest_l1e_get_gfn(gw.eff_l1e);
gmfn = vcpu_gfn_to_mfn(v, gfn); /* 利用P2M表定位mfn */
mmio = (is_hvm_domain(d)
         && paging_vcpu_mode_translate(v)
         && mmio_space(gfn_to_paddr(gfn)));
if ( !mmio && !mfn_valid(gmfn) )
{  /* gmfn 不正确 */
      perfc_incr(shadow_fault_bail_bad_gfn);
      SHADOW_PRINTK("BAD gfn=%"SH_PRI_gfn" gmfn=%"PRI_mfn"\n",
                  gfn_x(gfn), mfn_x(gmfn));
      goto not_a_shadow_fault;
}
/* Make sure there is enough free shadow memory to build a chain of
   * shadow tables: one SHADOW_MAX_ORDER chunk will always be enough
   * to allocate all we need.  (We never allocate a top-level shadow
   * on this path, only a 32b l1, pae l2+1 or 64b l3+2+1) */
/* 确保有足够的空闲内存构建shadow tables 链:
   * 一个SHADOW_MAX_ORDER chunk 总能满足我们的要求.
   * (我们从不分配一个顶级的shadow,在这个路径上)
   * 在客户更新cr3或者分页模式发生变更的时候分配顶级的shadow
   */
shadow_prealloc(d, SHADOW_MAX_ORDER);
/* Acquire the shadow.  This must happen before we figure out the rights
   * for the shadow entry, since we might promote a page here. */
/* 获取或者建立相应的shadow entry
   * 注意:寻址的结束总是从l1e中获取物理基址+OFFSET得出
   * 所以最后必须取得一个sl1e,获得sl1e的地址.
   */
ptr_sl1e = shadow_get_and_create_l1e(v, &gw, &sl1mfn, ft);
if ( unlikely(ptr_sl1e == NULL) )
{
      /* Couldn't get the sl1e!  Since we know the guest entries
      * are OK, this can only have been caused by a failed
      * shadow_set_l*e(), which will have crashed the guest.
      * Get out of the fault handler immediately. */
      ASSERT(d->is_shutting_down);
      unmap_walk(v, &gw);
      shadow_unlock(d);
      return 0;
}
/* Calculate the shadow entry and write it
   * 计算l1 shadow entry 并写入新的entry.
   */
l1e_propagate_from_guest(v, (gw.l1e) ? gw.l1e : &gw.eff_l1e, gw.l1mfn,
                           gmfn, &sl1e, ft, mmio);
r = shadow_set_l1e(v, ptr_sl1e, sl1e, sl1mfn);
#if SHADOW_OPTIMIZATIONS & SHOPT_PREFETCH
/* Prefetch some more shadow entries */
sh_prefetch(v, &gw, ptr_sl1e, sl1mfn);
#endif
/* Need to emulate accesses to page tables */
/* 需要模拟对客户页表的写操作 */
if ( sh_mfn_is_a_page_table(gmfn) )
{
      if ( ft == ft_demand_write )
      {
         perfc_incr(shadow_fault_emulate_write);
         goto emulate;
      }
      else if ( shadow_mode_trap_reads(d) && ft == ft_demand_read )
      { /* Xen  暂时不支持读页表的模拟 */
         perfc_incr(shadow_fault_emulate_read);
         goto emulate;
      }
}
if ( mmio )
{
      gpa = guest_walk_to_gpa(&gw);
      goto mmio;
}
perfc_incr(shadow_fault_fixed);
d->arch.paging.shadow.fault_count++;
reset_early_unshadow(v);
done:
sh_audit_gw(v, &gw);
unmap_walk(v, &gw);
SHADOW_PRINTK("fixed\n");
shadow_audit_tables(v);
shadow_unlock(d);
return EXCRET_fault_fixed; /* fault  已经修复,需要replay */
emulate: /* 模拟对GPT的操作 */
if ( !shadow_mode_refcounts(d) || !guest_mode(regs) )
      goto not_a_shadow_fault;
if ( is_hvm_domain(d) )
{
      /*
      * If we are in the middle of injecting an exception or interrupt then
      * we should not emulate: it is not the instruction at %eip that caused
      * the fault. Furthermore it is almost certainly the case the handler
      * stack is currently considered to be a page table, so we should
      * unshadow the faulting page before exiting.
      */
      /* 如果我们在注入异常或者中断的时候产生了#PF,
      * 那么我们不应该模拟: 并非由于%eip的指令引发的#PF,
      * 几乎可以肯定的是:　处理程序栈所在页面被认为是一个页表,
      * 所以我们在退出前unshadow faulting page.
      */
      if ( unlikely(hvm_event_injection_faulted(v)) )
      {
         gdprintk(XENLOG_DEBUG, "write to pagetable during event "
                  "injection: cr2=%#lx, mfn=%#lx\n",
                  va, mfn_x(gmfn));
         sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
         goto done;
      }
      /* 保存客户寄存器,用于下面的模拟之用 */
      hvm_store_cpu_guest_regs(v, regs, NULL);
}
SHADOW_PRINTK("emulate: eip=%#lx esp=%#lx\n",
               (unsigned long)regs->eip, (unsigned long)regs->esp);
emul_ops = shadow_init_emulation(&emul_ctxt, regs);
/*
   * We do not emulate user writes. Instead we use them as a hint that the
   * page is no longer a page table. This behaviour differs from native, but
   * it seems very unlikely that any OS grants user access to page tables.
   */
/* 我们不模拟Guest 用户对页表的写操作.
   * 相反,我们将这种操作做为一种暗示:
   * 　要写的页面已经不再做为页表使用
   * 这看起来有点问题,
   * 但实际情况是没有哪个操作系统会授权用户访问页表.
   */
r = X86EMUL_UNHANDLEABLE;
if ( !(regs->error_code & PFEC_user_mode) )
      /* [fix me??] 没有看 */
      r = x86_emulate(&emul_ctxt.ctxt, emul_ops); /* 模拟指令缓冲中指令的执行 */
/*
   * NB. We do not unshadow on X86EMUL_EXCEPTION. It's not clear that it
   * would be a good unshadow hint. If we *do* decide to unshadow-on-fault
   * then it must be 'failable': we cannot require the unshadow to succeed.
   */
if ( r == X86EMUL_UNHANDLEABLE )
{
      SHADOW_PRINTK("emulator failure, unshadowing mfn %#lx\n",
                     mfn_x(gmfn));
      perfc_incr(shadow_fault_emulate_failed);
      /* If this is actually a page table, then we have a bug, and need
      * to support more operations in the emulator.  More likely,
      * though, this is a hint that this page should not be shadowed. */
      sh_remove_shadows(v, gmfn, 0 /* thorough */, 1 /* must succeed */);
}
/* Emulator has changed the user registers: write back */
/* 模拟器可能已经改变用户寄存器: 所以需要写回 */
if ( is_hvm_domain(d) )
      hvm_load_cpu_guest_regs(v, regs);
goto done;
mmio:
if ( !guest_mode(regs) )
      goto not_a_shadow_fault;
perfc_incr(shadow_fault_mmio);
sh_audit_gw(v, &gw);
unmap_walk(v, &gw);
SHADOW_PRINTK("mmio %#"PRIpaddr"\n", gpa);
shadow_audit_tables(v);
reset_early_unshadow(v);
shadow_unlock(d);
handle_mmio(gpa);
return EXCRET_fault_fixed;
/* 并非由shadow 引发的#PF */
not_a_shadow_fault:
sh_audit_gw(v, &gw);
unmap_walk(v, &gw);
SHADOW_PRINTK("not a shadow fault\n");
shadow_audit_tables(v);
reset_early_unshadow(v);
shadow_unlock(d);
return 0;
}

本文来自ChinaUnix博客，如果查看原文请点：http://blog.chinaunix.net/u/7949/showart_483353.html

文库|博客

返回列表

Chinaunix › 论坛 › 操作系统 › Linux新手园地 › Linux文档专区 › Xen shadow.c 中 sh_page_fault() 注释

Xen shadow.c 中 sh_page_fault() 注释 [复制链接]

浏览过的版块