123 / 3 页下一页

linux内核分析（转自某位大哥网上的笔记） [复制链接]

nnfirst

丰衣足食

论坛徽章:: 0

11楼 [报告]

发表于 2003-04-21 23:23 |只看该作者

linux内核分析（转自某位大哥网上的笔记）

ding

实战分享：从技术角度谈机器学习入门| 【大话IT】RadonDB低门槛向MySQL集群下战书 | ChinaUnix打赏功能已上线！ | 新一代分布式关系型数据库RadonDB知多少？

lwj2003429

稍有积蓄

论坛徽章:: 0

12楼 [报告]

发表于 2003-04-23 10:07 |只看该作者

linux内核分析（转自某位大哥网上的笔记）

顶，建议进精华！！

实战分享：从技术角度谈机器学习入门| 【大话IT】RadonDB低门槛向MySQL集群下战书 | ChinaUnix打赏功能已上线！ | 新一代分布式关系型数据库RadonDB知多少？

aero

版主

论坛徽章:: 1

13楼 [报告]

发表于 2003-04-23 10:30 |只看该作者

linux内核分析（转自某位大哥网上的笔记）

举。

实战分享：从技术角度谈机器学习入门| 【大话IT】RadonDB低门槛向MySQL集群下战书 | ChinaUnix打赏功能已上线！ | 新一代分布式关系型数据库RadonDB知多少？

nnfirst

丰衣足食

论坛徽章:: 0

14楼 [报告]

发表于 2003-04-23 11:41 |只看该作者

linux内核分析（转自某位大哥网上的笔记）

gool

实战分享：从技术角度谈机器学习入门| 【大话IT】RadonDB低门槛向MySQL集群下战书 | ChinaUnix打赏功能已上线！ | 新一代分布式关系型数据库RadonDB知多少？

自由狼-台风0

丰衣足食

论坛徽章:: 0

15楼 [报告]

发表于 2003-04-23 18:55 |只看该作者

linux内核分析（转自某位大哥网上的笔记）

高！

实战分享：从技术角度谈机器学习入门| 【大话IT】RadonDB低门槛向MySQL集群下战书 | ChinaUnix打赏功能已上线！ | 新一代分布式关系型数据库RadonDB知多少？

轩辕砍刀

荣誉版主

论坛徽章:: 0

16楼 [报告]

发表于 2003-04-23 19:59 |只看该作者

linux内核分析（转自某位大哥网上的笔记）

}
            if (--cx == 0)
                     return TIMEOUT;
      }
      *pByte |= (c0 << 1) & 0xf0;
      data_write(0x00); /* send ACK */
      return OK;
}

为了能够在setup.S下收字符，特将字符接收子程序该为AT&T汇编语法（也没有什么好办法，在DOS下用TURBO C 2.0将上述代码编译成汇编代码，然后手工转换成AT&T格式，据说有程序可以自动进行这样的转换，有谁用过请指教）：

rcvbyte:
      pushw       %bp
      movw       %sp, %bp
      subw       $6, %sp
      movw       $511, -2(%bp)
      movw       $-1024, -4(%bp)
      jmp       .L13
.L15:
      movw       $889, %dx
      inb       %dx, %al
      movb       %al, -6(%bp)
      testb       $128, -6(%bp)
      jne       .L16
      inb       %dx, %al
      movb       %al, -5(%bp)
      movb       -6(%bp), %al
      cmpb       -5(%bp), %al
      jne       .L17
      jmp       .L14
.L17:
.L16:
      subw       $1, -4(%bp)
      sbbw       $0, -2(%bp)
      movw       -2(%bp), %dx
      movw       -4(%bp), %ax
      orw       %ax, %dx
      jne       .L18
      movw       $1, %ax
      jmp       .L12
.L18:
.L13:
      jmp       .L15
.L14:
      movb       -6(%bp), %al
      shrb       $1, %al
      shrb       $1, %al
      shrb       $1, %al
      andb       $15, %al
      movw       4(%bp), %bx
      movb       %al, (%bx)
      movb       $16, %al
      movw       $888, %dx
      outb       %al, %dx
      movw       $511, -2(%bp)
      movw       $-1024, -4(%bp)
      jmp       .L19
.L21:
      movw       $889, %dx
      inb       %dx, %al
      movb       %al, -6(%bp)
      testb       $128, %al
      je       .L22
      inb       %dx, %al
      movb       %al, -5(%bp)
      movb       -6(%bp), %al
      cmpb       -5(%bp), %al
      jne       .L23
      jmp       .L20
.L23:
.L22:
      subw       $1, -4(%bp)
      sbbw       $0, -2(%bp)
      movw       -2(%bp), %dx
      movw       -4(%bp), %ax
      orw       %ax, %dx
      jne       .L24
      movw       $1, %ax
      jmp       .L12
.L24:
.L19:
      jmp       .L21
.L20:
      movb       -6(%bp), %al
      shlb       $1, %al
      andb       $240, %al
      movw       4(%bp), %bx
      orb       %al, (%bx)
      xorw       %ax, %ax
      movw       $888, %dx
      outb       %al, %dx
      jmp       .L12
.L12:
      movw       %bp, %sp
      popw       %bp
      ret

能够收发字符还不行，作为协议，总得知道数据的起始和结束，也应该进行简单的检错。这里采用字符填充方式进行数据包编码，用‘\’表示转义字符，数据包头用\H表示，数据包结束用\T表示如果数据中有'\'，则用\\表示（从printf的格式串中学来的）,数据包后面跟一个字节的校验和，这样就可以收发数据包了，具体程序如下：

int rcvpack(unsigned char * pData, int * pLength)
{
      int ret;
      int length;
      unsigned char checksum;
      int maxlength;
      int status;
      maxlength = *pLength + 1;
      if (maxlength<=0)
            return FAIL;
      if (pData == NULL)
            return FAIL;
      checksum = 0;
      length = 0;
      status = 0;
      while (1)
      {
            unsigned char ch;
            int count;
            count = 10;
            while (1)
            {
            if ((ret = rcvbyte(&ch)) != OK)
            {
                     count--;
                     if (count==0)
                     {
                              printf("\nReceive byte timeout\n"

;
                              return ret;
                     }
            }
                     else
                              break;
            }
            switch (status)
            {
                     case 0:
                     {
                              if (ch == '\\')
                              {
                                    status = 1;
                              }
                     }
                     break;
                     case 1:
                     {
                              if (ch == 'H')
                                    status = 2;
                              else
                                    status = 0;
                     }
                     break;
                     case 2:
                     {
                              if (ch == '\\')
                              {
                                    status = 3;
                              }
                              else
                              {
                                    length ++;
                                    if (length>;maxlength)
                                    {
                                             printf("Buffer overflow(%d>;%d)\n", length, maxlength);
                                             return FAIL;
                                    }
                                    *pData++ = ch;
                                    checksum += ch;
                              }
                     }
                     break;
                     case 3:
                     {
                              if (ch == '\\')
                              {
                                 length++;
                                 if (length>;maxlength)
                                 {
                                    printf("Buffer overflow (%d>;%d)\n", length, maxlength);
                                    return FAIL;
                                 }
                                 checksum += ch;
                                 *pData++ = ch;
                                 status = 2;
                              }
                              else
                              if (ch =='T')
                              {
                                    unsigned char chk;
                                    *pLength = length;
                                    if (rcvbyte(&chk)!=OK)
                                             return FAIL;
                                    if (checksum==chk)
                                    {
                                             return OK;
                                    }
                                    else
                                    {
                                             printf("ERROR: Checksum is nozero(%d-%d)\n", checksum,chk);
                                             return FAIL;
                                    }
                              }
                              else
                              {
                                    printf("ERROR: a '\\' or 'T' expected('%c')!\n ", ch);
                                    return FAIL;
                              }
                     }
            }
      }
}

int sendpack(unsigned char * pData, int length)
{
      int ret;
      unsigned char checksum;
      checksum = 0;
      if (length<=0)
            return OK;
      if ((ret = sendbyte('\\')) != OK)
            return 1;
      if ((ret = sendbyte('H')) != OK)
            return 2;
      while (length>;0)
      {
            unsigned char ch;
            ch = *pData++;
            checksum += ch;
            if ((ret = sendbyte(ch)) != OK)
                     return 3;
            if (ch == '\\')
            {
                     if ((ret = sendbyte(ch)) != OK)
                              return 4;
            }
            length--;
      }

      if ((ret = sendbyte('\\')) != OK)
            return 5;
      if ((ret = sendbyte('T')) != OK)
            return 6;
      if ((ret = sendbyte(checksum)) != OK)
            return 7;
      return OK;
}

同样，也将rcvpack改成AT&T汇编(减少了几个printf语句)：

chbuffer:
      .byte 0
overflow:
      .string "Buffer overflow..."
rcvpack:
      pushw       %bp
      movw       %sp, %bp
      subw       $12, %sp
      pushw       %si
      movw       4(%bp), %si
      movw       6(%bp), %bx
      movw       (%bx), %ax
      incw       %ax
      movw       %ax, -6(%bp)
      cmpw       $0, -6(%bp)
      jg       .L26
      leaw       overflow, %si
      call       prtstr
      movw       $2, %ax
      jmp       .L25
.L26:
      orw       %si, %si
      jne       .L27
      movw       $2, %ax
      jmp       .L25
.L27:
      movb       $0,-8(%bp)
      movw       $0, -10(%bp)
      movw       $0, -4(%bp)
      jmp       .L28
.L30:
      movw       $10, -2(%bp)
      jmp       .L31
.L33:
#       movw       -4(%bp), %ax
#       addb       $'0', %al
#       call       prtchr
      leaw       chbuffer, %ax
      pushw       %ax
      call       rcvbyte
      popw       %cx
      movw       %ax, -12(%bp)
      orw       %ax, %ax
      je       .L34
      decw       -2(%bp)
      cmpw       $0, -2(%bp)
      jne       .L35
      movw       -12(%bp), %ax
      jmp       .L25
.L35:
      jmp       .L36
.L34:
      jmp       .L32
.L36:
.L31:
      jmp       .L33
.L32:
      pushw       %si
      leaw       chbuffer, %si
      movb       (%si), %al
      movb       %al, -7(%bp)
      popw       %si
#       call       prtchr
      movw       -4(%bp), %ax
      cmpw       $3, %ax
      jbe       .L58
      jmp       .L56
.L58:
      cmpw       $0, %ax
      je       .L38
      cmpw       $1, %ax
      je       .L40
      cmpw       $2, %ax
      je       .L43
      cmpw       $3, %ax
      je       .L47
      jmp       .L56
.L38:
      cmpb       $92, -7(%bp)
      jne       .L39
      movw       $1, -4(%bp)
.L39:
      jmp       .L37
.L40:
      cmpb       $72, -7(%bp)
      jne       .L41
      movw       $2, -4(%bp)
      jmp       .L42
.L41:
      movw       $0, -4(%bp)
.L42:
      jmp       .L37
.L43:
      cmpb       $92, -7(%bp)
      jne       .L44
      movw       $3, -4(%bp)
      jmp       .L45
.L44:
      incw       -10(%bp)
      movw       -10(%bp), %ax
      cmpw       -6(%bp), %ax
      jle       .L46
      movw       $2, %ax
      jmp       .L25
.L46:
      movb       -7(%bp), %al
      movb       %al, (%si)
      incw       %si
      movb       -7(%bp), %al
      addb       %al, -8(%bp)
.L45:
      jmp       .L37
.L47:
      cmpb       $92, -7(%bp)
      jne       .L48
      incw       -10(%bp)
      movw       -10(%bp), %ax
      cmpw       -6(%bp), %ax
      jle       .L49
      movw       $2, %ax
      jmp       .L25
.L49:
      movb       -7(%bp), %al
      addb       %al, -8(%bp)
      movb       -7(%bp), %al
      movb       %al, (%si)
      incw       %si
      movw       $2, -4(%bp)
      jmp       .L50
.L48:
      cmpb       $84, -7(%bp)
      jne       .L51
      movw       -10(%bp), %ax
      movw       6(%bp), %bx
      movw       %ax, (%bx)
      leaw       chbuffer, %ax
      pushw       %ax
      call       rcvbyte
      popw       %cx
      orw       %ax, %ax
      je       .L52
      movw       $2, %ax
      jmp       .L25
.L52:
      movb       -8(%bp), %al
      cmpb       chbuffer, %al
      jne       .L53
      xorw       %ax, %ax
      jmp       .L25
      jmp       .L54
sChecksumFailed:
      .string "Checksum error!"
.L53:
      leaw       sChecksumFailed, %si
      call       prtstr
      movw       $2, %ax
      jmp       .L25
.L54:
      jmp       .L55
.L51:
      movw       $2, %ax
      jmp       .L25
.L55:
.L50:
.L56:
.L37:
.L28:
      jmp       .L30
.L29:
.L25:
      popw       %si
      movw       %bp, %sp
      popw       %bp
      ret

好了，万事具备了，先用上面的c代码写另外一台计算机上的“服务”程序（也用来测试）,这台计算机运行DOS，用TURBO C 2.0编译运行：
运行时将initrd.img和内核编译后的/usr/src/linux/arch/i386/boot/compressed/bvmlinux.out拷贝到该计算机的c:\下，然后带参数 s c:\bvmlinux.out c:\initrd.img运行即可。

至于启动程序，还得进行少许修改，才能烧到boot rom 中，见后面的说明。

int main(int argc, char* argv[])
{
      FILE* pFile;
      int count = 2;
      if (argc<3)
      {
            printf("Usage testspp [s | r] \n"

;
            return 1;
      }
      while(count       {
      if (argv[1][0] == 's')
            pFile = fopen(argv[count], "rb"

;
else
pFile = fopen(argv[count], "wb"

;
      if (pFile==NULL)
      {
            printf("Can't open/create file %s\n", argv[2]);
            return 2;
      }
      if (argv[1][0]=='r')/*receive*/
      {
            unsigned long filesize;
            char buffer[10244];
            int length;
            /*get filelength */
            length = 10244;

            printf("Receiving filesize package\n"

;
            while( (rcvpack(buffer, &length)!=OK) && (length!=4))
                     length = 10244;
            filesize = *(long*)buffer;
            printf("file size is:%ld\n", filesize);

            while (filesize>;0)
            {
                     length = 10244;
                     if (rcvpack(buffer, &length) != OK)
                     {
                              printf("Receive data package failed\n"

;
                              return 0;
                     }
                     if (length>;0)
                              fwrite(buffer, 1, length, pFile);
                     filesize-=length;
                     printf("\r%ld Bytes Left       ", filesize);
            }
      }
      else/*send*/
      {
            unsigned long filesize;
            /*send file length*/
            unsigned long stemp;
            int ret;
            fseek(pFile, 0, 2);
            filesize = ftell(pFile);
            fseek(pFile, 0, 0);
            printf("\nfile size is:%ld\n", filesize);
            /*
            while ((ret = sendpack((char *)&filesize, 4)) != OK)
            {
                     printf("send file size failed(%d)\n", ret);
            }
            */
            while (filesize>;0)
            {
                     char buffer[10240];
                     long size;
                     int ret;
                     size = fread(buffer, 1, 10240, pFile);
                     if ((ret = sendpack(buffer, size)) != OK)
                     {
                              printf("Send data package failed(%d)\n", ret);
                              return 0;
                     }
                     filesize -= size;
                     printf("\r\t%ld Bytes Left", filesize);
            }
      }
      fclose(pFile);
      count++;
      }/*while*/
      return 0;
}

5、对bootsect.S的修改
目前的bootsect.S ，主要的问题是，它是从软盘上读数据，将这些代码换成对rcvpack的调用即可，另外，它不支持调入initrd，应该增加相应的代码。问题在于，bootsect.S中没有那么多空间来放rcvpack相关的代码（毕竟只有512字节，当然，如果烧在boot rom中，就不存在这个问题了，但是用软盘调试时就不行了，因此干脆编制load_kernel和load_initrd放在setup.S中，然后在bootsect.S中进行回调即可。

bootsect.S 修改如下(只给出修改部分)：
.....
.....
ok_load_setup:
      call       kill_motor
      call       print_nl
# Now we will load kernel and initrd
loading:
# 先打印Loading字符
      movw       $INITSEG, %ax
      movw       %ax, %es             # set up es
      movb       $0x03, %ah             # read cursor pos
      xorb       %bh, %bh
      int       $0x10
      movw       $22, %cx
      movw       $0x0007, %bx             # page 0, attribute 7 (normal)
      movw $msg1, %bp
      movw $0x1301, %ax             # write string, move cursor
      int       $0x10                      # tell the user we're loading..
load_kernel_img:
# 将load_kernel函数的指针放到0x22C处这里进行调用就行了（软盘启动过程中，此前已经将setup.S
# 从磁盘上调到bootsect.S,即0x0200之后，注意setup.S的头部是一张表，这里“提前”消费了）
# 0x22C is the load kernel routine
      bootsect_readimage = 0x22C
      lcall       bootsect_readimage
load_initrd_img:
# 将load_initrd函数的指针放到0x220处
# 0x220 if the load initrd routine
      bootsect_readinitrd = 0x220
      lcall       bootsect_readinitrd

# After that (everything loaded), we jump to the setup-routine
# loaded directly after the bootblock:
      ljmp       $SETUPSEG, $0
......
......
6、对setup.S的修改
对setup.S进行修改，主要是：修改setup.S头部，增加load_kernel和load_initrd函数等，具体如下。
修改setup.S头部如下（为好看，这里删除了原来的部分注释）：

start:
                     jmp       trampoline
                     .ascii       "HdrS"             # header signature
                     .word       0x0202             # header version number (>;= 0x0105)
realmode_swtch:       .word       0, 0             # default_switch, SETUPSEG
start_sys_seg:       .word       SYSSEG
                     .word       kernel_version       # pointing to kernel version string
type_of_loader:       .byte       0
loadflags:
LOADED_HIGH       = 1
                     .byte       LOADED_HIGH # 只支持bzImage
setup_move_size:       .word  0x8000
code32_start:                               # here loaders can put a different
                     .long       0x100000       # 0x100000 = default for big kernel
ramdisk_image:       .long       0xB00000       # ramdisk 调到12M处
ramdisk_size:             .long       0             # 由load_initrd来设置长度
bootsect_kludge:
                     .word  load_initrd, SETUPSEG #0x220, 放置load_initrd函数的指针
heap_end_ptr:             .word       modelist+1024       pad1:             .word       0
cmd_line_ptr:             .long 0
load_kernel_call:
                     .word  load_kernel, SETUPSEG
trampoline:             call       start_of_setup

                     .space       1024

load_kernel和load_initrd：

load_imsg:
      .byte 13, 10
      .string "Load INITRD from PARPort(37

"
load_kmsg:
.byte 13, 10
.string "Load Kernel From PARPort(37

"
reading_suc:
      .string       "."
reading_failed:
      .string " failed"
read_len:
      .word 0, 0
read_total:
      .word 0, 0
read_buffer:
      # 如何在AT&T语法中完成intel语法中的 db 1280 dup(0)，那位请指教
      # AT&T汇编的语法何处寻？
      .string "012345678901234567890123456789012345678901234567890123456789"
      .string "012345678901234567890123456789012345678901234567890123456789"
      .string "012345678901234567890123456789012345678901234567890123456789"
      .string "012345678901234567890123456789012345678901234567890123456789"
      .string "012345678901234567890123456789012345678901234567890123456789"
      .string "012345678901234567890123456789012345678901234567890123456789"
      .string "012345678901234567890123456789012345678901234567890123456789"
      .string "012345678901234567890123456789012345678901234567890123456789"
      .string "012345678901234567890123456789012345678901234567890123456789"
      .string "012345678901234567890123456789012345678901234567890123456789"
      .string "012345678901234567890123456789012345678901234567890123456789"
      .string "012345678901234567890123456789012345678901234567890123456789"
      .string "012345678901234567890123456789012345678901234567890123456789"
      .string "012345678901234567890123456789012345678901234567890123456789"
      .string "012345678901234567890123456789012345678901234567890123456789"
      .string "012345678901234567890123456789012345678901234567890123456789"
      .string "012345678901234567890123456789012345678901234567890123456789"
      .string "012345678901234567890123456789012345678901234567890123456789"
      .string "012345678901234567890123456789012345678901234567890123456789"
      .string "012345678901234567890123456789012345678901234567890123456789"

load_initrd:
      pushw       %ds
      pushw       %es
      pushw       %cs
      popw       %ds
      pushw       %cs
      popw       %es
      cld
      leaw       load_imsg, %si
      call       prtstr                               # 打印提示
      movw       $0x1000, %ax
      movw       %ax, %es
      xorw       %bx, %bx
      movw       $0x00B0, %ax                      # initrd数据先调到0x1000:0000处，
                                             # 满64K即移动到12M(0xB00000)处
      movw       %ax, %fs
      movw       $0, %cs:move_es
      movl       $0, %cs:read_total
      call       movetohigh                      # 初始化数据移动部分
      call       .ld_img                      # 从并口上读入一个文件并移动到指定位置
      movl       %cs:read_total, %eax
      movl       %eax, %cs:ramdisk_size       # 设置ramdisk_size和ramdisk_image
      movl       $0x00B00000, %eax
      movl       %eax, %cs:ramdisk_image
      popw       %es
      popw       %ds
      lret

load_kernel:
      pushw       %ds
      pushw       %es
      pushw       %cs
      popw       %ds
      pushw %cs
      popw       %es
      cld
      leaw       load_kmsg, %si
      call       prtstr
      movw       $0x1000, %ax
      movw       %ax, %es
      xorw       %bx, %bx
      movw       $0x0010, %ax
      movw       %ax, %fs
      movw       $0, %cs:move_es
      movl       $0, %cs:read_total
      call       movetohigh
      call       .ld_img
      popw       %es
      popw       %ds
      lret

.ld_img:
.ld_nextpack:
      pushw       %bx
      pushw       %es
      leaw       read_len, %si
      movw       $1124, %ax
      movw       %ax, (%si)
      pushw       %si
      leaw       read_buffer, %ax
      pushw       %ax
      movw       %bx, %ax
      call       rcvpack             # 调用rcpack接收一个数据包read_buffer中
      popw       %cx
      popw       %cx
      popw       %es
      popw       %bx
      cmpw       $0, %ax             # 成功？
      je       .ld_suc
      leaw       reading_failed, %si
      call       prtstr
.ld_panic:
      jmp       .ld_panic             # 失败则死循环
.ld_suc:
      leaw       read_buffer, %si
      movw       %bx, %di
      movw       $256, %cx             # move 1024 bytes
      rep
      movsl                               # 从read_buffer移动到es:bx处，强制假定一个数据包长度
                                    # 就是1024字节，最后一个数据包除外。
      addw       $1024, %bx             # 更新bx, 如果bx加到零，则表示已经满64K，后面的调用中
      call       movetohigh             # 进行实际的数据移动
      movw       %ax, %dx             #
      cmpw       $0, %ax             # 如果进行了64K数据移动，就打印一个'.'
      je       .ld_1
      leaw       reading_suc, %si
      call       prtstr
.ld_1:
      leaw       read_len, %si
      xorl       %eax, %eax
      movw       (%si), %ax
      addl       %eax, %cs:read_total
      cmpw       $1024, %ax                # 更新收到数据总字节数，如果收到的字节数少于1024，则表示
                                    # 收到最后一个数据包，这得冒点风险，万一最后一个数据包刚好
                                    # 是1024字节，怎么办好呢？赌一把吧！
      jb       .ld_lastestpack
      jmp       .ld_nextpack             # 接着接收下一个数据包
.ld_lastestpack:
      # 最后一个数据包收到后，不见得满64K，此时应该强制数据移动
      cmpw       $0, %dx
      jne       .ld_exit
      xorw       %bx, %bx
      call       movetohigh
.ld_exit:
      ret

7、用软盘进行调试，将启动程序烧到bootrom中
      好了，大功告成，对内核进行配置，然后make bzImage，将bvmlinux.out拷贝到“服务器”上，建立initrd也放在“服务器”上，然后放张软盘在软驱中，dd if=/usr/src/linux/arch/i386/boot/bzImage of=/dev/fd0 count=32将bootsect.S+setup.S部分拷贝到软盘上，重新启动（先连接好并口线）。启动后再在“服务器”上启动文件“服务”程序，终于可以将Linux从并口上启动了！
      做少量调整（主要是去掉读setup.S部分的代码），即可以将此bzImage的前8(16?)K写在一个文件中，处理成boot rom映象，烧到boot rom中，插到网络卡上，启动机器即可。这就是用网络卡从并口上启动Linux。

标题 Re: 用网络卡从并口上启动Linux（I386） [re: raoxianhong]
作者 raoxianhong (journeyman)
时间 10/09/01 11:30 AM

网络上说可以将Bootrom写到BIOS中去，但是没有实验成功，不知道有什么讲究，哪位可曾试过？
寻找文件 cbrom.pdf

标题推荐两篇讲述启动过程的文章 [re: feiyunw]
作者 raoxianhong (journeyman)
时间 10/11/01 09:08 AM

http://www.pcguide.com/ref/mbsys/bios/boot.htm
http://www2.csa.iisc.ernet.in/~kvs/LinuxBoot.html

标题 Re: 386 boot代码分析 [re: feiyunw]
作者 raoxianhong (member)
时间 10/25/01 05:09 PM
附加文件 181431-bootrom.zip

有几位老兄Mail问网卡启动的启动代码问题，这里总结如下：

1.系统自检完毕后在ROM空间中找（好象是2Kbytes为单位），如果某一段的前两表字节是0x55AA，那么第三个字节作为ROM程序的大小（512字节为单位）。然后将该段空间中的所有字节相加（计算校验和），结果为零时表示ROM程序有效。此时BIOS用一个长调用(lcall)，调用该块的第四个字节起始处（自然该用lret返回）。

2.有个问题原来一直不明白，如果此时某个启动网卡启动系统，但是后面还有带ROM的卡（比如PCI），那么该段ROM程序岂不是没有机会运行了吗，当然，如果不运行任何设备的扩展ROM，不知道Linux内会不会有问题！后来查资料得知，实际上制作网卡启动程序时还没有这么简单。

3.事实上，系统在自检及运行所有的扩展硬件检测之后，是用int 19h启动操作系统的！因此在扩展ROM中不直接启动操作系统，而是将操作系统启动代码作为int 19h的中断调用（其实也不用返回，操作系统没有必要返回）代码就行了。
明白这一点后，制作一个网卡启动程序就容易多了，具体请看某个网卡的启动源代码即可，附件中有一个，记不住是从哪里抄来的了！

标题通用的网络卡bootrom处理程序 [re: feiyunw]
作者 raoxianhong (member)
时间 12/06/01 08:05 PM

Bootrom写好后要进行一些处理才能烧到EPROM中去。这里提供一段代码可以完成这个功能，上面讲的用并口启动Linux的程序就是这么处理的。
基本的想法是，写一个通用的启动代码载入程序（stub），将bootsect.S+setup.S(也就是bzImage的前面一段)设置成0x19号中断的中断向量。在外面写一段代码将该段代码和启动代码进行合并，生成合法的bootrom映象就，可以烧到bootrom中去，在网络卡上启动。

下面是通用的启动代码载入程序：

.code16
RomHeader:
      .byte 0x55, 0xaa #启动ROM标志
RomPageCount:
      .byte 0x20  #假定bootrom是16K bytes

RomCode:
      pushw %es
      pushw %bx
      pushw %ax

      movb  $0xc1, %al
      call  IntVectAddr
      movw  $0x6a6e, %ax
      cmpw  %es

%bx), %ax
jz RomBootInit_x
movw %ax, %es

%bx)
      movw  $0xc019, %ax
      call  MoveIntVector
      movw  $RomBootVect, %bx
      pushw %cs
      popw  %es
      call  SetIntVector
RomBootInit_x:
      popw  %ax
      popw  %bx
      popw  %es
      lret

IntVectAddr:
      xorw       %bx,%bx
      movw       %bx,%es
      movb       %al,%bl
      addw       %bx,%bx
      addw       %bx,%bx
      ret

GetIntVector:
      call       IntVectAddr
GetIntVect_1:
                  les       %es

%bx), %bx
      ret

SetIntVector:
      pushf                      #; entry AL=vector to set, ES:BX=value
      pushw       %es             #; exit: vector modified
      pushw       %bx             #; all registers preserved
      call       IntVectAddr
      cli
      popw       %es

%bx)
addw $2, %bx
popw %es

%bx)
      subw       $2, %bx
      popf
      jmp       GetIntVect_1

MoveIntVector:
      call       GetIntVector       #; entry AL=vect to get, AH=vect to set
      xchgb       %al,%ah             #; exit: vector set, ES:BX=vector value
      call       SetIntVector       #; other registers preserved
      xchgb       %al,%ah
      ret

RomBootVect:
      pushw %cs
      popw       %ds
      movw       $0x07c0, %ax
      movw       %ax, %es
      movw       $BootCode, %si
      subw       %di, %di
      movw       $8192, %cx
      cld
      rep
      movsw
      ljmp       $0x07c0, $0
      lret

.org       0x0200
BootCode:

在Linux下的编译方法与bootsect.S的编译方法一样，编译成可执行文件后，比如放在bootx文件中。
内核编译后（make bzImage，支持上面所说的启动方式)，得到bzImage文件。

下面是将这两个文件复合在一起得到bootrom映象的程序：

/* mkbtrom.c */

int main(int argc, char* argv[])
{
      char buf[16384];
      char ch;
      int i;
      if (argc<4)
      {
            printf("Usage: mkbtrom \n"

;
            return 1;
      }
      FILE * pFile;
      pFile = fopen(argv[1], "rb"

;
      if (pFile==NULL)
      {
            printf("File %s open failed\n", argv[1]);
            return 2;
      }
      fread(buf, 1, 512, pFile);
      fclose(pFile);
      pFile = fopen(argv[2], "rb"

;
      if (pFile==NULL)
      {
            printf("File %s open failed\n", argv[2]);
            return 2;
      }
      fread(&buf[512], 1, 16384-512-1, pFile);
      fclose(pFile);
      ch = 0;
      for (i = 0;i<18383;i++)
            ch += buf[ i ];
      buf[16383] = -ch;
      pFile = fopen(argv[3], "wb"

;
      fwrite(buf, 1, 16384, pFile);
      fclose(pFile);
      return 0;
}

编译成执行文件后，运行mkbtrom bootx bzImage boot16k.bin后，boot16k.bin就可以烧到eprom中，从网络卡中启动了。

实战分享：从技术角度谈机器学习入门| 【大话IT】RadonDB低门槛向MySQL集群下战书 | ChinaUnix打赏功能已上线！ | 新一代分布式关系型数据库RadonDB知多少？

轩辕砍刀

荣誉版主

论坛徽章:: 0

17楼 [报告]

发表于 2003-04-23 20:03 |只看该作者

linux内核分析（转自某位大哥网上的笔记）

-----------------------------------------------------

asmlinkage unsigned int do_IRQ(struct pt_regs regs)
{
... ...
if (softirq_active(cpu) & softirq_mask(cpu))
do_softirq();
}

-----------------------------------------------------

还有，不是每个被标注的软中断都能在这次陷入内核的部分中完成，可能会延迟到下次中断。

其它地方的调用:

在entry.S中有一个调用点:

handle_softirq:
call SYMBOL_NAME(do_softirq)
jmp ret_from_intr

有两处调用它，一处是当系统调用处理完后:

ENTRY(ret_from_sys_call)
#ifdef CONFIG_SMP
movl processor(%ebx),%eax
shll $CONFIG_X86_L1_CACHE_SHIFT,%eax
movl SYMBOL_NAME(irq_stat)(,%eax),%ecx # softirq_active
testl SYMBOL_NAME(irq_stat)+4(,%eax),%ecx # softirq_mask
#else
movl SYMBOL_NAME(irq_stat),%ecx # softirq_active
testl SYMBOL_NAME(irq_stat)+4,%ecx # softirq_mask
#endif
jne handle_softirq

一处是当异常处理完后：

ret_from_exception:
#ifdef CONFIG_SMP
GET_CURRENT(%ebx)
movl processor(%ebx),%eax
shll $CONFIG_X86_L1_CACHE_SHIFT,%eax
movl SYMBOL_NAME(irq_stat)(,%eax),%ecx # softirq_active
testl SYMBOL_NAME(irq_stat)+4(,%eax),%ecx # softirq_mask
#else
movl SYMBOL_NAME(irq_stat),%ecx # softirq_active
testl SYMBOL_NAME(irq_stat)+4,%ecx # softirq_mask
#endif
jne handle_softirq

注意其中的irq_stat, irq_stat +4 对应的就是字段 active和mask

既然我们每次调用完硬中断后都马上调用软中断，为什么还要在这里调用呢?
原因可能都多方面的:

(1)在系统调用或者异常处理中同样可以标注软中断，这样它们在返回前就能得以迅速执行

(2)前面提到，有些软中断要延迟到下次陷入内核才能执行，系统调用和异常都陷入内核，所以可以尽早的把软中断处理掉

(3)如果在异常或者系统调用中发生中断，那么前面提到，可能还会有一些软中断没有处理，在这两个地方做一个补救工作，尽量避免到下次陷入内核才处理这些软中断。

另外，在切换前也调用。

bottom half

2.2.x中的bottom half :

2.2.x版本中的bottom half就相当于2.4.1中的softirq.它的问题在于只有32个,如果要扩充的话，需要task 队列(这里task不是进程，而是函数)，还有一个比较大的问题，就是虽然bottom half在一个CPU上是串行的(由local_bh_count[cpu]记数保证),但是在多CPU上是不安全的，例如，一个CPU上在运行关于定时器的bottom half,另一个CPU也可以运行同一个bottom half,出现了重入。

2.4.1中的bottom half

2.4.1中，用tasklet表示bottom half, mark_bh就是将相应的tasklet挂到运行队列里tasklet_hi_vec[cpu].list,这个队列由HI_SOFTIRQ对应的softirq来执行。

另外，用一个全局锁来保证，当一个CPU上运行一个bottom half时，其它CPU上不能运行任何一个bottom half。这和以前的bottom half有所不同，不知道是否我看错了。

用32个tasklet来表示bottom half:

struct tasklet_struct bh_task_vec[32];

首先，初始化所有的bottom half:

void __init softirq_init()
{
... ...
for (i=0; i<32; i++)
tasklet_init(bh_task_vec+i, bh_action, i);
... ...
}

这里bh_action是下面的函数，它使得bottom half运行对应的bh_base。

static void bh_action(unsigned long nr)
{
int cpu = smp_processor_id();

/*1*/ if (!spin_trylock(&global_bh_lock))
goto resched;

if (!hardirq_trylock(cpu))
goto resched_unlock;
if (bh_base[nr])
bh_base[nr]();

hardirq_endlock(cpu);
spin_unlock(&global_bh_lock);
return;

resched_unlock:
spin_unlock(&global_bh_lock);
resched:
mark_bh(nr);
}

/*1*/试图上锁,如果得不到锁，则重新将bottom half挂上，下次在运行。

当要定义一个bottom half时用下面的函数:

void init_bh(int nr, void (*routine)(void))
{
bh_base[nr] = routine;
mb();
}

取消定义时，用:

void remove_bh(int nr)
{
tasklet_kill(bh_task_vec+nr);
bh_base[nr] = NULL;
}

tasklet_kill确保这个tasklet被运行了，因而它的指针也没有用了。

激活一个bottom half,就是将它挂到队列中 :

static inline void mark_bh(int nr)
{
tasklet_hi_schedule(bh_task_vec+nr);
}

[目录]

--------------------------------------------------------------------------------

from lisolog

[目录]

--------------------------------------------------------------------------------

index

中断流程
中断可以用下面的流程来表示:

中断产生源 -------->; 中断向量表 (idt) ----------->; 中断入口 ( 一般简单处理后调用相应的函数) --------->; 后续处理

根据中断产生源，我们可以把中断分成两个部分 :

内部中断( CPU 产生)
外部中断( 外部硬件产生 )

这些中断经过一些处理后，会有一些后续处理。

后面分别讨论:

内部中断
外部中断
后续处理

[目录]

--------------------------------------------------------------------------------

内部中断

内部中断
内部中断有两种产生方式:

CPU 自发产生的: 如除数为0 的中断， page_fault 等
程序调用 int : int 80h

CPU自发产生的中断对应 idt 向量表中确定的位置,例如除数为0的中断在对应idt中第0个向量,
因此，内核只需要在第0个向量中设定相应的处理函数即可。

程序调用 int 可以产生的任何中断, 因此，前者是后者的子集。特别的有:

int 80h

这是系统调用的中断.( system call )是用户代码调用内核代码的入口。

这里面可以考察的专题至少有:

*系统调用
*其它内部中断

[目录]

--------------------------------------------------------------------------------

外部中断

外部中断
1.
外部中断是: 外部硬件(如时钟) ----->; 中断芯片 ---->; 中断向量表 ----->; 中断入口
完成一个完整的映射，有4件事情要做:

(1) 将外部设备和中断芯片相应的管脚接上
(2) 对中断芯片设置，使得特定管脚的中断能映射到CPU idt特定的位置
(3) 程序中包含了这些中断入口
(4) 在中断向量表里设置向量相应的入口地址

这些工作需要在外部中断流程里描述

2.
由于硬件设备可能共用一个中断，在统一的函数中会有相应的结构来处理，也就是有16个结构分别处理相应的16个中断
特定的硬件驱动需要将自己的处理函数挂接到特定的结构上.

3.
但是，有一个问题:驱动怎么知道自己的硬件产生哪个中断?
有一些是确定的，比如时钟是第0个, 软盘是第 5 个(right ??), 还有一些 PCI 设备是可以通过访问得到它们的中断号的，但是ISA设备需要通过探测(probe)来得到(详细情况可以参考 linux device driver )这涉及探测的工作

4.
因此，这里面要考察的工作至少包括:

1. i8259芯片的设置(包括上面的 (2) ), 以及一些其它属性的设置
2. 外部中断的流程
3. 处理外部中断的结构与相应的数据结构

下面是《LINUX系统分析...》中的一段，可供参考。

但有时一个设备驱动程序不知道设备将使用哪一个中断。在PCI结构中这不会成为一个问题，因为PCI的设备驱动程序总是知道它们的中断号。但对于ISA结构而言，一个设备驱动程序找到自己使用的中断号却并不容易。Linux系统通过允许设备驱动程序探测自己的中断来解决这个问题。

首先，设备驱动程序使得设备产生一个中断。然后，允许系统中所有没有指定的中断，这意味着设备挂起的中断将会通过中断控制器传送。Linux 系统读取中断状态寄存器然后将它的值返回到设备驱动程序。一个非0 的结果意味着在探测期间发生了一个或者多个的中断。设备驱动程序现在可以关闭探测，这时所有还未被指定的中断将继续被禁止。

一个ISA 设备驱动程序知道了它的中断号以后，就可以请求对中断的控制了。PCI 结构的系统中断比I S A 结构的系统中断要灵活得多。ISA设备使用中断插脚经常使用跳线设置，所以在设备驱动程序中是固定的。但PCI 设备是在系统启动过程中PCI初始化时由PCI BIOS或PCI子系统分配的。每一个PCI 设备都有可能使用A、B、C或者D这4 个中断插脚中的一个。缺省情况下设备使用插脚A。
每个PCI插槽的PCI中断A、B、C和D是通过路由选择连接到中断控制器上的。所以PCI插槽4的插脚A可能连接到中断控制器的插脚6 ，PCI 插槽4 的插脚B 可能连接到中断控制器的插脚7 ,以此类推。

PCI中断具体如何进行路由一般依照系统的不同而不同，但系统中一定存在PCI中断路由拓扑结构的设置代码。在Intel PC机中，系统的BIOS代码负责中断的路由设置。对于没有BIOS的系统，Linux系统内核负责设置。
PCI的设置代码将中断控制器的插脚号写入到每个设备的PCI设置头中。PCI的设置代码根据所知道的PCI中断路由拓扑结构、PCI设备使用的插槽，以及正在使用的PCI中断的插脚号来决定中断号，也就是IRQ号。

系统中可以有很多的PCI中断源，例如当系统使用了PCI-PCI桥时。这时，中断源的数目可能超过了系统可编程中断控制器上插脚的数目。在这种情况下，某些PCI设备之间就不得不共享一个中断，也就是说，中断控制器上的某一个插脚可以接收来自几个设备的中断。Linux系统通过让第一个中断源请求者宣布它使用的中断是否可以被共享来实现中断在几个设备之间共享的。中断共享使得irq_action数组中的同一个入口指向几个设备的irqaction结构。当一个共享的中断有中断发生时，Linux系统将会调用和此中断有关的所有中断处理程序。所有可以共享中断的设备驱动程序的中断处理程序都可能在任何时候被调用，即使在自身没有中断需要处理时。

[目录]

--------------------------------------------------------------------------------

后续处理

后续处理
后续部分主要完成下面的工作

1. bottom_half
2. 是否能进程切换?
3.是否需要进程切换?是则切换
4.信号处理

特别的，有一个重要的下半部分就是时钟中断的下半部分。

bottom_half
正如许多书所说的，它们继续完成中断处理(在开中断的状态下), 因此中断中的处理函数需要在一个32位变量中设置特定的bit来告诉do_softirq要执行哪个bottom_half(我们不妨把这个32位数想象成一个新的中断向量表，设置bit相当于产生中断，下半部分相当于handler,也许这是被称为软中断的原因吧)bottom_half有的时候需要借助一个特殊的结构: task_queue 来完成更多的工作，

task_queue
task_queue 是一个链表，每个节点是一个函数指针，这样，一个 bottom_half 就可以执行一个链表上的函数列了
当然 task_queue 不一定只在 bottom_half 中应用, 我查了一下，在一些驱动中也直接调用 run_task_queue 来执行一个特定的队列.
如果内核需要在某个中断产生后执行它的函数，只需要在它下半部分调用的 task_queue 上挂上它的函数( Linux Device Driver 中有步进马达的例子)现在的内核有些变化，增加了softirq_action tasklet, 不十分清楚是什么原因

是否需要进行切换
因为 linux是非抢占的，所以如果返回的代码段是内核级的话，就不允许进行切换。如果能切换判断一下是否需要切换, 如果是就切换

信号处理
看是否有信号要处理，如果要调用 do_signal

时钟中断的下半部分
在上面许多的外部中断中，有一个特殊的中断的处理 timer_interrupt, 它的下半部分主要处理:时间计算和校准定时器工作

因此，我们有了下面的工作
*下半部分(包括softirq, tasklet, bottom_half )
*后续处理的流程
*时钟中断的下半部分
*定时器

[目录]

--------------------------------------------------------------------------------

软中断代码线索

[声明]
#define DECLARE_TASK_QUEUE(q) LIST_HEAD(q)
#define LIST_HEAD(name) \
struct list_head name = LIST_HEAD_INIT(name)
struct list_head {
struct list_head *next, *prev;
};
#define LIST_HEAD_INIT(name) { &(name), &(name) }

   ''DECLARE_TASK_QUEUE'' [include/linux/tqueue.h, include/linux/list.h]

DECLARE_TASK_QUEUE(q) 宏用来申明一个名叫"q"的结构管理任务队列

[标明(MARK)]

这里是mark_bh过程的结构 [include/linux/interrupt.h]

|mark_bh(NUMBER)
|tasklet_hi_schedule(bh_task_vec + NUMBER)
   |insert into tasklet_hi_vec
      |__cpu_raise_softirq(HI_SOFTIRQ)
         |soft_active |= (1 << HI_SOFTIRQ)

               ''mark_bh''[include/linux/interrupt.h]

例如，当IRQ handler需要推迟一些工作, 它会mark_bh(NUMBER), 这里的NUMBER是BH的标志

[执行]
通过do_IRQ[arch/i386/kernel/irq.c]

|do_softirq
|h->;action(h)->; softirq_vec[TASKLET_SOFTIRQ]->;action ->; tasklet_action
   |tasklet_vec[0].list->;func

"h->;action(h);" 是队列向前.
.

[目录]

--------------------------------------------------------------------------------

2. 4软中断机制

一. 软中断概况
软中断是利用硬件中断的概念，用软件方式进行模拟，实现宏观上的异步执行效果。很多情况下，软中断和"信号"有些类似，同时，软中断又是和硬中断相对应的，"硬中断是外部设备对CPU的中断"，"软中断通常是硬中断服务程序对内核的中断"，"信号则是由内核（或其他进程）对某个进程的中断"（《Linux内核源代码情景分析》第三章）。软中断的一种典型应用就是所谓的"下半部"（bottom half），它的得名来自于将硬件中断处理分离成"上半部"和"下半部"两个阶段的机制：上半部在屏蔽中断的上下文中运行，用于完成关键性的处理动作；而下半部则相对来说并不是非常紧急的，通常还是比较耗时的，因此由系统自行安排运行时机，不在中断服务上下文中执行。bottom half的应用也是激励内核发展出目前的软中断机制的原因，因此，我们先从bottom half的实现开始。

二. bottom half

在Linux内核中，bottom half通常用"bh"表示，最初用于在特权级较低的上下文中完成中断服务的非关键耗时动作，现在也用于一切可在低优先级的上下文中执行的异步动作。最早的bottom half实现是借用中断向量表的方式，在目前的2.4.x内核中仍然可以看到：

static void (*bh_base[32])(void);       /* kernel/softirq.c */
系统如此定义了一个函数指针数组，共有32个函数指针，采用数组索引来访问，与此相对应的是一套函数：

void init_bh(int nr,void (*routine)(void));
为第nr个函数指针赋值为routine。

void remove_bh(int nr);
动作与init_bh()相反，卸下nr函数指针。

void mark_bh(int nr);
标志第nr个bottom half可执行了。

由于历史的原因，bh_base各个函数指针位置大多有了预定义的意义，在v2.4.2内核里有这样一个枚举：

enum {
      TIMER_BH = 0,
      TQUEUE_BH,
      DIGI_BH,
      SERIAL_BH,
      RISCOM8_BH,
      SPECIALIX_BH,
      AURORA_BH,
      ESP_BH,
      SCSI_BH,
      IMMEDIATE_BH,
      CYCLADES_BH,
      CM206_BH,
      JS_BH,
      MACSERIAL_BH,
      ISICOM_BH
};

并约定某个驱动使用某个bottom half位置，比如串口中断就约定使用SERIAL_BH，现在我们用得多的主要是TIMER_BH、TQUEUE_BH和IMMEDIATE_BH，但语义已经很不一样了，因为整个bottom half的使用方式已经很不一样了，这三个函数仅仅是在接口上保持了向下兼容，在实现上一直都在随着内核的软中断机制在变。现在，在2.4.x内核里，它用的是tasklet机制。

三. task queue

在介绍tasklet之前，有必要先看看出现得更早一些的task queue机制。显而易见，原始的bottom half机制有几个很大的局限，最重要的一个就是个数限制在32个以内，随着系统硬件越来越多，软中断的应用范围越来越大，这个数目显然是不够用的，而且，每个bottom half上只能挂接一个函数，也是不够用的。因此，在2.0.x内核里，已经在用task queue（任务队列）的办法对其进行了扩充，这里使用的是2.4.2中的实现。

task queue是在系统队列数据结构的基础上建成的，以下即为task queue的数据结构，定义在include/linux/tqueue.h中：

struct tq_struct {
      struct list_head list;       /* 链表结构 */
      unsigned long sync;          /* 初识为0，入队时原子的置1，以避免重复入队 */
      void (*routine)(void *);       /* 激活时调用的函数 */
      void *data;                   /* routine(data) */
};

typedef struct list_head task_queue;

在使用时，按照下列步骤进行：

DECLARE_TASK_QUEUE(my_tqueue); /* 定义一个my_tqueue，实际上就是一个以tq_struct为元素的list_head队列 */
说明并定义一个tq_struct变量my_task;
queue_task(&my_task,&my_tqueue); /* 将my_task注册到my_tqueue中 */
run_task_queue(&my_tqueue); /* 在适当的时候手工启动my_tqueue */
大多数情况下，都没有必要调用DECLARE_TASK_QUEUE()定义自己的task queue，因为系统已经预定义了三个task queue：

tq_timer，由时钟中断服务程序启动；
tq_immediate，在中断返回前以及schedule()函数中启动；
tq_disk，内存管理模块内部使用。
一般使用tq_immediate就可以完成大多数异步任务了。

run_task_queue(task_queue *list)函数可用于启动list中挂接的所有task，可以手动调用，也可以挂接在上面提到的bottom half向量表中启动。以run_task_queue()作为bh_base[nr]的函数指针，实际上就是扩充了每个bottom half的函数句柄数，而对于系统预定义的tq_timer和tq_immediate的确是分别挂接在TQUEUE_BH和IMMEDIATE_BH上（注意，TIMER_BH没有如此使用，但TQUEUE_BH也是在do_timer()中启动的），从而可以用于扩充bottom half的个数。此时，不需要手工调用run_task_queue()（这原本就不合适），而只需调用mark_bh(IMMEDIATE_BH)，让bottom half机制在合适的时候调度它。

四. tasklet

由上看出，task queue以bottom half为基础；而bottom half在v2.4.x中则以新引入的tasklet为实现基础。

之所以引入tasklet，最主要的考虑是为了更好的支持SMP，提高SMP多个CPU的利用率：不同的tasklet可以同时运行于不同的CPU上。在它的源码注释中还说明了几点特性，归结为一点，就是：同一个tasklet只会在一个CPU上运行。

struct tasklet_struct
{
      struct tasklet_struct *next;       /* 队列指针 */
      unsigned long state;             /* tasklet的状态，按位操作，目前定义了两个位的含义：
            TASKLET_STATE_SCHED（第0位）或TASKLET_STATE_RUN（第1位） */
      atomic_t count;                      /* 引用计数，通常用1表示disabled */
      void (*func)(unsigned long);       /* 函数指针 */
      unsigned long data;             /* func(data) */
};

把上面的结构与tq_struct比较，可以看出，tasklet扩充了一点功能，主要是state属性，用于CPU间的同步。

tasklet的使用相当简单：

定义一个处理函数void my_tasklet_func(unsigned long);
DECLARE_TASKLET(my_tasklet,my_tasklet_func,data); /*
定义一个tasklet结构my_tasklet，与my_tasklet_func(data)函数相关联，相当于DECLARE_TASK_QUEUE() */
tasklet_schedule(&my_tasklet); /*
登记my_tasklet，允许系统在适当的时候进行调度运行，相当于queue_task(&my_task,&tq_immediate)和mark_bh(IMMEDIATE_BH) */
可见tasklet的使用比task queue更简单，而且，tasklet还能更好的支持SMP结构，因此，在新的2.4.x内核中，tasklet是建议的异步任务执行机制。除了以上提到的使用步骤外，tasklet机制还提供了另外一些调用接口：

DECLARE_TASKLET_DISABLED(name,function,data); /*
和DECLARE_TASKLET()类似，不过即使被调度到也不会马上运行，必须等到enable */
tasklet_enable(struct tasklet_struct *); /* tasklet使能 */
tasklet_disble(struct tasklet_struct *); /* 禁用tasklet，只要tasklet还没运行，则会推迟到它被enable */
tasklet_init(struct tasklet_struct *,void (*func)(unsigned long),unsigned long); /* 类似DECLARE_TASKLET() */
tasklet_kill(struct tasklet_struct *); /* 清除指定tasklet的可调度位，即不允许调度该tasklet，但不做tasklet本身的清除 */

前面提到过，在2.4.x内核中，bottom half是利用tasklet机制实现的，它表现在所有的bottom half动作都以一类tasklet的形式运行，这类tasklet与我们一般使用的tasklet不同。

在2.4.x中，系统定义了两个tasklet队列的向量表，每个向量对应一个CPU（向量表大小为系统能支持的CPU最大个数，SMP方式下目前2.4.2为32）组织成一个tasklet链表：

struct tasklet_head tasklet_vec[NR_CPUS] __cacheline_aligned;
struct tasklet_head tasklet_hi_vec[NR_CPUS] __cacheline_aligned;

另外，对于32个bottom half，系统也定义了对应的32个tasklet结构：

struct tasklet_struct bh_task_vec[32];
在软中断子系统初始化时，这组tasklet的动作被初始化为bh_action(nr)，而bh_action(nr)就会去调用bh_base[nr]的函数指针，从而与bottom half的语义挂钩。mark_bh(nr)被实现为调用tasklet_hi_schedule(bh_tasklet_vec+nr)，在这个函数中，bh_tasklet_vec[nr]将被挂接在tasklet_hi_vec[cpu]链上（其中cpu为当前cpu编号，也就是说哪个cpu提出了bottom half的请求，则在哪个cpu上执行该请求），然后激发HI_SOFTIRQ软中断信号，从而在HI_SOFTIRQ的中断响应中启动运行。

tasklet_schedule(&my_tasklet)将把my_tasklet挂接到tasklet_vec[cpu]上，激发TASKLET_SOFTIRQ，在TASKLET_SOFTIRQ的中断响应中执行。HI_SOFTIRQ和TASKLET_SOFTIRQ是softirq子系统中的术语，下一节将对它做介绍。

五. softirq

从前面的讨论可以看出，task queue基于bottom half，bottom half基于tasklet，而tasklet则基于softirq。

可以这么说，softirq沿用的是最早的bottom half思想，但在这个"bottom half"机制之上，已经实现了一个更加庞大和复杂的软中断子系统。

struct softirq_action
{
      void (*action)(struct softirq_action *);
      void *data;
};
static struct softirq_action softirq_vec[32] __cacheline_aligned;
这个softirq_vec[]仅比bh_base[]增加了action()函数的参数，在执行上，softirq比bottom half的限制更少。

和bottom half类似，系统也预定义了几个softirq_vec[]结构的用途，通过以下枚举表示：

enum
{
      HI_SOFTIRQ=0,
      NET_TX_SOFTIRQ,
      NET_RX_SOFTIRQ,
      TASKLET_SOFTIRQ
};
HI_SOFTIRQ被用于实现bottom half，TASKLET_SOFTIRQ用于公共的tasklet使用，NET_TX_SOFTIRQ和NET_RX_SOFTIRQ用于网络子系统的报文收发。在软中断子系统初始化（softirq_init()）时，调用了open_softirq()对HI_SOFTIRQ和TASKLET_SOFTIRQ做了初始化：

void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)

open_softirq()会填充softirq_vec[nr]，将action和data设为传入的参数。TASKLET_SOFTIRQ填充为tasklet_action(NULL)，HI_SOFTIRQ填充为tasklet_hi_action(NULL)，在do_softirq()函数中，这两个函数会被调用，分别启动tasklet_vec[cpu]和tasklet_hi_vec[cpu]链上的tasklet运行。

static inline void __cpu_raise_softirq(int cpu, int nr)

这个函数用来激活软中断，实际上就是第cpu号CPU的第nr号软中断的active位置1。在do_softirq()中将判断这个active位。tasklet_schedule()和tasklet_hi_schedule()都会调用这个函数。

do_softirq()有4个执行时机，分别是：从系统调用中返回（arch/i386/kernel/entry.S::ENTRY(ret_from_sys_call)）、从异常中返回（arch/i386/kernel/entry.S::ret_from_exception标号）、调度程序中（kernel/sched.c::schedule()），以及处理完硬件中断之后（kernel/irq.c::do_IRQ()）。它将遍历所有的softirq_vec，依次启动其中的action()。需要注意的是，软中断服务程序，不允许在硬中断服务程序中执行，也不允许在软中断服务程序中嵌套执行，但允许多个软中断服务程序同时在多个CPU上并发。

六. 使用示例

softirq作为一种底层机制，很少由内核程序员直接使用，因此，这里的使用范例仅对其余几种软中断机制。

1.bottom half

原有的bottom half用法在drivers/char/serial.c中还能看到，包括三个步骤：

init_bh(SERIAL_BH,do_serial_bh);       //在串口设备的初始化函数rs_init()中，do_serial_bh()是处理函数
mark_bh(SERIAL_BH);             //在rs_sched_event()中，这个函数由中断处理例程调用
remove_bh(SERIAL_BH);          //在串口设备的结束函数rs_fini()中调用

尽管逻辑上还是这么三步，但在do_serial_bh()函数中的动作却是启动一个task queue：run_task_queue(&tq_serial)，而在rs_sched_event()中，mark_bh()之前调用的则是queue_task(...,&tq_serial)，也就是说串口bottom half已经结合task queue使用了。而那些更通用一些的bottom half，比如IMMEDIATE_BH，更是必须要与task queue结合使用，而且一般情况下，task queue也很少独立使用，而是与bottom half结合，这在下一节task queue使用示例中可以清楚地看到。

2.task queue

一般来说，程序员很少自己定义task queue，而是结合bottom half，直接使用系统预定义的tq_immediate等，尤以tq_immediate使用最频繁。看以下代码段，节选自drivers/block/floppy.c：

static struct tq_struct floppy_tq;       //定义一个tq_struct结构变量floppy_tq，不需要作其他初始化动作

static void schedule_bh( void (*handler)(void*) )
{
      floppy_tq.routine = (void *)(void *) handler;
            //指定floppy_tq的调用函数为handler，不需要考虑floppy_tq中的其他域
      queue_task(&floppy_tq, &tq_immediate);
            //将floppy_tq加入到tq_immediate中
      mark_bh(IMMEDIATE_BH);
            //激活IMMEDIATE_BH，由上所述可知，
            这实际上将引发一个软中断来执行tq_immediate中挂接的各个函数
}

当然，我们还是可以定义并使用自己的task queue，而不用tq_immediate，在drivers/char/serial.c中提到的tq_serial就是串口驱动自己定义的：

static DECLARE_TASK_QUEUE(tq_serial);

此时就需要自行调用run_task_queue(&tq_serial)来启动其中的函数了，因此并不常用。

3.tasklet

这是比task queue和bottom half更加强大的一套软中断机制，使用上也相对简单，见下面代码段：

1:       void foo_tasklet_action(unsigned long t);
2:       unsigned long stop_tasklet;
3:       DECLARE_TASKLET(foo_tasklet, foo_tasklet_action, 0);
4:       void foo_tasklet_action(unsigned long t)
5:       {
6:             //do something
7:
8:             //reschedule
9:             if(!stop_tasklet)
10:                      tasklet_schedule(&foo_tasklet);
11:       }
12:       void foo_init(void)
13:       {
14:             stop_tasklet=0;
15:             tasklet_schedule(&foo_tasklet);
16:       }
17:       void foo_clean(void)
18:       {
19:             stop_tasklet=1;
20:             tasklet_kill(&foo_tasklet);
21:       }

这个比较完整的代码段利用一个反复执行的tasklet来完成一定的工作，首先在第3行定义foo_tasklet，与相应的动作函数foo_tasklet_action相关联，并指定foo_tasklet_action()的参数为0。虽然此处以0为参数，但也同样可以指定有意义的其他参数值，但需要注意的是，这个参数值在定义的时候必须是有固定值的变量或常数（如上例），也就是说可以定义一个全局变量，将其地址作为参数传给foo_tasklet_action()，例如：

int flags;
DECLARE_TASKLET(foo_tasklet,foo_tasklet_action,&flags);
void foo_tasklet_action(unsigned long t)
{
int flags=*(int *)t;
...
}

这样就可以通过改变flags的值将信息带入tasklet中。直接在DECLARE_TASKLET处填写flags，gcc会报"initializer element is not constant"错。

第9、10行是一种RESCHEDULE的技术。我们知道，一个tasklet执行结束后，它就从执行队列里删除了，要想重新让它转入运行，必须重新调用tasklet_schedule()，调用的时机可以是某个事件发生的时候，也可以是像这样在tasklet动作中。而这种reschedule技术将导致tasklet永远运行，因此在子系统退出时，应该有办法停止tasklet。stop_tasklet变量和tasklet_kill()就是干这个的。

实战分享：从技术角度谈机器学习入门| 【大话IT】RadonDB低门槛向MySQL集群下战书 | ChinaUnix打赏功能已上线！ | 新一代分布式关系型数据库RadonDB知多少？

轩辕砍刀

荣誉版主

论坛徽章:: 0

18楼 [报告]

发表于 2003-04-23 20:07 |只看该作者

linux内核分析（转自某位大哥网上的笔记）

11、static inline void zap_pte_range(pmd_t * pmd, unsigned long address,
unsigned long size)
zap为zero all pages的缩写。该函数的作用是将在pmd中从虚拟地址address开始，长度为size的内存块通过循环调用pte_clear将其页表项清零，调用free_pte将所含空间中的物理内存或交换空间中的虚存页释放掉。在释放之前，必须检查从address开始长度为size的内存块有无越过PMD_SIZE.(溢出则可使指针逃出0～1023的区间)。

[目录]

--------------------------------------------------------------------------------

zap_pmd_range

12、static inline void zap_pmd_range(pgd_t * dir, unsigned long address, unsigned long size)
函数结构与zap_pte_range类似，通过调用zap_pte_range完成对所有落在address到address+size区间中的所有pte的清零工作。zap_pmd_range至多清除4M空间的物理内存。

[目录]

--------------------------------------------------------------------------------

zap_page_range

13、int zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
函数结构与前两个函数类似。将任务从address开始到address+size长度内的所有对应的pmd都清零。zap_page_range的主要功能是在进行内存收缩、释放内存、退出虚存映射或移动页表的过程中，将不在使用的物理内存从进程的三级页表中清除。（在讨论clear_page_tables时，就提到过当进程退出时，释放页表之前，先保证将页表对应项清零，保证在处于退出状态时，进程不占用0～3G的空间。）

[目录]

--------------------------------------------------------------------------------

zeromap_pte_range等

14、static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
unsigned long size, pte_t zero_pte)
15、static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address,
unsigned long size, pte_t zero_pte)
16、int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
这三个函数与前面的三个函数从结构上看很相似，他们的功能是将虚拟空间中从地址address开始，长度为size的内存块所对应的物理内存都释放掉，同时将指向这些区域的pte都指向系统中专门开出的长度为4K，全为0的物理页。zeromap_page_range在kernel代码中没有被引用，这个函数是旧版本的Linux遗留下来的，在新版本中已经被zap_page_range所替代。

[目录]

--------------------------------------------------------------------------------

remap_pte_range等

17、static inline void remap_pte_range(pte_t * pte, unsigned long address,
unsigned long size,       unsigned long offset, pgprot_t prot)
18、static inline int remap_pmd_range(pmd_t * pmd, unsigned long address,
unsigned long size,       unsigned long offset, pgprot_t prot)
19、int remap_page_range(unsigned long from, unsigned long offset, unsigned long size,
pgprot_t prot)
这三个函数也同前面的函数一样，层层调用，现仅介绍一下最后一个函数的作用。remap_page_range的功能是将原先被映射到虚拟内存地址from处的，大小为size的虚拟内存块映射到以偏移量offset为起始地址的虚拟内存中，同时将原来的pte、pmd项都清零。该函数也是逐级调用，在remap_pte_range中，通过set_pte将的物理页映射到新的虚拟内存页表项pte上。remap_page_range函数的功能与下文中的remap.c中介绍的功能相近，因此在kernel中也没有用到。

[目录]

--------------------------------------------------------------------------------

put_dirty_page

20、unsigned long put_dirty_page(struct task_struct * tsk, unsigned long page,
unsigned long address)
将虚拟内存页page链接到任务tsk中虚拟地址为address的虚拟内存中，其主要调用的流程如下：put_dirty_page->;setup_arg_page->;do_load_xxx_binary(xxx为aout或elf，这些函数都在fs\exec.c中)，它的功能是将在载入可执行文件的时候，将其相关的堆栈信息、环境变量等复制到当前进程的空间上。

[目录]

--------------------------------------------------------------------------------

handle_mm_fault

21、void handle_mm_fault(struct vm_area_struct * vma, unsigned long address,
int write_access)
      用于处理ALPHA机中的缺页中断

[目录]

--------------------------------------------------------------------------------

mmap.c

在mmap.c中，主要提供了对进程内存管理进行支持的函数，主要包括了do_mmap、do_munmap等对进程的虚拟块堆avl数进行管理的函数。
有关avl树的一些操作：
1、static inline void avl_neighbours (struct vm_area_struct * node, struct vm_area_struct * tree, struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right)
寻找avl树tree中的节点node的前序节点和后序节点，将结果放在指针to_the_left和to_the_right中，即使得*to_the_left->;next=node，node->;next=*to_the_right。在实际搜索中，过程是找到node节点中的左节点的最右节点和右节点的最左节点，采用avl树搜索可以提高效率。

2、static inline void avl_rebalance (struct vm_area_struct *** nodeplaces_ptr, int count)
将由于插入操作或删除操作而造成不平衡的avl树恢复成平衡状态。nodeplaces_ptr是指向的是需要调整的子树的根节点，count是该子树的高度。

static inline void avl_insert (struct vm_area_struct * new_node,
struct vm_area_struct ** ptree)
将新节点new_node插入avl树ptree中，并将该树重新生成平衡avl树。在创建avl树时，将vma模块不断的插入avl树中，构建一个大的avl树。当进程创建时，复制父进程后需要将以双向链表拷贝过来的vma链生成avl树。

4、static inline void avl_insert_neighbours (struct vm_area_struct * new_node, struct vm_area_struct ** ptree,       struct vm_area_struct ** to_the_left, struct vm_area_struct ** to_the_right)
将新节点new_node插入avl树ptree中，并将该树重新生成平衡avl树，同时返回该新节点的前序节点和后序节点。

5、static inline void avl_remove (struct vm_area_struct * node_to_delete, struct vm_area_struct ** ptree)
将指定要删除的节点node_to_delete从avl树ptree中删除。并将该树重新生成平衡avl树。该函数在释放虚存空间和归并vma链表是使用。

7、static void printk_list (struct vm_area_struct * vma)
8、static void printk_avl (struct vm_area_struct * tree)
9、static void avl_checkheights (struct vm_area_struct * tree)
10、static void avl_checkleft (struct vm_area_struct * tree, vm_avl_key_t key)
11、static void avl_checkright (struct vm_area_struct * tree, vm_avl_key_t key)
12、static void avl_checkorder (struct vm_area_struct * tree)
13、static void avl_check (struct task_struct * task, char *caller)
这些函数都是系统调试时用以检测avl树结构的正确性

14、static inline int vm_enough_memory(long pages)
通过计算当前系统中所剩的空间判断是否足够调用。可使用的内存包括缓冲存储器、页缓存、主存中的空闲页、swap缓存等。

15、static inline unsigned long vm_flags(unsigned long prot, unsigned long flags)
提供宏功能将页的保护位和标志位合并起来。

16、unsigned long get_unmapped_area(unsigned long addr, unsigned long len)
从虚拟内存address开始找到未分配的连续空间大于len的虚拟空间块，并将该快的首地址返回。

17、unsigned long do_mmap(struct file * file, unsigned long addr, unsigned long len,
      unsigned long prot, unsigned long flags, unsigned long off)
do_mmap在Linux虚拟内存管理中是一个很重要的函数，它的主要功能是将可执行文件的映象映射到虚拟内存中，或将别的进程中的内存信息映射到该进程的虚拟空间中。并将映射了的虚拟块的vma加入到该进程的vma avl树中。其运行的流程如下，更详细的分析请参阅林涛同学和徐玫峰同学的报告。
检验给定的映射长度len是大于1页，小于一个任务的最大长度3G且加上进程的加上偏移量off不会溢出。如不满足则退出。
如果当前任务的内存是上锁的，检验加上len后是否会超过当前进程上锁长度的界限。如是则退出。
如果从文件映射，检验文件是否有读的权限。如无在退出。
调用get_unmaped取得从地址address开始未映射的连续虚拟空间大于len的虚存块。
如从文件映射，保证该文件控制块有相应的映射操作。
为映射组织该区域申请vma结构。
调用vm_enough_memory有足够的内存。如无则释放6中申请的vma，退出。
如果是文件映射，调用file->;f_op_mmap将该文件映射如vma中。
调用insert_vm_struct将vma插入该进程的avl树中。
归并该avl树。

18、void merge_segments (struct mm_struct * mm, unsigned long start_addr,
unsigned long end_addr)
经过对进程虚拟空间不断的映射，在进程中的vma块有许多是可以合并的，为了提高avl树查找的效率，减少avl树中不必要的vma块，通常需要将这些块和并，merge_segments的功能为合并虚拟空间中从start_addr到end_addr中类型相同，首尾相连的vma块。由于只有经过增加操作采有可能合并，所有merge_segments只在do_mmap和unmap_fixup中被调用。该函数的流程如下：
根据起始地址start_addr从找到第一块满足vm_end>;start_addr的vma块mpnt。
调用avl_neighbours找到在vma双向链表上与mpnt前后相连的vma块prev和next。
如果prev和mpnt首尾相连，且有同样在swap file中的节点，同样的标志，同样的操作等则将其合并，反之转向6。
调用avl_remove将mpnt从avl树中删除，调整prev的结束地址和后序指针。
将结构mpnt所占的物理空间删除。
prev、mpnt、next依次下移，如未超过end_addr则返回3。

19、static void unmap_fixup(struct vm_area_struct *area, unsigned long addr, size_t len)
释放虚拟空间中的某些区域的时候，将会出现四种情况：
将整个vma释放掉
将vma的前半部分释放掉
将vma的后半部分释放掉
将vma的中间部分释放掉
为了正常维护vma树，当第一种情况是，将整个vma释放掉。同时释放vma结构所占的空间。第二种，释放后半部分，修改vma的相关信息。第二种，释放前半部分，修改vma的相关信息。第四种，由于在vma中出现了一个洞，则需增加一个vma结构描述新出现的vma块。unmap_fixup所执行的工作就是当释放空间时，修正对vma树的影响。

20、int do_munmap(unsigned long addr, size_t len)
do_munmap将释放落在从地址addr开始，长度为len空间内的vma所对应的虚拟空间。do_munmap被系统调用sys_munmap所调用（对sys_munmap如何工作的不甚了解）。下面是该函数的流程：
通过find_vma根据addr找到第一块vma->;end>;addr的vma块mpnt。
调用avl_neighbours找到mpnt在链表中的相邻指针prev和next。
将检查中所有与虚拟空间addr~addr+len相交的vma块放入free链表中。同时如果该vma链接在共享内存中，则将其从该环形链表中释放出来。
按序搜索free链表，调用unmap_fixup释放空间。
调用zap_page_range将指向释放掉的虚拟空间中的pte页表项清零。
调用kfree释放mpnt结构占用的空间。

remap.c
该文件提供了对虚拟内存重映射的若干函数。在下文中将介绍这些函数的功能，分析这些函数在虚拟内存管理中所起的作用。同时详细介绍其中主要函数的流程。

static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr)
根据输入的虚存地址返回其在虚拟内存中的对应的页表项pte。

static inline pte_t *alloc_one_pte(struct mm_struct *mm, unsigned long addr)
根据输入的虚存地址addr在pgd表中根据三级页表映射机制找pte，如果在pgd表中无对应的项，则分配给一个pgd（pmd）表项，在这个表项内分配根据addr分配pte，将pte返回。

static inline int copy_one_pte(pte_t * src, pte_t * dst)
将目的pte（dst）表项中的值赋成源pte（src）中的值，然后将源pte中的值清零，根据这函数的功能取move_one_pte更合适。

static int move_one_page(struct mm_struct *mm,
      unsigned long old_addr, unsigned long new_addr)
根据输入的虚拟地址old_addr调用get_one_pte获取该地址在三级页表中的pte项，调用copy_one_pte将该pte对应的物理页指针移到根据new_addr对应的pte项上，即在虚拟空间内移动一虚拟内存页。

static int move_page_tables(struct mm_struct * mm,
      unsigned long new_addr, unsigned long old_addr, unsigned long len)
将虚拟地址空间中从old_addr开始的长度为len的虚拟内存移动到以new_addr为起始地点的的虚拟空间中，以下为该函数的流程：
将所需移动的内存长度len赋值给偏移量offset如果offset＝0，结束。反之转向2。
将偏移量offset减去一个页的长度，调用move_one_page将从old_addr+offset开始的一页移到new_addr+offset。若发生错误则转到4。
如果offset不为0，则转向1，反之结束。
调用move_one_page将所有已移动到新地址的页移回源地址，调用zap_page_range将从new_addr开始的移动过的页pte清零，并返回出错信息－1。

static inline unsigned long move_vma(struct vm_area_struct * vma,
      unsigned long addr, unsigned long old_len, unsigned long new_len)
将虚存中的vma块vma的起始地址为addr，长度为old_len的内存块扩展为长度为new_len的内存块，并在虚存中找到可容纳长度为new_len块的连续区域，返回首地址。其工作流程如下：
给新的vma结构块new_vma分配空间，如果不成功返回出错信息。
调用get_unmap_area从addr开始找到第一个未被利用的虚存空洞，空洞长度大于给定的新的虚拟内存块的长度len，将其首地址赋给new_addr。如果未招到,则转向9。
调用move_page_tables将从addr开始的长度为old_len的内存区域移动到以new_addr为起始地址的虚拟空间中。
修改new_vma块中关于起始地址，结束地址的值。
将新的new_vma块插入到当前进程的虚拟内存所链成的双向链表和avl树中。
调用merge_segment将虚拟空间中地址可能连结在一起的不同的vma段连结成一个vma块，同时删除冗于的vma块。
将原有空间中的从addr开始，长度为old_len的虚拟空间释放掉。
修改mm结构中的所有虚存的长度，返回新的起始虚拟地址new_addr。
将vma块new_vma释放掉并返回出错信息。

asmlinkage unsigned long sys_mremap(unsigned long addr,       unsigned long old_len,
                  unsigned long new_len       unsigned long flags)
sys_remap是一个系统调用，其主要功能是扩展或收缩现有的虚拟空间。它的主要工作流程如下：
检查addr地址是否小于4096，如小于，则非法，返回。
将原始长度old_len和需要扩展或收缩的长度new_len页对齐。
如果有old_len>;new_len，则说明是收缩空间，调用do_munmap将虚存空间中从new_len到old_len的空间释放掉。返回收缩后的首地址addr。
根据addr找到第一块vma块满足vma->;end >; addr，检查addr是否落在虚存的空洞中，如是，则返回出错信息。
检查需要扩展的内存块是否落在该vma块中，越界则返回出错信息。
如果该vma是上锁的，则检测上锁的内存扩展后是否越界，如是，则7返回出错信息。
检测当前进程的虚存空间经扩展后是否超过系统给该进程的最大空间。如是，则返回出错信息。
如果找到vma块从addr开始到块末尾的长度为old_len且(old_len的长度不等于new_len或该虚存是不可移动的)，则转向9，反之转向10。
检测从跟随找到的vma块的未分配的空间是否大于需要扩展空间。如果大于，则直接将扩展的空间挂在找到的vma块后，修改vma块中相关的信息，并返回扩展后虚拟块的首地址。如小于转向10。
如果当前虚拟块是是不可移动的，则返回错误信息。反之，调用move_vma将需要扩展的虚拟块移动可以容纳其长度new_len的虚拟空间中。

[目录]

--------------------------------------------------------------------------------

伙伴(buddy)算法

2.4版内核的页分配器引入了"页区"(zone)结构, 一个页区就是一大块连续的物理页面. Linux 2.4将整个物理内存划分为3个页区, DMA页区(ZONE_DMA), 普通页区(ZONE_NORMAL)和高端页区(ZONE_HIGHMEM).
页区可以使页面分配更有目的性, 有利于减少内存碎片. 每个页区的页分配仍使用伙伴(buddy)算法.
伙伴算法将整个页区划分为以2为幂次的各级页块的集合, 相邻的同次页块称为伙伴, 一对伙伴可以合并到更高次页面集合中去.

下面分析一下伙伴算法的页面释放过程.

; mm/page_alloc.c:

#define BAD_RANGE(zone,x) (((zone) != (x)->;zone) || (((x)-mem_map) offset) || (((x)-mem_map) >;= (zone)->;offset+(zone)->;size))

#define virt_to_page(kaddr)       (mem_map + (__pa(kaddr) >;>; PAGE_SHIFT))
#define put_page_testzero(p)       atomic_dec_and_test(

void free_pages(unsigned long addr, unsigned long order)
{       order是页块尺寸指数, 即页块的尺寸有(2^order)页.
      if (addr != 0)
            __free_pages(virt_to_page(addr), order);
}
void __free_pages(struct page *page, unsigned long order)
{
      if (!PageReserved(page)  put_page_testzero(page))
            __free_pages_ok(page, order);
}
static void FASTCALL(__free_pages_ok (struct page *page, unsigned long order));
static void __free_pages_ok (struct page *page, unsigned long order)
{
      unsigned long index, page_idx, mask, flags;
      free_area_t *area;
      struct page *base;
      zone_t *zone;

      if (page->;buffers)
            BUG();
      if (page->;mapping)
            BUG();
      if (!VALID_PAGE(page))
            BUG();
      if (PageSwapCache(page))
            BUG();
      if (PageLocked(page))
            BUG();
      if (PageDecrAfter(page))
            BUG();
      if (PageActive(page))
            BUG();
      if (PageInactiveDirty(page))
            BUG();
      if (PageInactiveClean(page))
            BUG();

      page->;flags  ~((1       page->;age = PAGE_AGE_START;

      zone = page->;zone; 取page所在的页区

      mask = (~0UL)       base = mem_map + zone->;offset; 求页区的起始页
      page_idx = page - base; 求page在页区内的起始页号
      if (page_idx  ~mask) 页号必须在页块尺寸边界上对齐
            BUG();
      index = page_idx >;>; (1 + order);
            ; 求页块在块位图中的索引, 每一索引位置代表相邻两个"伙伴"
      area = zone->;free_area + order; 取该指数页块的位图平面

      spin_lock_irqsave( flags);

      zone->;free_pages -= mask; 页区的自由页数加上将释放的页数(掩码值为负)

      while (mask + (1                struct page *buddy1, *buddy2;

            if (area >;= zone->;free_area + MAX_ORDER) 如果超过了最高次平面
                     BUG();
            if (!test_and_change_bit(index, area->;map)) 测试并取反页块的索引位
                     /*
                     * the buddy page is still allocated.
                     */
                     break; 如果原始位为0, 则说明该页块原来没有伙伴, 操作完成
            /*
            * Move the buddy up one level. 如果原始位为1, 则说明该页块存在一个伙伴
            */
            buddy1 = base + (page_idx ^ -mask); 对页块号边界位取反,得到伙伴的起点
            buddy2 = base + page_idx;

            if (BAD_RANGE(zone,buddy1)) 伙伴有没有越过页区范围
                     BUG();
            if (BAD_RANGE(zone,buddy2))
                     BUG();

            memlist_del( 删除伙伴的自由链
            mask                area++; 求更高次位图平面
            index >;>;= 1; 求更高次索引号
            page_idx  mask; 求更高次页块的起始页号
      }
      memlist_add_head( + page_idx)->;list,  将求得的高次页块加入该指数的自由链

      spin_unlock_irqrestore( flags);

      /*
      * We don't want to protect this variable from race conditions
      * since it's nothing important, but we do want to make sure
      * it never gets negative.
      */
      if (memory_pressure >; NR_CPUS)
            memory_pressure--;
}

[目录]

--------------------------------------------------------------------------------

页目录处理的宏

对于i386的2级分页机构,每个页目录字高20位是页号,低12位是页属性.
如果将页目录字的低12位屏蔽成0,整个页目录字就是相应页面的物理地址,下面是常用的一些页目录处理的宏.
typedef struct { unsigned long pgd; } pgd_t;                一级页目录字结构
typedef struct { unsigned long pmd; } pmd_t;                中级页目录字结构
typedef struct { unsigned long pte_low; } pte_t;       末级页目录字结构
typedef struct { unsigned long pgprot; } pgprot_t;       页属性字结构

pgd_t *pgd = pgd_offset(mm_struct,addr);
      取进程虚拟地址addr的一级页目录字指针,扩展为
      ((mm_struct)->;pgd + ((address >;>; 22)  0x3FF))

pgd_t *pgd = pgd_offset_k(addr)
      取内核地址addr的一级页目录字指针,扩展为
      (init_mm.pgd + ((address >;>; 22)  0x3FF));

pmd_t *pmd = pmd_offset(pgd, addr) ;
      从一级页目录字指针取addr的中级页录字指针,在2级分页系统中,它们的值是相同的,扩展为
      (pmd_t *)(pgd);

pte_t *pte = pte_offset(pmd, addr)
      从中级页目录字指针取addr的末级页目录字指针,扩展为
      (pte_t *)((pmd->;pmd  0xFFFFF000) + 0xC0000000) + ((addr >;>; 12)  0x3FF);

struct page *page = pte_page(pte_val));
      取末级页目录字pte_val的页面映射指针,扩展为
      (mem_map+(pte_val.pte_low >;>; 12))

pte_t pte_val = ptep_get_and_clear(pte);
      取末级页目录字指针pte的值,并将该目录字清零,扩展为
      (pte_t){xchg(...

pte_t pte_val = mk_pte(page,pgprot);
      将页面映射指针page与页属性字pgprot组合成页目录字,扩展为
      (pte_t) { (page - mem_map)
pte_t pte_val =  mk_pte_phys(physpage, pgprot);
      将物理地址physpage所在页面与页属性字组合成页目录字,扩展为
      (pte_t) { physpage >;>; 12
unsigned long addr = pmd_page(pmd_val);
      取中级页目录字所表示的页目录虚拟地址,扩展为
      ((unsigned long) (pmd_val.pmd  0xFFFFF000 + 0xC0000000));

set_pte(pte,pte_val);
      设置末级页目录字,扩展为
      *pte = pteval;
set_pmd(pmd,pmd_val)
      设置中级页目录字,扩展为
      *pmd = pmd_val;
set_pgd(pgd,pgd_val)
      设置一级页目录字,扩展为
      *pgd = pgd_val;

[目录]

--------------------------------------------------------------------------------

MM作者的文章

Linux MM: design for a zone based memory allocator
Rik van Riel, July 1998
One of the biggest problems currently facing the Linux memory management subsystem is memory fragmentation. This is the result of several developments in other parts of the Linux kernel, most importantly the growth of each process'es kernel stack to 8 kB and the dynamic allocation of DMA and networking buffers. These factors, together with a general speedup of both peripheral hardware and the device drivers has lead to a situation where the currently used buddy allocator just can't cut it anymore. This white-paper is divided in 3 pieces, the problem, the solution and some actual code. I need a lot of comments and hints for possible improvement, so feel free to email them to me...

The problem
The problem is caused by the fact that memory is allocated in chunks of different sizes. For most types of usage we just allocate memory one page (4 kB on most machines) at a time, but sometimes we give out larger pieces of memory (2, 4, 8, 16 or 32 pages at once). Because of the fact that most UNIX (and Linux) machines have a completely full memory (free memory is wasted memory), it is next to impossible to free larger area's and the best we can do is be very careful not to hand out those large areas when we only need a small one.
There have been (and there are) several workarounds for this fragmentation issue; one of them (PTE chaining) even involves a physical to logical translating, almost reverse page table-like solution. With that project, we can swap out pages based on their physical address, thus force freeing that one page that blocked an entire 128 kB area. This would solve most of our problems, except when that last page is unswappable, for example a page table or a program's kernel stack. In that case, we're screwed regardlessly of what deallocation scheme we're using.

Because our inability to hand out larger chunks of memory has impact on system functionality and could even have impact on system stability it seems warranted to sacrifice a little bit of speed (the buddy system is fast!) in order to solve most of the above problems. The main problem with the current system is that it doesn't differentiate between swappable and unswappable memory, leading to a system where page tables and other cruft are scattered all over the system, making it impossible to free up one large contiguous area.

This problem is made even worse by the fact that on some architectures we can only do DMA to addresses under 16 MB and it will undoubtedly show up again in some years when we all have 16 GB of memory and try do do DMA to those oldie 32 bit PCI cards that don't support dual cycle address mode

The solution
The solution is to hand out free zones of 128 kB large, and to use each zone for one type of usage only. Then we can be sure that no page tables interfere with the freeing of a zone of user memory, and we can always just free an area of memory.
In the current Linux kernel, we have the following uses for memory:

reserved memory, kernel code and statically allocated kernel structures: after system boot we never much with the layout of this memory so it's a non issue wrt. the allocator
user memory: this memory can be swapped out and/or relocated at will, it is allocated one page at a time and gives us no trouble, apart from the fact that we always need more than we have physically available; no special requirements
kernel stack: we allocate 8 kB (2 pages) of unswappable kernel stack for each process; each of those stacks needs to be physically contiguous and it needs to be in fast memory (not in uncached memory)
page tables: page directories are unswappable, page tables and (on some machines) page middle directories can be moved/swapped with great caution; the memory for these is given out one page at a time; we only look up the page tables every once in a while so speed is not very critical; when we have uncached memory, we'd rather use it for page tables than for user pages
small SLAB: SLAB memory is used for dynamic kernel data; it is allocated and freed at will, unfortunately this will is not ours but that of the (device) driver that requested the memory; speed is critical
large SLAB: the same as small SLAB, but sometimes the kernel wants large chunks (>; 2 pages); we make the distinction between the two because we don't want to face hopeless fragmentation inside the SLAB zones...
DMA buffers: this memory needs to be physically below a certain boundary (16 MB for ISA DMA) and is often allocated in chunks of 32, 64 or 128 kB
For small (< 16 MB) machines, the above scheme is overkill and we treat several types of usage as one. We can, for instance, treat large SLAB and DMA the same, and small SLAB, kernel stack and page table can be allocated in the same zones too. Small slab and kernel stack will be treated the same on every machine; the distinction is only made because I want the documentation to be complete.
In addition to this, we can differentiate between 3 different kinds of memory:

DMA memory: this memory is located under the 16 MB limit and is cached by the L1 and L2 caches
'normal' memory: this memory is located above the DMA limit and is cached by the L1 and L2 caches, it can not be used for DMA buffers
slow memory: this memory is not cached or present on an add-on board, it can not be used for DMA buffers and using it for time critical kernel stack and SLAB would be disastrous for performance; we also don't want to use it for CPU intensive user applications
Since we don't want to waste the slow memory we might have, we can use that for page tables and user memory that isn't used very often. If we have user memory in slow memory and it turns out that it is used very often we can always use the swap code to relocate it to fast memory. DMA memory is scarce, so we want to allocate that only we specifically need it or when we don't have any other memory left.
This leads to the following zone allocation orders:

SLAB and kernel stack  | user memory |  page tables |  DMA buffers
-----------------------+---------------+----------------+-------------
normal memory       | normal memory |  slow memory |  DMA memory
DMA memory          | slow memory |  normal memory |
slow memory          | DMA memory |  DMA memory |

This means that when, for instance, we ran out of user memory and there is enough free memory available, we first try to grab a zone of 'normal memory', if that fails we look for a free area of slow memory and DMA memory is tried last.

Page allocation
For SLAB, page table and DMA memory we always try to allocate from the fullest zone available and we grab a free zone when we're out of our own memory. In order to grab the fullest zone, we keep these zones in a (partially?) sorted order. For large SLAB/DMA areas we will also want to keep in mind the sizes of the memory chunks previously allocated in this zone.
User pages are kept on a number of linked lists: active, inactive, clean and free. We allocate new pages in the inactive queue and perform allocations from the free queue first, moving to the clean queue when we're out of free pages. Inactive pages get either promoted to the active queue (when they're in heavy use) or demoted to the clean queue (when they're dirty, we have to clean them first). Pages in the clean queue are also unmapped from the page table and thus already 'halfway swapped out'. Pages only enter the free list when a program free()s pages or when we add a new zone to the user area.

In order to be able to free new zones (for when SLAB gets overly active), we need to be able to mark a relatively free zone force-freeable. Upon scanning such a page kswapd will free the page and make sure it isn't allocated again.When the PTE chaining system gets integrated into the kernel, we can just force-free a user zone with relatively few active pages when the system runs out of free zones. Until then we'll need to keep two free zones and walk the page tables to find and free the pages.

Actual code
There's not much of actual code yet but all the administrative details are ready. ALPHA status reached and the .h file is ready

/*
* The struct mem_zone is used to describe a 32 page memory area.
*/

struct mem_zone {
mem_zone * prev, next; /* The previous and next zone on this list */
unsigned long used; /* Used pages bitmap for SLAB, etc !!! count for user */
unsigned long flags;
};

/*
* Flags for struct_mem->;flags
*/

#define ZONE_DMA 0x00000001 /* DMA memory */
#define ZONE_SLOW 0x00000002 /* uncached/slow memory */
#define ZONE_USER 0x00000004 /* usermode pages, these defines are for paranoia only */
#define ZONE_SLAB 0x00000008 /* large SLAB */
#define ZONE_STK 0x00000010 /* kernel stack and order-1 SLAB (and order-0 SLAB if there is slow memory) */
#define ZONE_PTBL 0x00000020 /* page tables and one-page SLAB (except when there is slow memory) */
#define ZONE_DMA 0x00000040 /* DMAbuffers */
#define ZONE_RECL 0x00000080 /* We are reclaiming this zone */
#define ZONE_0 0x00000100 /* loose pages allocated */
#define ZONE_1 0x00000200 /*order-1 (2^1 = 2 page)chunks allocated */
#define ZONE_2 0x00000400 /* etc... In order to help in buddy-like allocation for */
#define ZONE_3 0x00000800 /* large SLAB zones on small memory machines. */
#define ZONE_4 0x00001000
#define ZONE_5 0x00002000

/*
* Memory statistics
*/

typedef struct {
      unsigned long used;
      unsigned long free;
} zone_stats_t;

struct memstats {
      struct zone_stats_t ptbl;
      struct zone_stats_t stk;
      struct zone_stats_t slab;
      struct zone_stats_t dma;
      /* Slightly different structs for these */
      struct user {
            unsigned long active;
            unsigned long inactive;
            unsigned long clean;       /* we do lazy reclamation */
            unsigned long free;
      };
      struct free {
            unsigned long dma;       /* different memory types */
            unsigned long normal;
            unsigned long slow;
      };
      struct misc {
            unsigned long num_physpages;
            unsigned long reserved;       /* reserved pages */
            unsigned long kernel;       /* taken by static kernel stuff */
      };
};

/* This is where we find the different zones */

struct memzones {
      struct free {
            struct mem_zone dma;
            struct mem_zone normal;
            struct mem_zone slow;
      };
      struct mem_zone dma;
      struct mem_zone user;
      struct mem_zone slab;
      struct mem_zone stk;
      struct mem_zone ptbl;
};

实战分享：从技术角度谈机器学习入门| 【大话IT】RadonDB低门槛向MySQL集群下战书 | ChinaUnix打赏功能已上线！ | 新一代分布式关系型数据库RadonDB知多少？

轩辕砍刀

荣誉版主

论坛徽章:: 0

19楼 [报告]

发表于 2003-04-23 20:11 |只看该作者

linux内核分析（转自某位大哥网上的笔记）

试图减小信号灯的数值，如果成功，信号灯的count取值成为0。这个进程现在可以继续运行并使用数据文件。但是，如果另一个进程需要使用这个文件，现在它试图减少信号灯的count数值，它会失败因为结果会是-1。这个进程会被挂起直到第一个进程处理完数据文件。当第一个进程处理完数据文件，它会增加信号灯的waking数值成为1。现在等待进程会被唤醒，这次它减小信号灯的尝试会成功。

每一个独立的信号灯操作可能都需要维护一个调整动作。Linux至少为每一个进程的每一个信号灯数组都维护一个sem_undo的数据结构。如果请求的进程没有，就在需要的时候为它创建一个。这个新的sem_undo数据结构同时在进程的task_struct数据结构和信号灯队列的semid_ds数据结构的队列中排队。对信号灯队列中的信号灯执行操作的时候，和这个操作值相抵消的值加到这个进程的sem_undo数据结构的调整队列这个信号灯的条目上。所以，如果操作值为2，那么这个就在这个信号灯的调整条目上增加-2。

当进程被删除，比如退出的时候，Linux遍历它的sem_undo数据结构组，并实施对于信号灯数组的调整。如果删除信号灯，它的sem_undo数据结构仍旧停留在进程的task_struct队列中，但是相应的信号灯数组标识符标记为无效。这种情况下，清除信号灯的代码只是简单地废弃这个sem_undo数据结构。

3.锁机制
lock_…（）；
unlock_…（）；
wait_on_…（）：TASK_RUNNING ->;TASK_UNINTERRUPTIBLE；
进程在RUNNING,WAITING状态间转换时，锁机制也是Linux中解决进程之间共享资源的一个方法。锁就是在资源的结构定义中加入一个锁成员，或为一个标志位,它的取值可以由多个进程检验和设置。锁可以用于实现对资源的共享竞争。具体来说当一个进程占用一个资源时，先对其上锁，然后再进行相关的操作，如果这时别的进程也要用这个资源，则必须等待这个锁被解开后，才可以进行下去。
但是，锁仅在某些数据结构和资源申请中才会用到，进程在申请某种特定资源时，会调用相应的__wait_on_… 函数来判断是否该资源已经被上锁，如果未上锁或已被解锁，则分配资源给进程，否则进程加入到等待队列中去。这种类型的申请有：__wait_on_dquot、__wait_on_buffer、__wait_on_inode、__wait_on_page、__wait_on_super等。
值得注意的是，如果申请不到这种资源，进程的状态都是转变成TASK_UNINTERRUPTIBLE。
定义锁的方式有两种：
专门的某项数据结构：
如：Superblock的数据结构中专门定义了锁数据项：s_lock；
置数据结构中某一项的某个标志位为锁标志：
如：
struct inode中定义了i_state的数据项，通过判断i_state 是否置了 I_LOCK，来判断该inode节点是否上锁。（2.2.8版本中定义）//注意：在2.2.0.34版本中是采用了专门的数据项i_lock来进行锁操作的。
struct buffer_head 中定义了b_state的数据项，通过判断b_state是否置了 BH_Lock位,来判断该缓冲区头是否上锁。
struct dquot中定义了dq_flags的数据项，通过判断dq_flags是否置了DQ_LOCKED位,来判断该dquot是否上锁。
struct page中定义了flags的数据项，通过判断flags是否置了PG_locked 位,来判断该页是否上锁。//注：程序中一般采用PageLocked(page)函数来判断是否上锁。

我们以buffer_head的加锁和解锁操作为例来解释一下通过锁机制进行的状态转换，在这里要申请buffer_head 资源，先要申请到锁，buffer_head的加锁和解锁就是通过置位和复位bh->;b_state来实现的：
//申请资源时将该缓冲区上锁，置锁位，如果申请不到，睡眠在等待队列上，等待该锁的释放。
extern inline void lock_buffer(struct buffer_head * bh)
{
      while (set_bit(BH_Lock, &bh->;b_state))
            __wait_on_buffer(bh);
}

//资源释放时，清该缓冲区锁位，并唤醒等待队列上的进程，参与竞争资源。
void unlock_buffer(struct buffer_head * bh)
{
      ......

      clear_bit(BH_Lock, &bh->;b_state);
      wake_up(&bh->;b_wait);
      ......
}

//检验该锁位是否已经置位
static inline int buffer_locked(struct buffer_head * bh)
{
      return test_bit(BH_Lock, &bh->;b_state);
}

//在 \USR\SRC\LINUX\FS\BUFFER.C中定义了__wait_on_buffer(stuct buffer_head * bh);该函数判断该buffer_head是否已经被上了锁，如果是，则不能得到资源，将进程置成TASK_UNINTERRUPTIBLE，加入bh-->;b_wait队列中，调用schedule()转去调用其他的进程，否则，分配给资源，进程进入TASK_running状态。
void __wait_on_buffer(struct buffer_head * bh)
{
      struct wait_queue wait = { current, NULL };

      bh->;b_count++;
      add_wait_queue(&bh->;b_wait, &wait);/*进程加入到等待锁的队列*/
repeat:
      run_task_queue(&tq_disk);
      current->;state = TASK_UNINTERRUPTIBLE;/*进程状态置为TASK_UNINTERRUPTIBLE*/
      if (buffer_locked(bh)) {
            schedule(); /*如果申请不到锁，重新调度CPU*/
            goto repeat;
      }
      remove_wait_queue(&bh->;b_wait, &wait);/*进程从等待队列中删除*/
      bh->;b_count--;
      current->;state = TASK_RUNNING; /*进程状态置为TASK_ RUNNING*/
}

4. 管道（流）
管道做为系统的特殊设备文件,可以是内存方式的,也可以是外存方式的。管道的传输一般是单向的,即一个管道一向,若两个进程要做双向传输则需要2个管道.管道生成时即有两端,一端为读,一端为写,两个进程要协调好,一个进程从读方读,另一个进程向写方写。管道的读写使用流设备的读写函数,即:read(),write.管道的传输方式为FIFO,流方式的.不象消息队列可以按类型读取.管道分为有名管道和无名管道：
1．有名管道
一般为系统特殊文件方式,使用的进程之间不一定要有父子关系或兄弟关系.
2．无名管道
一般为内存方式,使用的进程之间一定要有父子关系或兄弟关系.

  Linux shell允许重定向。例如：

  $ ls | pr | lpr

  把列出目录文件的命令ls的输出通过管道接到pr命令的标准输入上进行分页。最后，pr命令的标准输出通过管道连接到lpr命令的标准输入上，在缺省打印机上打印出结果。管道是单向的字节流，把一个进程的标准输出和另一个进程的标准输入连接在一起。没有一个进程意识到这种重定向，和它平常一样工作。是shell建立了进程之间的临时管道。在Linux中，使用指向同一个临时VFS INODE节点（本身指向内存中的一个物理页）的两个file数据结构来实现管道。当写进程向管道中写的时候，字节拷贝到了共享的数据页，当从管道中读的时候，字节从共享页中拷贝出来。Linux必须同步对于管道的访问。必须保证管道的写和读步调一致，它使用锁、等待队列和信号。
      运用管道方式进行通讯的进程，由于都是调用sleep_on_interruptible,因此都是睡眠在TASK_INTERRUPTIBLE状态的。

管道结构的定义在include\linux\pipe_fs_i.h中，
struct pipe_inode_info {
      struct wait_queue * wait;
      char * base;
      unsigned int start;
      unsigned int len;
      unsigned int lock; //用到了锁
      unsigned int rd_openers;
      unsigned int wr_openers;
      unsigned int readers;
      unsigned int writers;
};

对管道的操作主要有读和写两种：
1．向一个管道写pipe_write()：
在\fs\pipe.c中定义了static int pipe_write(struct inode * inode, struct file * filp, const char * buf, int count)；
实现机制：当写进程向管道写的时候，它使用标准的write库函数。这些库函数传递的文件描述符是进程的file数据结构组中的索引，每一个都表示一个打开的文件，在这种情况下，是打开的管道。Linux系统调用使用描述这个管道的file数据结构指向的write例程。这个write例程使用表示管道的VFS INODE节点存放的信息，来管理写的请求。如果有足够的空间把所有的字节都写导管到中，只要管道没有被读进程锁定，Linux为写进程上锁，并把字节从进程的地址空间拷贝到共享的数据页。如果管道被读进程锁定或者空间不够，当前进程睡眠，并放在管道INODE节点的等待队列中，并调用调度程序，运行另外一个进程。它是可以中断的，所以它可以接收信号。当管道中有了足够的空间写数据或者锁定解除，写进程就会被读进程唤醒。当数据写完之后，管道的VFS INODE 节点锁定解除，管道INODE节点的等待队列中的所有读进程都会被唤醒。

2．从一个管道读Pipe_read()：
在\fs\pipe.c中定义了static int pipe_read(struct inode * inode, struct file * filp, char * buf, int count)；
实现机制：从管道中读取数据和写数据非常相似。进程允许进行非阻塞的读（依赖于它们打开文件或者管道的模式），这时，如果没有数据可读或者管道被锁定，会返回一个错误。这意味着进程会继续运行。另一种方式是在管道的INODE节点的等待队列中等待，直到写进程完成。如果管道的进程都完成了操作，管道的INODE节点和相应的共享数据页被废弃。

进程在TASK_RUNNING和TASK_STOPPED间的转换：
1．进程从TASK_RUNNING->;TASK_STOPPED的转换：
TASK_STOPPED状态是一种暂停状态，和TASK_STOPPED状态配合工作的标志为PF_PTRACED和PF_TRACESYS，分别表示被跟踪和正在跟踪系统调用，一个是被动的，一个是主动的。
进程可通过两种途径进入TASK_STOPPED状态：
1)．受其它进程的syscall_trace（）系统调用的控制而暂时将CPU交给控制进程。
在调用syscall_trace（）之前必须先调用sys_ptrace()（简称ptrace（）），进行跟踪系统调用之前的准备工作。只有调用sys_ptrace()后，current的PF_PTRACED和PF_TRACESYS标志都已置位，跟踪和被跟踪的关系都已明确，再调用syscall_trace（）才能真正使进程转入STOPPED状态。
      syscall_trace（）实现步骤：
（1）检验是否调用过ptrace（）做过准备，没有则返回；
      （2）置状态STOPPED ，通知父进程，子进程状态已变；
      （3）进行CPU重新调度，将current进程从runqueue删除。
（4）如果exit_code非空，将它的值作为接收到的信号放到signal中。若是SIGTRAP
   则current进程将一直处于stopped，直到收到其它信号。

sys_ptrace()实现步骤：
（1）如果request == PTRACE_TRACEME，则有进程要求跟踪current进程:
            若current进程已被其它进程跟踪返回；
否则置current进程已被进程跟踪的标记；
（2）如果current进程想跟踪其它进程:
         a.不能跟踪init进程；
            b.找pid对应的进程child，找不到返回出错；
            c.如果request为PTRACE_ATTACH
如果current进程试图ATTACH自己，出错；
            如果试图attach的进程child已被attach，出错；
            否则       child->;flags |= PF_PTRACED;做上标记，child已被attach；如果child
不是current的子进程，将它变成current的子进程；并且发SIGSTOP信号，暂
停它。
（3）进行其他合法性检查；
（4）判断request，执行相应操作：
            case PTRACE_SYSCALL:继续执行，在下一个系统调用返回处停下。
case PTRACE_CONT:发信号后重新开始运行。
                     如果request == PTRACE_SYSCALL，置child标志位PF_TRACESYS;
                     否则       清child标志位PF_TRACESYS，以防止重新运行后因历史原因在下一个
      系统调用返回处停下；
                     唤醒child进程。
      case PTRACE_KILL: 想要结束child进程，唤醒child进程，并在退出信息
                  exit_code中存放SIGKILL信号。
            case PTRACE_SINGLESTEP:  进行单步运行环境设置。
            case PTRACE_DETACH: 恢复child进程的自由。清跟踪标志，并唤醒child进程                                     恢复child进程的原始亲属关系。

2)．收到要求它暂停的信号。
另一种进入STOPPED状态的方法是信号，SIGSTOP信号使自由的current进程，打上PF_PTRACED标记，并将它转入STOPPED状态。do_signal在检查进程收到的信号时，若发现current进程已打上PF_PTRACED标记，则除收到的SIGKILL信号的情况外，current进程都将马上进入STOPPED状态。
do_signal（）实现步骤：
（1）current进程已打上PF_PTRACED标记，则除收到的SIGKILL信号的情况外，进程都将进入TASK_STOPPED状态，通知父进程，并重新调度；
      （2）如果收到信号SIGSTOP:如果当前进程标志位不是PF_PTRACED，则置当前进程状态为TASK_STOPPED; 通知父进程，并重新调度；

2.进程从TASK_STOPPED->;TASK_RUNNING的转换:
从TASK_STOPPED状态转到TASK_RUNNING状态通过“信号唤醒”。当有SIGKILL或SIGCONT信号发给TASK_STOPPED状态下的进程时，进程将被wake_up_process()唤醒。
int send_sig(unsigned long sig,struct task_struct * p,int priv)
{
      ………;
      save_flags(flags); cli(); /*保存标志，关中断*/
      if ((sig == SIGKILL) || (sig == SIGCONT)) {
            if (p->;state == TASK_STOPPED)
                     wake_up_process(p);    /*若进程p的状态是STOPPED，并且所发送的信号是SIGKILL和SIGCONT，将p状态赋成RUNNING，并挂到run-queue*/
p->;exit_code = 0;    /*退出信息没有*/
            p->;signal &= ~( (1<<(SIGSTOP-1)) | (1<<(SIGTSTP-1)) |
                              (1<<(SIGTTIN-1)) | (1<<(SIGTTOU-1)) );       /*处理过信号后，将p的可能曾接受到的SIGSTOP、SIGTSTP、SIGTTIN、SIGTTOU信号清掉*/
      }
      if (sig == SIGSTOP || sig == SIGTSTP || sig == SIGTTIN || sig == SIGTTOU)
            p->;signal &= ~(1<<(SIGCONT-1));    /*若信号为SIGSTOP、SIGTSTP、SIGTTIN、SIGTTOU中的任一种，将p可能曾接受到的SIGCONT信号清掉*/
      restore_flags(flags);       /*恢复CPU标志同时打开中断*/
      generate(sig,p);    /*登记不能立即被处理的信号。*/
      return 0;
}

进程的终止:从TASK_RUNNING->;TASK_ZOMBIE的转换
进程终止由可终止进程的系统调用通过调用do_exit（）实现，do_exit（）终止current进程，首先为current进程做上PF_EXITING的标记，释放current进程的存储管理信息、文件系统、文件信息、信号响应函数指针数组，将状态置成TASK_ZOMBIE，通知current的父进程，最后进行重新调度。do_exit()带一个参数code，用于传递终止进程的原因。

do_exit(long code)流程：
（1）如果进程在中断服务程序中调用do_exit（），则打印提示信息
（2）记录进程的记帐信息
（3）进程标志置为PF_EXITING
（4）释放定时器链表
      （5）释放临界区数据
      （6）将消息队列中和current进程有关项删除
      （7）释放进程的存储管理信息
      （8）释放进程已打开文件的信息
      （9）释放进程的文件系统
      （10）释放进程的信号响应函数指针数组等管理信息
      （11）释放进程的LDT
      （12）进程状态置为TASK_ZOMBIE
      （13）置上退出信息，通知所有进程亲戚，它要退出了#
      （14）exec_domain结构共享计数减1， binfmt结构共享计数减1
（15）重新调度，将current进程从run-queue中删除，交出CPU

exit_notify （）函数向所有和current进程有关的进程发相应的消息，以便它们开展工作，exit_notify （）还判断cueernt进程所在组是否会因current进程的退出而悬空，如果悬空并且组内有stopped状态的进程则发信号；同时进行一系列的指针调整，调整因current进程的死亡引起的进程关系转变。
exit_notify （）流程：
将所有原始进程为current的进程变成init进程的孙子。
如果父进程和current进程不在同一组，但在同一session内并且current进程组内所有进程的父进程和它在同一组，也就是说，current进程所在组会因current的退出而悬挂，同时current进程所在组内有stopped进程，就向整个组发SIGHUP和SIGCONT信号。
通知父进程进程死了。
调整所有current进程的子进程的父进程指针，将它们挂到它们的原始进程下，
   将以往的跟踪被跟踪历史清除，调整它和新的兄弟的关系；检查每一个current
   进程的子进程所在的组是否会悬挂，如果子进程和current进程不在同一组，并
   且这个组已悬挂，组内有stopped的进程，就向组员发SIGHUP 和 SIGCONT信号。       （5）如果current进程是session的主管，就和它所控制的tty脱离，向current
   进程显示终端所在的组发SIGHUP 和 SIGCONT信号。

进程直接或间接地调用do_exit() 后，进程进入ZOMBIE状态，还有一块PCB未释放。PCB的释放必须由它的父进程执行，当父进程调用sys_wait4（）时释放进入ZOMBIE状态的子进程的PCB。

具体调用do_exit()的函数有以下情况：
具体对应的系统调用出错，不得不终止进程，如：
do_page_fault（）：这个系统调用处理页面错误，它找到页面地址，出错原因，并将它转入相应的处理函数。当发生越界（out of memory）等bad page的致命错误。

sys_sigreturn（）：一般情况下sys_sigreturn（）将sigcontext的内容保存到堆栈中，保存过程中当发现段寄存器越界了，这个致命错误就将导致进程结束。

setup_frame（）：setup_frame（）建立信号响应函数的运行的环境，保存当前寄存器，将相应寄存器赋上信号响应函数的信息。在具体设定之前首先进行存储条件检验，不满足就不得不结束进程。

save_v86_state（）：save_v86_state（）保存虚拟86模式下（virtual 86 mode）的信息，如果进程PCB中vm86的信息为空的，无法继续进行操作，只能结束进程。

（2）其他终止进程的情况，通过调用以下函数实现终止：
sys_exit（）：是一个系统调用，实现终止调用它的当前进程。

sys_reboot()：sys_reboot()只能被特权级用户调用，用于重新启动系统。

do_signal（）：do_signal（）是处理信号的一个函数。检查current进程每一个接收到的signal，如果是结束进程的信号，结束进程进行相应处理。

die_if_kernel（）。

[目录]

--------------------------------------------------------------------------------

线程

1       概述
1.1       线程的定义(Introduction)
Threads can best be described as “lightweight processes”. The traditional UNIX-style notion of a process has been found to be inconvenient, if not inadequate for several applications in distributed systems development. The needs of these applications are best served by threads, which generalize the notion of a process so that it can be associated with multiple activities. The popularity of threads has resulted in their implementation on UNIX systems and thread libraries are now widely available to programmers for the development of concurrent applications.
1.2       Threads Implementation
Threads can be implemented in one of two ways:
1. User-level threads:
There is no kernel support for multi-threaded processes. Hence, the kernel only has a single-threaded process abstraction, but multi-threaded processes are implemented in a library of procedures linked with application programs. The kernel has no knowledge of lightweight processes (threads), and therefore cannot schedule them independently. A threads run-time library organizes the scheduling of threads. A thread would block the process and therefore all threads within it if it made a blocking system call, so the asynchronous I/O facilities of UNIX are used. The major disadvantage of this scheme is that threads within a process cannot take advantage of a multi-processor.
（上段译文）User-level没有核心支持的多线程的进程。因此，核心只有单线程进程概念，而多线程进程由与应用程序连接的过程库实现。核心不知道线程的存在，也就不能独立的调度这些线程了。一个线程运行库组织线程的调度。如果一个线程调用了一个阻塞的系统调用，进程可能被阻塞，当然其中的所有线程也同时被阻塞，所以UNIX使用了异步I/O工具。这种机制的的最大缺点是不能发挥多处理器的优势。
The advantages include:
（系统消耗小）Certain thread operations are significantly less costly. For example, switching between threads belonging to the same process do not necessarily involve a system call, and hence save this over-head.
（可以修改以适应特殊的应用）User-level thread implementations can be customized or changed to suit the particular application requirements. This is particularly useful for real-time multimedia processing etc. Also, it is possible to support many more user-level threads than can by default by a kernel.
2. Kernel-level threads:
This implementation allows threads within different processes to be scheduled according to a single scheme of relative prioritizing. This is suited for exploiting the concurrence of multiprocessors.
核心级线程如许不同进程里的线程按照同一相对优先方法调度，这适合于发挥多处理器的并发优点。
Most of the current thread library implementations available today implement user-level threads. There have been several research projects that have implemented some form of Kernel-level threads. Notable among these are the Mach distributed OS, which combines the advantages of user-level and kernel-level threads by allowing user-level code to provide scheduling hints to the kernel thread scheduler. By providing such a two-level scheduling scheme, the kernel retains control over the allocation of processor time, but also allows a process to take advantage of multiple processors.

1.3       Thread Libraries
The two most widely used thread libraries are POSIX and Solaris thread libraries. Both implementations are inter-operable, their functionality is similar, and can be used within the same application. However, only POSIX threads are guaranteed to be fully portable to other POSIX-compliant environments.
Similarities:
Most of the functions in both libraries, libpthread and libthread, have a counterpart in the other library. POSIX functions and Solaris functions, whose names have similar endings, usually have similar functionality, number of arguments, and use of arguments. All POSIX threads function names begin with the prefix pthread? where as the Solaris threads function names begin with the prefix thr?
Differences:
POSIX
is more portable
establishes characteristics for each thread according to configurable attribute objects
implements thread cancellation
enforces scheduling algorithms
allows for clean-up handlers for fork(2) calls
Solaris
threads can be suspended and continued
implements an optimized mutex, readers/writer locking
may increase the concurrency
implements daemon threads, for whose demise the process does not wait

1.4       Threads Standards
There are three different definitions for thread libraries competing for attention today: Win32, OS/2, and POSIX. The first two are proprietary and limited to their individual platforms (Win32 threads run only under NT and Win95, OS/2 threads on OS/2). The POSIX specification (IEEE 1003.1c, aka Pthreads) is intended for all computing platforms, and implementations are available or in the works for almost all major UNIX systems (including Linux), along with VMS.

POSIX Threads
The POSIX standard defines the API and behavior that all the Pthreads libraries must meet. It is part of the extended portion of POSIX, so it is not a requirement for meeting XPG4, but it is required for X/Open UNIX 98, and all major UNIX vendors have committed to meeting this standard. As of this writing, (7/97) almost all UNIX vendors have released a library.

Win32 and OS/2 Threads
Both the NT and OS/2 implementations contain some fairly radical differences
from the POSIX standard--to the degree that even porting from one or the other
to POSIX will prove moderately challenging. Microsoft has not announced any
plans to adopt POSIX. There are freeware POSIX libraries for Win32 (see
Commercial Products on page 249), and OS/2 also has an optional POSIX library.

DCE Threads
Before POSIX completed work on the standard, it produced a number of drafts which it published for comment. Draft 4 was used as the basis for the threads library in DCE. It is similar to the final spec, but it does contain a number of significant differences. Presumably, no one is writing any new DCE code.

Solaris Threads
Also known as UI threads, this is the library, which SunSoft used in developing Solaris 2 before the POSIX, committee completed their work. It will be available on Solaris 2 for the foreseeable future, although we expect most applications writers will opt for Pthreads. The vast majority of the two libraries are virtually identical.
1.5       Linux线程的思想及特点
1.5.1       LinuxThreads
http://pauillac.inria.fr/~xleroy/linuxthreads
Xavier Leroy at INRIA (Paris, France), with input from Pavel Krauz, Richard Henderson and others, has developed a Pthreads library that implements the One-to-One model, allowing it to take advantage of multiple processors. It is based on the new Linux system call, clone()2 . It runs on Linux 2.0 and up, on Intel, Alpha, SPARC, m68k, and MIPS machines. One limitation is its non-standard implementation of signal handling.
1.5.2       Implementation model for LinuxThreads
LinuxThreads follows the so-called "one-to-one" model: each thread is actually a separate process in the kernel. The kernel scheduler takes care of scheduling the threads, just like it schedules regular processes. The threads are created with the Linux clone() system call, which is a generalization of fork() allowing the new process to share the memory space, file descriptors, and signal handlers of the parent.
LinuxThreads采用称为1-1模型：每个线程实际上在核心是一个个单独的进程，核心的调度程序负责线程的调度，就象调度普通进程。线程是用系统调用clone()创建的，clone()系统调用是fork()的普遍形式，它允许新进程共享父进程的存储空间、文件描述符和软中断处理程序。
Advantages of the "one-to-one" model include:

Minimal overhead on CPU-intensive multiprocessing (with about one thread per processor); 最小限度消耗的CPU级多处理技术（每个CPU一个线程）；
Minimal overhead on I/O operations; 最小限度消耗的I/O操作；
A simple and robust implementation (the kernel scheduler does most of the hard work for us)；一种简单和强壮的实现（核心调度程序为我们做了大部分艰难的工作）。

The main disadvantage is more expensive context switches on mutex and condition operations, which must go through the kernel. This is mitigated by the fact that context switches in the Linux kernel are pretty efficient.

1.5.3       Consider other implementation models

There are basically two other models. The "many-to-one" model relies on a user-level scheduler that context-switches between the threads entirely in user code; viewed from the kernel, there is only one process running. This model is completely out of the question for me, since it does not take advantage of multiprocessors, and require unholy magic to handle blocking I/O operations properly. There are several user-level thread libraries available for Linux, but I found all of them deficient in functionality, performance, and/or robustness.
还有另外两种基本模型。多对一模型依赖于用户级的调度程序，线程切换完全由用户程序完成；从核心角度看，只有一个进程正在运行。这种模型不是我们所关心的，因为它无法利用多处理器的优点，而且要用不合理的方法处理I/O操作阻塞。
The "many-to-many" model combines both kernel-level and user-level scheduling: several kernel-level threads run concurrently, each executing a user-level scheduler that selects between user threads. Most commercial Unix systems (Solaris, Digital Unix and IRIX) implement POSIX threads this way. This model combines the advantages of both the "many-to-one" and the "one-to-one" model, and is attractive because it avoids the worst-case behaviors of both models -- especially on kernels where context switches are expensive, such as Digital Unix. Unfortunately, it is pretty complex to implement, and requires kernel supporting which Linux does not provide. Linus Torvalds and other Linux kernel developers have always been pushing the "one-to-one" model in the name of overall simplicity, and are doing a pretty good job of making kernel-level context switches between threads efficient. LinuxThreads is just following the general direction they set.
2       Linux核心对线程的支持
Linux核心对线程的支持主要是通过其系统调用，下文将进行系统的介绍。
2.1       系统调用clone()
以下是系统调用clone的代码：
asmlinkage int sys_clone(struct pt_regs regs)
{
      unsigned long clone_flags;
      unsigned long newsp;

      clone_flags = regs.ebx;
      newsp = regs.ecx;
if (!newsp)
            newsp = regs.esp;
      return do_fork(clone_flags, newsp, &regs);
}

与系统调用clone功能相似的系统调用有fork,但fork事实上只是clone的功能的一部分，clone与fork的主要区别在于传递了几个参数，而当中最重要的参数就是conle_flags,下表是系统定义的几个clone_flags标志：
标志       Value       含义
CLONE_VM       0x00000100       置起此标志在进程间共享VM
CLONE_FS       0x00000200       置起此标志在进程间共享文件系统信息
CLONE_FILES       0x00000400       置起此标志在进程间共享打开的文件
CLONE_SIGHAND       0x00000800       置起此标志在进程间共享信号处理程序
如果置起以上标志所做的处理分别是：
置起CLONE_VM标志：
            mmget(current->;mm);
            /*
            * Set up the LDT descriptor for the clone task.
            */
            copy_segments(nr, tsk, NULL);
            SET_PAGE_DIR(tsk, current->;mm->;pgd);
置起CLONE_ FS标志：
            atomic_inc(&current->;fs->;count);
置起CLONE_ FILES标志：
            atomic_inc(&oldf->;count);
置起CLONE_ SIGHAND标志：
            atomic_inc(&current->;sig->;count);
2.2       与线程调度相关的系统调用
以下是glibc-linuxthread用来进行调度的系统调度：
      .long SYMBOL_NAME(sys_sched_setparam) /* 系统调用154 */
/*用来设置进程（或线程）的调度参数*/
      .long SYMBOL_NAME(sys_sched_getparam)
/*用来获取进程（或线程）的调度参数*/
      .long SYMBOL_NAME(sys_sched_setscheduler)
/*用来设置进程（或线程）的调度参数*/
      .long SYMBOL_NAME(sys_sched_getscheduler)
/*用来获取进程（或线程）的调度参数*/
      .long SYMBOL_NAME(sys_sched_yield)
/*用来强制核心重新调度进程（或线程）*/
      .long SYMBOL_NAME(sys_sched_get_priority_max)
/*用来设置进程（或线程）的调度参数*/
      .long SYMBOL_NAME(sys_sched_get_priority_min)
/*用来获取进程（或线程）的调度参数*/
      .long SYMBOL_NAME(sys_sched_rr_get_interval) /* 系统调用161 */
/*用来获取进程（或线程）的调度时间间隔*/

3       Linux线程的实现
3.1       LinuxThreads概述
现在的0.8版LinuxThreads，是迄今为止在Linux下支持threads的最好的Runtime-library，而包含0.8版LinuxThreads的最好的Runtime-library是glibc- 2.1，下文所要分析的正是glibc-linuxthreads-2.1。
首先介绍一下0.8版LinuxThreads，它实现了一种BiCapitalized面向Linux的Posix 1003.1c"pthread"标准接口。LinuxThreads提供核心级线程即每个线程是一个独立的UNIX进程，通过调用新的系统调用与其它线程共享地址空间。线程由核心调度，就象UNIX进程调度一样。使用它的要求是：LINUX 版本2.0 或以上（要求有新的clone() 系统调用和新的实时调度程序）。对于Intel平台：要求有libc 5.2.18或后续版本，推荐使用5.2.18 或 5.4.12 及其后续版本；5.3.12和5.4.7有问题，也支持glibc 2,实际上是支持它的一个特别合适的版本。到目前支持Intel, Alpha, Sparc, Motorola 68k, ARM and MIPS平台，还支持多处理器
3.2       主要的数据结构及初始化
3.2.1       数据结构和部分数据初始化
/* Arguments passed to thread creation routine */
//传递给线程创建程序的参数
struct pthread_start_args {
  void * (*start_routine)(void *); /* function to run */
  void * arg;                /* its argument */
  sigset_t mask;             /* initial signal mask for thread */
  int schedpolicy;             /* initial scheduling policy (if any) */
  struct sched_param schedparam; /* initial scheduling parameters (if any) */
};

/* The type of thread descriptors */
//线程描述符类型
typedef struct _pthread_descr_struct * pthread_descr;

struct _pthread_descr_struct {
  pthread_descr p_nextlive, p_prevlive;
                              /* Double chaining of active threads */
  pthread_descr p_nextwaiting;  /* Next element in the queue holding the thr */
  pthread_t p_tid;             /* Thread identifier */
  int p_pid;                   /* PID of Unix process */
  int p_priority;             /* Thread priority (== 0 if not realtime) */
  struct _pthread_fastlock * p_lock; /* Spinlock for synchronized accesses */
  int p_signal;                /* last signal received */
  sigjmp_buf * p_signal_jmp; /* where to siglongjmp on a signal or NULL */
  sigjmp_buf * p_cancel_jmp; /* where to siglongjmp on a cancel or NULL */
  char p_terminated;          /* true if terminated e.g. by pthread_exit */
  char p_detached;             /* true if detached */
  char p_exited;             /* true if the assoc. process terminated */
  void * p_retval;             /* placeholder for return value */
  int p_retcode;             /* placeholder for return code */
  pthread_descr p_joining;    /* thread joining on that thread or NULL */
  struct _pthread_cleanup_buffer * p_cleanup; /* cleanup functions */
  char p_cancelstate;          /* cancellation state */
  char p_canceltype;          /* cancellation type (deferred/async) */
  char p_canceled;             /* cancellation request pending */
  int * p_errnop;             /* pointer to used errno variable */
  int p_errno;                /* error returned by last system call */
  int * p_h_errnop;          /* pointer to used h_errno variable */
  int p_h_errno;             /* error returned by last netdb function */
  char * p_in_sighandler;    /* stack address of sighandler, or NULL */
  char p_sigwaiting;          /* true if a sigwait() is in progress */
  struct pthread_start_args p_start_args; /* arguments for thread creation */
  void ** p_specific[PTHREAD_KEY_1STLEVEL_SIZE]; /* thread-specific data */
  void * p_libc_specific[_LIBC_TSD_KEY_N]; /* thread-specific data for libc */
  int p_userstack;             /* nonzero if the user provided the stack */
  void *p_guardaddr;             /* address of guard area or NULL */
  size_t p_guardsize;             /* size of guard area */
  pthread_descr p_self;             /* Pointer to this structure */
  int p_nr;                   /* Index of descriptor in __pthread_handles */
};

/* The type of thread handles. */
线程句柄
typedef struct pthread_handle_struct * pthread_handle;

struct pthread_handle_struct {
  struct _pthread_fastlock h_lock; /* Fast lock for sychronized access */
  pthread_descr h_descr;       /* Thread descriptor or NULL if invalid */
  char * h_bottom;             /* Lowest address in the stack thread */
};

/* The type of messages sent to the thread manager thread */
//发送给线程管理线程的请求
struct pthread_request {
  pthread_descr req_thread;    /* Thread doing the request */
  enum {                      /* Request kind */
REQ_CREATE, REQ_FREE, REQ_PROCESS_EXIT, REQ_MAIN_THREAD_EXIT,
REQ_POST, REQ_DEBUG
  } req_kind;
  union {                      /* Arguments for request */
struct {                   /* For REQ_CREATE: */
   const pthread_attr_t * attr; /* thread attributes */
   void * (*fn)(void *);    /* start function */
   void * arg;             /* argument to start function */
   sigset_t mask;          /* signal mask */
} create;
struct {                   /* For REQ_FREE: */
   pthread_t thread_id;    /* identifier of thread to free */
} free;
struct {                   /* For REQ_PROCESS_EXIT: */
   int code;                /* exit status */
} exit;
void * post;             /* For REQ_POST: the semaphore */
  } req_args;
};

/* One end of the pipe for sending requests to the thread manager. */
//向管理线程发送请求的管道的一端；初始化为-1表示管理线程还没有创建
int __pthread_manager_request = -1;

/* Other end of the pipe for sending requests to the thread manager. */

int __pthread_manager_reader;

//线程的堆栈大小
#define STACK_SIZE  (2 * 1024 * 1024)
//线程的初始堆栈大小
#define INITIAL_STACK_SIZE  (4 * PAGE_SIZE)

/* Attributes for threads.  */
//线程的属性
typedef struct
{
  int __detachstate;
  int __schedpolicy;
  struct __sched_param __schedparam;
  int __inheritsched;
  int __scope;
  size_t __guardsize;
  int __stackaddr_set;
  void *__stackaddr;
  size_t __stacksize;
} pthread_attr_t;

//每个进程的最大线程数
#define PTHREAD_THREADS_MAX       1024

3.2.2       Main thread and manager thread initializing

/* Thread creation */

int __pthread_create_2_1(pthread_t *thread, const pthread_attr_t *attr,
                     void * (*start_routine)(void *), void *arg)
{
  pthread_descr self = thread_self();
  struct pthread_request request;
  if (__pthread_manager_request < 0) { //检查是否启动线程机制
      //初始化管理线程，启动线程机制
if (__pthread_initialize_manager() < 0) return EAGAIN;
  }
  request.req_thread = self;
  request.req_kind = REQ_CREATE;
  request.req_args.create.attr = attr;
  request.req_args.create.fn = start_routine;
  request.req_args.create.arg = arg;
  sigprocmask(SIG_SETMASK, (const sigset_t *) NULL,
            &request.req_args.create.mask);
//向管理线程发送请求
  __libc_write(__pthread_manager_request, (char *) &request, sizeof(request));
  suspend(self);
  if (THREAD_GETMEM(self, p_retcode) == 0)
*thread = (pthread_t) THREAD_GETMEM(self, p_retval);
  return THREAD_GETMEM(self, p_retcode);
}

int __pthread_initialize_manager(void)
{
  int manager_pipe[2];
  int pid;
  struct pthread_request request;

  /* If basic initialization not done yet (e.g. we're called from a constructor run before our constructor), do it now */
//初始化初始线程
  if (__pthread_initial_thread_bos == NULL) pthread_initialize();
  /* Setup stack for thread manager */建立管理线程堆栈
  __pthread_manager_thread_bos = malloc(THREAD_MANAGER_STACK_SIZE);
  if (__pthread_manager_thread_bos == NULL) return -1;
  __pthread_manager_thread_tos =
__pthread_manager_thread_bos + THREAD_MANAGER_STACK_SIZE;
  /* Setup pipe to communicate with thread manager */
//建立与管理线程通信的管道
  if (pipe(manager_pipe) == -1) {
free(__pthread_manager_thread_bos);
return -1;
  }
  /* Start the thread manager */启动管理线程
  pid = __clone(__pthread_manager, (void **) __pthread_manager_thread_tos,
            CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND
            , (void *)(long)manager_pipe[0]);
  if (pid == -1) {
free(__pthread_manager_thread_bos);
__libc_close(manager_pipe[0]);
__libc_close(manager_pipe[1]);
return -1;
  }
  __pthread_manager_request = manager_pipe[1]; /* writing end */
  __pthread_manager_reader = manager_pipe[0]; /* reading end */
  __pthread_manager_thread.p_pid = pid;
  /* Make gdb aware of new thread manager */
  if (__pthread_threads_debug && __pthread_sig_debug >; 0)
{
   raise(__pthread_sig_debug);
   /* We suspend ourself and gdb will wake us up when it is
      ready to handle us. */
   suspend(thread_self());
}
  /* Synchronize debugging of the thread manager */
  request.req_kind = REQ_DEBUG;
  __libc_write(__pthread_manager_request, (char *) &request, sizeof(request));
  return 0;
}
//初始化初始线程
static void pthread_initialize(void)
{
  struct sigaction sa;
  sigset_t mask;
  struct rlimit limit;
  int max_stack;

  /* If already done (e.g. by a constructor called earlier!), bail out */
  if (__pthread_initial_thread_bos != NULL) return;
#ifdef TEST_FOR_COMPARE_AND_SWAP
  /* Test if compare-and-swap is available */
  __pthread_has_cas = compare_and_swap_is_available();
#endif
  /* For the initial stack, reserve at least STACK_SIZE bytes of stack below the current stack address, and align that on a STACK_SIZE boundary. */
//当前堆栈下为初始堆栈留出至少STACK_SIZE，并按STACK_SIZE对齐
  __pthread_initial_thread_bos =
(char *)(((long)CURRENT_STACK_FRAME - 2 * STACK_SIZE) & ~(STACK_SIZE - 1));
  /* Play with the stack size limit to make sure that no stack ever grows
   beyond STACK_SIZE minus two pages (one page for the thread descriptor
   immediately beyond, and one page to act as a guard page). */
//调整堆栈大小限制使其不能增长超过STACK_SIZE减两页（一页给线程
//描述符，一页作为保护页）
  getrlimit(RLIMIT_STACK, &limit);
  max_stack = STACK_SIZE - 2 * __getpagesize();
  if (limit.rlim_cur >; max_stack) {
limit.rlim_cur = max_stack;
setrlimit(RLIMIT_STACK, &limit);
  }
  /* Update the descriptor for the initial thread. */
  __pthread_initial_thread.p_pid = __getpid();
  /* If we have special thread_self processing, initialize that for the
   main thread now.  */
#ifdef INIT_THREAD_SELF
  INIT_THREAD_SELF(&__pthread_initial_thread, 0);
#endif
  /* The errno/h_errno variable of the main thread are the global ones.  */
  __pthread_initial_thread.p_errnop = &_errno;
  __pthread_initial_thread.p_h_errnop = &_h_errno;
#ifdef SIGRTMIN
  /* Allocate the signals used.  */分配使用的软中断号
  __pthread_sig_restart = __libc_allocate_rtsig (1);
  __pthread_sig_cancel = __libc_allocate_rtsig (1);
  __pthread_sig_debug = __libc_allocate_rtsig (1);
  if (__pthread_sig_restart < 0 ||
   __pthread_sig_cancel < 0 ||
   __pthread_sig_debug < 0)
{
   /* The kernel does not support real-time signals.  Use as before
      the available signals in the fixed set.
      Debugging is not supported in this case. */
   __pthread_sig_restart = DEFAULT_SIG_RESTART;
   __pthread_sig_cancel = DEFAULT_SIG_CANCEL;
   __pthread_sig_debug = 0;
}
#endif
  /* Setup signal handlers for the initial thread.
   Since signal handlers are shared between threads, these settings
   will be inherited by all other threads. */
//设置初始进程的信号处理程序
#ifndef __i386__
  sa.sa_handler = pthread_handle_sigrestart;
#else
  sa.sa_handler = (__sighandler_t) pthread_handle_sigrestart;
#endif
  sigemptyset(&sa.sa_mask);
  sa.sa_flags = 0;
  __sigaction(__pthread_sig_restart, &sa, NULL);
#ifndef __i386__
  sa.sa_handler = pthread_handle_sigcancel;
#else
  sa.sa_handler = (__sighandler_t) pthread_handle_sigcancel;
#endif
  sa.sa_flags = 0;
  __sigaction(__pthread_sig_cancel, &sa, NULL);
  if (__pthread_sig_debug >; 0) {
sa.sa_handler = pthread_handle_sigdebug;
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;
__sigaction(__pthread_sig_debug, &sa, NULL);
  }
  /* Initially, block __pthread_sig_restart. Will be unblocked on demand. */
  sigemptyset(&mask);
  sigaddset(&mask, __pthread_sig_restart);
  sigprocmask(SIG_BLOCK, &mask, NULL);
  /* Register an exit function to kill all other threads. */
  /* Do it early so that user-registered atexit functions are called
   before pthread_exit_process. */
  __on_exit(pthread_exit_process, NULL);
}

3.3       线程的创建

Manager thread 接到创建线程请求后调用下函数。
static int pthread_handle_create(pthread_t *thread, const pthread_attr_t *attr,
                              void * (*start_routine)(void *), void *arg,
                              sigset_t * mask, int father_pid)
{
  size_t sseg;
  int pid;
  pthread_descr new_thread;
  char * new_thread_bottom;
  pthread_t new_thread_id;
  char *guardaddr = NULL;
  size_t guardsize = 0;
  int pagesize = __getpagesize();

  /* First check whether we have to change the policy and if yes, whether
   we can  do this.  Normally this should be done by examining the
   return value of the __sched_setscheduler call in pthread_start_thread
   but this is hard to implement.  FIXME  */
//检查是否需要调整调度策略，如果需要，是否能够做到
  if (attr != NULL && attr->;__schedpolicy != SCHED_OTHER && geteuid () != 0)
return EPERM;
  /* Find a free segment for the thread, and allocate a stack if needed */
//找出一个空段，如果需要再分配堆栈
  for (sseg = 2; ; sseg++)
{
   if (sseg >;= PTHREAD_THREADS_MAX)
      return EAGAIN;
   if (__pthread_handles[sseg].h_descr != NULL)
      continue;
   if (pthread_allocate_stack(attr, thread_segment(sseg), pagesize,
                              &new_thread, &new_thread_bottom,
                              &guardaddr, &guardsize) == 0)
      break;
}
  __pthread_handles_num++;
  /* Allocate new thread identifier */分配新线程的标识符
  pthread_threads_counter += PTHREAD_THREADS_MAX;
  new_thread_id = sseg + pthread_threads_counter;
  /* Initialize the thread descriptor */初始化新线程描述符
  new_thread->;p_nextwaiting = NULL;
  new_thread->;p_tid = new_thread_id;
  new_thread->;p_priority = 0;
  new_thread->;p_lock = &(__pthread_handles[sseg].h_lock);
  new_thread->;p_signal = 0;
  new_thread->;p_signal_jmp = NULL;
  new_thread->;p_cancel_jmp = NULL;
  new_thread->;p_terminated = 0;
  new_thread->;p_detached = attr == NULL ? 0 : attr->;__detachstate;
  new_thread->;p_exited = 0;
  new_thread->;p_retval = NULL;
  new_thread->;p_joining = NULL;
  new_thread->;p_cleanup = NULL;
  new_thread->;p_cancelstate = PTHREAD_CANCEL_ENABLE;
  new_thread->;p_canceltype = PTHREAD_CANCEL_DEFERRED;
  new_thread->;p_canceled = 0;
  new_thread->;p_errnop = &new_thread->;p_errno;
  new_thread->;p_errno = 0;
  new_thread->;p_h_errnop = &new_thread->;p_h_errno;
  new_thread->;p_h_errno = 0;
  new_thread->;p_in_sighandler = NULL;
  new_thread->;p_sigwaiting = 0;
  new_thread->;p_guardaddr = guardaddr;
  new_thread->;p_guardsize = guardsize;
  new_thread->;p_userstack = attr != NULL && attr->;__stackaddr_set;
  memset (new_thread->;p_specific, '\0',
      PTHREAD_KEY_1STLEVEL_SIZE * sizeof (new_thread->;p_specific[0]));
  new_thread->;p_self = new_thread;
  new_thread->;p_nr = sseg;
  /* Initialize the thread handle */
  __pthread_init_lock(&__pthread_handles[sseg].h_lock);
  __pthread_handles[sseg].h_descr = new_thread;
  __pthread_handles[sseg].h_bottom = new_thread_bottom;
  /* Determine scheduling parameters for the thread */
//确定线程的调度参数
  new_thread->;p_start_args.schedpolicy = -1;
  if (attr != NULL) {
switch(attr->;__inheritsched) {
case PTHREAD_EXPLICIT_SCHED:
   new_thread->;p_start_args.schedpolicy = attr->;__schedpolicy;
   memcpy (&new_thread->;p_start_args.schedparam, &attr->;__schedparam,
            sizeof (struct sched_param));
   break;
case PTHREAD_INHERIT_SCHED:
   /* schedpolicy doesn't need to be set, only get priority */
   __sched_getparam(father_pid, &new_thread->;p_start_args.schedparam);
   break;
}
new_thread->;p_priority =
   new_thread->;p_start_args.schedparam.sched_priority;
  }
  /* Finish setting up arguments to pthread_start_thread */
//设置pthread_start_thread的参数
  new_thread->;p_start_args.start_routine = start_routine;
  new_thread->;p_start_args.arg = arg;
  new_thread->;p_start_args.mask = *mask;
  /* Raise priority of thread manager if needed */根据需要调整管理线程的优先级
  __pthread_manager_adjust_prio(new_thread->;p_priority);
  /* Do the cloning */创建新线程
  pid = __clone(pthread_start_thread, (void **) new_thread,
            CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND |
            __pthread_sig_cancel, new_thread);
  /* Check if cloning succeeded */
  if (pid == -1) {
/* Free the stack if we allocated it */
if (attr == NULL || !attr->;__stackaddr_set)
   {
      munmap((caddr_t)((char *)(new_thread+1) - INITIAL_STACK_SIZE),
            INITIAL_STACK_SIZE);
      if (new_thread->;p_guardsize != 0)
      munmap(new_thread->;p_guardaddr, new_thread->;p_guardsize);
   }
__pthread_handles[sseg].h_descr = NULL;
__pthread_handles[sseg].h_bottom = NULL;
__pthread_handles_num--;
return errno;
  }
  /* Insert new thread in doubly linked list of active threads */
//将新线程插入双向链表
  new_thread->;p_prevlive = __pthread_main_thread;
  new_thread->;p_nextlive = __pthread_main_thread->;p_nextlive;
  __pthread_main_thread->;p_nextlive->;p_prevlive = new_thread;
  __pthread_main_thread->;p_nextlive = new_thread;
  /* Set pid field of the new thread, in case we get there before the
   child starts. */
  new_thread->;p_pid = pid;
  /* We're all set */
  *thread = new_thread_id;
  return 0;
}

3.4       线程的堆栈分配和管理
STACK_SIZE 2*1024*1024
INITIAL_STACK_SIZE 4*PAGE_SIZE
THREAD_MANAGER_STACK_SIZE    2*PAGE_SIZE-32

static int pthread_allocate_stack(const pthread_attr_t *attr,
                              pthread_descr default_new_thread,
                              int pagesize,
                              pthread_descr * out_new_thread,
                              char ** out_new_thread_bottom,
                              char ** out_guardaddr,
                              size_t * out_guardsize)
{
  pthread_descr new_thread;
  char * new_thread_bottom;
  char * guardaddr;
  size_t stacksize, guardsize;

  if (attr != NULL && attr->;__stackaddr_set)
{
   /* The user provided a stack. */用户提供堆栈
   new_thread =
      (pthread_descr) ((long)(attr->;__stackaddr) & -sizeof(void *)) - 1;
   new_thread_bottom = (char *) attr->;__stackaddr - attr->;__stacksize;
   guardaddr = NULL;
   guardsize = 0;
   __pthread_nonstandard_stacks = 1;
}
  else
{
   /* Allocate space for stack and thread descriptor at default address */
//在缺省地址分配堆栈和描述符
   new_thread = default_new_thread;
   new_thread_bottom = (char *) new_thread - STACK_SIZE;
   if (mmap((caddr_t)((char *)(new_thread + 1) - INITIAL_STACK_SIZE), INITIAL_STACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED | MAP_GROWSDOWN, -1, 0) == MAP_FAILED)
      /* Bad luck, this segment is already mapped. */
      return -1;
   /* We manage to get a stack.  Now see whether we need a guard
      and allocate it if necessary.  Notice that the default
      attributes (stack_size = STACK_SIZE - pagesize and
      guardsize = pagesize) do not need a guard page, since
      the RLIMIT_STACK soft limit prevents stacks from
      running into one another. */
//判断是否需要保护页，如果需要就分配
   if (attr == NULL ||
      attr->;__guardsize == 0 ||
      (attr->;__guardsize == pagesize &&
         attr->;__stacksize == STACK_SIZE - pagesize))
      {
      /* We don't need a guard page. */
      guardaddr = NULL;
      guardsize = 0;
      }
   else
      {
      /* Put a bad page at the bottom of the stack */
      stacksize = roundup(attr->;__stacksize, pagesize);
      if (stacksize >;= STACK_SIZE - pagesize)
         stacksize = STACK_SIZE - pagesize;
      guardaddr = (void *)new_thread - stacksize;
      guardsize = attr->;__guardsize;
      if (mmap ((caddr_t) guardaddr, guardsize, 0, MAP_FIXED, -1, 0)
            == MAP_FAILED)
         {
            /* We don't make this an error.  */
            guardaddr = NULL;
            guardsize = 0;
         }
      }
}
  *out_new_thread = new_thread;
  *out_new_thread_bottom = new_thread_bottom;
  *out_guardaddr = guardaddr;
  *out_guardsize = guardsize;
  return 0;
}

3.5       线程的调度
Common threads 的调度和普通进程并无大的区别，创建者可以自己设定线程的优先级。但是Manager thread则需要实时响应各进程提出的请求，所以Manager thread被设置成高于其它线程的优先级，方法是在创建每个新线程时调整Manager thread的优先级。

/* Adjust priority of thread manager so that it always run at a priority
higher than all threads */

void __pthread_manager_adjust_prio(int thread_prio)
{
  struct sched_param param;

  if (thread_prio <= __pthread_manager_thread.p_priority) return;
  param.sched_priority =
thread_prio < __sched_get_priority_max(SCHED_FIFO)
? thread_prio + 1 : thread_prio;
  __sched_setscheduler(__pthread_manager_thread.p_pid, SCHED_FIFO, &param);
  __pthread_manager_thread.p_priority = thread_prio;
}

[目录]

--------------------------------------------------------------------------------

进程描述符

标题新兵笔记--ULK(C3) Process Descriptor
作者 Big John (stranger )
时间 05/19/01 06:01 PM
Process Descriptor

description：
进程描述符：也就是结构体task_struct，它有很多域，包含了一个进程的所有信息，主要有它的属性、当前的状态、它所占有的资料，还有一些指针用于把按不同的需求把它链进不同的链表中。
进程描述符与进程的kernel栈：每个进程都有个自己的kernel栈，当它进入kernel态时（比如进行系统调用），kernel会把栈指针指向当前进程的kernel栈。在2.2中，进程的描述符和kernel栈是捆在一起的，
union task_union {
struct task_struct task;
unsigned long stack[2048];
};
kernel每次分配一个进程描述符总会按task_union的大小（即8k）"顺手"分配一个kernel栈，这样做一个最重要的目的就是为了能方便的得到当前运行进程的描述符，当系统需要时，总是使用current来得到当前运行的进程，在2.0中current可能是个全局变量或全局数组（当多CPU时），这样一方面是不方便使用，另一方面，在多CPU中还得去找当前CPU才能确定当前的current（我当初看过，但忘了它是怎么找的了）。现在使用了这个结构，kernel随时可以通过栈底指针减8K就可得到描述符的地址。大家可以看到现在的current实际是一个函数get_current，它所做的是就是把esp减去8K，然后返回这个地址。
进程描述符的分配与释放：由这两个函数完成，alloc_task_struct和free_task_struct。这两个函数没什么花头，还是由于分配物理页帧的代码过大，这里也有一个缓存static struct task_struct * task_struct_stack[EXTRA_TASK_STRUCT]，它能缓存16项，分配和释放都尽量在这进行，除非它已经满了或空了，才去与分页系统打交道。
进程数组：struct task_struct *task[NR_TASKS];它是一个指针数组，其中NR_TASKS在i386下应该4090，实际可同时运行的进程应该是4092个，因为还得加上Process 0和Procces 1，这两个进程是不在进程数组中的。
当创建一个进程时，kernel会申请一片内存并把它加入task数组中。如何加入呢？出于效率考虑，task数组并不象我们平时处理那样把没有用的项置空，当要加入时顺序的去找数组中为空的项。它使用了类似于第二章中页表缓存链表的管理方法，tarray_freelist是这个链表的表头，具体操作如下：
初始化：
struct task_struct **tarray_freelist = NULL;
void __init sched_init(void)
{
。。。
int nr = NR_TASKS;
while(--nr >; 0)
add_free_taskslot(&task[nr]); // 把task数组的每一项加到空闲链表中。
。。。
}
函数add_free_taskslot：
*t = (struct task_struct *) tarray_freelist; // 把当前指针当next用，指向空闲链表的第一项（可能是NULL）
tarray_freelist = t; // tarray_freelist指向当前项
函数get_free_taskslot：
tslot = tarray_freelist; // *tslot是第一个空闲项
tarray_freelist = (struct task_struct **) *tslot; // *tslot的内容是下一个空闲项
return tslot;

各种各样的进程指针：在进程描述符中有很多task_struct的指针，用于把它链进不同的链表或树中。最简单的是next_task和prev_task，它们把当前系统中的所有进程链成一条双向链表；其次是next_run和prev_run，它们把当前可运行的进程（state为TASK_RUNNING，这和current不同，并不表示它正在运行，只表示它可以被CPU调度运行而已）链成一条双向链表，不过我的源码里并没有作者所说的runqueue指针头，好象是init_task取代了runqueue的位置；pidhash_next和pidhash_pprev是用来链进以进程号索引的hash表的，因为大多调用都使用进程号标识进程，所以需要建立一个hash表来加快以进程号来查找进程的速度；最后是p_opptr,p_pptr,p_cptr,p_ysptr,p_osptr，这些是用来标识进程的父子，兄弟等树状关系的，作者书中的图已经很清楚了，不再需要我多说了。

等待队列：一般来说，当某个任务不能马上完成时，kernel不会陪着进程在那死等，它只是简单把进程挂进某个队列，然后继续运行，等这个任务完成，kernel会从队列中取出等待的进程，并把它链进可运行队列中。等待队列的结构体如下：
struct wait_queue {
struct task_struct * task;
struct wait_queue * next;
};
很简单的结构，一个进程指针，一个next指针，应用时它将会是一个环形队列，add_wait_queue加入一项，remove_wait_queue移去新旧的一项，都不是很难理解。麻烦的是它的init_waitqueue，内容为
#define WAIT_QUEUE_HEAD(x) ((struct wait_queue *)((x)-1))
static inline void init_waitqueue(struct wait_queue **q)
{
*q = WAIT_QUEUE_HEAD(q);
}
结合作者的解释，它实际上是把当前队列指针加上前面的四个字节假设为一项了，然后"这一项"的next指针指向它自己。这个方法代码倒很简单，但是我却不是很喜欢，可读性实在有点。。。如果是我，宁愿加一空项做表头。
sleep和wakeup：刚才所说的kernel把进程挂进队一般是通过sleep_on来做的，而取出队列是通过wake_up来做的。现在来看看它们是怎么运行的吧。比如你要读取软盘内容，指令发出去了，但要等很久才有回应，这时会调用sleep_on把你的进程标识为TASK_UNINTERRUPTIBLE或TASK_INTERRUPTIBLE状态，然后把进程挂在某一个队列上，然后调用schedule，这样就会调度其它状态为TASK_RUNNING的进程继续运行。当指令完成时，比如软盘的内容已经读到内存中了，这时可能会产生一个中断，这个中断会从等待队列中把你的进程取出来，标识为TASK_RUNNING，然后放入可运行队列，又过了若干时间，你的进程真的开始运行了，这时会执行sleep_on中schedule后的语句，即把进程从进程从等待队列中移出去，然后就可以执行下面的操作了，这时你要读的内容已经读出来了。

进程限制：谁都不希望某个用户的进程会占用所有的内存，所有的CPU，所有的任何资源，这样就要对进程有所限制，kernel用了这样一个结构：
struct rlimit {
long rlim_cur;
long rlim_max;
};
其中rlim_cur为进程现在用到的资源数，rlim_max为进程可用的最大资源数。
结构task_struct中有一项为struct rlimit rlim[RLIM_NLIMITS]，其中RLIM_NLIMITS为10，即你对进程可以有10种限制，这10种限制作者有讲解的，我也不说了。

question:
1、我的印象中，在get_current中，esp应该是栈顶指针，而且随时会变的，用它去减去8K，能得到正确的地址吗？

标题 Re: 新兵笔记--ULK(C3) Process Descriptor [re: Big John]
作者 lucian_yao (addict)
时间 05/20/01 09:16 AM

1应该是栈顶向下8K对齐得到task_struct指针。
2在2.4中最大进程数没有了，由于基本不用TSS结构，所以不受GDT大小限制。

标题 Re: 新兵笔记--ULK(C3) Process Descriptor [re: lucian_yao]
作者 Big John (stranger )
时间 05/22/01 04:24 PM

1、是我的错，把
__asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
中的andl看成addl了，所以百思而不得，呵。其实很简单，系统分配进程描述符时，都是偶数页对齐的，把sp后面的13位清0，当然也就是描述符的位置了。：）
2、2.4对进程没有限制，那当然就不会再用task_struct的数组了，那它是怎么组织的呢？不会是链表吧。

标题 Re: 新兵笔记--ULK(C3) Process Descriptor [re: Big John]
作者 iobject (stranger)
时间 05/29/01 04:08 PM

static inline struct task_struct * get_current(void)
{
struct task_struct *current;
__asm__("andl %%esp,%0; ":"=r" (current) : "" (~8191UL));
return current;
}
对于,%0,从语法上似乎是指current,但是这样的话这句话就说不通了,难道我对%0的理解有错吗
哪位指点一下,谢谢!

标题 Re: 新兵笔记--ULK(C3) Process Descriptor [re: iobject]
作者 Big John (stranger )
时间 05/29/01 05:33 PM

asm ("combine %2,%0" : "=r" (foo) : "0" (foo), "g" (bar));

The constraint `"0"' for operand 1 says that it must occupy the same
location as operand 0. A digit in constraint is allowed only in an
input operand and it must refer to an output operand.

这段来自gcc的info，大概意思是说，%1和%0将占用同一个寄存器，所以引用%0也就是引用%1了。
这样看来
__asm__("andl %%esp,%0; ":"=r" (current) : "0" (~8191UL));
展开应该是这样的：
movl $(~8191UL),%eax
#APP
andl %esp, %eax
#NO_APP
movl %eax,current
我也是现学现用，不知道对不对。

[目录]

--------------------------------------------------------------------------------

init进程从内核态切换到用户态

标题 init进程如何从内核态切换到用户态。
作者 chstar (stranger )
时间 03/08/01 01:24 PM
init进程从内核态切换到用户态。

//谢谢lucian_yao 邀请，在此灌水一篇

大家都知道如何产生一个新的进程。
通过sys_fork,之后再调用sys_execve
系统初启后（核心态）的第一个用户态进程是init。
这要涉及到内层（特权级高）向外层（特权级低）转移的问题。
通常情况下，内核是不会调用用户层的代码，要想实现这逆向的转移，一般做法是在用户进程的核心栈(tss->;esp0)压入用户态的SS,ESP,EFLAGS,CS,EIP,伪装成用户进程是通过陷阱门进入核心态，之后通过iret返回用户态。
那么linux 2.2.14中的用户态进程init是如何实现的？

首先在kernel_thread(init...)函数中，
利用系统调用sys_clone fork出一个内核级进程(此时要给该进程分配核心栈<-esp0),之后call init函数，init函数还会再起几个kernel_thread,然后会加载/sbin/init(通过execve调用)
在sys_execve中，要完成内核态到用户态的转移。
大体流程是sys_execve-->;do_execve-->;load_elf_binary()
-->;do_load_elf_binary()-->;do_mmap()
start_thread(reg,newip,newsp) （processor.h）
请大家关注do_mmap()及start_thread()很重要哦
do_mmap完成从文件虚拟空间到内存虚拟空间的映射。
而start_thread就是要在进程核心栈中的相应位置填入进程用户态的xss,esp and xcs,eip.
最后进程从ret_from_sys_call返回，iret指令从核心栈pop出xcs, eip完成特权及指令的转移, pop出 xss,esp，完成堆栈的切换。

以上我也是刚看代码悟出的，如有不对之处，还望高手指出。

[目录]

--------------------------------------------------------------------------------

SET_LINKS

宏：SET_LINKS(p)将进程p插入到进程系中
struct task_struct {
  struct task_struct *next_task, *prev_task;
...
  struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr,*p_osptr;       ...};
next_task和prev_task 为描述进程先后关系的环形队列
p_opptr       指向原始的父进程
p_pptr       指向当前的父进程
p_cptr       指向最年轻的子进程
p_ysptr       指向弟进程
p_osptr       指向兄进程

include/linux/sched.h

#define SET_LINKS(p) do {
\
      (p)->;next_task = &init_task;
\ 进程p的下一个进程是初始化进程
      (p)->;prev_task = init_task.prev_task;
\ 进程p的前一个进程是初始化进程的前一个进程
      init_task.prev_task->;next_task = (p);
\ 进程p的进一进程指向p
      init_task.prev_task = (p);
\初始化进程的前一进程指向p; 即将进程p加入到环形进程队列的尾部
      (p)->;p_ysptr = NULL; \ 进程p现在是最年轻的进程
      if (((p)->;p_osptr = (p)->;p_pptr->;p_cptr) != NULL)
            (p)->;p_osptr->;p_ysptr = p;
\ 原来的最年轻进程变成p的兄进程
      (p)->;p_pptr->;p_cptr = p; \ 父进程指向新的子进程p
} while (0)

[目录]

--------------------------------------------------------------------------------

REMOVE_LINKS

宏：REMOVE_LINKS(p)将进程p从进程系中删除
struct task_struct {
  struct task_struct *next_task, *prev_task;
...
  struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr,*p_osptr;       ...};
next_task和prev_task 为描述进程先后关系的环形队列
p_opptr       指向原始的父进程
p_pptr       指向当前的父进程
p_cptr       指向最年轻的子进程
p_ysptr       指向弟进程
p_osptr       指向兄进程

include/linux/sched.h
#define REMOVE_LINKS(p) do { \
      (p)->;next_task->;prev_task = (p)->;prev_task;
\ 让进程p的下一进程指向p的前一进程
      (p)->;prev_task->;next_task = (p)->;next_task;
\ 让进程p的前一进程指向p的下一进程
      if ((p)->;p_osptr)
\ 如果进程p有兄进程,则让兄进程指向p的弟进程
      (p)->;p_osptr->;p_ysptr = (p)->;p_ysptr;
      if ((p)->;p_ysptr)
\ 如果进程p有弟进程,则让弟进程指向p的兄进程
            (p)->;p_ysptr->;p_osptr = (p)->;p_osptr; \
      else \ 如果p没有弟进程,说明p最年轻,则让父进程指向p的兄进程             (p)->;p_pptr->;p_cptr = (p)->;p_osptr;
\
} while (0)

[目录]

--------------------------------------------------------------------------------

get_wchan()

get_wchan()给出了某个睡眠进程schedule()的调用点.
; arch/i386/kernel/process.c
unsigned long get_wchan(struct task_struct *p)
{
      unsigned long ebp, esp, eip;
      unsigned long stack_page;
      int count = 0;
      if (!p || p == current || p->;state == TASK_RUNNING)
            return 0;
      stack_page = (unsigned long)p;
      esp = p->;thread.esp; 取switch_to之前内核堆栈指针
      if (!stack_page || esp  8188+stack_page)
            return 0;
      /* include/asm-i386/system.h:switch_to() pushes ebp last. */
      ebp = *(unsigned long *) esp; 取保存在切换现场的schedule的ebp
      do {
            if (ebp  8184+stack_page)
                     return 0;
            eip = *(unsigned long *) (ebp+4);
            ; (ebp+0)为上一级函数的ebp,(ebp+4)为schedule()的返回地址
            ; kernel/sched.c编绎加了-fno-omit-frame-pointer编绎标志,就是在这儿起作用.
            ; first_sched和last_sched是schedule()函数所在的地址范围
            if (eip = last_sched)
                     return eip;
            ebp = *(unsigned long *) ebp;
      } while (count++       return 0;
}

现在的问题是,在什么情况下需要用count循环几次? 现有的代码好象不需要循环.

[目录]

--------------------------------------------------------------------------------

sigframe的结构

struct pt_regs {
      long ebx;
      long ecx;
      long edx;
      long esi;
      long edi;
      long ebp;
      long eax;
      int  xds;
      int  xes;
      long orig_eax;
      long eip;
      int  xcs;
      long eflags;
      long esp;
      int  xss;
};
typedef void (*__sighandler_t)(int);
struct sigaction {
      __sighandler_t sa_handler; 用户的信号处理函数指针
      unsigned long sa_flags;
      void (*sa_restorer)(void); 用户自定义的信号恢复函数指针
      sigset_t sa_mask;
};
struct k_sigaction {
      struct sigaction sa;
};
struct exec_domain {
      const char *name;
      lcall7_func handler;
      unsigned char pers_low, pers_high;
      unsigned long * signal_map;
      unsigned long * signal_invmap;
      struct module * module;
      struct exec_domain *next;
};
struct sigcontext {
      unsigned short gs, __gsh;
      unsigned short fs, __fsh;
      unsigned short es, __esh;
      unsigned short ds, __dsh;
      unsigned long edi;
      unsigned long esi;
      unsigned long ebp;
      unsigned long esp;
      unsigned long ebx;
      unsigned long edx;
      unsigned long ecx;
      unsigned long eax;
      unsigned long trapno;
      unsigned long err;
      unsigned long eip;
      unsigned short cs, __csh;
      unsigned long eflags;
      unsigned long esp_at_signal;
      unsigned short ss, __ssh;
      struct _fpstate * fpstate;
      unsigned long oldmask;
      unsigned long cr2;
};
struct _fpstate {

      unsigned long cw;
      unsigned long       sw;
      unsigned long       tag;
      unsigned long       ipoff;
      unsigned long       cssel;
      unsigned long       dataoff;
      unsigned long       datasel;
      struct _fpreg       _st[8];
      unsigned short       status;
      unsigned short       magic;

      unsigned long       _fxsr_env[6];
      unsigned long       mxcsr;
      unsigned long       reserved;
      struct _fpxreg       _fxsr_st[8];
      struct _xmmreg       _xmm[8];
      unsigned long       padding[56];
};
struct sigframe
{
      char *pretcode; 指向retcode
      int sig; sa_handler的sig参数
      struct sigcontext sc; CPU状态
      struct _fpstate fpstate;如果进程使用过FPU的话保存FPU状态
      unsigned long extramask[(64  / 32 ) -1];
      char retcode[8]; "popl % eax; movl $__NR_sigreturn,% eax; int $0x80"
};
static void setup_frame(int sig, struct k_sigaction *ka,
                     sigset_t *set, struct pt_regs * regs)
{
      struct sigframe *frame;
      int err = 0;

      ;取信号帧的起始地址
      frame = get_sigframe(ka, regs, sizeof(*frame));
      ;检测frame指针是否越界
      if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
            goto give_sigsegv;
      ;每个进程可以对应于不同的运行域,如果需要的话就进行相应的信号转换
      err |= __put_user((current->;exec_domain
                        current->;exec_domain->;signal_invmap
                        sig                         ? current->;exec_domain->;signal_invmap[sig]
                        : sig),

      if (err)
            goto give_sigsegv;
      ;继续在用户堆栈上填充sigframe结构
      err |= setup_sigcontext(  regs, set->;sig[0]);
      if (err)
            goto give_sigsegv;
      ;如果系统信号集的描述字多于1个的话,在extramask在保存多出来的部分,
      ;set->;sig[0]已在sigframe->;sc.oldmask保存
      if (_NSIG_WORDS >; 1) {
            err |= __copy_to_user(frame->;ex

实战分享：从技术角度谈机器学习入门| 【大话IT】RadonDB低门槛向MySQL集群下战书 | ChinaUnix打赏功能已上线！ | 新一代分布式关系型数据库RadonDB知多少？

轩辕砍刀

荣誉版主

论坛徽章:: 0

20楼 [报告]

发表于 2003-04-23 20:18 |只看该作者

linux内核分析（转自某位大哥网上的笔记）

ip_fw_check
      {
            从传入的skb参数中提取源地址src,目的地址dst，源端口src_port,目的端口dst_port，
            TCP发起连接标志tcpsyn，分片包位移offset，IP包TOS消息oldtos；

            ......

            f = chain->;chain;       //取出规则链的的一条规则,规则链由chain参数传入
            count = 0;
            do {
                     for (; f; f = f->;next) {       //遍历规则链中的规则，直到匹配（ip_rule_match返回1）
                              count++;
                              if (ip_rule_match(f,rif,ip,
                                                tcpsyn,src_port,dst_port,offset)) {
                                    if (!testing
                                          && !ip_fw_domatch(f, ip, rif, chain->;label,//作些标记，一般返回1
                                                            skb, slot,
                                                            src_port, dst_port,
                                                            count, tcpsyn)) {
                                             ret = FW_BLOCK;
                                             goto out;
                                    }
                                    break;
                              }
                     }
                     if(f) { //找到匹配规则

                              ......

                     }else { //这次遍历根本没找到
                              是从别的地方跳转过来的，则转回去，然后继续遍历；
                              否则应用这条链的缺省规则;
                     }

            } while (ret == FW_SKIP+2);
      out:

            ......

            return ret;
      }

碎片：
      根据第一个片的消息进行过滤，其他分片则允许通过。如果规则是丢弃的话，虽然后面的分片都可到达主机，
      但由于第一片被滤掉了，无法重组成功，因此从效果上也相当于整个IP包被丢弃。

      存在的漏洞等.

2.3 规则：

      from 192.168.7.0/24 to 192.168.6.32/32 tcp 80 BLOCK

规则的数据结构表示:

      规则链
      struct ip_chain
      {
            ip_chainlabel label;          /* Defines the label for each block */
            struct ip_chain *next;          /* Pointer to next block */
            struct ip_fwkernel *chain;  /* Pointer to first rule in block */
            __u32 refcount;          /* Number of refernces to block */
            int policy;                   /* Default rule for chain.  Only *
                                          * used in built in chains */
            struct ip_reent reent[0]; /* Actually several of these */
      };

      规则
      struct ip_fwkernel
      {
            struct ip_fw ipfw;
            struct ip_fwkernel *next;       /* where to go next if current
                                             * rule doesn't match */
            struct ip_chain *branch;       /* which branch to jump to if
                                             * current rule matches */
            int simplebranch;             /* Use this if branch == NULL */
            struct ip_counters counters[0]; /* Actually several of these */
      };

      待匹配的数据包消息
      struct ip_fw
      {
            struct in_addr fw_src, fw_dst;             /* Source and destination IP addr */
            struct in_addr fw_smsk, fw_dmsk;       /* Mask for src and dest IP addr */
            __u32 fw_mark;                         /* ID to stamp on packet */
            __u16 fw_proto;                      /* Protocol, 0 = ANY */
            __u16 fw_flg;                               /* Flags word */
            __u16 fw_invflg;                      /* Inverse flags */
            __u16 fw_spts[2];                      /* Source port range. */
            __u16 fw_dpts[2];                      /* Destination port range. */
            __u16 fw_redirpt;                      /* Port to redirect to. */
            __u16 fw_outputsize;                   /* Max amount to output to
                                                         NETLINK */
            char          fw_vianame[IFNAMSIZ];       /* name of interface "via" */
            __u8          fw_tosand, fw_tosxor;       /* Revised packet priority */
      };

2.4 地址转换
      ip_fw_demasquerade
      ip_fw_masquerade

三 Linux下防火墙的实现之二（2.4内核）：
3.1
      A Packet Traversing the Netfilter System:

      --->

RE------>;[ROUTE]--->;FWD---------->

OST------>;
            Conntrack |    Filter ^ NAT (Src)
            Mangle    |             | Conntrack
            NAT (Dst) |          [ROUTE]
            (QDisc)    v             |
                        IN Filter    OUT Conntrack
                        |  Conntrack    ^  Mangle
                        |             |  NAT (Dst)
                        v             |  Filter

3.2 例子

## Insert connection-tracking modules (not needed if built into kernel).
# insmod ip_conntrack
# insmod ip_conntrack_ftp

## Create chain which blocks new connections, except if coming from inside.
# iptables -N block
# iptables -A block -m state --state ESTABLISHED,RELATED -j ACCEPT
# iptables -A block -m state --state NEW -i ! ppp0 -j ACCEPT
# iptables -A block -j DROP

## Jump to that chain from INPUT and FORWARD chains.
# iptables -A INPUT -j block
# iptables -A FORWARD -j block

3.3 规则的描述
      一条规则分为三部分：
      struct ipt_entry       //主要用来匹配IP头
      struct ip_match       //额外的匹配（tcp头，mac地址等）
      struct ip_target       //除缺省的动作外（如ACCEPT，DROP），可以增加新的（如REJECT）。

3.4 代码提炼

ip_input.c:
      /*
      *       Main IP Receive routine.
      */
      int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
      {
      ...
      return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
                           ip_rcv_finish);
      ...
      }

netfilter.h:
      #ifdef CONFIG_NETFILTER
      #define NF_HOOK(pf, hook, skb, indev, outdev, okfn)                      \
      (list_empty(&nf_hooks[(pf)][(hook)])                                     \
      ? (okfn)(skb)                                                             \
      : nf_hook_slow((pf), (hook), (skb), (indev), (outdev), (okfn)))
      #else /* !CONFIG_NETFILTER */
      #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb)
      #endif /*CONFIG_NETFILTER*/

大的框架："HOOK表"：
      struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];       //netfilter.c
      通过nf_register_hook和nf_unregister_hook完成添加删除工作，nf_iterate负责执行hook上的函数。

      增加用户自定义的HOOK，参见【8】，【10】：

重要流程（建议结合netfilter hacking howto 4.1.3来看）：
/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
ipt_do_table(struct sk_buff **pskb,
         unsigned int hook,
         const struct net_device *in,
         const struct net_device *out,
         struct ipt_table *table,
         void *userdata)
{
      struct ipt_entry *e;
      struct ipt_entry_target *t;
      unsigned int verdict = NF_DROP;

      table_base = (void *)table->;private->;entries
            + TABLE_OFFSET(table->;private,
                           cpu_number_map(smp_processor_id()));
      e = get_entry(table_base, table->;private->;hook_entry[hook]);

      ...
      ip_packet_match(ip, indev, outdev, &e->;ip, offset);

      ...
      IPT_MATCH_ITERATE(e, do_match, *pskb, in, out, offset, protohdr, datalen, &hotdrop)

      ...
      t = ipt_get_target(e);

      ...
      verdict = t->;u.kernel.target->;target(pskb, hook, in, out, t->;data, userdata);//非标准的target走这一步

      ...
      return verdict;
}

要加强对这段话的理解(netfilter hacking howto 4.1节) ：
>;iptables does not register with any netfilter hooks: it relies on
>;other modules to do that and feed it the packets as appropriate; a
>;module must register the netfilter hooks and ip_tables separately, and
>;provide the mechanism to call ip_tables when the hook is reached.

四 Linux下防火墙的实现之三（checkpoint FW1）

让我们看看checkpoint的在linux上的防火墙是如何实现的，最终我们会发现，竟然和lkm使用的手段差不多：）

fw1通过dev_add_pack的办法加载输入过滤函数，但是在net_bh()中，传往网络层的skbuff是克隆的,即
skb2=skb_clone(skb, GFP_ATOMIC);
if(skb2)
      pt_prev->;func(skb2, skb->;dev, pt_prev);
而fw1是怎么解决这个问题的呢？见下面的代码：

输入一：

      ; 哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪?
                     align 4

      ; 圹圹圹圹圹圹圹?S U B       R O U T       I N E 圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹?

      ; Attributes: bp-based frame

                     public fwinstallin
      fwinstallin       proc near             ; CODE XREF: fwinstall+E9p
                                             ; fwinstall+149p

      var_18             = byte ptr -18h
      arg_0             = dword       ptr  8

                     push       ebp
                     mov       ebp, esp
                     sub       esp, 10h
                     push       esi
                     push       ebx
                     mov       esi, ds:dev_base
                     cmp       [ebp+arg_0], 0
                     jz       short loc_0_802CBD0
                     add       esp, 0FFFFFFF4h
                     push       offset fw_ip_packet_type
                     call       dev_add_pack
                     mov       ebx, fw_ip_packet_type+10h       ;如果考虑字节对齐问题的话fw_ip_packet_type+10h这时应该是ip_packet_type
                     mov       dword ptr ds:fw_type_list, ebx
                     jmp       short loc_0_802CB9C
      ; 哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪?
                     align 4

      loc_0_802CB90:                               ; CODE XREF: fwinstallin+41j
                     add       esp, 0FFFFFFF4h
                     push       ebx
                     call       dev_remove_pack             ;fw1把ip_packet_type歇载掉了，然后自己在自己的处理函数(fw_filterin)中调ip_recv
                     mov       ebx, [ebx+10h]

      loc_0_802CB9C:                               ; CODE XREF: fwinstallin+2Dj
                     add       esp, 10h
                     test       ebx, ebx
                     jnz       short loc_0_802CB90
                     test       esi, esi
                     jz       short loc_0_802CC14

      loc_0_802CBA7:                               ; CODE XREF: fwinstallin+68j
                     test       byte ptr fwdebug, 81h
                     jz       short loc_0_802CBC3
                     add       esp, 0FFFFFFF8h
                     mov       eax, [esi]
                     push       eax
                     push       offset aFwinstallinS ; "fwinstallin: %s\n"
                     call       fwkdebug_printf
                     add       esp, 10h

      loc_0_802CBC3:                               ; CODE XREF: fwinstallin+4Ej
                     mov       esi, [esi+28h]
                     test       esi, esi
                     jnz       short loc_0_802CBA7
                     jmp       short loc_0_802CC14
      ; 哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪?
                     align 8

      loc_0_802CBD0:                               ; CODE XREF: fwinstallin+12j
                     cmp       dword ptr ds:fw_type_list, 0
                     jz       short loc_0_802CC14
                     add       esp, 0FFFFFFF4h
                     push       offset fw_ip_packet_type
                     call       dev_remove_pack
                     add       esp, 10h
                     cmp       dword ptr ds:fw_type_list, 0
                     jz       short loc_0_802CC14

      loc_0_802CBF2:                               ; CODE XREF: fwinstallin+B2j
                     add       esp, 0FFFFFFF4h
                     mov       eax, dword ptr ds:fw_type_list
                     push       eax
                     call       dev_add_pack
                     mov       eax, dword ptr ds:fw_type_list
                     add       esp, 10h
                     mov       eax, [eax+10h]
                     mov       dword ptr ds:fw_type_list, eax
                     test       eax, eax
                     jnz       short loc_0_802CBF2

      loc_0_802CC14:                               ; CODE XREF: fwinstallin+45j
                                             ; fwinstallin+6Aj ...
                     lea       esp, [ebp+var_18]
                     xor       eax, eax
                     pop       ebx
                     pop       esi
                     mov       esp, ebp
                     pop       ebp
                     retn
      fwinstallin       endp

输入二：
      public fw_ip_packet_type
      fw_ip_packet_type dd 8,       0, offset fw_filterin, 2 dup(0)       ; DATA XREF: fwinstallin+17o

输出的挂载和lkm的手法一样，更改dev->;hard_start_xmit。dev结构在2.2版本的发展过程中变了一次，
为了兼容fw1对这点也做了处理。

输出一：
      ; 哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪?
                     align 4

      ; 圹圹圹圹圹圹圹?S U B       R O U T       I N E 圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹?

      ; Attributes: bp-based frame

                     public fwinstallout
      fwinstallout       proc near             ; CODE XREF: fwinstall+FBp
                                             ; fwinstall+153p

      var_18             = byte ptr -18h
      arg_0             = dword       ptr  8

                     push       ebp
                     mov       ebp, esp
                     sub       esp, 0Ch
                     push       edi
                     push       esi
                     push       ebx
                     mov       edi, [ebp+arg_0]
                     xor       esi, esi
                     mov       ebx, ds:dev_base
                     jmp       short loc_0_802D0A8
      ; 哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪?

      loc_0_802D096:                               ; CODE XREF: fwinstallout+50j
                     add       esp, 0FFFFFFFCh
                     push       edi
                     push       esi
                     push       ebx
                     call       installout_on_device
                     add       esp, 10h
                     mov       ebx, [ebx+28h]
                     inc       esi

      loc_0_802D0A8:                               ; CODE XREF: fwinstallout+14j
                     test       ebx, ebx
                     jz       short loc_0_802D0F8
                     test       byte ptr fwdebug, 81h
                     jz       short loc_0_802D0CD
                     xor       eax, eax
                     mov       ax, [ebx+50h]
                     push       eax
                     mov       eax, [ebx]
                     push       eax
                     push       esi
                     push       offset aFwinstalloutIn ; "fwinstallout:       interface %d: name=%s, fl"...
                     call       fwkdebug_printf
                     add       esp, 10h

      loc_0_802D0CD:                               ; CODE XREF: fwinstallout+33j
                     cmp       esi, 3Fh
                     jle       short loc_0_802D096
                     add       esp, 0FFFFFFF8h
                     push       40h
                     push       offset aFw1CanOnlyHand ; "FW-1:       Can only handle       %d interfaces\n"
                     call       fwkdebug_printf
                     add       esp, 10h
                     test       edi, edi
                     jz       short loc_0_802D0F8
                     add       esp, 0FFFFFFF4h
                     push       offset aFw1NotAllInter ; "FW-1:       Not all       interfaces installed\n"
                     call       fwkdebug_printf
                     add       esp, 10h

      loc_0_802D0F8:                               ; CODE XREF: fwinstallout+2Aj
                                             ; fwinstallout+66j
                     mov       fw_nif,       esi
                     test       byte ptr fwdebug, 81h
                     jz       short loc_0_802D124
                     add       esp, 0FFFFFFFCh
                     mov       eax, offset aUn       ; "un"
                     test       edi, edi
                     jz       short loc_0_802D118
                     mov       eax, offset unk_0_80687E4

      loc_0_802D118:                               ; CODE XREF: fwinstallout+91j
                     push       eax
                     push       esi
                     push       offset aFw1DInterfaces ; "FW-1:       %d interfaces %sinstalled\n"
                     call       fwkdebug_printf

      loc_0_802D124:                               ; CODE XREF: fwinstallout+85j
                     lea       esp, [ebp+var_18]
                     xor       eax, eax
                     pop       ebx
                     pop       esi
                     pop       edi
                     mov       esp, ebp
                     pop       ebp
                     retn
      fwinstallout       endp

输出二：

      ; 哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪?
                     align 10h

      ; 圹圹圹圹圹圹圹?S U B       R O U T       I N E 圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹?

      ; Attributes: bp-based frame

                     public installout_on_device
      installout_on_device proc near             ; CODE XREF: fwinstallout+1Cp

      var_18             = byte ptr -18h
      var_4             = dword       ptr -4
      arg_0             = dword       ptr  8
      arg_4             = dword       ptr  0Ch
      arg_8             = dword       ptr  10h

                     push       ebp
                     mov       ebp, esp
                     sub       esp, 0Ch
                     push       edi
                     push       esi
                     push       ebx
                     mov       edi, [ebp+arg_0]
                     mov       esi, [ebp+arg_4]
                     mov       ebx, [ebp+arg_8]
                     add       esp, 0FFFFFFF4h
                     push       edi
                     call       xmit_func_addr
                     mov       [ebp+var_4], eax
                     add       esp, 10h
                     test       ebx, ebx
                     jz       short loc_0_802CFD4
                     mov       ebx, esi
                     shl       ebx, 4
                     cmp       (oftab+4)[ebx],       0
                     jz       short loc_0_802CF90
                     add       esp, 0FFFFFFF4h
                     push       offset aFw1OutputFilte ; "FW-1:       Output filter already installed\n"
                     call       fwkdebug_printf
                     mov       eax, 6Ah
                     jmp       loc_0_802D074

输出三：

      ; 哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪哪?
                     align 8

      ; 圹圹圹圹圹圹圹?S U B       R O U T       I N E 圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹圹?

      ; Attributes: bp-based frame

                     public xmit_func_addr
      xmit_func_addr       proc near             ; CODE XREF: installout_on_device+16p

      arg_0             = dword       ptr  8

                     push       ebp
                     mov       ebp, esp
                     mov       edx, [ebp+arg_0]
                     lea       eax, [edx+0ACh]
                     cmp       kver, 0Dh
                     jle       short loc_0_802CB5B
                     lea       eax, [edx+0B0h]

      loc_0_802CB5B:                               ; CODE XREF: xmit_func_addr+13j
                     mov       esp, ebp
                     pop       ebp
                     retn
      xmit_func_addr       endp

FW1与linux的一些比较，可以参看参考文献【11】

五参考文献
      【1】了解Check Point FW-1状态表
            http://magazine.nsfocus.com/detail.asp?id=538
      【2】A Stateful Inspection of FireWall-1
            http://www.dataprotect.com/bh2000/
      【3】Linux IPCHAINS-HOWTO
            http://www.linuxdoc.org
      【4】防火墙新生代：Stateful－inspection
            http://www.liuxuan.com/safe/anquan/html/firewall/04.htm
      【5】netfilter站点上的文档
            http://netfilter.kernelnotes.org
      【6】Application Gateways and Stateful Inspection:A Brief Note Comparing and Contrasting
            http://www.avolio.com/apgw+spf.html
      【7】Internet Firewalls:Frequently Asked Questions
            http://www.interhack.net/pubs/fwfaq
      【8】Writing a Module for netfilter
            http://www.linux-mag.com/2000-06/gear_01.html
      【9】ipchains的源代码分析
            http://www.lisoleg.net/lisoleg/network/ipchains.zip
      【10】内核防火墙netfilter入门
            http://magazine.nsfocus.com/detail.asp?id=637
      【11】Check Point Firewall-1 on Linux, Part Two
            http://www.securityfocus.com/frames/?focus=linux&content=/focus/linux/articles/checkpoint2.html

[目录]

--------------------------------------------------------------------------------

TCP/IP协议栈阅读笔记

下面是我看RH6.2(Kernel 2-2-14)的TCP/IP代码的笔记
[目录]

--------------------------------------------------------------------------------

启动以后

先从init/main.c的start_kernel函数说起。
在这个函数里面调用kernel_thread启动了init进程，这个进程对应的函数是同一个文件里面的init函数，在init函数里面调用了一个
叫do_basic_setup的在同一个文件里面的函数，这个函数调用了net/socket.c里面的sock_init函数，这个函数就是TCP/IP协议栈，也包括ipx等的入口。
首先sock_init函数里面有很多ifdef这样的东东，我觉得对于一个普通的主机来说，这些都不会配置的，它们包括：
SLAB_SKB,CONFIG_WAN_ROUTER,CONFIG_FIREWALL,CONFIG_RTNETLINK,CONFIG_NETLINK_DEV

去掉了这些编译选项以后就剩下这样的代码：

for (i = 0; i < NPROTO; i++)
net_families = NULL;
sk_init();
proto_init();

其中net_families在include/linux/net.h里面定义，是这样的：

struct net_proto_family
{
int family;
int (*create)(struct socket *sock, int protocol);
/* These are counters for the number of different methods of
each we support */
short authentication;
short encryption;
short encrypt_net;
};
其中有用的只有前两项，那个create的callback函数是每个协议，例如AF_INET等初始化上层协议如TCP/ICMP协议需要的，以后还会遇到的，这里先放着把

sk_init函数在net/core/sock.c里面，没什么说的..

struct sock *sk_alloc(int family, int priority, int zero_it)
{
struct sock *sk = kmem_cache_alloc(sk_cachep, priority);
if(sk) {
if (zero_it)
memset(sk, 0, sizeof(struct sock));
sk->;family = family;
}
return sk;
}

proto_init函数在同一个文件里面：

void __init proto_init(void)
{
extern struct net_proto protocols[];
struct net_proto *pro;
pro = protocols;
while (pro->;name != NULL)
{
(*pro->;init_func)(pro);
pro++;
}
}

struct net_proto在include/linux/net.h里面是这样的：

struct net_proto
{
const char *name; /* Protocol name */
void (*init_func)(struct net_proto *); /* Bootstrap */
};

这个protocols的数组是在net/protocols.c里面定义的，包含了一堆的协议初始化结构体，其中我只注意两个：AF_INET和AF_PACKET
它们的初始化函数分别是inet_proto_init和packet_proto_init

[目录]

--------------------------------------------------------------------------------

协议初始化-1

下面来看看IPv4协议和PACKET协议的初始化过程。
首先看PACKET协议，首先我们假定PACKET协议是编译在核心里面的，而不是一个MODULE，这样得到packet_proto_init函数在net/packet/af_packet.c里面是这样的：

void __init packet_proto_init(struct net_proto *pro)
{
sock_register(&packet_family_ops);
register_netdevice_notifier(&packet_netdev_notifier);
}

其中sock_register函数在net/socket.c里面，就是简单的设置前面说的net_families数组中间对应的值：

int sock_register(struct net_proto_family *ops)
{
if (ops->;family >;= NPROTO) {
printk(KERN_CRIT "protocol %d >;= NPROTO(%d)\n",
ops->;family, NPROTO);
return -ENOBUFS;
}
net_families[ops->;family]=ops;
return 0;
}

这里要说明的是packet_netdev_notifier是一个struct notifier_block类型，这个struct是在include/linux/notifier.h里面的：

struct notifier_block
{
int (*notifier_call)(struct notifier_block *self, unsigned long, void *);
struct notifier_block *next;
int priority;
};

而register_netdevice_notifier函数在net/core/dev.c里面，是这样的：

int register_netdevice_notifier(struct notifier_block *nb)
{
return notifier_chain_register(&netdev_chain, nb);
}

而notifier_chain_register函数在include/linux/notifier.h里面，是这样的:

extern __inline__ int notifier_chain_register(
struct notifier_block **list, struct notifier_block *n)
{
while(*list)
{
if(n->;priority >; (*list)->;priority)
break;
list= &((*list)->;next);
}
n->;next = *list;
*list=n;
return 0;
}

显然就是根据每个block的优先级把这个block排列在一个block的链表里面，在notifier_chain_register函数里面我们可以发现这个链表是netdev_chain。实际上这个链表的作用就是在每个interface打开，关闭状态改变或者外界调用相应的ioctl的时候通知这个链表上面的所有相关的设备，而每一个协议都调用register_netdevice_notifier注册了一个netdev_notifier的结构体，这样就可以在interface改变的时候得到通知了(通过调用每个notifier_call函数)。

下面来看inet_proto_init函数，这个函数在net/ipv4/af_inet.c中间，里面也有很多ifdef的编译选项，假定下面几个是没有定义的：CONFIG_NET_IPIP，CONFIG_NET_IPGRE，CONFIG_IP_FIREWALL，CONFIG_IP_MASQUERADE，CONFIG_IP_MROUTE

假定下面几个是定义了的：CONFIG_INET_RARP,CONFIG_PROC_FS
下面是整理后的代码：

(void) sock_register(&inet_family_ops);
for(p = inet_protocol_base; p != NULL {
struct inet_protocol *tmp=(struct inet_protocol *)p->;next;
inet_add_protocol(p);
printk("%s%s",p->;name,tmp?", ":"\n";
p = tmp;
}

arp_init();
ip_init();
tcp_v4_init(&inet_family_ops);
tcp_init();
icmp_init(&inet_family_ops);

rarp_ioctl_hook = rarp_ioctl;
proc_net_register(&proc_net_rarp);
proc_net_register(&proc_net_raw);
proc_net_register(&proc_net_snmp);
proc_net_register(&proc_net_netstat);
proc_net_register(&proc_net_sockstat);
proc_net_register(&proc_net_tcp);
proc_net_register(&proc_net_udp);

[目录]

--------------------------------------------------------------------------------

协议初始化-2

其中的sock_register函数的作用已经在前面说了，现在来看看struct inet_protocol和inet_add_protocol函数。前面的结构体
是在include/net/protocol.h里面：
struct inet_protocol
{
int (*handler)(struct sk_buff *skb, unsigned short len);
void (*err_handler)(struct sk_buff *skb, unsigned char *dp, int len);
struct inet_protocol *next;
unsigned char protocol;
unsigned char copy:1;
void *data;
const char *name;
};

第一个函数是用来接收数据的callback函数，第二个是错误处理函数，其它的copy是用来协议共享的，这个以后再说，data当然就是这个结构体的私有数据了。

inet_add_protocol函数是在net/ipv4/protocol.c里面的：

void inet_add_protocol(struct inet_protocol *prot)
{
unsigned char hash;
struct inet_protocol *p2;

hash = prot->;protocol & (MAX_INET_PROTOS - 1);
prot ->;next = inet_protos[hash];
inet_protos[hash] = prot;
prot->;copy = 0;

p2 = (struct inet_protocol *) prot->;next;
while(p2 != NULL)
{
if (p2->;protocol == prot->;protocol)
{
prot->;copy = 1;
break;
}
p2 = (struct inet_protocol *) p2->;next;
}
}
显然这个函数就是建立一个hash表，然后每个hash表项都是一个链表头，然后通过这个hash表加链表的方式访问每个协议结构体。在这里你也见到了copy成员的用法了把。

arp_init函数是在net/ipv4/arp.c里面的(假定没有定义CONFIG_SYSCTL)：

neigh_table_init(&arp_tbl);
dev_add_pack(&arp_packet_type);
proc_net_register(&proc_net_arp);

不知道是不是有人眼睛一亮啊，呵呵，看到了dev_add_pack函数。
还是一步步来把。

neigh_table_init函数在net/core/neighbour.c中间：

void neigh_table_init(struct neigh_table *tbl)
{
unsigned long now = jiffies;

tbl->;parms.reachable_time = neigh_rand_reach_time(
tbl->;parms.base_reachable_time);

init_timer(&tbl->;gc_timer);
tbl->;gc_timer.data = (unsigned long)tbl;
tbl->;gc_timer.function = neigh_periodic_timer;
tbl->;gc_timer.expires = now + tbl->;gc_interval +
tbl->;parms.reachable_time;
add_timer(&tbl->;gc_timer);

init_timer(&tbl->;proxy_timer);
tbl->;proxy_timer.data = (unsigned long)tbl;
tbl->;proxy_timer.function = neigh_proxy_process;
skb_queue_head_init(&tbl->;proxy_queue);

tbl->;last_flush = now;
tbl->;last_rand = now + tbl->;parms.reachable_time*20;
tbl->;next = neigh_tables;
neigh_tables = tbl;
}

jiffies是当前系统的时间，在i386系统上面好象一个jiffies代表50ms，显然这个函数就是生成两个timer将一个放在系统的timerlist里面。那个gc_timer的意思是garbage collect timer，因为每过一段时间arp的cache就应该更新，所以要有一个expires时间，这段时间过了以后就要更新arp地址了，那个proxy_timer还没有看是什么，不过我假定我的机器不使用proxy也不做成proxy，所以proxy相关的都没有管
那个timer的function显然是时钟到期的回调函数，data是这个回调函数要使用的私有数据了。

下面是dev_add_pack函数，它在net/core/dev.c里面：

void dev_add_pack(struct packet_type *pt)
{
int hash;
#ifdef CONFIG_NET_FASTROUTE
/* Hack to detect packet socket */
if (pt->;data) {
netdev_fastroute_obstacles++;
dev_clear_fastroute(pt->;dev);
}
#endif
if(pt->;type==htons(ETH_P_ALL))
{
netdev_nit++;
pt->;next=ptype_all;
ptype_all=pt;
}
else
{
hash=ntohs(pt->;type)&15;
pt->;next = ptype_base[hash];
ptype_base[hash] = pt;
}
}
显然系统保留了两个表，一个是ptype_all，用来接收所有类型的包的链表，一个是一个hash数组+链表的结构，用来接收特定类型的包。

struct packet_type的定义在include/linux/netdevice.h里面，我保留原来的注释，这样就不用我多说了

{
unsigned short type;
/* This is really htons(ether_type). */
struct device *dev;
/* NULL is wildcarded here */
int (*func) (struct sk_buff *,
struct device *, struct packet_type *);
void *data;
/* Private to the packet type */
struct packet_type *next;
};

其中的func当然是回调函数了,举个例子来说，arp_packet_type是这样的：

static struct packet_type arp_packet_type =
{
__constant_htons(ETH_P_ARP),
NULL, /* All devices */
arp_rcv,
NULL,
NULL
};

arp_init函数还有最后一个proc_net_register函数，这个函数在include/linux/proc_fs.h里面：

static inline int proc_net_register(struct proc_dir_entry * x)
{
return proc_register(proc_net, x);
}

而proc_register在fs/proc/root.c里面，主要作用是在proc_net对应的目录下面生成每个协议的子目录，例如TCP等在/proc目录下面生成相应的目录，用户可以通过访问/proc/net目录下面的相应目录得到每个协议的统计参数。

[目录]

--------------------------------------------------------------------------------

协议初始化-3

下面是ip_init函数，它在net/ipv4/ip_output.c里面：(下面假定定义了CONFIG_PROC_FS，CONFIG_IP_MULTICAST和CONFIG_NET_CLS_ROUTE)
__initfunc(void ip_init(void))
{
dev_add_pack(&ip_packet_type);
ip_rt_init();
proc_net_register(&proc_net_igmp);
}

前面的dev_add_pack是说过的，这里就不再说了，而且proc_net_register也是前面提过的，这里都不说了，先来看看ip_rt_init函数把，它在net/ipv4/route.c里面，函数是这样的：

__initfunc(void ip_rt_init(void))
{
struct proc_dir_entry *ent;
devinet_init();
ip_fib_init();
rt_periodic_timer.function = rt_check_expire;
/* All the timers, started at system startup tend
to synchronize. Perturb it a bit.
*/
rt_periodic_timer.expires = jiffies + net_random()%
ip_rt_gc_interval + ip_rt_gc_interval;
add_timer(&rt_periodic_timer);

proc_net_register(&(struct proc_dir_entry) {
PROC_NET_RTCACHE, 8, "rt_cache",
S_IFREG | S_IRUGO, 1, 0, 0,
0, &proc_net_inode_operations,
rt_cache_get_info
});
ent = create_proc_entry("net/rt_acct", 0, 0);
ent->;read_proc = ip_rt_acct_read;
}

这个函数总的看来就是注册几个notifier(后面还要看的)和初始化路由表的timer，最后就在/proc目录下面创建一个目录项。其中proc_net_register函数就不说了，而create_proc_entry函数就是在/proc/net目录下面创建一个rt_acct，就是路由参数统计(account)目录，读函数就是ip_rt_acct_read，这个函数就是从全局变量ip_rt_acct中间拷贝数据到用户缓冲中而已。

devinet_init函数是net/ipv4/devinet.c里面的函数，整理后如下:

register_gifconf(PF_INET, inet_gifconf);
register_netdevice_notifier(&ip_netdev_notifier);

register_netdevice_notifier函数在说PACKET协议的时候提过，这里不说了，register_gifconf函数是用来注册对应SIOCGIFCONF这个系统调用的协议无关的一个回调函数，这个函数对于PF_INET来说就是inet_gifconf函数。
其中inet_gifconf函数是net/ipv4/devinet.c里面的，我大概的看了一点，主要好象是在所有的interface里面做一个循环，得到相应的name和address然后返回的。不过不是非常确定。大家参谋呀

而register_gifconf函数本身是在net/core/dev.c里面的，如下：

static gifconf_func_t * gifconf_list [NPROTO];

int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
{
if (family>;=NPROTO)
return -EINVAL;
gifconf_list[family] = gifconf;
return 0;
}

这个函数的意义一目了然，就不说了。
gifconf_list里的函数会在dev_ifconf函数中间被调用，而dev_ifconf函数被dev_ioctl函数调用，dev_ioctl函数负责所有的针对interface的I/O控制。所以我们调用的interface的ioctl函数有一部分就会分到每个协议的gifconf函数里面来，我猜gifconf大概是generous interface configure的意思。就是通用接口配置的意思。

下面再看ip_fib_init函数，它在net/ipv4/fib_frontend.c中间，如下：
(假定没有define CONFIG_IP_MULTIPLE_TABLES，这个参数好象是要创建两个路由表，一个是local的，一个叫main)

__initfunc(void ip_fib_init(void))
{
proc_net_register(&(struct proc_dir_entry) {
PROC_NET_ROUTE, 5, "route",
S_IFREG | S_IRUGO, 1, 0, 0,
0, &proc_net_inode_operations,
fib_get_procinfo
});

fib_rules_init();
register_netdevice_notifier(&fib_netdev_notifier);
register_inetaddr_notifier(&fib_inetaddr_notifier);
}

其中proc_net_register和register_netdevice_notifier函数上面已经提过了，register_inetaddr_notifier函数的作用和register_netdevice_notifier差不多，这个函数也是调用的notifier_chain_register函数注册一个回调函数，这个回调函数在interface加上和删除的时候被调用，fib_rules_init函数其实也差不多，这个函数在net/ipv4/fib_rules.c里面，它其实就是调用一个
register_netdevice_notifier函数注册fib_rules_notifier回调结构体。
fib代表IPv4 Forwarding Information Base，就是IPv4转发信息的意思

[目录]

--------------------------------------------------------------------------------

协议初始化-4

下面是分析tcp_v4_init的时候了,这个函数在net/ipv4/tcp_ipv4.c里面:
__initfunc(void tcp_v4_init(struct net_proto_family *ops))
{
int err;

tcp_inode.i_mode = S_IFSOCK;
tcp_inode.i_sock = 1;
tcp_inode.i_uid = 0;
tcp_inode.i_gid = 0;

tcp_socket->;inode = &tcp_inode;
tcp_socket->;state = SS_UNCONNECTED;
tcp_socket->;type=SOCK_RAW;

if ((err=ops->;create(tcp_socket, IPPROTO_TCP))<0)
panic("Failed to create the TCP control socket.\n";
tcp_socket->;sk->;allocation=GFP_ATOMIC;
tcp_socket->;sk->;num = 256;
tcp_socket->;sk->;ip_ttl = MAXTTL;
}
tcp_inode当然就是一个inode节点了,而tcp_socket等于tcp_inode.u.socket_i,通过一个指针他们指向同一个内存.
tcp_socket是用来通信使用的,可以叫TCP的control socket或者是communicationsocket,当TCP通信没有相应的socket的时候这个socket就充当了socket的角色.比如在一个关闭端口上收到SYN时发送RST,或者是在三次握手的时候发送SYN(还没有accept产生新的socket)

值得注意的是ops->;create函数的调用,我们前面见过对于AF_INET来说这个回调函数是net/ipv4/af_inet.c的inet_create函数,这个函数是用来创建一个socket的时候用的,由于函数比较长,这里先略过分析,这第一次的分析只是一个大致流程的熟悉而已.

由于有socket创建和通信,所以这段代码是协议相关的,所以把这段代码从原来的tcp.c里面提取了出来

下面是tcp_init函数,它在net/ipv4/tcp.c里面,大体上来说就是创建了几个hash表和bucket.这段代码创建了下面几个全局对象:

tcp_openreq_cachep
tcp_bucket_cachep
tcp_timewait_cachep
tcp_ehash
tcp_bhash
其中ehash代表established hash, bhash代表bind hash,它们当然分别是所有的满足TCP_ESTABLISHED <= sk->;state < TCP_CLOSE状态的SOCK.但是我不清楚bucket在这里是什么意思.anyone knows?那几个cachep的作用也不是很清楚.由于整个函数主要是内存分配和错误处理,这里不贴了.

再下来就是icmp_init函数了,在net/ipv4/icmp.c里面,事实上,如果把tcp_v4_init里面的IPPROTO_TCP替换成IPPROTO_ICMP,基本都是一样的.

剩下的proc_net_register函数前面已经讲过了,这里就不说了.

到这里为止,Linux下面IP栈的开始的工作我们基本应该有了个了解,其中有几个关键的函数:

dev_add_pack:

注册一个链路层以上的处理函数,一般是用来使用新的网络层协议的,不过如果注册时重复也是可以的,这时候系统会设置一个copy位.如果是ETH_P_ALL则会接收所有的数据包.加入的元素保存在ptype_all链表和ptype_base hash链表中间.

inet_add_protocol:

注册一个建立在IP层以上的协议,例如TCP和UDP等

proc_net_register(还有类似的proc_register):
在/proc/net目录下面创建一个子目录项来使管理者能通过文件系统得到统计信息

现在迷惑的地方还有很多,一个是结构体sk_buff的每个成员的意义,一个是结构体sock的意义,不过这两个问题应该在以后看多了就知道了.下面我就打算一个个分析每个协议的处理了,包括状态转化/数据发送/接收.

[目录]

--------------------------------------------------------------------------------

bottom up

let's start from bottom up 有的时候用英语说话比汉语要简洁和有意思一点
一个lance得到数据以后总会这样处理:

skb = dev_alloc_skb (....);
skb->;protocol = eth_type_trans(skb, dev);
....
netif_rx (skb);

eth_type_trans函数在net/ethernet/eth.c里面,作用当然很简单了,大家可以自己看.
而netif_rx函数是在net/core/dev.c里面的,假定没有定义CONFIG_CPU_IS_SLOW(我觉得自己的CPU不慢)和CONFIG_NET_HW_FLOWCONTROL(很少有人会意识到很多网卡有流量控制把,不过没有交换设备的支持,想凭这个东西达到Qos也没什么
用)以后的代码是这样的:

void netif_rx(struct sk_buff *skb)
{
skb->;stamp = xtime;

if (backlog.qlen <= netdev_max_backlog) {
if (backlog.qlen) {
if (netdev_dropping == 0) {
skb_queue_tail(&backlog,skb);
mark_bh(NET_BH);
return;
}
atomic_inc(&netdev_rx_dropped);
kfree_skb(skb);
return;
}
netdev_dropping = 0;
skb_queue_tail(&backlog,skb);
mark_bh(NET_BH);
return;
}
netdev_dropping = 1;
atomic_inc(&netdev_rx_dropped);
kfree_skb(skb);
}

xtime是当前的时间,一个struct timeval,利用gettimeofday函数得到的就是这个东西的内容.backlog是一个sk_buff的双向链表, netdev_dropping初始化为0,如果没有定义CONFIG_NET_HW_FLOWCONTROL,这个变量一直都将是0.skb_queue_tail就是把一个sk_buff加入到backlog双向队列中去.然后mark_bh是设置了一个全局变量相对位移NET_BH处的bit就返回了.这个bit的设置将使得内核下次schedule的时候从TIMER_BH向下处理时检查到NET_BH处发现有设置就会调用对应NET_BH优先级的函数net_bh来处理,这个回调函数是在net_dev_init函数里面调用init_bh设置的,呵呵,兄弟们,如果感兴趣可以自己再init_bh看看设置一个自己的处理backlog的函数啊.

Linux在这里采取了一个古怪的策略进行控制权的转移和处理机优先级的处理.另一个函数net_bh来处理从backlog中间得到包,它是这样的(假定没定义CONFIG_BRIDGE这个选项):

void net_bh(void)
{
struct packet_type *ptype;
struct packet_type *pt_prev;
unsigned short type;
unsigned long start_time = jiffies;

NET_PROFILE_ENTER(net_bh);

if (qdisc_head.forw != &qdisc_head)
qdisc_run_queues();

while (!skb_queue_empty(&backlog))
{
struct sk_buff * skb;

if (jiffies - start_time >; 1)
goto net_bh_break;

skb = skb_dequeue(&backlog);

#ifdef CONFIG_NET_FASTROUTE
if (skb->;pkt_type == PACKET_FASTROUTE) {
dev_queue_xmit(skb);
continue;
}
#endif

/* XXX until we figure out every place to modify.. */
skb->;h.raw = skb->;nh.raw = skb->;data;

if(skb->;mac.raw < skb->;head || skb->;mac.raw >; skb->;data){
printk(KERN_CRIT "%s: wrong mac.raw ptr, proto=%04x\n",
skb->;dev->;name, skb->;protocol);
kfree_skb(skb);
continue;
}

type = skb->;protocol;

pt_prev = NULL;
for (ptype = ptype_all; ptype!=NULL; ptype=ptype->;next)
{
if (!ptype->;dev || ptype->;dev == skb->;dev) {
if(pt_prev)
{
struct sk_buff *skb2=skb_clone(skb, GFP_ATOMIC);
if(skb2)
pt_prev->;func(skb2,skb->;dev, pt_prev);
}
pt_prev=ptype;
}
}

for (ptype = ptype_base[ntohs(type)&15]; ptype != NULL;
ptype = ptype->;next)
{
if (ptype->;type == type && (!ptype->;dev ||
ptype->;dev==skb->;dev))
{
if(pt_prev)
{
struct sk_buff *skb2;
skb2=skb_clone(skb, GFP_ATOMIC);
if(skb2)
pt_prev->;func(skb2, skb->;dev, pt_prev);
}
pt_prev=ptype;
}
} /* End of protocol list loop */

if(pt_prev)
pt_prev->;func(skb, skb->;dev, pt_prev);
else {
kfree_skb(skb);
}
} /* End of queue loop */

if (qdisc_head.forw != &qdisc_head)
qdisc_run_queues();

netdev_dropping = 0;
NET_PROFILE_LEAVE(net_bh);
return;

net_bh_break:
mark_bh(NET_BH);
NET_PROFILE_LEAVE(net_bh);
return;
}

这个函数其实很简单,NET_PROFILE_ENTER当然是一个宏展开了,它其实就是include/net/profile.h里面的net_profile_enter函数,而NET_PROFILE_LEAVE是profile.h文件里面的net_profile_leave函数,有兴趣的看看把.帮我解疑.
qdisc_head是一个Qdisc_head类型,是一个全局变量,看名字和处理顺序应该看作是一个Quick DISCovery的队列,如果不为空的话我们就要运行qdisc_run_queues函数进行清理了,不过我并不清楚这个queue的意义,这个变量和函数都在net/sched/sch_generic.c里面获得的.大家看了给我答疑把,xixi

下面的东西挺简单的,我就不说了,值得注意的是:
1.大家还记得ptype_all和ptype_base吗?就是调用dev_add_pack加入的数组啊,最终也调用了pt_prev->;func(....)
2.系统先处理ptype_all然后才处理的ptype_base
3.每处理一个sk_buff如果超过1jiffies(x86上为50ms)就再等待下次调用
4.sk_clone是一个快速拷贝,没有拷贝数据,只是复制头部而已

[目录]

--------------------------------------------------------------------------------

packet 函数

看看在net/packet/af_packet.c里面的packet_create函数,这个就是通过packet_proto_init加入的回调函数,假设定义了CONFIG_SOCK_PACKET,代码整理如下,这个函数是在用户创建链路层socket的时候被调用的:
static int packet_create(struct socket *sock, int protocol)
{
struct sock *sk;
int err;

if (!capable(CAP_NET_RAW))
return -EPERM;

if (sock->;type != SOCK_DGRAM && sock->;type != SOCK_RAW
&& sock->;type != SOCK_PACKET
)
return -ESOCKTNOSUPPORT;
//只有socket(AF_PACKET, [SOCK_DGRAM, SOCK_RAW],
//或者socket(AF_INET, SOCK_PACKET ,才能调用成功

sock->;state = SS_UNCONNECTED;
MOD_INC_USE_COUNT;

err = -ENOBUFS;

sk = sk_alloc(PF_PACKET, GFP_KERNEL, 1);
if (sk == NULL)
goto out;

sk->;reuse = 1;
sock->;ops = &packet_ops;
if (sock->;type == SOCK_PACKET)
sock->;ops = &packet_ops_spkt;
//如果是old_style的SOCK_PACKET,就使用packet_ops_spkt
//如果是AF_PACKET,就使用packet_ops作为对应的socket的
//回调函数

sock_init_data(sock,sk);

sk->;protinfo.af_packet = kmalloc(sizeof(struct packet_opt),
GFP_KERNEL);
//protinfo是一个union

if (sk->;protinfo.af_packet == NULL)
goto out_free;

memset(sk->;protinfo.af_packet, 0, sizeof(struct packet_opt));

sk->;zapped=0;
//这个zapped属性表示一个TCP的socket收到了RST

sk->;family = PF_PACKET;
sk->;num = protocol;

sk->;protinfo.af_packet->;prot_hook.func = packet_rcv;
if (sock->;type == SOCK_PACKET)
sk->;protinfo.af_packet->;prot_hook.func = packet_rcv_spkt;
sk->;protinfo.af_packet->;prot_hook.data = (void *)sk;

if (protocol) {
sk->;protinfo.af_packet->;prot_hook.type = protocol;

dev_add_pack(&sk->;protinfo.af_packet->;prot_hook);
//注意到了没有,如果protocol非零的话也可以dev_add_pack
//的,不过当然不能达到phrack55-12的目的,因为这时候你的
//数据已经在用户地址空间了,内核的数据也是改不了的

sk->;protinfo.af_packet->;running = 1;
}

sklist_insert_socket(&packet_sklist, sk);
//这个函数显然应该实现非常简单,在net/core/sock.c里面.
//packet_sklist是用来给每个socket通知interface状态变化
//的消息的,包括UP/DOWN/MULTICAST_LIST_CHANGE
//这个回调函数的实现是我们说过的register_netdev_notifier

return(0);

out_free:
sk_free(sk);
out:
MOD_DEC_USE_COUNT;
return err;
}
只有在创建了packet socket以后应用程序才能接收链路层的数据包.而只有你设置了一个非零的protocol以后才能dev_add_pack,你的socket才能接收数据的.现在看来,dev_add_pack确实是实现底层数据改写的一个重要的函数.所以下面我们
将注意dev_add_pack设置的回调函数func的使用.

[目录]

--------------------------------------------------------------------------------

packet_rcv

我们已经知道了,如果使用socket(AF_SOCKET, ..)产生一个PACKET SOCKET的话,dev_add_pack加入的函数是packet_rcv,下面是这个在net/packet/af_packet.c里面的函数:
static int packet_rcv(struct sk_buff *skb, struct device *dev,
struct packet_type *pt)
{
struct sock *sk;
struct sockaddr_ll *sll = (struct sockaddr_ll*)skb->;cb;

sk = (struct sock *) pt->;data;
//我们在packet_create中令data = sk了,remember?

if (skb->;pkt_type == PACKET_LOOPBACK) {
kfree_skb(skb);
return 0;
}

skb->;dev = dev;

sll->;sll_family = AF_PACKET;
sll->;sll_hatype = dev->;type;
sll->;sll_protocol = skb->;protocol;
sll->;sll_pkttype = skb->;pkt_type;
sll->;sll_ifindex = dev->;ifindex;
sll->;sll_halen = 0;

if (dev->;hard_header_parse)
sll->;sll_halen = dev->;hard_header_parse(skb, sll->;sll_addr);

if (dev->;hard_header)
if (sk->;type != SOCK_DGRAM)
skb_push(skb, skb->;data - skb->;mac.raw);
else if (skb->;pkt_type == PACKET_OUTGOING)
skb_pull(skb, skb->;nh.raw - skb->;data);

if (sock_queue_rcv_skb(sk,skb)<0)
{
kfree_skb(skb);
return 0;
}
return(0);
}
pkt_type属性是什么地方确定的?

这里还有几个函数要说明:

skb_pull在include/linux/skbuff.h中间:
extern __inline__ char *__skb_pull(struct sk_buff *skb,
unsigned int len)
{
skb->;len-=len;
return skb->;data+=len;
}

extern __inline__ unsigned char * skb_pull(struct sk_buff *skb,
unsigned int len)
{
if (len >; skb->;len)
return NULL;
return __skb_pull(skb,len);
}

不过是把头部的数据空出来,相应调整数据头部data的地址和长度.

同样skb_push在include/linux/skbuff.h中间:

extern __inline__ unsigned char *__skb_push(struct sk_buff *skb,
unsigned int len)
{
skb->;data-=len;
skb->;len+=len;
return skb->;data;
}

extern __inline__ unsigned char *skb_push(struct sk_buff *skb,
unsigned int len)
{
skb->;data-=len;
skb->;len+=len;
if(skb->;data head)
{
__label__ here;
skb_under_panic(skb, len, &&here);
here: ;
}
return skb->;data;
}
这个调整使数据长度加长,和skb_pull相反,不过skb_push显然更加安全一点.

在上面的程序中间,如果设备有一个明确的link_level_header,就考虑要不要调整数据长度和地址,如果sk->;type不是SOCK_DGRAM的话,说明程序对整个数据包包括ll地址都感兴趣.这样需要加长数据段使得数据包含ll头部.不然如果数据是向外走的,则需要把数据裁减到只包含从网络层数据包头开始的地方.所以是从nh.raw剪掉data,这就是差值.(nh=network header)

经过了这些处理以后,现在的skb已经是可以提交的了,这样就调用sock_queue_rcv_skb函数将这个skb加入到相应socket的接收缓冲区中去.