Linux内核中PF_KEY协议族的实现(1)
本文档的Copyleft归yfydz所有,使用GPL发布,可以自由拷贝,转载,转载时请保持文档的完整性,严禁用于任何商业用途。<br>msn: <a href="mailto:yfydz_no1@hotmail.com" target="_blank">yfydz_no1@hotmail.com</a><br>来源:<a href="http://yfydz.cublog.cn/" target="_blank">http://yfydz.cublog.cn</a><div><br>1. 前言</div>
<div><br>在Linux2.6内核中自带了PF_KEY协议族的实现,这样就不用象2.4那样打补丁来实现了。内核中PF_KEY实现要完成的功能是实现维护内核的安全联盟(SA)和安全策略(SP)数据库, 以及和用户空间的接口。</div>
<div><br>以下内核代码版本为2.6.19.2,
PF_KEY相关代码在net/key/目录下,定义了内核中PF_KEY与用户空间的接口,这个接口是RFC定义的,因此各种实现都基本类似;但具体关
于SA和SP的内部的实现和管理则是与实现相关的,各种实现各自不同,在linux内核是使用xfrm库来实现的,代码在net/xfrm/目录下定义。</div>
<div> </div>
<div>2. 数据结构</div>
<div>关于SA和SP的数据结构已经在RFC2367中定义, 头文件为include/linux/pfkeyv2.h,
这些是用户空间和内核空间共享的,只是作为接口的数据结构;而内核中具体使用的数据结构为xfrm定义的结构,在include/net/xfrm.h中
定义。</div>
<div><br>2.1 PF_KEY类型的sock</div>
<div><br>struct pfkey_sock {<br> /* struct sock must be the first member of struct pfkey_sock */<br> struct sock sk;<br>// 比普通sock添加两个参数<br>// 是否进行登记<br> int registered;<br>// 是否是混杂模式<br> int promisc;<br>};</div>
<div><br>2.2 状态(SA)</div>
<div><br>xfrm状态用来描述SA在内核中的具体实现:</div>
<div>struct xfrm_state<br>{<br> /* Note: bydst is re-used during gc */<br>// 每个状态结构挂接到三个HASH链表中<br> struct hlist_node bydst; // 按目的地址HASH<br> struct hlist_node bysrc; // 按源地址HASH<br> struct hlist_node byspi; // 按SPI值HASH</div>
<div> atomic_t refcnt; // 所有使用计数<br> spinlock_t lock; // 状态锁</div>
<div> struct xfrm_id id; // ID<br> struct xfrm_selector sel; // 状态选择子</div>
<div> u32 genid;</div>
<div> /* Key manger bits */<br> struct {<br> u8 state;<br> u8 dying;<br> u32 seq;<br> } km; </div>
<div> /* Parameters of this state. */<br> struct {<br> u32 reqid;<br> u8 mode;<br> u8 replay_window;<br> u8 aalgo, ealgo, calgo;<br> u8 flags;<br> u16 family;<br> xfrm_address_t saddr;<br> int header_len;<br> int trailer_len;<br> } props;</div>
<div> struct xfrm_lifetime_cfg lft; // 生存时间</div>
<div> /* Data for transformer */<br> struct xfrm_algo *aalg; // hash算法<br> struct xfrm_algo *ealg; // 加密算法<br> struct xfrm_algo *calg; // 压缩算法</div>
<div> /* Data for encapsulator */<br> struct xfrm_encap_tmpl *encap; // NAT-T封装信息</div>
<div> /* Data for care-of address */<br> xfrm_address_t *coaddr;</div>
<div> /* IPComp needs an IPIP tunnel for handling uncompressed packets */<br> struct xfrm_state *tunnel; </div>
<div> /* If a tunnel, number of users + 1 */<br> atomic_t tunnel_users;</div>
<div> /* State for replay detection */<br> struct xfrm_replay_state replay;</div>
<div> /* Replay detection state at the time we sent the last notification */<br> struct xfrm_replay_state preplay;</div>
<div> /* internal flag that only holds state for delayed aevent at the<br> * moment<br> */<br> u32 xflags;</div>
<div> /* Replay detection notification settings */<br> u32 replay_maxage;<br> u32 replay_maxdiff;</div>
<div> /* Replay detection notification timer */<br> struct timer_list rtimer;</div>
<div> /* Statistics */<br> struct xfrm_stats stats;</div>
<div> struct xfrm_lifetime_cur curlft;<br> struct timer_list timer;</div>
<div> /* Last used time */<br> u64 lastused;</div>
<div> /* Reference to data common to all the instances of this<br> * transformer. */<br> struct xfrm_type *type;<br> struct xfrm_mode *mode;</div>
<div> /* Security context */<br> struct xfrm_sec_ctx *security;</div>
<div> /* Private data of this transformer, format is opaque,<br> * interpreted by xfrm_type methods. */<br> void *data;<br>};</div>
<div> </div>
<div>2.3 策略(SP)</div>
<div><br>struct xfrm_policy<br>{<br> struct xfrm_policy *next; // 下一个策略<br> struct hlist_node bydst; // 按目的地址HASH的链表<br> struct hlist_node byidx; // 按索引号HASH的链表</div>
<div> /* This lock only affects elements except for entry. */<br> rwlock_t lock;<br> atomic_t refcnt;<br> struct timer_list timer;</div>
<div> u8 type;<br> u32 priority;<br> u32 index;<br> struct xfrm_selector selector;<br> struct xfrm_lifetime_cfg lft;<br> struct xfrm_lifetime_cur curlft;<br> struct dst_entry *bundles;<br> __u16 family;<br> __u8 action;<br> __u8 flags;<br> __u8 dead;<br> __u8 xfrm_nr;<br> struct xfrm_sec_ctx *security;<br> struct xfrm_tmpl xfrm_vec;<br>};</div>
<div><br>2.4 事件</div>
<div>struct km_event<br>{<br> union {<br> u32 hard;<br> u32 proto;<br> u32 byid;<br> u32 aevent;<br> u32 type;<br> } data;</div>
<div> u32 seq;<br> u32 pid;<br> u32 event;<br>};</div>
<div><br>3. 初始化</div>
<div>/* net/key/af_key.c */</div>
<div>static int __init ipsec_pfkey_init(void)<br>{<br>// 登记key_proto结构, 该结构定义如下:<br>// static struct proto key_proto = {<br>// .name = "KEY",<br>// .owner = THIS_MODULE,<br>// .obj_size = sizeof(struct pfkey_sock),<br>//};<br>// 最后一个参数为0, 表示不进行slab的分配, 只是简单的将key_proto结构<br>// 挂接到系统的网络协议链表中,这个结构最主要是告知了pfkey sock结构的大小<br> int err = proto_register(&key_proto, 0);</div>
<div> if (err != 0)<br> goto out;</div>
<div>// 登记pfkey协议族的的操作结构<br> err = sock_register(&pfkey_family_ops);<br> if (err != 0)<br> goto out_unregister_key_proto;<br>#ifdef CONFIG_PROC_FS<br> err = -ENOMEM;<br>// 建立只读的pfkey的PROC文件: /proc/net/pfkey<br> if (create_proc_read_entry("net/pfkey", 0, NULL, pfkey_read_proc, NULL) == NULL)<br> goto out_sock_unregister;<br>#endif<br>// 登记通知(notify)处理pfkeyv2_mgr<br> err = xfrm_register_km(&pfkeyv2_mgr);<br> if (err != 0)<br> goto out_remove_proc_entry;<br>out:<br> return err;<br>out_remove_proc_entry:<br>#ifdef CONFIG_PROC_FS<br> remove_proc_entry("net/pfkey", NULL);<br>out_sock_unregister:<br>#endif<br> sock_unregister(PF_KEY);<br>out_unregister_key_proto:<br> proto_unregister(&key_proto);<br> goto out;<br>}</div>
<div><br>4. pfkey套接口操作</div>
<div><br>4.1 建立套接口</div>
<div><br>/* net/key/af_key.c */</div>
<div>// pfkey协议族操作, 在用户程序使用socket打开pfkey类型的socket时调用,<br>// 相应的create函数在__sock_create(net/socket.c)函数中调用:<br>static struct net_proto_family pfkey_family_ops = {<br> .family = PF_KEY,<br> .create = pfkey_create,<br> .owner = THIS_MODULE,<br>};</div>
<div>// 在用户空间每次打开pfkey socket时都会调用此函数: </div>
<div>static int pfkey_create(struct socket *sock, int protocol)<br>{<br> struct sock *sk;<br> int err;</div>
<div>// 建立PFKEY的socket必须有ROOT权限<br> if (!capable(CAP_NET_ADMIN))<br> return -EPERM;<br>// socket类型必须是RAW, 协议为PF_KEY_V2<br> if (sock->type != SOCK_RAW)<br> return -ESOCKTNOSUPPORT;<br> if (protocol != PF_KEY_V2)<br> return -EPROTONOSUPPORT;</div>
<div> err = -ENOMEM;<br>// 分配sock结构, 并清零<br> sk = sk_alloc(PF_KEY, GFP_KERNEL, &key_proto, 1);<br> if (sk == NULL)<br> goto out;</div>
<div>// PFKEY类型socket的操作<br> sock->ops = &pfkey_ops;<br>// 初始化socket参数<br> sock_init_data(sock, sk);</div>
<div>// 初始化sock的族类型和释放函数<br> sk->sk_family = PF_KEY;<br> sk->sk_destruct = pfkey_sock_destruct;<br>// 增加使用数<br> atomic_inc(&pfkey_socks_nr);</div>
<div>// 将sock挂接到系统的sock链表<br> pfkey_insert(sk);</div>
<div> return 0;<br>out:<br> return err;<br>}</div>
<div> </div>
<div><br>4.2 PF_KEY套接口操作</div>
<div><br>static const struct proto_ops pfkey_ops = {<br> .family = PF_KEY,<br> .owner = THIS_MODULE,<br> /* Operations that make no sense on pfkey sockets. */<br> .bind = sock_no_bind,<br> .connect = sock_no_connect,<br> .socketpair = sock_no_socketpair,<br> .accept = sock_no_accept,<br> .getname = sock_no_getname,<br> .ioctl = sock_no_ioctl,<br> .listen = sock_no_listen,<br> .shutdown = sock_no_shutdown,<br> .setsockopt = sock_no_setsockopt,<br> .getsockopt = sock_no_getsockopt,<br> .mmap = sock_no_mmap,<br> .sendpage = sock_no_sendpage,</div>
<div> /* Now the operations that really occur. */<br> .release = pfkey_release,<br> .poll = datagram_poll,<br> .sendmsg = pfkey_sendmsg,<br> .recvmsg = pfkey_recvmsg,<br>};</div>
<div><br>PF_KEY类型的sock中大多数操作都没有定义, 这是因为PF_KEY的数据都是本机内的内核空间于用户空间的交换, 因此实际和网络相关的操作都不用定义, 所谓发送和介绍数据也只是内核与用户空间之间的通信。</div>
<div><br>4.2.1 释放套接口</div>
<div><br>static int pfkey_release(struct socket *sock)<br>{<br>// 从socket到sock结构转换<br> struct sock *sk = sock->sk;</div>
<div> if (!sk)<br> return 0;<br>// 将sock从系统的sock链表断开<br> pfkey_remove(sk);</div>
<div>// 设置sock状态为DEAD, 清空sock中的socket和sleep指针<br> sock_orphan(sk);</div>
<div> sock->sk = NULL;<br>// 清除当前数据队列<br> skb_queue_purge(&sk->sk_write_queue);<br>// 释放sock<br> sock_put(sk);</div>
<div> return 0;<br>}</div>
<div><br>4.2.2 描述符选择</div>
<div><br>使用的是标准的数据报选择函数: datagram_poll</div>
<div><br>4.2.3 发送数据</div>
<div><br>实际是将数据从内核空间发送给用户空间的程序:</div>
<div>static int pfkey_sendmsg(struct kiocb *kiocb,<br> struct socket *sock, struct msghdr *msg, size_t len)<br>{<br> struct sock *sk = sock->sk;<br> struct sk_buff *skb = NULL;<br> struct sadb_msg *hdr = NULL;<br> int err;</div>
<div> err = -EOPNOTSUPP;<br>// PF_KEY不支持MSG_OOB标志<br> if (msg->msg_flags & MSG_OOB)<br> goto out;</div>
<div> err = -EMSGSIZE;<br>// 一次发送的数据长度不能太大<br> if ((unsigned)len > sk->sk_sndbuf - 32)<br> goto out;</div>
<div> err = -ENOBUFS;<br>// 获取一个空闲的skbuff<br> skb = alloc_skb(len, GFP_KERNEL);<br> if (skb == NULL)<br> goto out;</div>
<div> err = -EFAULT;<br>// 从缓冲区中拷贝数据到skbuff中<br> if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len))<br> goto out;<br>// 获取SADB数据头的指针<br> hdr = pfkey_get_base_msg(skb, &err);<br> if (!hdr)<br> goto out;</div>
<div> mutex_lock(&xfrm_cfg_mutex);<br>// 处理PFKEY数据的发送<br> err = pfkey_process(sk, skb, hdr);<br> mutex_unlock(&xfrm_cfg_mutex);</div>
<div>out:<br> if (err && hdr && pfkey_error(hdr, err, sk) == 0)<br> err = 0;<br> if (skb)<br> kfree_skb(skb);</div>
<div> return err ? : len;<br>}</div>
<div><br>static int pfkey_process(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr)<br>{<br> void *ext_hdrs;<br> int err;<br>// 向混杂模式的sock发送SA消息<br> pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL,<br> BROADCAST_PROMISC_ONLY, NULL);</div>
<div> memset(ext_hdrs, 0, sizeof(ext_hdrs));<br>// 解析SADB数据头中的消息类型<br> err = parse_exthdrs(skb, hdr, ext_hdrs);<br> if (!err) {<br> err = -EOPNOTSUPP;<br>// 根据消息类型调用相关的处理函数进行处理<br> if (pfkey_funcs)<br> err = pfkey_funcs(sk, skb, hdr, ext_hdrs);<br> }<br> return err;<br>}</div>
<div>4.2.4 接收数据</div>
<div><br>实际是将数据从用户空间发送给内核空间:</div>
<div>static int pfkey_recvmsg(struct kiocb *kiocb,<br> struct socket *sock, struct msghdr *msg, size_t len,<br> int flags)<br>{<br> struct sock *sk = sock->sk;<br> struct sk_buff *skb;<br> int copied, err;</div>
<div> err = -EINVAL;<br>// 只支持4类标志<br> if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))<br> goto out;</div>
<div> msg->msg_namelen = 0;<br>// 接收数据包<br> skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);<br> if (skb == NULL)<br> goto out;</div>
<div> copied = skb->len;<br>// 接收到的数据超过了接收缓冲区长度, 设置截断标志<br> if (copied > len) {<br> msg->msg_flags |= MSG_TRUNC;<br> copied = len;<br> }</div>
<div> skb->h.raw = skb->data;<br>// 将数据包中信息拷贝到接收缓冲区<br> err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);<br> if (err)<br> goto out_free;<br>// 设置时间戳<br> sock_recv_timestamp(msg, sk, skb);</div>
<div> err = (flags & MSG_TRUNC) ? skb->len : copied;</div>
<div>out_free:<br> skb_free_datagram(sk, skb);<br>out:<br> return err;<br>}</div>
<div>4.2.5 pfkey广播</div>
<div><br>pfkey广播是将内核到用户空间的回应信息, 所有打开了PF_KEY类型socket的用户空间程序都可以收到, 所以用户空间程序在收到消息的时候要判断是否该消息是给自己的, 不是就忽略掉,这和netlink的广播比较类似。</div>
<div>/* Send SKB to all pfkey sockets matching selected criteria. */<br>#define BROADCAST_ALL 0<br>#define BROADCAST_ONE 1<br>#define BROADCAST_REGISTERED 2<br>#define BROADCAST_PROMISC_ONLY 4<br>static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation,<br> int broadcast_flags, struct sock *one_sk)<br>{<br> struct sock *sk;<br> struct hlist_node *node;<br> struct sk_buff *skb2 = NULL;<br> int err = -ESRCH;</div>
<div> /* XXX Do we need something like netlink_overrun? I think<br> * XXX PF_KEY socket apps will not mind current behavior.<br> */<br> if (!skb)<br> return -ENOMEM;</div>
<div> pfkey_lock_table();<br>// 遍历所有的pfkey sock表, <br> sk_for_each(sk, node, &pfkey_table) {<br>// 获取pfkey sock用于发送消息<br> struct pfkey_sock *pfk = pfkey_sk(sk);<br> int err2;</div>
<div> /* Yes, it means that if you are meant to receive this<br> * pfkey message you receive it twice as promiscuous<br> * socket.<br> */<br>// 该pfkey sock是混杂模式, 先发送一次, 由于后面还会广播发送, 所以设置了混杂模式的pfkey<br>// sock一般情况下会收到两次<br> if (pfk->promisc)<br> pfkey_broadcast_one(skb, &skb2, allocation, sk);</div>
<div> /* the exact target will be processed later */<br>// 指定了one_sk的话这个one_sk对应的用户程序将最后才收到包, 现在在循环中不发<br>// 以后才发<br> if (sk == one_sk)<br> continue;<br>// 如果不是广播给所有的进程, #define BROADCAST_ALL 0<br> if (broadcast_flags != BROADCAST_ALL) {<br>// 如果只广播给pfkey混杂模式的进程, 跳过, 继续循环<br> if (broadcast_flags & BROADCAST_PROMISC_ONLY)<br> continue;<br>// 如果只广播给登记的进程而该sock没登记, 跳过, 继续循环<br> if ((broadcast_flags & BROADCAST_REGISTERED) &&<br> !pfk->registered)<br> continue;<br>// 只广播给一个, 和one_sk配合使用, 这样消息就只会发送给one_sk和所有混杂模式的pfkey sock<br> if (broadcast_flags & BROADCAST_ONE)<br> continue;<br> }<br>// 发送给该pfkey sock<br> err2 = pfkey_broadcast_one(skb, &skb2, allocation, sk);</div>
<div> /* Error is cleare after succecful sending to at least one<br> * registered KM */<br> if ((broadcast_flags & BROADCAST_REGISTERED) && err)<br> err = err2;<br> }<br> pfkey_unlock_table();</div>
<div>// 如果指定one_sk, 再向该pfkey sock发送, 该sock是最后一个收到消息的<br> if (one_sk != NULL)<br> err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk);</div>
<div>// 释放skb<br> if (skb2)<br> kfree_skb(skb2);<br> kfree_skb(skb);<br> return err;<br>}</div>
<div><br>// 发送一个包<br>static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2,<br> gfp_t allocation, struct sock *sk)<br>{<br> int err = -ENOBUFS;</div>
<div> sock_hold(sk);<br> if (*skb2 == NULL) {<br>// skb2是skb的一个克隆包<br> if (atomic_read(&skb->users) != 1) {<br> *skb2 = skb_clone(skb, allocation);<br> } else {<br> *skb2 = skb;<br>// 因为发送会减少skb的使用计数<br> atomic_inc(&skb->users);<br> }<br> }<br> if (*skb2 != NULL) {<br>// 实际发送的时skb2<br> if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) {<br> skb_orphan(*skb2);<br> skb_set_owner_r(*skb2, sk);<br> skb_queue_tail(&sk->sk_receive_queue, *skb2);<br> sk->sk_data_ready(sk, (*skb2)->len);<br> *skb2 = NULL;<br> err = 0;<br> }<br> }<br> sock_put(sk);<br> return err;<br>}</div>
<div>...... 待续 ......</div>
页:
[1]