<Linux Network 2.6.38> \"where_to_go\"

先来回顾一个潜在的结构 在skb的 pskb_copy, skb_copy  等操作中起到了关键作用

 

 写道
struct skb_shared_info {

unsignedshortnr_frags;

unsignedshortgso_size;

/*Warning:thisfieldisnotalwaysfilledin(UFO)!*/

unsignedshortgso_segs;

unsignedshortgso_type;

__be32ip6_frag_id;

__u8tx_flags;

structsk_buff*frag_list;

structskb_shared_hwtstampshwtstamps;

atomic_tdataref;

void*destructor_arg;

skb_frag_tfrags[MAX_SKB_FRAGS];

};

 想要顺利的访问这个结构就要了解一个宏

#define skb_shinfo(SKB)             ((struct skb_shared_info *)((SKB)->end))

 

 

再回忆一下头部结构

 

 写道
struct iphdr {

#ifdefined(__LITTLE_ENDIAN_BITFIELD)

__u8ihl:4,

version:4;

#elifdefined(__BIG_ENDIAN_BITFIELD)

__u8version:4,

ihl:4;

#else

#error"Pleasefix<asm/byteorder.h>"

#endif

__u8tos;

__be16tot_len;

__be16id;

__be16frag_off;

__u8ttl;

__u8protocol;

__sum16check;

__be32saddr;

__be32daddr;

/*Theoptionsstarthere.*/

};
 

好了现在就看看, 上前天说到的 ip_rcv 如何Ip处理包

 

 

int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
	/*在sk_buff  __netif_receive_skb  eth_type_trans 中已经让包符合L3 */
	struct iphdr *iph;
	u32 len;

	/*如果是因为开了promiscuous 模式而让垃圾包到了L3 ,就丢弃不属于自己的 */
	if (skb->pkt_type == PACKET_OTHERHOST)
		goto drop;

	/*依旧SNMP 采集点*/
	IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
	/*如果这包在别的子系统也使用 就拷贝一份给自己专门用*/
	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
		goto out;/*拷贝失败就丢弃*/
	}
	/*如果是分包的话 ,用__pskb_pull_tail把skb_shared_info中frag_list找回来 */
	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
		goto inhdr_error;
	/*重新指向L3头部*/
	iph = ip_hdr(skb);

	/*
	 *	RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
	 *
	 *	Is the datagram acceptable?
	 *
	 *	1.	Length at least the size of an ip header
	 *	2.	Version of 4
	 *	3.	Checksums correctly. [Speed optimisation for later, skip loopback checksums]
	 *	4.	Doesn't have a bogus length
	 */
	/*头部和版本检查*/
	if (iph->ihl < 5 || iph->version != 4)
		goto inhdr_error;
	/*同样的动作, 不过和上一次比这次是整个IP头部了*/
	if (!pskb_may_pull(skb, iph->ihl*4))
		goto inhdr_error;

	iph = ip_hdr(skb);
	/*校验*/
	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
		goto inhdr_error;
	/*长度完整性校验*/
	len = ntohs(iph->tot_len);
	/*skb buffer的真实长度只能比包头报告的长度大<因为可能被L2层填充了>或正好,小的话就有问题咯*/
	if (skb->len < len) {
		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
		goto drop;
	} else if (len < (iph->ihl*4))
	/*包头可能比包体还长吗? :)*/
		goto inhdr_error;

	/* Our transport medium may have padded the buffer out. Now we know it
	 * is IP we can trim to the true length of the frame.
	 * Note this now means skb->len holds ntohs(iph->tot_len).
	 */
	/*被L2填充了吗? 去掉! 前面校验不算~*/
	if (pskb_trim_rcsum(skb, len)) {
		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
		goto drop;
	}

	/* Remove any debris in the socket control block */
	/*看起来L3不需要 ip_options*/
	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));

	/* Must drop socket now because of tproxy. */
	/*既然都被我处理过了,就跟我把,帮你净身*/
	skb_orphan(skb);
	/*接受Netfilter 的洗礼吧(LVS基于此),最后再执行 ip_rcv_finish,这是正常之旅*/
	return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
		       ip_rcv_finish);

inhdr_error:
	/*去MIB树上反应一下*/
	IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
drop:
	kfree_skb(skb);
out:	/*哎*/
	return NET_RX_DROP;
}

 接下来就是看看 ip_rcv_finish

 

 

 

static int ip_rcv_finish(struct sk_buff *skb)
{
	const struct iphdr *iph = ip_hdr(skb);
	struct rtable *rt;

	/*
	 *	如果包中没有dst_entry结构(不知如何转发),就直接询问路由子系统看看有没人要它(后面就不处理了)
	 */
	if (skb_dst(skb) == NULL) {
		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
					       iph->tos, skb->dev);
		if (unlikely(err)) {
			if (err == -EHOSTUNREACH)
				IP_INC_STATS_BH(dev_net(skb->dev),
						IPSTATS_MIB_INADDRERRORS);
			else if (err == -ENETUNREACH)
				IP_INC_STATS_BH(dev_net(skb->dev),
						IPSTATS_MIB_INNOROUTES);
			else if (err == -EXDEV)
				NET_INC_STATS_BH(dev_net(skb->dev),
						 LINUX_MIB_IPRPFILTER);
			goto drop;
		}
	}
/*更新该CPU的ip_rt_acct 统计,参考/proc/net/rt_acct*/
#ifdef CONFIG_NET_CLS_ROUTE
	if (unlikely(skb_dst(skb)->tclassid)) {
		struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
		u32 idx = skb_dst(skb)->tclassid;
		st[idx&0xFF].o_packets++;
		st[idx&0xFF].o_bytes += skb->len;
		st[(idx>>16)&0xFF].i_packets++;
		st[(idx>>16)&0xFF].i_bytes += skb->len;
	}
#endif
	/*如果有IP options就去处理*/
	if (iph->ihl > 5 && ip_rcv_options(skb))
		goto drop;
	/*根据目的地类型做一些统计*/
	rt = skb_rtable(skb);
	if (rt->rt_type == RTN_MULTICAST) {
		IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
				skb->len);
	} else if (rt->rt_type == RTN_BROADCAST)
		IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
				skb->len);
	/"where to go 见下面"/

	return dst_input(skb);

drop:
	kfree_skb(skb);
	return NET_RX_DROP;
}

这个决定L3包走向的函数指针在那里设置的呢,跟着我看一下吧

接着上一篇讲过的inet_init()->ip_init()->ip_rt_init()

-->

rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);

<route.c __rtnl_register登记 >

-->

 

 

int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
			   u8 tos, struct net_device *dev, bool noref)
{
	//...
			if (our ) {
				int res = ip_route_input_mc(skb, daddr, saddr,
							    tos, dev, our);
	//...
		
			}
	//...
	
	}
//...

	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
}
 

inet_rtm_getroute()->ip_route_input()->ip_route_input_common()-

 

A:ip_route_input_slow()

->ip_mkroute_input()->__mkroute_input()

"ip_forward;"

B<our>: ip_route_input_mc()

B1 "ip_local_deliver"

B2 "init_net.loopback_dev";

这是几个likely的不考虑异常分支,别的就先不介绍了

int ip_forward(struct sk_buff *skb)
{
	struct iphdr *iph;	/* Our header */
	struct rtable *rt;	/* Route we use */
	struct ip_options * opt	= &(IPCB(skb)->opt);

	if (skb_warn_if_lro(skb))
		goto drop;

	if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
		goto drop;
	/*Router Alert option 的处理,后面会分析 ip_call_ra_chain()函数*/
	if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
		return NET_RX_SUCCESS;
	/*pkt_type在L2层处理的时候有无设置PACKET_HOST*/
	if (skb->pkt_type != PACKET_HOST)
		goto drop;
	/*什么都没做 直接让pass了*/
	skb_forward_csum(skb);

	/*rfc 规定防止路由循环等等用的*/
	if (ip_hdr(skb)->ttl <= 1)
		goto too_many_hops;

	/*VPN IPSec交给xfrm 框架处理转发*/
	if (!xfrm4_route_forward(skb))
		goto drop;
	
	rt = skb_rtable(skb);
	/*如果IP包选项指明了要用自己提供的路由来走<Strict Source Routing >,而又不能满*足就失败*/
	if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
		goto sr_failed;
	/*如果包长度大于了目的地的MTU 却禁止分包 就发送一个ICMP<这块参考 TCP/IP详解卷一 9章>*/
	if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
		     (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
		IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
			  htonl(dst_mtu(&rt->dst)));
		goto drop;
	}

	/* We are about to mangle packet. Copy it! 这个之前解释过了 */
	if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
		goto drop;
	iph = ip_hdr(skb);

	/* Decrease ttl after skb cow done  协议要求*/
	ip_decrease_ttl(iph);

	/*
	 *如果包允许走别的路由而且他也表示希望走一个更好的,就重新计算路由,当然也会	* 引起一个ICMP ,这个函数后面分析	 */
	if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb))
		ip_rt_send_redirect(skb);
	
	/*根据 IP包头部的TOS 设置包优先级 给后面的 Traffic Control 用<ps: 路由器设计很重视这个选项>*/
	skb->priority = rt_tos2priority(iph->tos);
	/*之前分析过,经过NetFilter后调用ip_forward_finish*/
	return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
		       rt->dst.dev, ip_forward_finish);

sr_failed:
	/*
	 *	Strict routing permits no gatewaying 协议规定的参考 TCP/IP-I 1
	 */
	 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
	 goto drop;

too_many_hops:
	/* Tell the sender its packet died... */
	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);
	icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
drop:
	kfree_skb(skb);
	return NET_RX_DROP;
}

 好吧 ,下面就是

 

 

static int ip_forward_finish(struct sk_buff *skb)
{
	struct ip_options * opt	= &(IPCB(skb)->opt);

	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
	/* ip_forward里面已经处理了2个IP  options <Router Alert 和Strict Source Routing>,* 下面就是把剩下的处理完,由于几乎很少会用到IP-O <看看这里的unlikely也知道概率了吧* >所以就不分析了*/
	if (unlikely(opt->optlen))
		ip_forward_options(skb);
	/*最终归属,根据 消息包类型去调用相应的函数指针 ip_mc_output ..ip_finish_output */
	return dst_output(skb);
}

 

 

=========================

补充上面说的 ip_call_ra_chian

 

 写道
struct ip_ra_chain {

structip_ra_chain__rcu*next;

structsock*sk;

union{

void(*destructor)(structsock*);

structsock*saved_sk;

};

structrcu_headrcu;

};

 int ip_call_ra_chain(struct sk_buff *skb)

{
	struct ip_ra_chain *ra;
	u8 protocol = ip_hdr(skb)->protocol;
	struct sock *last = NULL;
	struct net_device *dev = skb->dev;
	/*这里遍历了整个raw sock 链表 */
	for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
		struct sock *sk = ra->sk;

		/* If socket is bound to an interface, only report
		 * the packet if it came  from that interface.
		 */
	/*包头端口号和该raw sock 的端口匹配 设备接口序号也匹配*/
		if (sk && inet_sk(sk)->inet_num == protocol &&
		    (!sk->sk_bound_dev_if ||
		     sk->sk_bound_dev_if == dev->ifindex) &&
		    net_eq(sock_net(sk), dev_net(dev))) {
	/*如果分段过 就去重组整个IP包 ip_fragment.c*/
			if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
				if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
					return 1;
			}
			if (last) {
	/*关键就是这里,把包复制一遍然后传给上层
	*放入该sock的sk_receive_queue*/
				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
				if (skb2)
					raw_rcv(last, skb2);
			}
	/*下一个raw sock */
			last = sk;
		}
	}

	if (last) {
		raw_rcv(last, skb);
		return 1;
	}

相关推荐