Environmental Science:
- Version: kernel-5.4.54 amd64 dual core Ubuntu 18 04
- k8s cluster network component: flannel, Kube proxy: IPVS
- Code tool: vs code
1. General
- SNAT (source address translation) is the core function of the NAT table of IPTABLES. It is widely used in Intranet environments such as routers, ECs and K8S clusters. It is an indispensable function in the kernel network subsystem
- The NAT of IPTABLES completely depends on the conntrack of netfilter. NAT cannot be performed on packets without conntrack
In the K8S cluster, DNAT is used for load balancing, and SNAT is used to ensure that the packets forwarded by the node can return to the node to complete de DNAT restoration, rather than directly send them to the client.
- The client accesses the load balancing IP. If the back-end IP directly packets back to the client, the client cannot recognize it;
- The back-end IP packet is transferred to the load balancer first, and the back-end IP is restored to the load balancing IP before it is sent to the client
- Both IPTABLES and IPVS can realize DNAT load balancing, but SNAT can only be realized by IPTABLES
- View the SNAT rules for IPTABLES in the cluster
root@cluster1-worker1:~# iptables -t nat -nL Chain PREROUTING (policy ACCEPT) target prot opt source destination KUBE-SERVICES all -- 0.0.0.0/0 0.0.0.0/0 /* kubernetes service portals */ DOCKER all -- 0.0.0.0/0 0.0.0.0/0 ADDRTYPE match dst-type LOCAL Chain INPUT (policy ACCEPT) target prot opt source destination Chain OUTPUT (policy ACCEPT) target prot opt source destination KUBE-SERVICES all -- 0.0.0.0/0 0.0.0.0/0 /* kubernetes service portals */ DOCKER all -- 0.0.0.0/0 !127.0.0.0/8 ADDRTYPE match dst-type LOCAL Chain POSTROUTING (policy ACCEPT) target prot opt source destination KUBE-POSTROUTING all -- 0.0.0.0/0 0.0.0.0/0 /* kubernetes postrouting rules */ MASQUERADE all -- 172.17.0.0/16 0.0.0.0/0 RETURN all -- 10.244.0.0/16 10.244.0.0/16 MASQUERADE all -- 10.244.0.0/16 !224.0.0.0/4 RETURN all -- !10.244.0.0/16 10.244.2.0/24 MASQUERADE all -- !10.244.0.0/16 10.244.0.0/16 ... Chain KUBE-POSTROUTING (1 references) target prot opt source destination /* Kubernetes endpoints dst ip:port, source ip for solving hairpin purpose */ MASQUERADE all -- 0.0.0.0/0 0.0.0.0/0 match-set KUBE-LOOP-BACK dst,dst,src RETURN all -- 0.0.0.0/0 0.0.0.0/0 mark match ! 0x4000/0x4000 MARK all -- 0.0.0.0/0 0.0.0.0/0 MARK xor 0x4000 /* kubernetes service traffic requiring SNAT */ MASQUERADE all -- 0.0.0.0/0 0.0.0.0/0 ...
Analyzing how MASQUERADE SNAT is very helpful for us to understand the network communication between clusters
2. Concept
2.1 de-SNAT
Why do de SNAT?
Suppose that the local machine snats the packet sent by POD1, and the source IP changes from POD1-IP to HOST-IP; In this way, the destination of the server's packet return is HOST-IP, but POD1 needs to receive the packet. If de SNAT does not change the destination of the packet return to POD1-IP, POD1 cannot receive the packet
2.2 SNAT related hook points in Netfilter
The SNAT rule of K8S cluster is POST_ROUTING, SNAT, in PRE_ROUTING to do de SNAT
3. Code analysis
3.1 hook function registered in NAT table by masquerade
static struct xt_target masquerade_tg_reg[] __read_mostly = { { #if IS_ENABLED(CONFIG_IPV6) .name = "MASQUERADE", .family = NFPROTO_IPV6, .target = masquerade_tg6, .targetsize = sizeof(struct nf_nat_range), .table = "nat", .hooks = 1 << NF_INET_POST_ROUTING, .checkentry = masquerade_tg6_checkentry, .destroy = masquerade_tg_destroy, .me = THIS_MODULE, }, { #endif .name = "MASQUERADE", .family = NFPROTO_IPV4, .target = masquerade_tg, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .table = "nat", .hooks = 1 << NF_INET_POST_ROUTING, .checkentry = masquerade_tg_check, .destroy = masquerade_tg_destroy, .me = THIS_MODULE, } };
3.2 masquerade_tg analysis
static unsigned int masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) { struct nf_nat_range2 range; const struct nf_nat_ipv4_multi_range_compat *mr; /* Get the configuration of the rule and the available port range of the SNAT */ mr = par->targinfo; range.flags = mr->range[0].flags; range.min_proto = mr->range[0].min; range.max_proto = mr->range[0].max; /* Core function */ return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range, xt_out(par)); }
3.2.1 nf_nat_masquerade_ipv4 analysis
unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, const struct nf_nat_range2 *range, const struct net_device *out) { struct nf_conn *ct; struct nf_conn_nat *nat; enum ip_conntrack_info ctinfo; struct nf_nat_range2 newrange; const struct rtable *rt; __be32 newsrc, nh; WARN_ON(hooknum != NF_INET_POST_ROUTING); /* Get conntrack connection information */ ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); /* Source address is 0.0.0.0 - locally generated packet that is * probably not supposed to be masqueraded. */ if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) return NF_ACCEPT; /* Get routing table */ rt = skb_rtable(skb); /* Next hop address */ nh = rt_nexthop(rt, ip_hdr(skb)->daddr); /* Select the most appropriate SNAT source address */ newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE); if (!newsrc) { pr_info("%s ate my IP address\n", out->name); return NF_DROP; } nat = nf_ct_nat_ext_add(ct); if (nat) nat->masq_index = out->ifindex; /* Transfer from original range. */ /* Set available source addresses and source port ranges */ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.ip = newsrc; newrange.max_addr.ip = newsrc; newrange.min_proto = range->min_proto; newrange.max_proto = range->max_proto; /* Hand modified range to generic setup. */ /* Determine the SNAT source address according to the available range and modify the connection record */ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); }
3.2.2 nf_nat_setup_info analysis
unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype) { struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; /* Can't setup nat info for confirmed ct. */ if (nf_ct_is_confirmed(ct)) return NF_ACCEPT; WARN_ON(maniptype != NF_NAT_MANIP_SRC && maniptype != NF_NAT_MANIP_DST); if (WARN_ON(nf_nat_initialized(ct, maniptype))) return NF_DROP; /* What we've got will look like inverse of reply. Normally * this is what is in the conntrack, except for prior * manipulations (future optimization: if num_manips == 0, * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ nf_ct_invert_tuple(&curr_tuple, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); /* Gets a unique quintuple from the available range */ get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { struct nf_conntrack_tuple reply; /* Alter conntrack table so will recognize replies. */ /* Modify the quintuple of the backhaul in conntrack */ nf_ct_invert_tuple(&reply, &new_tuple); nf_conntrack_alter_reply(ct, &reply); /* Non-atomic: we own this at the moment. */ /* Identify the nat type that needs to be done */ if (maniptype == NF_NAT_MANIP_SRC) ct->status |= IPS_SRC_NAT; else ct->status |= IPS_DST_NAT; if (nfct_help(ct) && !nfct_seqadj(ct)) if (!nfct_seqadj_ext_add(ct)) return NF_DROP; } /* Add connection records to the bysource table */ if (maniptype == NF_NAT_MANIP_SRC) { unsigned int srchash; spinlock_t *lock; srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); spin_unlock_bh(lock); } /* It's done. */ if (maniptype == NF_NAT_MANIP_DST) ct->status |= IPS_DST_NAT_DONE; else ct->status |= IPS_SRC_NAT_DONE; return NF_ACCEPT; }
3.3.3 get_unique_tuple analysis
/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, * we change the source to map into the range. For NF_INET_PRE_ROUTING * and NF_INET_LOCAL_OUT, we change the destination to map into the * range. It might not be possible to get a unique tuple, but we try. * At worst (or if we race), we will end up with a final duplicate in * __nf_conntrack_confirm and drop the packet. */ static void get_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig_tuple, const struct nf_nat_range2 *range, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); zone = nf_ct_zone(ct); /* 1) If this srcip/proto/src-proto-part is currently mapped, * and that same mapping gives a unique tuple within the given * range, use that. * * This is only required for source (ie. NAT/masq) mappings. * So far, we don't do local source mappings, so multiple * manips not an issue. */ /* First try to determine whether the SNAT does not meet the availability range, or obtain the SNAT source address in the recent SNAT connection record */ if (maniptype == NF_NAT_MANIP_SRC && !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* SNAT And non random ports will come here */ /* try the original tuple first */ /* SNAT is not used to judge whether the availability range is met */ if (in_range(orig_tuple, range)) { /* Determine whether the quintuple is unique */ if (!nf_nat_used_tuple(orig_tuple, ct)) { *tuple = *orig_tuple; return; } /* Obtain the SNAT source address in the connection record of the latest SNAT according to the source address hash */ } else if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { pr_debug("get_unique_tuple: Found current src map\n"); /* Determine whether the quintuple is unique */ if (!nf_nat_used_tuple(tuple, ct)) return; } } /* The random port or the five tuples that meet the above judgment are not found */ /* 2) Select the least-used IP/proto combination in the given range */ *tuple = *orig_tuple; /* Get the most appropriate source address from the source address range */ find_best_ips_proto(zone, tuple, range, ct, maniptype); /* 3) The per-protocol part of the manip is made to map into * the range to make a unique tuple. */ /* Only bother mapping if it's not already in range and unique */ /* Judge whether the quintuple meets the range without modifying the port */ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && l4proto_in_range(tuple, maniptype, &range->min_proto, &range->max_proto) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) /* Non random port & & set the port range & & the port meets the range & & quintuple unique * It will go here and directly return the five tuples of confirmation*/ return; } else if (!nf_nat_used_tuple(tuple, ct)) { /* Non random port & & port range not set & & quintuple unique * It will go here and directly return the five tuples of confirmation*/ return; } } /* Last chance: get protocol to try to obtain unique tuple. */ /* Select an appropriate port in the available range (five tuples are unique, and the port is in the range) */ nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct); }
First, do not modify the packet. Here, only the conntrack connection record is modified. Later, modify the packet according to the connection record
Packet modification and de SNAT in NAT analysis document: connection tracking and NAT analysis of IPTABLES
3.3 difference between SNAT and MASQ
3.3.1 SNAT hook function
static struct xt_target xt_nat_target_reg[] __read_mostly = { { .name = "SNAT", .revision = 0, .checkentry = xt_nat_checkentry_v0, .destroy = xt_nat_destroy, .target = xt_snat_target_v0, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .family = NFPROTO_IPV4, .table = "nat", .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, ...
3.3.2 xt_snat_target_v0 analysis
static unsigned int xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); /* Get range */ xt_nat_convert_range(&range, &mr->range[0]); /* Determine the SNAT source address according to the available range and modify the connection record */ return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); }
You can see that both SNAT and MASQ finally call nf_nat_setup_info, the difference is that MASQ has a step to select the most appropriate source IP.