MASQUERADE address selection and port selection of SNAT

Posted by Nightslyr on Mon, 10 Jan 2022 13:16:22 +0100

Environmental Science:

  1. Version: kernel-5.4.54 amd64 dual core Ubuntu 18 04
  2. k8s cluster network component: flannel, Kube proxy: IPVS
  3. Code tool: vs code

1. General

  • SNAT (source address translation) is the core function of the NAT table of IPTABLES. It is widely used in Intranet environments such as routers, ECs and K8S clusters. It is an indispensable function in the kernel network subsystem
  • The NAT of IPTABLES completely depends on the conntrack of netfilter. NAT cannot be performed on packets without conntrack
  • In the K8S cluster, DNAT is used for load balancing, and SNAT is used to ensure that the packets forwarded by the node can return to the node to complete de DNAT restoration, rather than directly send them to the client.

    • The client accesses the load balancing IP. If the back-end IP directly packets back to the client, the client cannot recognize it;
    • The back-end IP packet is transferred to the load balancer first, and the back-end IP is restored to the load balancing IP before it is sent to the client
  • Both IPTABLES and IPVS can realize DNAT load balancing, but SNAT can only be realized by IPTABLES
  • View the SNAT rules for IPTABLES in the cluster
root@cluster1-worker1:~# iptables -t nat -nL
target     prot opt source               destination         
KUBE-SERVICES  all  --              /* kubernetes service portals */
DOCKER     all  --              ADDRTYPE match dst-type LOCAL

Chain INPUT (policy ACCEPT)
target     prot opt source               destination         

Chain OUTPUT (policy ACCEPT)
target     prot opt source               destination         
KUBE-SERVICES  all  --              /* kubernetes service portals */
DOCKER     all  --           !          ADDRTYPE match dst-type LOCAL

target     prot opt source               destination         
KUBE-POSTROUTING  all  --              /* kubernetes postrouting rules */
MASQUERADE  all  --           
RETURN     all  --       
MASQUERADE  all  --       !         
RETURN     all  -- !       
MASQUERADE  all  -- ! 
Chain KUBE-POSTROUTING (1 references)
target     prot opt source        destination   
/* Kubernetes endpoints dst ip:port, source ip for solving hairpin purpose */      
MASQUERADE  all  --     match-set KUBE-LOOP-BACK dst,dst,src

RETURN     all  --     mark match ! 0x4000/0x4000
MARK       all  --     MARK xor 0x4000

/* kubernetes service traffic requiring SNAT */
MASQUERADE  all  --     

Analyzing how MASQUERADE SNAT is very helpful for us to understand the network communication between clusters

2. Concept

2.1 de-SNAT

Why do de SNAT?
Suppose that the local machine snats the packet sent by POD1, and the source IP changes from POD1-IP to HOST-IP; In this way, the destination of the server's packet return is HOST-IP, but POD1 needs to receive the packet. If de SNAT does not change the destination of the packet return to POD1-IP, POD1 cannot receive the packet

2.2 SNAT related hook points in Netfilter

The SNAT rule of K8S cluster is POST_ROUTING, SNAT, in PRE_ROUTING to do de SNAT

3. Code analysis

3.1 hook function registered in NAT table by masquerade

static struct xt_target masquerade_tg_reg[] __read_mostly = {
        .name       = "MASQUERADE",
        .family     = NFPROTO_IPV6,
        .target     = masquerade_tg6,
        .targetsize = sizeof(struct nf_nat_range),
        .table      = "nat",
        .hooks      = 1 << NF_INET_POST_ROUTING,
        .checkentry = masquerade_tg6_checkentry,
        .destroy    = masquerade_tg_destroy,
        .me     = THIS_MODULE,
    }, {
        .name       = "MASQUERADE",
        .family     = NFPROTO_IPV4,
        .target     = masquerade_tg,
        .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
        .table      = "nat",
        .hooks      = 1 << NF_INET_POST_ROUTING,
        .checkentry = masquerade_tg_check,
        .destroy    = masquerade_tg_destroy,
        .me     = THIS_MODULE,

3.2 masquerade_tg analysis

static unsigned int
masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
    struct nf_nat_range2 range;
    const struct nf_nat_ipv4_multi_range_compat *mr;

    /* Get the configuration of the rule and the available port range of the SNAT */
    mr = par->targinfo;
    range.flags = mr->range[0].flags;
    range.min_proto = mr->range[0].min;
    range.max_proto = mr->range[0].max;

    /* Core function */
    return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range,

3.2.1 nf_nat_masquerade_ipv4 analysis

unsigned int
nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
               const struct nf_nat_range2 *range,
               const struct net_device *out)
    struct nf_conn *ct;
    struct nf_conn_nat *nat;
    enum ip_conntrack_info ctinfo;
    struct nf_nat_range2 newrange;
    const struct rtable *rt;
    __be32 newsrc, nh;


    /* Get conntrack connection information */
    ct = nf_ct_get(skb, &ctinfo);

    WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
             ctinfo == IP_CT_RELATED_REPLY)));

    /* Source address is - locally generated packet that is
     * probably not supposed to be masqueraded.
    if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
        return NF_ACCEPT;

    /* Get routing table */
    rt = skb_rtable(skb);
    /* Next hop address */
    nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
    /* Select the most appropriate SNAT source address */
    newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE);
    if (!newsrc) {
        pr_info("%s ate my IP address\n", out->name);
        return NF_DROP;

    nat = nf_ct_nat_ext_add(ct);
    if (nat)
        nat->masq_index = out->ifindex;

    /* Transfer from original range. */
    /* Set available source addresses and source port ranges */
    memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
    memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
    newrange.flags       = range->flags | NF_NAT_RANGE_MAP_IPS;
    newrange.min_addr.ip = newsrc;
    newrange.max_addr.ip = newsrc;
    newrange.min_proto   = range->min_proto;
    newrange.max_proto   = range->max_proto;

    /* Hand modified range to generic setup. */
    /* Determine the SNAT source address according to the available range and modify the connection record */
    return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);

3.2.2 nf_nat_setup_info analysis

unsigned int
nf_nat_setup_info(struct nf_conn *ct,
          const struct nf_nat_range2 *range,
          enum nf_nat_manip_type maniptype)
    struct net *net = nf_ct_net(ct);
    struct nf_conntrack_tuple curr_tuple, new_tuple;

    /* Can't setup nat info for confirmed ct. */
    if (nf_ct_is_confirmed(ct))
        return NF_ACCEPT;

    WARN_ON(maniptype != NF_NAT_MANIP_SRC &&
        maniptype != NF_NAT_MANIP_DST);

    if (WARN_ON(nf_nat_initialized(ct, maniptype)))
        return NF_DROP;

    /* What we've got will look like inverse of reply. Normally
     * this is what is in the conntrack, except for prior
     * manipulations (future optimization: if num_manips == 0,
     * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)

    /* Gets a unique quintuple from the available range */
    get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);

    if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
        struct nf_conntrack_tuple reply;

        /* Alter conntrack table so will recognize replies. */
        /* Modify the quintuple of the backhaul in conntrack */
        nf_ct_invert_tuple(&reply, &new_tuple);
        nf_conntrack_alter_reply(ct, &reply);

        /* Non-atomic: we own this at the moment. */
        /* Identify the nat type that needs to be done */
        if (maniptype == NF_NAT_MANIP_SRC)
            ct->status |= IPS_SRC_NAT;
            ct->status |= IPS_DST_NAT;

        if (nfct_help(ct) && !nfct_seqadj(ct))
            if (!nfct_seqadj_ext_add(ct))
                return NF_DROP;

    /* Add connection records to the bysource table */
    if (maniptype == NF_NAT_MANIP_SRC) {
        unsigned int srchash;
        spinlock_t *lock;

        srchash = hash_by_src(net,
        lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];

    /* It's done. */
    if (maniptype == NF_NAT_MANIP_DST)
        ct->status |= IPS_DST_NAT_DONE;
        ct->status |= IPS_SRC_NAT_DONE;

    return NF_ACCEPT;

3.3.3 get_unique_tuple analysis

/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
 * we change the source to map into the range. For NF_INET_PRE_ROUTING
 * and NF_INET_LOCAL_OUT, we change the destination to map into the
 * range. It might not be possible to get a unique tuple, but we try.
 * At worst (or if we race), we will end up with a final duplicate in
 * __nf_conntrack_confirm and drop the packet. */
static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
         const struct nf_conntrack_tuple *orig_tuple,
         const struct nf_nat_range2 *range,
         struct nf_conn *ct,
         enum nf_nat_manip_type maniptype)
    const struct nf_conntrack_zone *zone;
    struct net *net = nf_ct_net(ct);

    zone = nf_ct_zone(ct);

    /* 1) If this srcip/proto/src-proto-part is currently mapped,
     * and that same mapping gives a unique tuple within the given
     * range, use that.
     * This is only required for source (ie. NAT/masq) mappings.
     * So far, we don't do local source mappings, so multiple
     * manips not an issue.
    /* First try to determine whether the SNAT does not meet the availability range, or obtain the SNAT source address in the recent SNAT connection record */
    if (maniptype == NF_NAT_MANIP_SRC &&
        !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
        /* SNAT And non random ports will come here */
        /* try the original tuple first */
        /* SNAT is not used to judge whether the availability range is met */
        if (in_range(orig_tuple, range)) {
            /* Determine whether the quintuple is unique */
            if (!nf_nat_used_tuple(orig_tuple, ct)) {
                *tuple = *orig_tuple;
        /* Obtain the SNAT source address in the connection record of the latest SNAT according to the source address hash */
        } else if (find_appropriate_src(net, zone,
                        orig_tuple, tuple, range)) {
            pr_debug("get_unique_tuple: Found current src map\n");
            /* Determine whether the quintuple is unique */
            if (!nf_nat_used_tuple(tuple, ct))

    /* The random port or the five tuples that meet the above judgment are not found */
    /* 2) Select the least-used IP/proto combination in the given range */
    *tuple = *orig_tuple;
    /* Get the most appropriate source address from the source address range */
    find_best_ips_proto(zone, tuple, range, ct, maniptype);

    /* 3) The per-protocol part of the manip is made to map into
     * the range to make a unique tuple.

    /* Only bother mapping if it's not already in range and unique */
    /* Judge whether the quintuple meets the range without modifying the port */
    if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
        if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
            if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
                l4proto_in_range(tuple, maniptype,
                      &range->max_proto) &&
                (range->min_proto.all == range->max_proto.all ||
                 !nf_nat_used_tuple(tuple, ct)))
                /* Non random port & & set the port range & & the port meets the range & & quintuple unique
                 * It will go here and directly return the five tuples of confirmation*/
        } else if (!nf_nat_used_tuple(tuple, ct)) {
            /* Non random port & & port range not set & & quintuple unique
             * It will go here and directly return the five tuples of confirmation*/

    /* Last chance: get protocol to try to obtain unique tuple. */
    /* Select an appropriate port in the available range (five tuples are unique, and the port is in the range) */
    nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct);

First, do not modify the packet. Here, only the conntrack connection record is modified. Later, modify the packet according to the connection record
Packet modification and de SNAT in NAT analysis document: connection tracking and NAT analysis of IPTABLES

3.3 difference between SNAT and MASQ

3.3.1 SNAT hook function

static struct xt_target xt_nat_target_reg[] __read_mostly = {
        .name       = "SNAT",
        .revision   = 0,
        .checkentry = xt_nat_checkentry_v0,
        .destroy    = xt_nat_destroy,
        .target     = xt_snat_target_v0,
        .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
        .family     = NFPROTO_IPV4,
        .table      = "nat",
        .hooks      = (1 << NF_INET_POST_ROUTING) |
                  (1 << NF_INET_LOCAL_IN),
        .me     = THIS_MODULE,

3.3.2 xt_snat_target_v0 analysis

static unsigned int
xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
    const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
    struct nf_nat_range2 range;
    enum ip_conntrack_info ctinfo;
    struct nf_conn *ct;

    ct = nf_ct_get(skb, &ctinfo);
    WARN_ON(!(ct != NULL &&
         (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
          ctinfo == IP_CT_RELATED_REPLY)));

    /* Get range */
    xt_nat_convert_range(&range, &mr->range[0]);
    /* Determine the SNAT source address according to the available range and modify the connection record */
    return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);

You can see that both SNAT and MASQ finally call nf_nat_setup_info, the difference is that MASQ has a step to select the most appropriate source IP.

Topics: C