patch-2.3.99-pre2 linux/net/ipv4/netfilter/ip_conntrack_core.c
Next file: linux/net/ipv4/netfilter/ip_conntrack_ftp.c
Previous file: linux/net/ipv4/netfilter/Makefile
Back to the patch index
Back to the overall index
- Lines: 892
- Date:
Fri Mar 17 10:56:20 2000
- Orig file:
v2.3.99-pre1/linux/net/ipv4/netfilter/ip_conntrack_core.c
- Orig date:
Wed Dec 31 16:00:00 1969
diff -u --recursive --new-file v2.3.99-pre1/linux/net/ipv4/netfilter/ip_conntrack_core.c linux/net/ipv4/netfilter/ip_conntrack_core.c
@@ -0,0 +1,891 @@
+/* Connection state tracking for netfilter. This is separated from,
+ but required by, the NAT layer; it can also be used by an iptables
+ extension. */
+
+/* (c) 1999 Paul `Rusty' Russell. Licenced under the GNU General
+ Public Licence. */
+
+#ifdef MODULE
+#define __NO_VERSION__
+#endif
+#include <linux/version.h>
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/brlock.h>
+#include <net/checksum.h>
+#include <linux/stddef.h>
+#include <linux/sysctl.h>
+
+/* This rwlock protects the main hash table, protocol/helper/expected
+ registrations, conntrack timers*/
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
+
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+DECLARE_RWLOCK(ip_conntrack_lock);
+
+void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
+static LIST_HEAD(expect_list);
+static LIST_HEAD(protocol_list);
+static LIST_HEAD(helpers);
+unsigned int ip_conntrack_htable_size = 0;
+static int ip_conntrack_max = 0;
+static atomic_t ip_conntrack_count = ATOMIC_INIT(0);
+struct list_head *ip_conntrack_hash;
+
+extern struct ip_conntrack_protocol ip_conntrack_generic_protocol;
+
+static inline int proto_cmpfn(const struct ip_conntrack_protocol *curr,
+ u_int8_t protocol)
+{
+ return protocol == curr->proto;
+}
+
+struct ip_conntrack_protocol *__find_proto(u_int8_t protocol)
+{
+ struct ip_conntrack_protocol *p;
+
+ MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+ p = LIST_FIND(&protocol_list, proto_cmpfn,
+ struct ip_conntrack_protocol *, protocol);
+ if (!p)
+ p = &ip_conntrack_generic_protocol;
+
+ return p;
+}
+
+struct ip_conntrack_protocol *find_proto(u_int8_t protocol)
+{
+ struct ip_conntrack_protocol *p;
+
+ READ_LOCK(&ip_conntrack_lock);
+ p = __find_proto(protocol);
+ READ_UNLOCK(&ip_conntrack_lock);
+ return p;
+}
+
+static inline void ip_conntrack_put(struct ip_conntrack *ct)
+{
+ IP_NF_ASSERT(ct);
+ IP_NF_ASSERT(ct->infos[0].master);
+ /* nf_conntrack_put wants to go via an info struct, so feed it
+ one at random. */
+ nf_conntrack_put(&ct->infos[0]);
+}
+
+static inline u_int32_t
+hash_conntrack(const struct ip_conntrack_tuple *tuple)
+{
+#if 0
+ dump_tuple(tuple);
+#endif
+#ifdef CONFIG_NETFILTER_DEBUG
+ if (tuple->src.pad)
+ DEBUGP("Tuple %p has non-zero padding.\n", tuple);
+#endif
+ /* ntohl because more differences in low bits. */
+ /* To ensure that halves of the same connection don't hash
+ clash, we add the source per-proto again. */
+ return (ntohl(tuple->src.ip + tuple->dst.ip
+ + tuple->src.u.all + tuple->dst.u.all
+ + tuple->dst.protonum)
+ + ntohs(tuple->src.u.all))
+ % ip_conntrack_htable_size;
+}
+
+inline int
+get_tuple(const struct iphdr *iph, size_t len,
+ struct ip_conntrack_tuple *tuple,
+ struct ip_conntrack_protocol *protocol)
+{
+ int ret;
+
+ /* Can only happen when extracting tuples from inside ICMP
+ packets */
+ if (iph->frag_off & htons(IP_OFFSET)) {
+ if (net_ratelimit())
+ printk("ip_conntrack_core: Frag of proto %u.\n",
+ iph->protocol);
+ return 0;
+ }
+ /* Guarantee 8 protocol bytes: if more wanted, use len param */
+ else if (iph->ihl * 4 + 8 > len)
+ return 0;
+
+ tuple->src.ip = iph->saddr;
+ tuple->src.pad = 0;
+ tuple->dst.ip = iph->daddr;
+ tuple->dst.protonum = iph->protocol;
+
+ ret = protocol->pkt_to_tuple((u_int32_t *)iph + iph->ihl,
+ len - 4*iph->ihl,
+ tuple);
+ return ret;
+}
+
+static int
+invert_tuple(struct ip_conntrack_tuple *inverse,
+ const struct ip_conntrack_tuple *orig,
+ const struct ip_conntrack_protocol *protocol)
+{
+ inverse->src.ip = orig->dst.ip;
+ inverse->src.pad = 0;
+ inverse->dst.ip = orig->src.ip;
+ inverse->dst.protonum = orig->dst.protonum;
+
+ return protocol->invert_tuple(inverse, orig);
+}
+
+static void
+destroy_conntrack(struct nf_conntrack *nfct)
+{
+ struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
+
+ IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
+ IP_NF_ASSERT(!timer_pending(&ct->timeout));
+
+ if (ct->master.master)
+ nf_conntrack_put(&ct->master);
+
+ if (ip_conntrack_destroyed)
+ ip_conntrack_destroyed(ct);
+ kfree(ct);
+ atomic_dec(&ip_conntrack_count);
+}
+
+static void death_by_timeout(unsigned long ul_conntrack)
+{
+ struct ip_conntrack *ct = (void *)ul_conntrack;
+
+ WRITE_LOCK(&ip_conntrack_lock);
+ /* Remove from both hash lists */
+ LIST_DELETE(&ip_conntrack_hash
+ [hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)],
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
+ LIST_DELETE(&ip_conntrack_hash
+ [hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)],
+ &ct->tuplehash[IP_CT_DIR_REPLY]);
+ /* If our expected is in the list, take it out. */
+ if (ct->expected.expectant) {
+ IP_NF_ASSERT(list_inlist(&expect_list, &ct->expected));
+ IP_NF_ASSERT(ct->expected.expectant == ct);
+ LIST_DELETE(&expect_list, &ct->expected);
+ }
+ WRITE_UNLOCK(&ip_conntrack_lock);
+ ip_conntrack_put(ct);
+}
+
+static inline int
+conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
+ const struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack *ignored_conntrack)
+{
+ MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+ return i->ctrack != ignored_conntrack
+ && memcmp(tuple, &i->tuple, sizeof(*tuple)) == 0;
+}
+
+static struct ip_conntrack_tuple_hash *
+__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack *ignored_conntrack)
+{
+ struct ip_conntrack_tuple_hash *h;
+
+ MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+ h = LIST_FIND(&ip_conntrack_hash[hash_conntrack(tuple)],
+ conntrack_tuple_cmp,
+ struct ip_conntrack_tuple_hash *,
+ tuple, ignored_conntrack);
+ return h;
+}
+
+/* Find a connection corresponding to a tuple. */
+struct ip_conntrack_tuple_hash *
+ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack *ignored_conntrack)
+{
+ struct ip_conntrack_tuple_hash *h;
+
+ READ_LOCK(&ip_conntrack_lock);
+ h = __ip_conntrack_find(tuple, ignored_conntrack);
+ if (h)
+ atomic_inc(&h->ctrack->ct_general.use);
+ READ_UNLOCK(&ip_conntrack_lock);
+
+ return h;
+}
+
+/* Returns true if a connection correspondings to the tuple (required
+ for NAT). */
+int
+ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
+ const struct ip_conntrack *ignored_conntrack)
+{
+ struct ip_conntrack_tuple_hash *h;
+
+ READ_LOCK(&ip_conntrack_lock);
+ h = __ip_conntrack_find(tuple, ignored_conntrack);
+ READ_UNLOCK(&ip_conntrack_lock);
+
+ return h != NULL;
+}
+
+/* Returns TRUE if it dealt with ICMP, and filled in skb fields */
+int icmp_error_track(struct sk_buff *skb)
+{
+ const struct iphdr *iph = skb->nh.iph;
+ struct icmphdr *hdr = (struct icmphdr *)((u_int32_t *)iph + iph->ihl);
+ struct ip_conntrack_tuple innertuple, origtuple;
+ struct iphdr *inner = (struct iphdr *)(hdr + 1);
+ size_t datalen = skb->len - iph->ihl*4 - sizeof(*hdr);
+ struct ip_conntrack_protocol *innerproto;
+ struct ip_conntrack_tuple_hash *h;
+ enum ip_conntrack_info ctinfo;
+
+ if (iph->protocol != IPPROTO_ICMP)
+ return 0;
+
+ if (skb->len < iph->ihl * 4 + sizeof(struct icmphdr)) {
+ DEBUGP("icmp_error_track: too short\n");
+ return 1;
+ }
+
+ if (hdr->type != ICMP_DEST_UNREACH
+ && hdr->type != ICMP_SOURCE_QUENCH
+ && hdr->type != ICMP_TIME_EXCEEDED
+ && hdr->type != ICMP_PARAMETERPROB
+ && hdr->type != ICMP_REDIRECT)
+ return 0;
+
+ /* Ignore it if the checksum's bogus. */
+ if (ip_compute_csum((unsigned char *)hdr, sizeof(*hdr) + datalen)) {
+ DEBUGP("icmp_error_track: bad csum\n");
+ return 1;
+ }
+
+ innerproto = find_proto(inner->protocol);
+ /* Are they talking about one of our connections? */
+ if (inner->ihl * 4 + 8 > datalen
+ || !get_tuple(inner, datalen, &origtuple, innerproto)) {
+ DEBUGP("icmp_error: ! get_tuple p=%u (%u*4+%u dlen=%u)\n",
+ inner->protocol, inner->ihl, 8,
+ datalen);
+ return 1;
+ }
+
+ /* Ordinarily, we'd expect the inverted tupleproto, but it's
+ been preserved inside the ICMP. */
+ if (!invert_tuple(&innertuple, &origtuple, innerproto)) {
+ DEBUGP("icmp_error_track: Can't invert tuple\n");
+ return 1;
+ }
+ h = ip_conntrack_find_get(&innertuple, NULL);
+ if (!h) {
+ DEBUGP("icmp_error_track: no match\n");
+ return 1;
+ }
+
+ ctinfo = IP_CT_RELATED;
+ if (DIRECTION(h) == IP_CT_DIR_REPLY)
+ ctinfo += IP_CT_IS_REPLY;
+
+ /* Update skb to refer to this connection */
+ skb->nfct = &h->ctrack->infos[ctinfo];
+ return 1;
+}
+
+static inline int helper_cmp(const struct ip_conntrack_helper *i,
+ const struct ip_conntrack_tuple *rtuple)
+{
+ return i->will_help(rtuple);
+}
+
+/* Compare all but src per-proto part. */
+static int expect_cmp(const struct ip_conntrack_expect *i,
+ const struct ip_conntrack_tuple *tuple)
+{
+ return (tuple->src.ip == i->tuple.src.ip
+ && tuple->dst.ip == i->tuple.dst.ip
+ && tuple->dst.u.all == i->tuple.dst.u.all
+ && tuple->dst.protonum == i->tuple.dst.protonum);
+}
+
+/* Allocate a new conntrack; we set everything up, then grab write
+ lock and see if we lost a race. If we lost it we return 0,
+ indicating the controlling code should look again. */
+static int
+init_conntrack(const struct ip_conntrack_tuple *tuple,
+ struct ip_conntrack_protocol *protocol,
+ struct sk_buff *skb)
+{
+ struct ip_conntrack *conntrack;
+ struct ip_conntrack_tuple repl_tuple;
+ size_t hash, repl_hash;
+ struct ip_conntrack_expect *expected;
+ enum ip_conntrack_info ctinfo;
+ int i;
+
+ if (!invert_tuple(&repl_tuple, tuple, protocol)) {
+ DEBUGP("Can't invert tuple.\n");
+ return 1;
+ }
+
+ if(ip_conntrack_max &&
+ (atomic_read(&ip_conntrack_count) >= ip_conntrack_max)) {
+ if (net_ratelimit())
+ printk(KERN_WARNING "ip_conntrack: maximum limit of %d entries exceeded\n", ip_conntrack_max);
+ return 1;
+ }
+
+ conntrack = kmalloc(sizeof(struct ip_conntrack), GFP_ATOMIC);
+ if (!conntrack) {
+ DEBUGP("Can't allocate conntrack.\n");
+ return 1;
+ }
+ hash = hash_conntrack(tuple);
+ repl_hash = hash_conntrack(&repl_tuple);
+
+ memset(conntrack, 0, sizeof(struct ip_conntrack));
+ atomic_set(&conntrack->ct_general.use, 1);
+ conntrack->ct_general.destroy = destroy_conntrack;
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
+ conntrack->tuplehash[IP_CT_DIR_ORIGINAL].ctrack = conntrack;
+ conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
+ conntrack->tuplehash[IP_CT_DIR_REPLY].ctrack = conntrack;
+ for(i=0; i < IP_CT_NUMBER; i++)
+ conntrack->infos[i].master = &conntrack->ct_general;
+
+ if (!protocol->new(conntrack, skb->nh.iph, skb->len)) {
+ kfree(conntrack);
+ return 1;
+ }
+
+ /* Sew in at head of hash list. */
+ WRITE_LOCK(&ip_conntrack_lock);
+ /* Check noone else beat us in the race... */
+ if (__ip_conntrack_find(tuple, NULL)) {
+ WRITE_UNLOCK(&ip_conntrack_lock);
+ printk("ip_conntrack: Wow someone raced us!\n");
+ kfree(conntrack);
+ return 0;
+ }
+ conntrack->helper = LIST_FIND(&helpers, helper_cmp,
+ struct ip_conntrack_helper *,
+ &repl_tuple);
+ /* Need finding and deleting of expected ONLY if we win race */
+ expected = LIST_FIND(&expect_list, expect_cmp,
+ struct ip_conntrack_expect *, tuple);
+ if (expected) {
+ /* Welcome, Mr. Bond. We've been expecting you... */
+ conntrack->status = IPS_EXPECTED;
+ conntrack->master.master = &expected->expectant->ct_general;
+ IP_NF_ASSERT(conntrack->master.master);
+ LIST_DELETE(&expect_list, expected);
+ expected->expectant = NULL;
+ nf_conntrack_get(&conntrack->master);
+ ctinfo = IP_CT_RELATED;
+ } else {
+ ctinfo = IP_CT_NEW;
+ }
+ list_prepend(&ip_conntrack_hash[hash],
+ &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]);
+ list_prepend(&ip_conntrack_hash[repl_hash],
+ &conntrack->tuplehash[IP_CT_DIR_REPLY]);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+
+ /* Update skb to refer to this connection */
+ skb->nfct = &conntrack->infos[ctinfo];
+
+ atomic_inc(&ip_conntrack_count);
+ return 1;
+}
+
+static void
+resolve_normal_ct(struct sk_buff *skb)
+{
+ struct ip_conntrack_tuple tuple;
+ struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack_protocol *proto;
+ enum ip_conntrack_info ctinfo;
+
+ proto = find_proto(skb->nh.iph->protocol);
+ if (!get_tuple(skb->nh.iph, skb->len, &tuple, proto))
+ return;
+
+ /* Loop around search/insert race */
+ do {
+ /* look for tuple match */
+ h = ip_conntrack_find_get(&tuple, NULL);
+ if (!h && init_conntrack(&tuple, proto, skb))
+ return;
+ } while (!h);
+
+ /* It exists; we have (non-exclusive) reference. */
+ if (DIRECTION(h) == IP_CT_DIR_REPLY) {
+ ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
+ h->ctrack->status |= IPS_SEEN_REPLY;
+ } else {
+ /* Once we've had two way comms, always ESTABLISHED. */
+ if (h->ctrack->status & IPS_SEEN_REPLY) {
+ DEBUGP("ip_conntrack_in: normal packet for %p\n",
+ h->ctrack);
+ ctinfo = IP_CT_ESTABLISHED;
+ } else if (h->ctrack->status & IPS_EXPECTED) {
+ DEBUGP("ip_conntrack_in: related packet for %p\n",
+ h->ctrack);
+ ctinfo = IP_CT_RELATED;
+ } else {
+ DEBUGP("ip_conntrack_in: new packet for %p\n",
+ h->ctrack);
+ ctinfo = IP_CT_NEW;
+ }
+ }
+ skb->nfct = &h->ctrack->infos[ctinfo];
+}
+
+/* Return conntrack and conntrack_info a given skb */
+struct ip_conntrack *
+ip_conntrack_get(struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
+{
+ if (!skb->nfct) {
+ /* It may be an icmp error... */
+ if (!icmp_error_track(skb))
+ resolve_normal_ct(skb);
+ }
+
+ if (skb->nfct) {
+ struct ip_conntrack *ct
+ = (struct ip_conntrack *)skb->nfct->master;
+
+ /* ctinfo is the index of the nfct inside the conntrack */
+ *ctinfo = skb->nfct - ct->infos;
+ IP_NF_ASSERT(*ctinfo >= 0 && *ctinfo < IP_CT_NUMBER);
+ return ct;
+ }
+ return NULL;
+}
+
+/* Netfilter hook itself. */
+unsigned int ip_conntrack_in(unsigned int hooknum,
+ struct sk_buff **pskb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ struct ip_conntrack *ct;
+ enum ip_conntrack_info ctinfo;
+ struct ip_conntrack_protocol *proto;
+ int ret;
+
+ /* FIXME: Do this right please. --RR */
+ (*pskb)->nfcache |= NFC_UNKNOWN;
+
+ /* Previously seen (loopback)? Ignore. Do this before
+ fragment check. */
+ if ((*pskb)->nfct)
+ return NF_ACCEPT;
+
+ /* Gather fragments. */
+ if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+ *pskb = ip_ct_gather_frags(*pskb);
+ if (!*pskb)
+ return NF_STOLEN;
+ }
+
+ ct = ip_conntrack_get(*pskb, &ctinfo);
+ if (!ct)
+ /* Not valid part of a connection */
+ return NF_ACCEPT;
+
+ proto = find_proto((*pskb)->nh.iph->protocol);
+ /* If this is new, this is first time timer will be set */
+ ret = proto->packet(ct, (*pskb)->nh.iph, (*pskb)->len, ctinfo);
+
+ if (ret == -1) {
+ /* Invalid */
+ nf_conntrack_put((*pskb)->nfct);
+ (*pskb)->nfct = NULL;
+ return NF_ACCEPT;
+ }
+
+ if (ret != NF_DROP && ct->helper) {
+ ret = ct->helper->help((*pskb)->nh.iph, (*pskb)->len,
+ ct, ctinfo);
+ if (ret == -1) {
+ /* Invalid */
+ nf_conntrack_put((*pskb)->nfct);
+ (*pskb)->nfct = NULL;
+ return NF_ACCEPT;
+ }
+ }
+
+ return ret;
+}
+
+int invert_tuplepr(struct ip_conntrack_tuple *inverse,
+ const struct ip_conntrack_tuple *orig)
+{
+ return invert_tuple(inverse, orig, find_proto(orig->dst.protonum));
+}
+
+/* Add a related connection. */
+int ip_conntrack_expect_related(struct ip_conntrack *related_to,
+ const struct ip_conntrack_tuple *tuple)
+{
+ WRITE_LOCK(&ip_conntrack_lock);
+ related_to->expected.tuple = *tuple;
+
+ if (!related_to->expected.expectant) {
+ list_prepend(&expect_list, &related_to->expected);
+ related_to->expected.expectant = related_to;
+ } else {
+ IP_NF_ASSERT(list_inlist(&expect_list, &related_to->expected));
+ IP_NF_ASSERT(related_to->expected.expectant
+ == related_to);
+ }
+ WRITE_UNLOCK(&ip_conntrack_lock);
+
+ return 0;
+}
+
+/* Alter reply tuple (maybe alter helper). If it's already taken,
+ return 0 and don't do alteration. */
+int ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
+ const struct ip_conntrack_tuple *newreply)
+{
+ unsigned int newindex = hash_conntrack(newreply);
+
+ WRITE_LOCK(&ip_conntrack_lock);
+ if (__ip_conntrack_find(newreply, conntrack)) {
+ WRITE_UNLOCK(&ip_conntrack_lock);
+ return 0;
+ }
+ DEBUGP("Altering reply tuple of %p to ", conntrack);
+ DUMP_TUPLE(newreply);
+
+ LIST_DELETE(&ip_conntrack_hash
+ [hash_conntrack(&conntrack->tuplehash[IP_CT_DIR_REPLY]
+ .tuple)],
+ &conntrack->tuplehash[IP_CT_DIR_REPLY]);
+ conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
+ list_prepend(&ip_conntrack_hash[newindex],
+ &conntrack->tuplehash[IP_CT_DIR_REPLY]);
+ conntrack->helper = LIST_FIND(&helpers, helper_cmp,
+ struct ip_conntrack_helper *,
+ newreply);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+ return 1;
+}
+
+int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
+{
+ MOD_INC_USE_COUNT;
+
+ WRITE_LOCK(&ip_conntrack_lock);
+ list_prepend(&helpers, me);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+
+ return 0;
+}
+
+static inline int unhelp(struct ip_conntrack_tuple_hash *i,
+ const struct ip_conntrack_helper *me)
+{
+ if (i->ctrack->helper == me) {
+ i->ctrack->helper = NULL;
+ /* Get rid of any expected. */
+ if (i->ctrack->expected.expectant) {
+ IP_NF_ASSERT(i->ctrack->expected.expectant
+ == i->ctrack);
+ LIST_DELETE(&expect_list, &i->ctrack->expected);
+ i->ctrack->expected.expectant = NULL;
+ }
+ }
+ return 0;
+}
+
+void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
+{
+ unsigned int i;
+
+ /* Need write lock here, to delete helper. */
+ WRITE_LOCK(&ip_conntrack_lock);
+ LIST_DELETE(&helpers, me);
+
+ /* Get rid of expecteds, set helpers to NULL. */
+ for (i = 0; i < ip_conntrack_htable_size; i++)
+ LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
+ struct ip_conntrack_tuple_hash *, me);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+
+ /* Someone could be still looking at the helper in a bh. */
+ br_write_lock_bh(BR_NETPROTO_LOCK);
+ br_write_unlock_bh(BR_NETPROTO_LOCK);
+
+ MOD_DEC_USE_COUNT;
+}
+
+/* Refresh conntrack for this many jiffies: if noone calls this,
+ conntrack will vanish with current skb. */
+void ip_ct_refresh(struct ip_conntrack *ct, unsigned long extra_jiffies)
+{
+ WRITE_LOCK(&ip_conntrack_lock);
+ /* If this hasn't had a timer before, it's still being set up */
+ if (ct->timeout.data == 0) {
+ ct->timeout.data = (unsigned long)ct;
+ ct->timeout.function = death_by_timeout;
+ ct->timeout.expires = jiffies + extra_jiffies;
+ atomic_inc(&ct->ct_general.use);
+ add_timer(&ct->timeout);
+ } else {
+ /* Need del_timer for race avoidance (may already be dying). */
+ if (del_timer(&ct->timeout)) {
+ ct->timeout.expires = jiffies + extra_jiffies;
+ add_timer(&ct->timeout);
+ }
+ }
+ WRITE_UNLOCK(&ip_conntrack_lock);
+}
+
+/* Returns new sk_buff, or NULL */
+struct sk_buff *
+ip_ct_gather_frags(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+#ifdef CONFIG_NETFILTER_DEBUG
+ unsigned int olddebug = skb->nf_debug;
+#endif
+ if (sk) sock_hold(sk);
+ skb = ip_defrag(skb);
+ if (!skb) {
+ if (sk) sock_put(sk);
+ return skb;
+ }
+ if (sk) {
+ skb_set_owner_w(skb, sk);
+ sock_put(sk);
+ }
+
+ ip_send_check(skb->nh.iph);
+ skb->nfcache |= NFC_ALTERED;
+#ifdef CONFIG_NETFILTER_DEBUG
+ /* Packet path as if nothing had happened. */
+ skb->nf_debug = olddebug;
+#endif
+ return skb;
+}
+
+static inline int
+do_kill(const struct ip_conntrack_tuple_hash *i,
+ int (*kill)(const struct ip_conntrack *i, void *data),
+ void *data)
+{
+ return kill(i->ctrack, data);
+}
+
+/* Bring out ya dead! */
+static struct ip_conntrack_tuple_hash *
+get_next_corpse(int (*kill)(const struct ip_conntrack *i, void *data),
+ void *data)
+{
+ struct ip_conntrack_tuple_hash *h = NULL;
+ unsigned int i;
+
+ READ_LOCK(&ip_conntrack_lock);
+ for (i = 0; !h && i < ip_conntrack_htable_size; i++) {
+ h = LIST_FIND(&ip_conntrack_hash[i], do_kill,
+ struct ip_conntrack_tuple_hash *, kill, data);
+ }
+ if (h)
+ atomic_inc(&h->ctrack->ct_general.use);
+ READ_UNLOCK(&ip_conntrack_lock);
+
+ return h;
+}
+
+void
+ip_ct_selective_cleanup(int (*kill)(const struct ip_conntrack *i, void *data),
+ void *data)
+{
+ struct ip_conntrack_tuple_hash *h;
+
+ /* This is order n^2, by the way. */
+ while ((h = get_next_corpse(kill, data)) != NULL) {
+ /* Time to push up daises... */
+ if (del_timer(&h->ctrack->timeout))
+ death_by_timeout((unsigned long)h->ctrack);
+ /* ... else the timer will get him soon. */
+
+ ip_conntrack_put(h->ctrack);
+ }
+}
+
+/* Fast function for those who don't want to parse /proc (and I don't
+ blame them). */
+/* Reversing the socket's dst/src point of view gives us the reply
+ mapping. */
+static int
+getorigdst(struct sock *sk, int optval, void *user, int *len)
+{
+ struct ip_conntrack_tuple_hash *h;
+ struct ip_conntrack_tuple tuple = { { sk->rcv_saddr, { sk->sport },
+ 0 },
+ { sk->daddr, { sk->dport },
+ IPPROTO_TCP } };
+
+ /* We only do TCP at the moment: is there a better way? */
+ if (strcmp(sk->prot->name, "TCP") != 0) {
+ DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
+ return -ENOPROTOOPT;
+ }
+
+ if (*len != sizeof(struct sockaddr_in)) {
+ DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
+ *len, sizeof(struct sockaddr_in));
+ return -EINVAL;
+ }
+
+ h = ip_conntrack_find_get(&tuple, NULL);
+ if (h) {
+ struct sockaddr_in sin;
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.u.tcp.port;
+ sin.sin_addr.s_addr = h->ctrack->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.ip;
+
+ DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
+ IP_PARTS(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+ ip_conntrack_put(h->ctrack);
+ if (copy_to_user(user, &sin, sizeof(sin)) != 0)
+ return -EFAULT;
+ else
+ return 0;
+ }
+ DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
+ IP_PARTS(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
+ IP_PARTS(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
+ return -ENOENT;
+}
+
+static struct nf_sockopt_ops so_getorigdst
+= { { NULL, NULL }, PF_INET,
+ 0, 0, NULL, /* Setsockopts */
+ SO_ORIGINAL_DST, SO_ORIGINAL_DST+1, &getorigdst,
+ 0, NULL };
+
+#define NET_IP_CONNTRACK_MAX 2089
+#define NET_IP_CONNTRACK_MAX_NAME "ip_conntrack_max"
+
+static struct ctl_table_header *ip_conntrack_sysctl_header;
+
+static ctl_table ip_conntrack_table[] = {
+ { NET_IP_CONNTRACK_MAX, NET_IP_CONNTRACK_MAX_NAME, &ip_conntrack_max,
+ sizeof(ip_conntrack_max), 0644, NULL, proc_dointvec },
+ { 0 }
+};
+
+static ctl_table ip_conntrack_dir_table[] = {
+ {NET_IPV4, "ipv4", NULL, 0, 0555, ip_conntrack_table, 0, 0, 0, 0, 0},
+ { 0 }
+};
+
+static ctl_table ip_conntrack_root_table[] = {
+ {CTL_NET, "net", NULL, 0, 0555, ip_conntrack_dir_table, 0, 0, 0, 0, 0},
+ { 0 }
+};
+
+static int kill_all(const struct ip_conntrack *i, void *data)
+{
+ return 1;
+}
+
+/* Mishearing the voices in his head, our hero wonders how he's
+ supposed to kill the mall. */
+void ip_conntrack_cleanup(void)
+{
+ unregister_sysctl_table(ip_conntrack_sysctl_header);
+ ip_ct_selective_cleanup(kill_all, NULL);
+ vfree(ip_conntrack_hash);
+ nf_unregister_sockopt(&so_getorigdst);
+}
+
+int __init ip_conntrack_init(void)
+{
+ unsigned int i;
+ int ret;
+
+ /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
+ * machine has 256 buckets. 1GB machine has 8192 buckets. */
+ ip_conntrack_htable_size
+ = (((num_physpages << PAGE_SHIFT) / 16384)
+ / sizeof(struct list_head));
+ ip_conntrack_max = 8 * ip_conntrack_htable_size;
+
+ printk("ip_conntrack (%u buckets, %d max)\n",
+ ip_conntrack_htable_size, ip_conntrack_max);
+
+ ret = nf_register_sockopt(&so_getorigdst);
+ if (ret != 0)
+ return ret;
+
+ ip_conntrack_hash = vmalloc(sizeof(struct list_head)
+ * ip_conntrack_htable_size);
+ if (!ip_conntrack_hash) {
+ nf_unregister_sockopt(&so_getorigdst);
+ return -ENOMEM;
+ }
+
+ /* Don't NEED lock here, but good form anyway. */
+ WRITE_LOCK(&ip_conntrack_lock);
+ /* Sew in builtin protocols. */
+ list_append(&protocol_list, &ip_conntrack_protocol_tcp);
+ list_append(&protocol_list, &ip_conntrack_protocol_udp);
+ list_append(&protocol_list, &ip_conntrack_protocol_icmp);
+ WRITE_UNLOCK(&ip_conntrack_lock);
+
+ for (i = 0; i < ip_conntrack_htable_size; i++)
+ INIT_LIST_HEAD(&ip_conntrack_hash[i]);
+
+/* This is fucking braindead. There is NO WAY of doing this without
+ the CONFIG_SYSCTL unless you don't want to detect errors.
+ Grrr... --RR */
+#ifdef CONFIG_SYSCTL
+ ip_conntrack_sysctl_header
+ = register_sysctl_table(ip_conntrack_root_table, 0);
+ if (ip_conntrack_sysctl_header == NULL) {
+ vfree(ip_conntrack_hash);
+ nf_unregister_sockopt(&so_getorigdst);
+ return -ENOMEM;
+ }
+#endif /*CONFIG_SYSCTL*/
+
+ ret = ip_conntrack_protocol_tcp_init();
+ if (ret != 0) {
+ unregister_sysctl_table(ip_conntrack_sysctl_header);
+ vfree(ip_conntrack_hash);
+ nf_unregister_sockopt(&so_getorigdst);
+ }
+
+ return ret;
+}
+
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)