/* rrl.c - Response Rate Limiting for NSD.
 * By W.C.A. Wijngaards
 * Copyright 2012, NLnet Labs.
 * BSD, see LICENSE.
 */
#include "config.h"
#include <errno.h>
#include "rrl.h"
#include "util.h"
#include "lookup3.h"
#include "options.h"

#ifdef RATELIMIT

#ifdef HAVE_MMAP
#include <sys/mman.h>
#if defined(MAP_ANON) && !defined(MAP_ANONYMOUS)
#define MAP_ANONYMOUS   MAP_ANON
#endif
#endif /* HAVE_MMAP */


/**
 * The rate limiting data structure bucket, this represents one rate of
 * packets from a single source.
 * Smoothed average rates.
 */
struct rrl_bucket {
	/* the source netmask */
	uint64_t source;
	/* rate, in queries per second, which due to rate=r(t)+r(t-1)/2 is
	 * equal to double the queries per second */
	uint32_t rate;
	/* the full hash */
	uint32_t hash;
	/* counter for queries arrived in this second */
	uint32_t counter;
	/* timestamp, which time is the time of the counter, the rate is from
	 * one timestep before that. */
	int32_t stamp;
	/* flags for the source mask and type */
	uint16_t flags;
};

/* the (global) array of RRL buckets */
static struct rrl_bucket* rrl_array = NULL;
static size_t rrl_array_size = RRL_BUCKETS;
static uint32_t rrl_ratelimit = RRL_LIMIT; /* 2x qps */
static uint8_t rrl_slip_ratio = RRL_SLIP;
static uint8_t rrl_ipv4_prefixlen = RRL_IPV4_PREFIX_LENGTH;
static uint8_t rrl_ipv6_prefixlen = RRL_IPV6_PREFIX_LENGTH;
static uint64_t rrl_ipv6_mask; /* max prefixlen 64 */
static uint32_t rrl_whitelist_ratelimit = RRL_WLIST_LIMIT; /* 2x qps */

/* the array of mmaps for the children (saved between reloads) */
static void** rrl_maps = NULL;
static size_t rrl_maps_num = 0;

void rrl_mmap_init(int numch, size_t numbuck, size_t lm, size_t wlm, size_t sm,
	size_t plf, size_t pls)
{
#ifdef HAVE_MMAP
	size_t i;
#endif
	if(numbuck != 0)
		rrl_array_size = numbuck;
	rrl_ratelimit = lm*2;
	rrl_slip_ratio = sm;
	rrl_ipv4_prefixlen = plf;
	rrl_ipv6_prefixlen = pls;
	if (pls <= 32) {
		rrl_ipv6_mask = ((uint64_t) htonl(0xffffffff << (32-pls))) << 32;
	} else {
		rrl_ipv6_mask =  ((uint64_t) htonl(0xffffffff << (64-pls))) |
			(((uint64_t)0xffffffff)<<32);
	}
	rrl_whitelist_ratelimit = wlm*2;
#ifdef HAVE_MMAP
	/* allocate the ratelimit hashtable in a memory map so it is
	 * preserved across reforks (every child its own table) */
	rrl_maps_num = (size_t)numch;
	rrl_maps = (void**)xmallocarray(rrl_maps_num, sizeof(void*));
	for(i=0; i<rrl_maps_num; i++) {
		rrl_maps[i] = mmap(NULL,
			sizeof(struct rrl_bucket)*rrl_array_size,
			PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
		if(rrl_maps[i] == MAP_FAILED) {
			log_msg(LOG_ERR, "rrl: mmap failed: %s",
				strerror(errno));
			exit(1);
		}
		memset(rrl_maps[i], 0,
			sizeof(struct rrl_bucket)*rrl_array_size);
	}
#else
	(void)numch;
	rrl_maps_num = 0;
	rrl_maps = NULL;
#endif
}

void rrl_set_limit(size_t lm, size_t wlm, size_t sm)
{
	rrl_ratelimit = lm*2;
	rrl_whitelist_ratelimit = wlm*2;
	rrl_slip_ratio = sm;
}

void rrl_init(size_t ch)
{
	if(!rrl_maps || ch >= rrl_maps_num)
	    rrl_array = xalloc_array_zero(sizeof(struct rrl_bucket),
	    	rrl_array_size);
#ifdef HAVE_MMAP
	else rrl_array = (struct rrl_bucket*)rrl_maps[ch];
#endif
}

/** return the source netblock of the query, this is the genuine source
 * for genuine queries and the target for reflected packets */
static uint64_t rrl_get_source(query_type* query, uint16_t* c2)
{
	/* note there is an IPv6 subnet, that maps
	 * to the same buckets as IPv4 space, but there is a flag in c2
	 * that makes the hash different */
#ifdef INET6
	if( ((struct sockaddr_in*)&query->addr)->sin_family == AF_INET) {
		*c2 = 0;
		return ((struct sockaddr_in*)&query->addr)->
			sin_addr.s_addr & htonl(0xffffffff << (32-rrl_ipv4_prefixlen));
	} else {
		uint64_t s;
		*c2 = rrl_ip6;
		memmove(&s, &((struct sockaddr_in6*)&query->addr)->sin6_addr,
			sizeof(s));
		return s & rrl_ipv6_mask;
	}
#else
	*c2 = 0;
	return query->addr.sin_addr.s_addr & htonl(0xffffffff << (32-rrl_ipv4_prefixlen));
#endif
}

/** debug source to string */
static const char* rrlsource2str(uint64_t s, uint16_t c2)
{
	static char buf[64];
	struct in_addr a4;
#ifdef INET6
	if(c2) {
		/* IPv6 */
		struct in6_addr a6;
		memset(&a6, 0, sizeof(a6));
		memmove(&a6, &s, sizeof(s));
		if(!inet_ntop(AF_INET6, &a6, buf, sizeof(buf)))
			strlcpy(buf, "[ip6 ntop failed]", sizeof(buf));
		else {
			static char prefix[5];
			snprintf(prefix, sizeof(prefix), "/%d", rrl_ipv6_prefixlen);
			strlcat(buf, &prefix[0], sizeof(buf));
		}
		return buf;
	}
#else
	(void)c2;
#endif
	/* ipv4 */
	a4.s_addr = (uint32_t)s;
	if(!inet_ntop(AF_INET, &a4, buf, sizeof(buf)))
		strlcpy(buf, "[ip4 ntop failed]", sizeof(buf));
	else {
		static char prefix[5];
		snprintf(prefix, sizeof(prefix), "/%d", rrl_ipv4_prefixlen);
		strlcat(buf, &prefix[0], sizeof(buf));
	}
	return buf;
}

enum rrl_type rrlstr2type(const char* s)
{
	if(strcmp(s, "nxdomain")==0) return rrl_type_nxdomain;
	else if(strcmp(s, "error")==0) return rrl_type_error;
	else if(strcmp(s, "referral")==0) return rrl_type_referral;
	else if(strcmp(s, "any")==0) return rrl_type_any;
	else if(strcmp(s, "wildcard")==0) return rrl_type_wildcard;
	else if(strcmp(s, "nodata")==0) return rrl_type_nodata;
	else if(strcmp(s, "dnskey")==0) return rrl_type_dnskey;
	else if(strcmp(s, "positive")==0) return rrl_type_positive;
	else if(strcmp(s, "rrsig")==0) return rrl_type_rrsig;
	else if(strcmp(s, "all")==0) return rrl_type_all;
	return 0; /* unknown */
}

const char* rrltype2str(enum rrl_type c)
{
	switch(c & 0x0fff) {
		case rrl_type_nxdomain: return "nxdomain";
		case rrl_type_error: return "error";
		case rrl_type_referral: return "referral";
		case rrl_type_any: return "any";
		case rrl_type_wildcard: return "wildcard";
		case rrl_type_nodata: return "nodata";
		case rrl_type_dnskey: return "dnskey";
		case rrl_type_positive: return "positive";
		case rrl_type_rrsig: return "rrsig";
		case rrl_type_all: return "all";
	}
	return "unknown";
}

/** classify the query in a number of different types, each has separate
 * ratelimiting, so that positive queries are not impeded by others */
static uint16_t rrl_classify(query_type* query, const uint8_t** d,
	size_t* d_len)
{
	if(RCODE(query->packet) == RCODE_NXDOMAIN) {
		if(query->zone && query->zone->apex) {
			*d = dname_name(domain_dname(query->zone->apex));
			*d_len = domain_dname(query->zone->apex)->name_size;
		}
		return rrl_type_nxdomain;
	}
	if(RCODE(query->packet) != RCODE_OK) {
		if(query->zone && query->zone->apex) {
			*d = dname_name(domain_dname(query->zone->apex));
			*d_len = domain_dname(query->zone->apex)->name_size;
		}
		return rrl_type_error;
	}
	if(query->delegation_domain) {
		*d = dname_name(domain_dname(query->delegation_domain));
		*d_len = domain_dname(query->delegation_domain)->name_size;
		return rrl_type_referral;
	}
	if(query->qtype == TYPE_ANY) {
		if(query->qname) {
			*d = dname_name(query->qname);
			*d_len = query->qname->name_size;
		}
		return rrl_type_any;
	}
	if(query->qtype == TYPE_RRSIG) {
		if(query->qname) {
			*d = dname_name(query->qname);
			*d_len = query->qname->name_size;
		}
		return rrl_type_rrsig;
	}
	if(query->wildcard_domain) {
		*d = dname_name(domain_dname(query->wildcard_domain));
		*d_len = domain_dname(query->wildcard_domain)->name_size;
		return rrl_type_wildcard;
	}
	if(ANCOUNT(query->packet) == 0) {
		if(query->zone && query->zone->apex) {
			*d = dname_name(domain_dname(query->zone->apex));
			*d_len = domain_dname(query->zone->apex)->name_size;
		}
		return rrl_type_nodata;
	}
	if(query->qtype == TYPE_DNSKEY) {
		if(query->qname) {
			*d = dname_name(query->qname);
			*d_len = query->qname->name_size;
		}
		return rrl_type_dnskey;
	}
	/* positive */
	if(query->qname) {
		*d = dname_name(query->qname);
		*d_len = query->qname->name_size;
	}
	return rrl_type_positive;
}

/** Examine the query and return hash and source of netblock. */
static void examine_query(query_type* query, uint32_t* hash, uint64_t* source,
	uint16_t* flags, uint32_t* lm)
{
	/* compile a binary string representing the query */
	uint16_t c, c2;
	/* size with 16 bytes to spare */
	uint8_t buf[MAXDOMAINLEN + sizeof(*source) + sizeof(c) + 16];
	const uint8_t* dname = NULL; size_t dname_len = 0;
	uint32_t r = 0x267fcd16;

	*source = rrl_get_source(query, &c2);
	c = rrl_classify(query, &dname, &dname_len);
	if(query->zone && query->zone->opts &&
		(query->zone->opts->pattern->rrl_whitelist & c))
		*lm = rrl_whitelist_ratelimit;
	if(*lm == 0) return;
	c |= c2;
	*flags = c;
	memmove(buf, source, sizeof(*source));
	memmove(buf+sizeof(*source), &c, sizeof(c));

	DEBUG(DEBUG_QUERY, 1, (LOG_INFO, "rrl_examine type %s name %s", rrltype2str(c), dname?wiredname2str(dname):"NULL"));

	/* and hash it */
	if(dname && dname_len <= MAXDOMAINLEN) {
		memmove(buf+sizeof(*source)+sizeof(c), dname, dname_len);
		*hash = hashlittle(buf, sizeof(*source)+sizeof(c)+dname_len, r);
	} else
		*hash = hashlittle(buf, sizeof(*source)+sizeof(c), r);
}

/* age the bucket because elapsed time steps have gone by */
static void rrl_attenuate_bucket(struct rrl_bucket* b, int32_t elapsed)
{
	if(elapsed > 16) {
		b->rate = 0;
	} else {
		/* divide rate /2 for every elapsed time step, because
		 * the counters in the inbetween steps were 0 */
		/* r(t) = 0 + 0/2 + 0/4 + .. + oldrate/2^dt */
		b->rate >>= elapsed;
		/* we know that elapsed >= 2 */
		b->rate += (b->counter>>(elapsed-1));
	}
}

/** log a message about ratelimits */
static void
rrl_msg(query_type* query, const char* str)
{
	uint16_t c, c2, wl = 0;
	const uint8_t* d = NULL;
	size_t d_len;
	uint64_t s;
	char address[128];
	if(verbosity < 1) return;
	addr2str(&query->addr, address, sizeof(address));
	s = rrl_get_source(query, &c2);
	c = rrl_classify(query, &d, &d_len) | c2;
	if(query->zone && query->zone->opts &&
		(query->zone->opts->pattern->rrl_whitelist & c))
		wl = 1;
	log_msg(LOG_INFO, "ratelimit %s %s type %s%s target %s query %s %s",
		str, d?wiredname2str(d):"", rrltype2str(c),
		wl?"(whitelisted)":"", rrlsource2str(s, c2),
		address, rrtype_to_string(query->qtype));
}

/** true if the query used to be blocked by the ratelimit */
static int
used_to_block(uint32_t rate, uint32_t counter, uint32_t lm)
{
	return rate >= lm || counter+rate/2 >= lm;
}

/** update the rate in a ratelimit bucket, return actual rate */
uint32_t rrl_update(query_type* query, uint32_t hash, uint64_t source,
	uint16_t flags, int32_t now, uint32_t lm)
{
	struct rrl_bucket* b = &rrl_array[hash % rrl_array_size];

	DEBUG(DEBUG_QUERY, 1, (LOG_INFO, "source %llx hash %x oldrate %d oldcount %d stamp %d",
		(long long unsigned)source, hash, b->rate, b->counter, b->stamp));

	/* check if different source */
	if(b->source != source || b->flags != flags || b->hash != hash) {
		/* initialise */
		/* potentially the wrong limit here, used lower nonwhitelim */
		if(verbosity >= 1 &&
			used_to_block(b->rate, b->counter, rrl_ratelimit)) {
			char address[128];
			addr2str(&query->addr, address, sizeof(address));
			log_msg(LOG_INFO, "ratelimit unblock ~ type %s target %s query %s %s (%s collision)",
				rrltype2str(b->flags),
				rrlsource2str(b->source, b->flags),
				address, rrtype_to_string(query->qtype),
				(b->hash!=hash?"bucket":"hash"));
		}
		b->hash = hash;
		b->source = source;
		b->flags = flags;
		b->counter = 1;
		b->rate = 0;
		b->stamp = now;
		return 1;
	}
	/* this is the same source */

	/* check if old, zero or smooth it */
	/* circular arith for time */
	if(now - b->stamp == 1) {
		/* very busy bucket and time just stepped one step */
		int oldblock = used_to_block(b->rate, b->counter, lm);
		b->rate = b->rate/2 + b->counter;
		if(oldblock && b->rate < lm)
			rrl_msg(query, "unblock");
		b->counter = 1;
		b->stamp = now;
	} else if(now - b->stamp > 0) {
		/* older bucket */
		int olderblock = used_to_block(b->rate, b->counter, lm);
		rrl_attenuate_bucket(b, now - b->stamp);
		if(olderblock && b->rate < lm)
			rrl_msg(query, "unblock");
		b->counter = 1;
		b->stamp = now;
	} else if(now != b->stamp) {
		/* robust, timestamp from the future */
		if(used_to_block(b->rate, b->counter, lm))
			rrl_msg(query, "unblock");
		b->rate = 0;
		b->counter = 1;
		b->stamp = now;
	} else {
		/* bucket is from the current timestep, update counter */
		b->counter ++;

		/* log what is blocked for operational debugging */
		if(b->counter + b->rate/2 == lm && b->rate < lm)
			rrl_msg(query, "block");
	}

	/* return max from current rate and projected next-value for rate */
	/* so that if the rate increases suddenly very high, it is
	 * stopped halfway into the time step */
	if(b->counter > b->rate/2)
		return b->counter + b->rate/2;
	return b->rate;
}

int rrl_process_query(query_type* query)
{
	uint64_t source;
	uint32_t hash;
	/* we can use circular arithmetic here, so int32 works after 2038 */
	int32_t now = (int32_t)time(NULL);
	uint32_t lm = rrl_ratelimit;
	uint16_t flags;
	if(rrl_ratelimit == 0 && rrl_whitelist_ratelimit == 0)
		return 0;

	/* examine query */
	examine_query(query, &hash, &source, &flags, &lm);

	if(lm == 0)
		return 0; /* no limit for this */

	/* update rate */
	return (rrl_update(query, hash, source, flags, now, lm) >= lm);
}

query_state_type rrl_slip(query_type* query)
{
	/* discard number the packets, randomly */
#ifdef HAVE_ARC4RANDOM_UNIFORM
	if((rrl_slip_ratio > 0) && ((rrl_slip_ratio == 1) || ((arc4random_uniform(rrl_slip_ratio)) == 0))) {
#elif HAVE_ARC4RANDOM
	if((rrl_slip_ratio > 0) && ((rrl_slip_ratio == 1) || ((arc4random() % rrl_slip_ratio) == 0))) {
#else
	if((rrl_slip_ratio > 0) && ((rrl_slip_ratio == 1) || ((random() % rrl_slip_ratio) == 0))) {
#endif
		/* set TC on the rest */
		TC_SET(query->packet);
		ANCOUNT_SET(query->packet, 0);
		NSCOUNT_SET(query->packet, 0);
		ARCOUNT_SET(query->packet, 0);
		if(query->qname)
			/* header, type, class, qname */
			buffer_set_position(query->packet,
				QHEADERSZ+4+query->qname->name_size);
		else 	buffer_set_position(query->packet, QHEADERSZ);
		return QUERY_PROCESSED;
	}
	return QUERY_DISCARDED;
}

#endif /* RATELIMIT */