diff options
author | Stuart Henderson <sthen@cvs.openbsd.org> | 2013-11-26 12:50:31 +0000 |
---|---|---|
committer | Stuart Henderson <sthen@cvs.openbsd.org> | 2013-11-26 12:50:31 +0000 |
commit | b665eb4cb1ea56ccad7fee700f05c85dec76e702 (patch) | |
tree | 8453629bcc74596d1a3588c5a534658f6a7b3503 /usr.sbin | |
parent | 9f9bd245ba092cf635e0212513052b389360c9ba (diff) |
import NSD 4.0.0, tests from Dorian Büttner, Patrik Lundin, requested by brad@
Diffstat (limited to 'usr.sbin')
66 files changed, 15885 insertions, 3040 deletions
diff --git a/usr.sbin/nsd/LICENSE b/usr.sbin/nsd/LICENSE index 955c3665a36..55faacfc49b 100644 --- a/usr.sbin/nsd/LICENSE +++ b/usr.sbin/nsd/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2001-2011, NLnet Labs. All rights reserved. +Copyright (c) 2001-2006, NLnet Labs. All rights reserved. This software is open source. diff --git a/usr.sbin/nsd/acx_nlnetlabs.m4 b/usr.sbin/nsd/acx_nlnetlabs.m4 index 719112645aa..e1cf83a70bd 100644 --- a/usr.sbin/nsd/acx_nlnetlabs.m4 +++ b/usr.sbin/nsd/acx_nlnetlabs.m4 @@ -2,7 +2,9 @@ # Copyright 2009, Wouter Wijngaards, NLnet Labs. # BSD licensed. # -# Version 24 +# Version 26 +# 2013-09-19 FLTO help text improved. +# 2013-07-18 Enable ACX_CHECK_COMPILER_FLAG to test for -Wstrict-prototypes # 2013-06-25 FLTO has --disable-flto option. # 2013-05-03 Update W32_SLEEP for newer mingw that links but not defines it. # 2013-03-22 Fix ACX_RSRC_VERSION for long version numbers. @@ -119,7 +121,7 @@ AC_MSG_CHECKING(whether $CC supports -$1) cache=`echo $1 | sed 'y%.=/+-%___p_%'` AC_CACHE_VAL(cv_prog_cc_flag_$cache, [ -echo 'void f(){}' >conftest.c +echo 'void f(void){}' >conftest.c if test -z "`$CC $CPPFLAGS $CFLAGS -$1 -c conftest.c 2>&1`"; then eval "cv_prog_cc_flag_$cache=yes" else @@ -409,7 +411,7 @@ dnl Check if CC supports -flto. dnl in a way that supports clang and suncc (that flag does something else, dnl but fails to link). It sets it in CFLAGS if it works. AC_DEFUN([ACX_CHECK_FLTO], [ - AC_ARG_ENABLE([flto], AS_HELP_STRING([--disable-flto], [Disable link-time optimization])) + AC_ARG_ENABLE([flto], AS_HELP_STRING([--disable-flto], [Disable link-time optimization (gcc specific option)])) AS_IF([test "x$enable_flto" != "xno"], [ AC_MSG_CHECKING([if $CC supports -flto]) BAKCFLAGS="$CFLAGS" diff --git a/usr.sbin/nsd/answer.c b/usr.sbin/nsd/answer.c index 8fa4ab16821..0377f0b5859 100644 --- a/usr.sbin/nsd/answer.c +++ b/usr.sbin/nsd/answer.c @@ -1,7 +1,7 @@ /* * answer.c -- manipulating query answers and encoding them. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * diff --git a/usr.sbin/nsd/answer.h b/usr.sbin/nsd/answer.h index 85d349f438d..acb3665af11 100644 --- a/usr.sbin/nsd/answer.h +++ b/usr.sbin/nsd/answer.h @@ -1,7 +1,7 @@ /* * answer.h -- manipulating query answers and encoding them. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * diff --git a/usr.sbin/nsd/axfr.h b/usr.sbin/nsd/axfr.h index b5d7afc29fd..33a68629523 100644 --- a/usr.sbin/nsd/axfr.h +++ b/usr.sbin/nsd/axfr.h @@ -1,7 +1,7 @@ /* * axfr.h -- generating AXFR responses. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * diff --git a/usr.sbin/nsd/buffer.c b/usr.sbin/nsd/buffer.c index 49151018fa9..d71fa15e3f3 100644 --- a/usr.sbin/nsd/buffer.c +++ b/usr.sbin/nsd/buffer.c @@ -1,7 +1,7 @@ /* * buffer.c -- generic memory buffer . * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * diff --git a/usr.sbin/nsd/buffer.h b/usr.sbin/nsd/buffer.h index 28610fe9310..bee7d8b29eb 100644 --- a/usr.sbin/nsd/buffer.h +++ b/usr.sbin/nsd/buffer.h @@ -1,7 +1,7 @@ /* * buffer.h -- generic memory buffer. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * diff --git a/usr.sbin/nsd/compat/memcmp.c b/usr.sbin/nsd/compat/memcmp.c index 6d3d27ac9e7..371b3d11baf 100644 --- a/usr.sbin/nsd/compat/memcmp.c +++ b/usr.sbin/nsd/compat/memcmp.c @@ -1,7 +1,7 @@ /* - * memcmp.c: memcmp compat implementation. + * memcmp.c: memcmp compat implementation. * - * Copyright (c) 2010-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2010, NLnet Labs. All rights reserved. * * See LICENSE for the license. */ diff --git a/usr.sbin/nsd/compat/memmove.c b/usr.sbin/nsd/compat/memmove.c index fd65a93f84e..f83996684fe 100644 --- a/usr.sbin/nsd/compat/memmove.c +++ b/usr.sbin/nsd/compat/memmove.c @@ -1,7 +1,7 @@ /* - * memmove.c: memmove compat implementation. + * memmove.c: memmove compat implementation. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. */ diff --git a/usr.sbin/nsd/compat/strptime.c b/usr.sbin/nsd/compat/strptime.c index 6986d35ce73..4ec96c12cef 100644 --- a/usr.sbin/nsd/compat/strptime.c +++ b/usr.sbin/nsd/compat/strptime.c @@ -10,7 +10,7 @@ * - Does not properly processes year day * * LICENSE - * Copyright (c) 2008-2011, NLnet Labs, Matthijs Mekking. + * Copyright (c) 2008, NLnet Labs, Matthijs Mekking. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/usr.sbin/nsd/configlexer.lex b/usr.sbin/nsd/configlexer.lex index 55bf4cfe62a..ee4ad1522d9 100644 --- a/usr.sbin/nsd/configlexer.lex +++ b/usr.sbin/nsd/configlexer.lex @@ -2,7 +2,7 @@ /* * configlexer.lex - lexical analyzer for NSD config file * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved * * See LICENSE for the license. * @@ -20,8 +20,6 @@ #include "configparser.h" void c_error(const char *message); -#define YY_NO_UNPUT - #if 0 #define LEXOUT(s) printf s /* used ONLY when debugging */ #else @@ -47,6 +45,15 @@ static void config_start_include(const char* filename) c_error_msg("includes nested too deeply, skipped (>%d)", MAXINCLUDES); return; } + if (cfg_parser->chroot) { + int l = strlen(cfg_parser->chroot); /* chroot has trailing slash */ + if (strncmp(cfg_parser->chroot, filename, l) != 0) { + c_error_msg("include file '%s' is not relative to chroot '%s'", + filename, cfg_parser->chroot); + return; + } + filename += l - 1; /* strip chroot without trailing slash */ + } input = fopen(filename, "r"); if(!input) { c_error_msg("cannot open include file '%s': %s", @@ -82,6 +89,16 @@ static void config_end_include(void) #endif %} +%option noinput +%option nounput +%{ +#ifndef YY_NO_UNPUT +#define YY_NO_UNPUT 1 +#endif +#ifndef YY_NO_INPUT +#define YY_NO_INPUT 1 +#endif +%} SPACE [ \t] LETTER [a-zA-Z] @@ -104,6 +121,8 @@ debug-mode{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_DEBUG_MODE;} hide-version{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_HIDE_VERSION;} ip4-only{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_IP4_ONLY;} ip6-only{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_IP6_ONLY;} +do-ip4{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_DO_IP4;} +do-ip6{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_DO_IP6;} database{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_DATABASE;} identity{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_IDENTITY;} nsid{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_NSID;} @@ -117,12 +136,13 @@ ipv6-edns-size{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_IPV6_EDNS_SIZE;} pidfile{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_PIDFILE;} port{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_PORT;} statistics{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_STATISTICS;} -zone-stats-file{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_ZONESTATSFILE;} chroot{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_CHROOT;} username{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_USERNAME;} zonesdir{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_ZONESDIR;} +zonelistfile{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_ZONELISTFILE;} difffile{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_DIFFFILE;} xfrdfile{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_XFRDFILE;} +xfrdir{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_XFRDIR;} xfrd-reload-timeout{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_XFRD_RELOAD_TIMEOUT;} verbosity{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_VERBOSITY;} zone{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_ZONE;} @@ -137,6 +157,16 @@ allow-axfr-fallback{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_ALLOW_AXFR_F key{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_KEY;} algorithm{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_ALGORITHM;} secret{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_SECRET;} +pattern{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_PATTERN;} +include-pattern{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_INCLUDEPATTERN;} +remote-control{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_REMOTE_CONTROL;} +control-enable{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_CONTROL_ENABLE;} +control-interface{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_CONTROL_INTERFACE;} +control-port{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_CONTROL_PORT;} +server-key-file{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_SERVER_KEY_FILE;} +server-cert-file{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_SERVER_CERT_FILE;} +control-key-file{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_CONTROL_KEY_FILE;} +control-cert-file{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_CONTROL_CERT_FILE;} AXFR { LEXOUT(("v(%s) ", yytext)); return VAR_AXFR;} UDP { LEXOUT(("v(%s) ", yytext)); return VAR_UDP;} rrl-size{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_RRL_SIZE;} @@ -146,6 +176,7 @@ rrl-ipv4-prefix-length{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_RRL_IPV4_ rrl-ipv6-prefix-length{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_RRL_IPV6_PREFIX_LENGTH;} rrl-whitelist-ratelimit{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_RRL_WHITELIST_RATELIMIT;} rrl-whitelist{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_RRL_WHITELIST;} +zonefiles-check{COLON} { LEXOUT(("v(%s) ", yytext)); return VAR_ZONEFILES_CHECK;} {NEWLINE} { LEXOUT(("NL\n")); cfg_parser->line++;} /* Quoted strings. Strip leading and ending quotes */ diff --git a/usr.sbin/nsd/configyyrename.h b/usr.sbin/nsd/configyyrename.h index 856dfe96d3e..6beb810aa8e 100644 --- a/usr.sbin/nsd/configyyrename.h +++ b/usr.sbin/nsd/configyyrename.h @@ -1,7 +1,7 @@ /* * configyyrename.h -- renames for config file yy values to avoid conflicts. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -32,6 +32,7 @@ #define yyps c_ps #define yypv c_pv #define yys c_s +#define yyss c_ss #define yy_yys c_yys #define yystate c_state #define yytmp c_tmp @@ -85,5 +86,34 @@ #define yyget_leng c_get_leng #define yylineno c_lineno #define yyget_text c_get_text +#define yyvsp c_vsp +#define yyvs c_vs +#define yytext c_text +#define yyleng c_leng +#define yy_meta c__meta +#define yy_start c__start +#define yy_nxt c__nxt +#define yy_n_chars c__n_chars +#define yy_more_flag c__more_flag +#define yy_more_len c__more_len +#define yy_try_NUL_trans c__try_NUL_trans +#define yy_last_accepting_cpos c__last_accepting_cpos +#define yy_last_accepting_state c__last_accepting_state +#define yy_init c__init +#define yy_base c__base +#define yy_accept c__accept +#define yy_c_buf_p c__c_buf_p +#define yy_chk c__chk +#define yy_current_buffer c__current_buffer +#define yy_def c__def +#define yy_did_buffer_switch_on_eof c__did_buffer_switch_on_eof +#define yy_ec c__ec +#define yy_fatal_error c__fatal_error +#define yy_flex_alloc c__flex_alloc +#define yy_flex_free c__flex_free +#define yy_flex_realloc c__flex_realloc +#define yy_get_next_buffer c__get_next_buffer +#define yy_get_previous_state c__get_previous_state +#define yy_hold_char c__hold_char #endif /* CONFIGYYRENAME_H */ diff --git a/usr.sbin/nsd/dbaccess.c b/usr.sbin/nsd/dbaccess.c index abecce7a1f5..866c762ea70 100644 --- a/usr.sbin/nsd/dbaccess.c +++ b/usr.sbin/nsd/dbaccess.c @@ -1,7 +1,7 @@ /* * dbaccess.c -- access methods for nsd(8) database * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -17,467 +17,569 @@ #include <string.h> #include <unistd.h> #include <fcntl.h> -#include <stdio.h> /* DEBUG */ #include "dns.h" #include "namedb.h" #include "util.h" #include "options.h" +#include "rdata.h" +#include "udb.h" +#include "udbradtree.h" +#include "udbzone.h" +#include "zonec.h" +#include "nsec3.h" +#include "difffile.h" + +static time_t udb_time = 0; +static unsigned udb_rrsets = 0; +static unsigned udb_rrset_count = 0; -int -namedb_lookup(struct namedb *db, - const dname_type *dname, - domain_type **closest_match, - domain_type **closest_encloser) +void +namedb_close(struct namedb* db) { - return domain_table_search( - db->domains, dname, closest_match, closest_encloser); + if(db) { + if(db->udb) { + udb_base_close(db->udb); + udb_base_free(db->udb); + } + zonec_desetup_parser(); + region_destroy(db->region); + } } -static int -read_magic(namedb_type *db) +void +namedb_close_udb(struct namedb* db) { - char buf[NAMEDB_MAGIC_SIZE]; - - if (fread(buf, sizeof(char), sizeof(buf), db->fd) != sizeof(buf)) - return 0; - - return memcmp(buf, NAMEDB_MAGIC, NAMEDB_MAGIC_SIZE) == 0; + if(db) { + /* we cannot actually munmap the data, because other + * processes still need to access the udb, so cleanup the + * udb */ + udb_base_free_keep_mmap(db->udb); + db->udb = NULL; + } } -static const dname_type * -read_dname(FILE *fd, region_type *region) +void +apex_rrset_checks(namedb_type* db, rrset_type* rrset, domain_type* domain) { - uint8_t size; - uint8_t temp[MAXDOMAINLEN]; + uint32_t soa_minimum; + unsigned i; + zone_type* zone = rrset->zone; + assert(domain == zone->apex); + (void)domain; + if (rrset_rrtype(rrset) == TYPE_SOA) { + zone->soa_rrset = rrset; - if (fread(&size, sizeof(uint8_t), 1, fd) != 1) - return NULL; - if (fread(temp, sizeof(uint8_t), size, fd) != size) - return NULL; + /* BUG #103 add another soa with a tweaked ttl */ + if(zone->soa_nx_rrset == 0) { + zone->soa_nx_rrset = region_alloc(db->region, + sizeof(rrset_type)); + zone->soa_nx_rrset->rr_count = 1; + zone->soa_nx_rrset->next = 0; + zone->soa_nx_rrset->zone = zone; + zone->soa_nx_rrset->rrs = region_alloc(db->region, + sizeof(rr_type)); + } + memcpy(zone->soa_nx_rrset->rrs, rrset->rrs, sizeof(rr_type)); - return dname_make(region, temp, 1); + /* check the ttl and MINIMUM value and set accordinly */ + memcpy(&soa_minimum, rdata_atom_data(rrset->rrs->rdatas[6]), + rdata_atom_size(rrset->rrs->rdatas[6])); + if (rrset->rrs->ttl > ntohl(soa_minimum)) { + zone->soa_nx_rrset->rrs[0].ttl = ntohl(soa_minimum); + } + } else if (rrset_rrtype(rrset) == TYPE_NS) { + zone->ns_rrset = rrset; + } else if (rrset_rrtype(rrset) == TYPE_RRSIG) { + for (i = 0; i < rrset->rr_count; ++i) { + if(rr_rrsig_type_covered(&rrset->rrs[i])==TYPE_DNSKEY){ + zone->is_secure = 1; + break; + } + } + } } -static int -read_size(namedb_type *db, uint32_t *result) +/** read rr */ +static void +read_rr(namedb_type* db, rr_type* rr, udb_ptr* urr, domain_type* domain) { - if (fread(result, sizeof(*result), 1, db->fd) == 1) { - *result = ntohl(*result); - return 1; - } else { - return 0; + buffer_type buffer; + ssize_t c; + assert(udb_ptr_get_type(urr) == udb_chunk_type_rr); + rr->owner = domain; + rr->type = RR(urr)->type; + rr->klass = RR(urr)->klass; + rr->ttl = RR(urr)->ttl; + + buffer_create_from(&buffer, RR(urr)->wire, RR(urr)->len); + c = rdata_wireformat_to_rdata_atoms(db->region, db->domains, + rr->type, RR(urr)->len, &buffer, &rr->rdatas); + if(c == -1) { + /* safe on error */ + rr->rdata_count = 0; + rr->rdatas = NULL; + return; } + rr->rdata_count = c; } -static domain_type * -read_domain(namedb_type *db, uint32_t domain_count, domain_type **domains) +/** calculate rr count */ +static uint16_t +calculate_rr_count(udb_base* udb, udb_ptr* rrset) { - uint32_t domain_number; - - if (!read_size(db, &domain_number)) - return NULL; - - if (domain_number == 0 || domain_number > domain_count) - return NULL; - - return domains[domain_number - 1]; + udb_ptr rr; + uint16_t num = 0; + udb_ptr_new(&rr, udb, &RRSET(rrset)->rrs); + while(rr.data) { + num++; + udb_ptr_set_rptr(&rr, udb, &RR(&rr)->next); + } + udb_ptr_unlink(&rr, udb); + return num; } -static zone_type * -read_zone(namedb_type *db, uint32_t zone_count, zone_type **zones) +/** read rrset */ +static void +read_rrset(udb_base* udb, namedb_type* db, zone_type* zone, + domain_type* domain, udb_ptr* urrset) { - uint32_t zone_number; - - if (!read_size(db, &zone_number)) - return NULL; - - if (zone_number == 0 || zone_number > zone_count) - return NULL; - - return zones[zone_number - 1]; + rrset_type* rrset; + udb_ptr urr; + unsigned i; + assert(udb_ptr_get_type(urrset) == udb_chunk_type_rrset); + /* if no RRs, do not create anything (robust) */ + if(RRSET(urrset)->rrs.data == 0) + return; + rrset = (rrset_type *) region_alloc(db->region, sizeof(rrset_type)); + rrset->zone = zone; + rrset->rr_count = calculate_rr_count(udb, urrset); + rrset->rrs = (rr_type *) region_alloc( + db->region, rrset->rr_count * sizeof(rr_type)); + /* add the RRs */ + udb_ptr_new(&urr, udb, &RRSET(urrset)->rrs); + for(i=0; i<rrset->rr_count; i++) { + read_rr(db, &rrset->rrs[i], &urr, domain); + udb_ptr_set_rptr(&urr, udb, &RR(&urr)->next); + } + udb_ptr_unlink(&urr, udb); + domain_add_rrset(domain, rrset); + if(domain == zone->apex) + apex_rrset_checks(db, rrset, domain); } -static int -read_rdata_atom(namedb_type *db, uint16_t type, int index, uint32_t domain_count, domain_type **domains, rdata_atom_type *result) +/** read one elem from db, of type domain_d */ +static void read_node_elem(udb_base* udb, namedb_type* db, + region_type* dname_region, zone_type* zone, struct domain_d* d) { - uint8_t data[65536]; - - if (rdata_atom_is_domain(type, index)) { - result->domain = read_domain(db, domain_count, domains); - if (!result->domain) - return 0; - } else { - uint16_t size; - - if (fread(&size, sizeof(size), 1, db->fd) != 1) - return 0; - size = ntohs(size); - if (fread(data, sizeof(uint8_t), size, db->fd) != size) - return 0; - - result->data = (uint16_t *) region_alloc( - db->region, sizeof(uint16_t) + size); - memcpy(result->data, &size, sizeof(uint16_t)); - memcpy((uint8_t *) result->data + sizeof(uint16_t), data, size); + const dname_type* dname; + domain_type* domain; + udb_ptr urrset; + + dname = dname_make(dname_region, d->name, 0); + if(!dname) return; + domain = domain_table_insert(db->domains, dname); + assert(domain); /* domain_table_insert should always return non-NULL */ + + /* add rrsets */ + udb_ptr_init(&urrset, udb); + udb_ptr_set_rptr(&urrset, udb, &d->rrsets); + while(urrset.data) { + read_rrset(udb, db, zone, domain, &urrset); + udb_ptr_set_rptr(&urrset, udb, &RRSET(&urrset)->next); + + if(++udb_rrsets % ZONEC_PCT_COUNT == 0 && time(NULL) > udb_time + ZONEC_PCT_TIME) { + udb_time = time(NULL); + VERBOSITY(1, (LOG_INFO, "read %s %d %%", + zone->opts->name, udb_rrsets*100/udb_rrset_count)); + } } - - return 1; + region_free_all(dname_region); + udb_ptr_unlink(&urrset, udb); } -static rrset_type * -read_rrset(namedb_type *db, - uint32_t domain_count, domain_type **domains, - uint32_t zone_count, zone_type **zones) +/** recurse read radix from disk. This radix tree is by domain name, so max of + * 256 depth, and thus the stack usage is small. */ +static void read_zone_recurse(udb_base* udb, namedb_type* db, + region_type* dname_region, zone_type* zone, struct udb_radnode_d* node) { - rrset_type *rrset; - int i, j; - domain_type *owner; - uint16_t type; - uint16_t klass; - uint32_t soa_minimum; - - owner = read_domain(db, domain_count, domains); - if (!owner) - return NULL; - - rrset = (rrset_type *) region_alloc(db->region, sizeof(rrset_type)); - - rrset->zone = read_zone(db, zone_count, zones); - if (!rrset->zone) - return NULL; - - if (fread(&type, sizeof(type), 1, db->fd) != 1) - return NULL; - type = ntohs(type); - - if (fread(&klass, sizeof(klass), 1, db->fd) != 1) - return NULL; - klass = ntohs(klass); - - if (fread(&rrset->rr_count, sizeof(rrset->rr_count), 1, db->fd) != 1) - return NULL; - rrset->rr_count = ntohs(rrset->rr_count); - rrset->rrs = (rr_type *) region_alloc( - db->region, rrset->rr_count * sizeof(rr_type)); - - assert(rrset->rr_count > 0); - - for (i = 0; i < rrset->rr_count; ++i) { - rr_type *rr = &rrset->rrs[i]; - - rr->owner = owner; - rr->type = type; - rr->klass = klass; - - if (fread(&rr->rdata_count, sizeof(rr->rdata_count), 1, db->fd) != 1) - return NULL; - rr->rdata_count = ntohs(rr->rdata_count); - rr->rdatas = (rdata_atom_type *) region_alloc( - db->region, rr->rdata_count * sizeof(rdata_atom_type)); - - if (fread(&rr->ttl, sizeof(rr->ttl), 1, db->fd) != 1) - return NULL; - rr->ttl = ntohl(rr->ttl); - - for (j = 0; j < rr->rdata_count; ++j) { - if (!read_rdata_atom(db, rr->type, j, domain_count, domains, &rr->rdatas[j])) - return NULL; + if(node->elem.data) { + /* pre-order process of node->elem, for radix tree this is + * also in-order processing (identical to order tree_next()) */ + read_node_elem(udb, db, dname_region, zone, (struct domain_d*) + (udb->base + node->elem.data)); + } + if(node->lookup.data) { + uint16_t i; + struct udb_radarray_d* a = (struct udb_radarray_d*) + (udb->base + node->lookup.data); + /* we do not care for what the exact radix key is, we want + * to add all of them and the read routine does not need + * the radix-key, it has it stored */ + for(i=0; i<a->len; i++) { + if(a->array[i].node.data) { + read_zone_recurse(udb, db, dname_region, zone, + (struct udb_radnode_d*)(udb->base + + a->array[i].node.data)); + } } } +} - domain_add_rrset(owner, rrset); - - if (rrset_rrtype(rrset) == TYPE_SOA) { - assert(owner == rrset->zone->apex); - rrset->zone->soa_rrset = rrset; - - /* BUG #103 add another soa with a tweaked ttl */ - rrset->zone->soa_nx_rrset = region_alloc(db->region, sizeof(rrset_type)); - rrset->zone->soa_nx_rrset->rrs = - region_alloc(db->region, rrset->rr_count * sizeof(rr_type)); - - memcpy(rrset->zone->soa_nx_rrset->rrs, rrset->rrs, sizeof(rr_type)); - rrset->zone->soa_nx_rrset->rr_count = 1; - rrset->zone->soa_nx_rrset->next = 0; +/** read zone data */ +static void +read_zone_data(udb_base* udb, namedb_type* db, region_type* dname_region, + udb_ptr* z, zone_type* zone) +{ + udb_ptr dtree; + /* recursively read domains, we only read so ptrs stay valid */ + udb_ptr_new(&dtree, udb, &ZONE(z)->domains); + if(RADTREE(&dtree)->root.data) + read_zone_recurse(udb, db, dname_region, zone, + (struct udb_radnode_d*) + (udb->base + RADTREE(&dtree)->root.data)); + udb_ptr_unlink(&dtree, udb); +} - /* also add a link to the zone */ - rrset->zone->soa_nx_rrset->zone = rrset->zone; +/** create a zone */ +zone_type* +namedb_zone_create(namedb_type* db, const dname_type* dname, + zone_options_t* zo) +{ + zone_type* zone = (zone_type *) region_alloc(db->region, + sizeof(zone_type)); + zone->node = radname_insert(db->zonetree, dname_name(dname), + dname->name_size, zone); + assert(zone->node); + zone->apex = domain_table_insert(db->domains, dname); + zone->apex->usage++; /* the zone.apex reference */ + zone->apex->is_apex = 1; + zone->soa_rrset = NULL; + zone->soa_nx_rrset = NULL; + zone->ns_rrset = NULL; +#ifdef NSEC3 + zone->nsec3_param = NULL; + zone->nsec3_last = NULL; + zone->nsec3tree = NULL; + zone->hashtree = NULL; + zone->wchashtree = NULL; + zone->dshashtree = NULL; +#endif + zone->opts = zo; + zone->is_secure = 0; + zone->is_changed = 0; + zone->is_ok = 1; + return zone; +} - /* check the ttl and MINIMUM value and set accordinly */ - memcpy(&soa_minimum, rdata_atom_data(rrset->rrs->rdatas[6]), - rdata_atom_size(rrset->rrs->rdatas[6])); - if (rrset->rrs->ttl > ntohl(soa_minimum)) { - rrset->zone->soa_nx_rrset->rrs[0].ttl = ntohl(soa_minimum); +void +namedb_zone_delete(namedb_type* db, zone_type* zone) +{ + /* RRs and UDB and NSEC3 and so on must be already deleted */ + radix_delete(db->zonetree, zone->node); + + /* see if apex can be deleted */ + if(zone->apex) { + zone->apex->usage --; + if(zone->apex->usage == 0) { + /* delete the apex, possibly */ + domain_table_deldomain(db, zone->apex); } - owner->has_SOA = 1; + } - } else if (owner == rrset->zone->apex - && rrset_rrtype(rrset) == TYPE_NS) - { - rrset->zone->ns_rrset = rrset; + /* soa_rrset is freed when the SOA was deleted */ + if(zone->soa_nx_rrset) { + region_recycle(db->region, zone->soa_nx_rrset->rrs, + sizeof(rr_type)); + region_recycle(db->region, zone->soa_nx_rrset, + sizeof(rrset_type)); } #ifdef NSEC3 -#ifndef FULL_PREHASH - else if (type == TYPE_NSEC3) { - if (0 != namedb_add_nsec3_domain(db, owner, rrset->zone)) { - return NULL; - } - } -#endif /* !FULL_PREHASH */ -#endif /* NSEC3 */ - if (rrset_rrtype(rrset) == TYPE_RRSIG && owner == rrset->zone->apex) { - for (i = 0; i < rrset->rr_count; ++i) { - if (rr_rrsig_type_covered(&rrset->rrs[i]) == TYPE_DNSKEY) { - rrset->zone->is_secure = 1; - break; - } - } + hash_tree_delete(db->region, zone->nsec3tree); + hash_tree_delete(db->region, zone->hashtree); + hash_tree_delete(db->region, zone->wchashtree); + hash_tree_delete(db->region, zone->dshashtree); +#endif + region_recycle(db->region, zone, sizeof(zone_type)); +} + +/** read a zone */ +static void +read_zone(udb_base* udb, namedb_type* db, nsd_options_t* opt, + region_type* dname_region, udb_ptr* z) +{ + /* construct dname */ + const dname_type* dname = dname_make(dname_region, ZONE(z)->name, 0); + zone_options_t* zo = dname?zone_options_find(opt, dname):NULL; + zone_type* zone; + if(!dname) return; + if(!zo) { + /* deleted from the options, remove it from the nsd.db too */ + VERBOSITY(2, (LOG_WARNING, "zone %s is deleted", + dname_to_string(dname, NULL))); + udb_zone_delete(udb, z); + region_free_all(dname_region); + return; } - return rrset; + assert(udb_ptr_get_type(z) == udb_chunk_type_zone); + udb_rrsets = 0; + udb_rrset_count = ZONE(z)->rrset_count; + zone = namedb_zone_create(db, dname, zo); + region_free_all(dname_region); + read_zone_data(udb, db, dname_region, z, zone); + zone->is_changed = (ZONE(z)->is_changed != 0); +#ifdef NSEC3 + prehash_zone_complete(db, zone); +#endif } -struct namedb * -namedb_open (const char *filename, nsd_options_t* opt, size_t num_children) +/** read zones from nsd.db */ +static void +read_zones(udb_base* udb, namedb_type* db, nsd_options_t* opt, + region_type* dname_region) { - namedb_type *db; + udb_ptr ztree, n, z; + udb_ptr_init(&z, udb); + udb_ptr_new(&ztree, udb, udb_base_get_userdata(udb)); + udb_radix_first(udb,&ztree,&n); + udb_time = time(NULL); + while(n.data) { + udb_ptr_set_rptr(&z, udb, &RADNODE(&n)->elem); + udb_radix_next(udb, &n); /* store in case n is deleted */ + read_zone(udb, db, opt, dname_region, &z); + udb_ptr_zero(&z, udb); + } + udb_ptr_unlink(&ztree, udb); + udb_ptr_unlink(&n, udb); + udb_ptr_unlink(&z, udb); +} +/** try to read the udb file or fail */ +static int +try_read_udb(namedb_type* db, int fd, const char* filename, + nsd_options_t* opt) +{ /* * Temporary region used while loading domain names from the * database. The region is freed after each time a dname is * read from the database. */ - region_type *dname_region; + region_type* dname_region; + + assert(fd != -1); + if(!(db->udb=udb_base_create_fd(filename, fd, &namedb_walkfunc, + NULL))) { + /* fd is closed by failed udb create call */ + VERBOSITY(1, (LOG_WARNING, "can not use %s, " + "will create anew", filename)); + return 0; + } + /* sanity check if can be opened */ + if(udb_base_get_userflags(db->udb) != 0) { + log_msg(LOG_WARNING, "%s was not closed properly, it might " + "be corrupted, will create anew", filename); + udb_base_free(db->udb); + db->udb = NULL; + return 0; + } + /* read if it can be opened */ + dname_region = region_create(xalloc, free); + /* this operation does not fail, we end up with + * something, even if that is an empty namedb */ + read_zones(db->udb, db, opt, dname_region); + region_destroy(dname_region); + return 1; +} + +struct namedb * +namedb_open (const char* filename, nsd_options_t* opt) +{ + namedb_type* db; /* - * Temporary region used to store array of domains and zones - * while loading the database. The region is freed before - * returning. + * Region used to store the loaded database. The region is + * freed in namedb_close. */ - region_type *temp_region; - - uint32_t dname_count; - domain_type **domains; /* Indexed by domain number. */ - - uint32_t zone_count; - zone_type **zones; /* Indexed by zone number. */ - - uint32_t i; - uint32_t rrset_count = 0; - uint32_t rr_count = 0; - - rrset_type *rrset; - - DEBUG(DEBUG_DBACCESS, 2, - (LOG_INFO, "sizeof(namedb_type) = %lu\n", (unsigned long) sizeof(namedb_type))); - DEBUG(DEBUG_DBACCESS, 2, - (LOG_INFO, "sizeof(zone_type) = %lu\n", (unsigned long) sizeof(zone_type))); - DEBUG(DEBUG_DBACCESS, 2, - (LOG_INFO, "sizeof(domain_type) = %lu\n", (unsigned long) sizeof(domain_type))); - DEBUG(DEBUG_DBACCESS, 2, - (LOG_INFO, "sizeof(rrset_type) = %lu\n", (unsigned long) sizeof(rrset_type))); - DEBUG(DEBUG_DBACCESS, 2, - (LOG_INFO, "sizeof(rr_type) = %lu\n", (unsigned long) sizeof(rr_type))); - DEBUG(DEBUG_DBACCESS, 2, - (LOG_INFO, "sizeof(rdata_atom_type) = %lu\n", (unsigned long) sizeof(rdata_atom_type))); - DEBUG(DEBUG_DBACCESS, 2, - (LOG_INFO, "sizeof(rbnode_t) = %lu\n", (unsigned long) sizeof(rbnode_t))); - - if ((db = namedb_create()) == NULL) { - log_msg(LOG_ERR, - "insufficient memory to create database"); - return NULL; + region_type* db_region; + int fd; + + /* attempt to open, if does not exist, create a new one */ + fd = open(filename, O_RDWR); + if(fd == -1) { + if(errno != ENOENT) { + log_msg(LOG_ERR, "%s: %s", filename, strerror(errno)); + return NULL; + } } - db->filename = region_strdup(db->region, filename); + +#ifdef USE_MMAP_ALLOC + db_region = region_create_custom(mmap_alloc, mmap_free, MMAP_ALLOC_CHUNK_SIZE, + MMAP_ALLOC_LARGE_OBJECT_SIZE, MMAP_ALLOC_INITIAL_CLEANUP_SIZE, 1); +#else /* !USE_MMAP_ALLOC */ + db_region = region_create_custom(xalloc, free, DEFAULT_CHUNK_SIZE, + DEFAULT_LARGE_OBJECT_SIZE, DEFAULT_INITIAL_CLEANUP_SIZE, 1); +#endif /* !USE_MMAP_ALLOC */ + db = (namedb_type *) region_alloc(db_region, sizeof(struct namedb)); + db->region = db_region; + db->domains = domain_table_create(db->region); + db->zonetree = radix_tree_create(db->region); + db->diff_skip = 0; + db->diff_pos = 0; if (gettimeofday(&(db->diff_timestamp), NULL) != 0) { log_msg(LOG_ERR, "unable to load %s: cannot initialize" - "timestamp", db->filename); - namedb_destroy(db); - return NULL; - } - - /* Open it... */ - db->fd = fopen(db->filename, "r"); - if (db->fd == NULL) { - log_msg(LOG_ERR, "unable to load %s: %s", - db->filename, strerror(errno)); - namedb_destroy(db); - return NULL; - } - - if (!read_magic(db)) { - log_msg(LOG_ERR, "corrupted database (read magic): %s", db->filename); - log_msg(LOG_ERR, "cannot load database, incompatible version " - "number. Please rebuild database and " - "start again."); - namedb_close(db); + "timestamp", filename); + region_destroy(db_region); + close(fd); return NULL; - } + } - if (!read_size(db, &zone_count)) { - log_msg(LOG_ERR, "corrupted database (read size): %s", db->filename); - namedb_close(db); - return NULL; + /* attempt to read the file (if it exists) */ + if(fd != -1) { + if(!try_read_udb(db, fd, filename, opt)) + fd = -1; } - - DEBUG(DEBUG_DBACCESS, 1, - (LOG_INFO, "Retrieving %lu zones\n", (unsigned long) zone_count)); - - temp_region = region_create(xalloc, free); - dname_region = region_create(xalloc, free); - - db->zone_count = zone_count; - zones = (zone_type **) region_alloc(temp_region, - zone_count * sizeof(zone_type *)); - for (i = 0; i < zone_count; ++i) { - const dname_type *dname = read_dname(db->fd, dname_region); - if (!dname) { - log_msg(LOG_ERR, "corrupted database (read dname): %s", db->filename); - region_destroy(dname_region); - region_destroy(temp_region); - namedb_close(db); - return NULL; - } - zones[i] = (zone_type *) region_alloc(db->region, - sizeof(zone_type)); - zones[i]->next = db->zones; - db->zones = zones[i]; - zones[i]->apex = domain_table_insert(db->domains, dname); - zones[i]->soa_rrset = NULL; - zones[i]->soa_nx_rrset = NULL; - zones[i]->ns_rrset = NULL; -#ifdef NSEC3 - zones[i]->nsec3_soa_rr = NULL; - zones[i]->nsec3_last = NULL; -#endif - zones[i]->opts = zone_options_find(opt, domain_dname(zones[i]->apex)); - zones[i]->number = i + 1; - zones[i]->is_secure = 0; - zones[i]->updated = 1; - zones[i]->is_ok = 0; - zones[i]->dirty = region_alloc(db->region, sizeof(uint8_t)*num_children); - memset(zones[i]->dirty, 0, sizeof(uint8_t)*num_children); - if(!zones[i]->opts) { - log_msg(LOG_ERR, "cannot load database. Zone %s in db " - "%s, but not in config file (might " - "happen if you edited the config " - "file). Please rebuild database and " - "start again.", - dname_to_string(dname, NULL), db->filename); - region_destroy(dname_region); - region_destroy(temp_region); - namedb_close(db); + /* attempt to create the file (if necessary or failed read) */ + if(fd == -1) { + if(!(db->udb=udb_base_create_new(filename, &namedb_walkfunc, + NULL))) { + region_destroy(db_region); return NULL; } -#ifdef NSEC3 -#ifndef FULL_PREHASH - zones[i]->nsec3_domains = NULL; - if (0 != zone_nsec3_domains_create(db, zones[i])) { - log_msg(LOG_ERR, - "insufficient memory for NSEC3 tree, " - "unable to read database"); - region_destroy(dname_region); - region_destroy(temp_region); - namedb_close(db); + if(!udb_dns_init_file(db->udb)) { + region_destroy(db->region); return NULL; } -#endif /* !FULL_PREHASH */ -#endif /* NSEC3 */ - region_free_all(dname_region); } + zonec_setup_parser(db); + return db; +} - if (!read_size(db, &dname_count)) { - log_msg(LOG_ERR, "corrupted database (read size): %s", db->filename); - region_destroy(dname_region); - region_destroy(temp_region); - namedb_close(db); - return NULL; +/** the the file mtime stat (or nonexist or error) */ +static int +file_get_mtime(const char* file, time_t* mtime, int* nonexist) +{ + struct stat s; + if(stat(file, &s) != 0) { + *mtime = 0; + *nonexist = (errno == ENOENT); + return 0; } + *nonexist = 0; + *mtime = s.st_mtime; + return 1; +} - DEBUG(DEBUG_DBACCESS, 1, - (LOG_INFO, "Retrieving %lu domain names\n", (unsigned long) dname_count)); - - domains = (domain_type **) region_alloc( - temp_region, dname_count * sizeof(domain_type *)); - for (i = 0; i < dname_count; ++i) { - const dname_type *dname = read_dname(db->fd, dname_region); - if (!dname) { - log_msg(LOG_ERR, "corrupted database (read dname): %s", db->filename); - region_destroy(dname_region); - region_destroy(temp_region); - namedb_close(db); - return NULL; +void +namedb_read_zonefile(struct namedb* db, struct zone* zone, udb_base* taskudb, + udb_ptr* last_task) +{ + time_t mtime = 0; + int nonexist = 0; + unsigned int errors; + const char* fname; + if(!db || !db->udb || !zone || !zone->opts || !zone->opts->pattern->zonefile) + return; + fname = config_make_zonefile(zone->opts); + if(!file_get_mtime(fname, &mtime, &nonexist)) { + if(nonexist) { + VERBOSITY(2, (LOG_INFO, "zonefile %s does not exist", + fname)); + } else + log_msg(LOG_ERR, "zonefile %s: %s", + fname, strerror(errno)); + if(taskudb) task_new_soainfo(taskudb, last_task, zone); + return; + } else { + /* check the mtime */ + if(udb_zone_get_mtime(db->udb, dname_name(domain_dname( + zone->apex)), domain_dname(zone->apex)->name_size) + >= (uint64_t)mtime) { + VERBOSITY(3, (LOG_INFO, "zonefile %s is not modified", + fname)); + return; } - domains[i] = domain_table_insert(db->domains, dname); - region_free_all(dname_region); } - region_destroy(dname_region); - -#ifndef NDEBUG - fprintf(stderr, "database region after loading domain names: "); - region_dump_stats(db->region, stderr); - fprintf(stderr, "\n"); + assert(parser); + /* wipe zone from memory */ +#ifdef NSEC3 + nsec3_hash_tree_clear(zone); #endif - - while ((rrset = read_rrset(db, dname_count, domains, zone_count, zones))) { - ++rrset_count; - rr_count += rrset->rr_count; - } - - DEBUG(DEBUG_DBACCESS, 1, - (LOG_INFO, "Retrieved %lu RRs in %lu RRsets\n", - (unsigned long) rr_count, (unsigned long) rrset_count)); - - region_destroy(temp_region); - - if ((db->crc_pos = ftello(db->fd)) == -1) { - log_msg(LOG_ERR, "ftello %s failed: %s", - db->filename, strerror(errno)); - namedb_close(db); - return NULL; - } - if (!read_size(db, &db->crc)) { - log_msg(LOG_ERR, "corrupted database (read size): %s", db->filename); - namedb_close(db); - return NULL; - } - if (!read_magic(db)) { - log_msg(LOG_ERR, "corrupted database (read magic): %s", db->filename); - log_msg(LOG_ERR, "cannot load database, incompatible version " - "number. Please rebuild database and " - "start again."); - namedb_close(db); - return NULL; + delete_zone_rrs(db, zone); +#ifdef NSEC3 + nsec3_clear_precompile(db, zone); + zone->nsec3_param = NULL; +#endif /* NSEC3 */ + errors = zonec_read(zone->opts->name, fname, zone); + if(errors > 0) { + region_type* dname_region; + udb_ptr z; + log_msg(LOG_ERR, "zone %s file %s read with %u errors", + zone->opts->name, fname, errors); + /* wipe (partial) zone from memory */ + zone->is_ok = 1; +#ifdef NSEC3 + nsec3_hash_tree_clear(zone); +#endif + delete_zone_rrs(db, zone); +#ifdef NSEC3 + nsec3_clear_precompile(db, zone); + zone->nsec3_param = NULL; +#endif /* NSEC3 */ + /* see if we can revert to the udb stored version */ + if(!udb_zone_search(db->udb, &z, dname_name(domain_dname( + zone->apex)), domain_dname(zone->apex)->name_size)) { + /* tell that zone contents has been lost */ + if(taskudb) task_new_soainfo(taskudb, last_task, zone); + return; + } + /* read from udb */ + dname_region = region_create(xalloc, free); + udb_rrsets = 0; + udb_rrset_count = ZONE(&z)->rrset_count; + udb_time = time(NULL); + read_zone_data(db->udb, db, dname_region, &z, zone); + region_destroy(dname_region); + udb_ptr_unlink(&z, db->udb); + } else { + VERBOSITY(1, (LOG_INFO, "zone %s read with no errors", + zone->opts->name)); + zone->is_ok = 1; + zone->is_changed = 0; + /* store zone into udb */ + if(!write_zone_to_udb(db->udb, zone, mtime)) { + log_msg(LOG_ERR, "failed to store zone in db"); + } else { + VERBOSITY(2, (LOG_INFO, "zone %s written to db", + zone->opts->name)); + } } - - fclose(db->fd); - db->fd = NULL; - -#ifndef NDEBUG - fprintf(stderr, "database region after loading database: "); - region_dump_stats(db->region, stderr); - fprintf(stderr, "\n"); + if(taskudb) task_new_soainfo(taskudb, last_task, zone); +#ifdef NSEC3 + prehash_zone_complete(db, zone); #endif - - return db; } -void -namedb_close (struct namedb *db) +void namedb_check_zonefile(struct namedb* db, udb_base* taskudb, + udb_ptr* last_task, zone_options_t* zopt) { - namedb_fd_close(db); - if (db) { - namedb_destroy(db); + zone_type* zone; + const dname_type* dname = (const dname_type*)zopt->node.key; + /* find zone to go with it, or create it */ + zone = namedb_find_zone(db, dname); + if(!zone) { + zone = namedb_zone_create(db, dname, zopt); } + namedb_read_zonefile(db, zone, taskudb, last_task); } -void -namedb_fd_close (struct namedb *db) +void namedb_check_zonefiles(struct namedb* db, nsd_options_t* opt, + udb_base* taskudb, udb_ptr* last_task) { - if (db && db->fd) { - fclose(db->fd); + zone_options_t* zo; + /* check all zones in opt, create if not exist in main db */ + RBTREE_FOR(zo, zone_options_t*, opt->zone_options) { + namedb_check_zonefile(db, taskudb, last_task, zo); } } - diff --git a/usr.sbin/nsd/dbcreate.c b/usr.sbin/nsd/dbcreate.c index f193792debb..f0fbb112784 100644 --- a/usr.sbin/nsd/dbcreate.c +++ b/usr.sbin/nsd/dbcreate.c @@ -1,7 +1,7 @@ /* * dbcreate.c -- routines to create an nsd(8) name database * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -9,6 +9,7 @@ #include "config.h" +#include <sys/stat.h> #include <sys/types.h> #include <errno.h> #include <fcntl.h> @@ -17,266 +18,362 @@ #include <unistd.h> #include "namedb.h" +#include "udb.h" +#include "udbradtree.h" +#include "udbzone.h" +#include "options.h" -static int write_db (namedb_type *db); -static int write_number(struct namedb *db, uint32_t number); +/* pathname directory separator character */ +#define PATHSEP '/' -struct namedb * -namedb_new (const char *filename) +/** add an rdata (uncompressed) to the destination */ +static size_t +add_rdata(rr_type* rr, unsigned i, uint8_t* buf, size_t buflen) { - namedb_type *db; - /* Make a new structure... */ - if ((db = namedb_create()) == NULL) { - log_msg(LOG_ERR, - "insufficient memory to create database"); - return NULL; - } - db->filename = region_strdup(db->region, filename); - db->crc = 0xffffffff; - db->diff_skip = 0; - db->fd = NULL; - - if (gettimeofday(&(db->diff_timestamp), NULL) != 0) { - log_msg(LOG_ERR, "unable to load %s: cannot initialize " - "timestamp", db->filename); - namedb_destroy(db); - return NULL; - } - - /* - * Unlink the old database, if it exists. This is useful to - * ensure that NSD doesn't see the changes until a reload is done. - */ - if (unlink(db->filename) == -1 && errno != ENOENT) { - namedb_destroy(db); - return NULL; - } - - /* Create the database */ - if ((db->fd = fopen(db->filename, "w")) == NULL) { - namedb_destroy(db); - return NULL; - } - - if (!write_data_crc(db->fd, NAMEDB_MAGIC, NAMEDB_MAGIC_SIZE, &db->crc)) { - fclose(db->fd); - namedb_discard(db); - return NULL; + switch(rdata_atom_wireformat_type(rr->type, i)) { + case RDATA_WF_COMPRESSED_DNAME: + case RDATA_WF_UNCOMPRESSED_DNAME: + { + const dname_type* dname = domain_dname( + rdata_atom_domain(rr->rdatas[i])); + if(dname->name_size > buflen) + return 0; + memmove(buf, dname_name(dname), dname->name_size); + return dname->name_size; + } + default: + break; } - - return db; + memmove(buf, rdata_atom_data(rr->rdatas[i]), + rdata_atom_size(rr->rdatas[i])); + return rdata_atom_size(rr->rdatas[i]); } - -int -namedb_save (struct namedb *db) +/* marshal rdata into buffer, must be MAX_RDLENGTH in size */ +size_t +rr_marshal_rdata(rr_type* rr, uint8_t* rdata, size_t sz) { - if (write_db(db) != 0) { - return -1; - } - - /* Finish up and write the crc */ - if (!write_number(db, ~db->crc)) { - fclose(db->fd); - return -1; + size_t len = 0; + unsigned i; + assert(rr); + for(i=0; i<rr->rdata_count; i++) { + len += add_rdata(rr, i, rdata+len, sz-len); } - - /* Write the magic... */ - if (!write_data_crc(db->fd, NAMEDB_MAGIC, NAMEDB_MAGIC_SIZE, &db->crc)) { - fclose(db->fd); - return -1; - } - - /* Close the database */ - fclose(db->fd); - namedb_destroy(db); - return 0; + return len; } - +/** delete an RR */ void -namedb_discard (struct namedb *db) +udb_del_rr(udb_base* udb, udb_ptr* z, rr_type* rr) { - unlink(db->filename); - namedb_destroy(db); + /* marshal the rdata (uncompressed) into a buffer */ + uint8_t rdata[MAX_RDLENGTH]; + size_t rdatalen = rr_marshal_rdata(rr, rdata, sizeof(rdata)); + assert(udb); + udb_zone_del_rr(udb, z, dname_name(domain_dname(rr->owner)), + domain_dname(rr->owner)->name_size, rr->type, rr->klass, + rdata, rdatalen); } -static int -write_dname(struct namedb *db, domain_type *domain) +/** write rr */ +int +udb_write_rr(udb_base* udb, udb_ptr* z, rr_type* rr) { - const dname_type *dname = domain_dname(domain); - - if (!write_data_crc(db->fd, &dname->name_size, sizeof(dname->name_size), &db->crc)) - return -1; - - if (!write_data_crc(db->fd, dname_name(dname), dname->name_size, &db->crc)) - return -1; - - return 0; + /* marshal the rdata (uncompressed) into a buffer */ + uint8_t rdata[MAX_RDLENGTH]; + size_t rdatalen = 0; + unsigned i; + assert(rr); + for(i=0; i<rr->rdata_count; i++) { + rdatalen += add_rdata(rr, i, rdata+rdatalen, + sizeof(rdata)-rdatalen); + } + assert(udb); + return udb_zone_add_rr(udb, z, dname_name(domain_dname(rr->owner)), + domain_dname(rr->owner)->name_size, rr->type, rr->klass, + rr->ttl, rdata, rdatalen); } +/** write rrset */ static int -write_number(struct namedb *db, uint32_t number) +write_rrset(udb_base* udb, udb_ptr* z, rrset_type* rrset) { - number = htonl(number); - return write_data_crc(db->fd, &number, sizeof(number), &db->crc); + unsigned i; + for(i=0; i<rrset->rr_count; i++) { + if(!udb_write_rr(udb, z, &rrset->rrs[i])) + return 0; + } + return 1; } +/** write a zone */ static int -write_rrset(struct namedb *db, domain_type *domain, rrset_type *rrset) +write_zone(udb_base* udb, udb_ptr* z, zone_type* zone) { - uint16_t rr_count; - int i, j; - uint16_t type; - uint16_t klass; - - assert(db); - assert(domain); - assert(rrset); - - rr_count = htons(rrset->rr_count); - - if (!write_number(db, domain->number)) - return 1; - - if (!write_number(db, rrset->zone->number)) - return 1; - - type = htons(rrset_rrtype(rrset)); - if (!write_data_crc(db->fd, &type, sizeof(type), &db->crc)) - return 1; - - klass = htons(rrset_rrclass(rrset)); - if (!write_data_crc(db->fd, &klass, sizeof(klass), &db->crc)) - return 1; - - if (!write_data_crc(db->fd, &rr_count, sizeof(rr_count), &db->crc)) - return 1; - - for (i = 0; i < rrset->rr_count; ++i) { - rr_type *rr = &rrset->rrs[i]; - uint32_t ttl; - uint16_t rdata_count; - - rdata_count = htons(rr->rdata_count); - if (!write_data_crc(db->fd, &rdata_count, sizeof(rdata_count), &db->crc)) - return 1; - - ttl = htonl(rr->ttl); - if (!write_data_crc(db->fd, &ttl, sizeof(ttl), &db->crc)) - return 1; - - for (j = 0; j < rr->rdata_count; ++j) { - rdata_atom_type atom = rr->rdatas[j]; - if (rdata_atom_is_domain(rr->type, j)) { - if (!write_number(db, rdata_atom_domain(atom)->number)) - return 1; - - } else { - uint16_t size = htons(rdata_atom_size(atom)); - if (!write_data_crc(db->fd, &size, sizeof(size), &db->crc)) - return 1; - - if (!write_data_crc(db->fd, - rdata_atom_data(atom), - rdata_atom_size(atom), &db->crc)) - return 1; - + /* write all domains in the zone */ + domain_type* walk; + rrset_type* rrset; + int n = 0, c = 0; + time_t t = time(NULL); + + /* count domains: for pct logging */ + for(walk=zone->apex; walk && domain_is_subdomain(walk, zone->apex); + walk=domain_next(walk)) { + n++; + } + /* write them */ + for(walk=zone->apex; walk && domain_is_subdomain(walk, zone->apex); + walk=domain_next(walk)) { + /* write all rrsets (in the zone) for this domain */ + for(rrset=walk->rrsets; rrset; rrset=rrset->next) { + if(rrset->zone == zone) { + if(!write_rrset(udb, z, rrset)) + return 0; } } + /* only check every ... domains, and print pct */ + if(++c % ZONEC_PCT_COUNT == 0 && time(NULL) > t + ZONEC_PCT_TIME) { + t = time(NULL); + VERBOSITY(1, (LOG_INFO, "write %s %d %%", + zone->opts->name, c*100/n)); + } } + return 1; +} - return 0; +/** create and write a zone */ +int +write_zone_to_udb(udb_base* udb, zone_type* zone, time_t mtime) +{ + udb_ptr z; + /* make udb dirty */ + udb_base_set_userflags(udb, 1); + /* find or create zone */ + if(udb_zone_search(udb, &z, dname_name(domain_dname(zone->apex)), + domain_dname(zone->apex)->name_size)) { + /* wipe existing contents */ + udb_zone_clear(udb, &z); + } else { + if(!udb_zone_create(udb, &z, dname_name(domain_dname( + zone->apex)), domain_dname(zone->apex)->name_size)) { + udb_base_set_userflags(udb, 0); + return 0; + } + } + /* set mtime */ + ZONE(&z)->mtime = (uint64_t)mtime; + ZONE(&z)->is_changed = 0; + udb_zone_set_log_str(udb, &z, NULL); + /* write zone */ + if(!write_zone(udb, &z, zone)) { + udb_base_set_userflags(udb, 0); + return 0; + } + udb_ptr_unlink(&z, udb); + udb_base_set_userflags(udb, 0); + return 1; } static int -number_dnames_iterator(domain_type *node, void *user_data) +print_rrs(FILE* out, struct zone* zone) { - uint32_t *current_number = (uint32_t *) user_data; - - node->number = *current_number; - ++*current_number; - - return 0; + rrset_type *rrset; + domain_type *domain = zone->apex; + region_type* region = region_create(xalloc, free); + struct state_pretty_rr* state = create_pretty_rr(region); + /* first print the SOA record for the zone */ + if(zone->soa_rrset) { + size_t i; + for(i=0; i < zone->soa_rrset->rr_count; i++) { + if(!print_rr(out, state, &zone->soa_rrset->rrs[i])){ + log_msg(LOG_ERR, "There was an error " + "printing SOARR to zone %s", + zone->opts->name); + region_destroy(region); + return 0; + } + } + } + /* go through entire tree below the zone apex (incl subzones) */ + while(domain && domain_is_subdomain(domain, zone->apex)) + { + for(rrset = domain->rrsets; rrset; rrset=rrset->next) + { + size_t i; + if(rrset->zone != zone || rrset == zone->soa_rrset) + continue; + for(i=0; i < rrset->rr_count; i++) { + if(!print_rr(out, state, &rrset->rrs[i])){ + log_msg(LOG_ERR, "There was an error " + "printing RR to zone %s", + zone->opts->name); + region_destroy(region); + return 0; + } + } + } + domain = domain_next(domain); + } + region_destroy(region); + return 1; } static int -write_dname_iterator(domain_type *node, void *user_data) +print_header(zone_type* zone, FILE* out, time_t* now, const char* logs) { - namedb_type *db = (namedb_type *) user_data; - - return write_dname(db, node); + char buf[4096]; + /* ctime prints newline at end of this line */ + snprintf(buf, sizeof(buf), "; zone %s written by NSD %s on %s", + zone->opts->name, PACKAGE_VERSION, ctime(now)); + if(!write_data(out, buf, strlen(buf))) + return 0; + if(!logs || logs[0] == 0) return 1; + snprintf(buf, sizeof(buf), "; %s\n", logs); + return write_data(out, buf, strlen(buf)); } static int -write_domain_iterator(domain_type *node, void *user_data) +write_to_zonefile(zone_type* zone, const char* filename, const char* logs) { - namedb_type *db = (namedb_type *) user_data; - rrset_type *rrset; - int error = 0; - - for (rrset = node->rrsets; rrset; rrset = rrset->next) { - error += write_rrset(db, node, rrset); + time_t now = time(0); + FILE *out; + VERBOSITY(1, (LOG_INFO, "writing zone %s to file %s", + zone->opts->name, filename)); + + out = fopen(filename, "w"); + if(!out) { + log_msg(LOG_ERR, "cannot write zone %s file %s: %s", + zone->opts->name, filename, strerror(errno)); + return 0; + } + if(!print_header(zone, out, &now, logs)) { + fclose(out); + log_msg(LOG_ERR, "There was an error printing " + "the header to zone %s", zone->opts->name); + return 0; + } + if(!print_rrs(out, zone)) { + fclose(out); + return 0; } + fclose(out); + return 1; +} - return error; +/** create directories above this file, .../dir/dir/dir/file */ +int +create_dirs(const char* path) +{ + char dir[4096]; + char* p; + strlcpy(dir, path, sizeof(dir)); + /* if we start with / then do not try to create '' */ + if(dir[0] == PATHSEP) + p = strchr(dir+1, PATHSEP); + else p = strchr(dir, PATHSEP); + /* create each directory component from the left */ + while(p) { + assert(*p == PATHSEP); + *p = 0; /* end the directory name here */ + if(mkdir(dir +#ifndef MKDIR_HAS_ONE_ARG + , 0750 +#endif + ) == -1) { + if(errno != EEXIST) { + log_msg(LOG_ERR, "create dir %s: %s", + dir, strerror(errno)); + return 0; + } + /* it already exists, OK, continue */ + } + *p = PATHSEP; + p = strchr(p+1, PATHSEP); + } + return 1; } -/* - * Writes databse data into open database *db - * - * Returns zero if success. - */ +/** create pathname components and check if file exists */ static int -write_db(namedb_type *db) +create_path_components(const char* path, int* notexist) { - zone_type *zone; - uint32_t terminator = 0; - uint32_t dname_count = 1; - uint32_t zone_count = 1; - int errors = 0; - - for (zone = db->zones; zone; zone = zone->next) { - zone->number = zone_count; - ++zone_count; - - if (!zone->soa_rrset) { - fprintf(stderr, "SOA record not present in %s\n", - dname_to_string(domain_dname(zone->apex), - NULL)); - ++errors; + /* stat the file, to see if it exists, and if its directories exist */ + struct stat s; + if(stat(path, &s) != 0) { + if(errno == ENOENT) { + *notexist = 1; + /* see if we need to create pathname components */ + return create_dirs(path); } + log_msg(LOG_ERR, "cannot stat %s: %s", path, strerror(errno)); + return 0; } + *notexist = 0; + return 1; +} - if (errors > 0) - return -1; - - --zone_count; - if (!write_number(db, zone_count)) - return -1; - for (zone = db->zones; zone; zone = zone->next) { - if (write_dname(db, zone->apex)) - return -1; +void +namedb_write_zonefile(namedb_type* db, zone_options_t* zopt) +{ + const char* zfile; + int notexist = 0; + zone_type* zone; + /* if no zone exists, it has no contents or it has no zonefile + * configured, then no need to write data to disk */ + if(!zopt->pattern->zonefile) + return; + zone = namedb_find_zone(db, (const dname_type*)zopt->node.key); + if(!zone || !zone->apex) + return; + /* write if file does not exist, or if changed */ + /* so, determine filename, create directory components, check exist*/ + zfile = config_make_zonefile(zopt); + if(!create_path_components(zfile, ¬exist)) { + log_msg(LOG_ERR, "could not write zone %s to file %s because " + "the path could not be created", zopt->name, zfile); + return; } - if (domain_table_iterate(db->domains, number_dnames_iterator, &dname_count)) - return -1; - - --dname_count; - if (!write_number(db, dname_count)) - return -1; - - DEBUG(DEBUG_ZONEC, 1, - (LOG_INFO, "Storing %lu domain names\n", (unsigned long) dname_count)); - - if (domain_table_iterate(db->domains, write_dname_iterator, db)) - return -1; - - if (domain_table_iterate(db->domains, write_domain_iterator, db)) - return -1; - - if (!write_data_crc(db->fd, &terminator, sizeof(terminator), &db->crc)) - return -1; + /* if not changed, do not write. */ + if(notexist || zone->is_changed) { + char logs[4096]; + char bakfile[4096]; + udb_ptr zudb; + if(!udb_zone_search(db->udb, &zudb, + dname_name(domain_dname(zone->apex)), + domain_dname(zone->apex)->name_size)) + return; /* zone does not exist in db */ + /* write to zfile~ first, then rename if that works */ + snprintf(bakfile, sizeof(bakfile), "%s~", zfile); + if(ZONE(&zudb)->log_str.data) { + udb_ptr s; + udb_ptr_new(&s, db->udb, &ZONE(&zudb)->log_str); + strlcpy(logs, (char*)udb_ptr_data(&s), sizeof(logs)); + udb_ptr_unlink(&s, db->udb); + } else logs[0] = 0; + if(!write_to_zonefile(zone, bakfile, logs)) { + udb_ptr_unlink(&zudb, db->udb); + return; /* error already printed */ + } + if(rename(bakfile, zfile) == -1) { + log_msg(LOG_ERR, "rename(%s to %s) failed: %s", + bakfile, zfile, strerror(errno)); + udb_ptr_unlink(&zudb, db->udb); + return; + } + zone->is_changed = 0; + ZONE(&zudb)->mtime = (uint64_t)time(0); + ZONE(&zudb)->is_changed = 0; + udb_zone_set_log_str(db->udb, &zudb, NULL); + udb_ptr_unlink(&zudb, db->udb); + } +} - return 0; +void +namedb_write_zonefiles(namedb_type* db, nsd_options_t* options) +{ + zone_options_t* zo; + RBTREE_FOR(zo, zone_options_t*, options->zone_options) { + namedb_write_zonefile(db, zo); + } } diff --git a/usr.sbin/nsd/difffile.c b/usr.sbin/nsd/difffile.c index 2b6d721d878..0719cf6f9dc 100644 --- a/usr.sbin/nsd/difffile.c +++ b/usr.sbin/nsd/difffile.c @@ -1,7 +1,7 @@ /* * difffile.c - DIFF file handling source code. Read and write diff files. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -14,22 +14,26 @@ #include <stdlib.h> #include <errno.h> #include "difffile.h" +#include "xfrd-disk.h" #include "util.h" #include "packet.h" #include "rdata.h" +#include "udb.h" +#include "udbzone.h" #include "nsec3.h" +#include "nsd.h" +#include "rrl.h" static int -write_32(FILE *out, uint32_t val) +write_64(FILE *out, uint64_t val) { - val = htonl(val); return write_data(out, &val, sizeof(val)); } static int -write_16(FILE *out, uint16_t val) +write_32(FILE *out, uint32_t val) { - val = htons(val); + val = htonl(val); return write_data(out, &val, sizeof(val)); } @@ -49,142 +53,117 @@ write_str(FILE *out, const char* str) } void -diff_write_packet(const char* zone, uint32_t new_serial, uint16_t id, - uint32_t seq_nr, uint8_t* data, size_t len, nsd_options_t* opt) +diff_write_packet(const char* zone, const char* pat, uint32_t old_serial, + uint32_t new_serial, uint32_t seq_nr, uint8_t* data, size_t len, + struct nsd* nsd, uint64_t filenumber) { - const char* filename = opt->difffile; - struct timeval tv; - FILE *df; - uint32_t file_len = sizeof(uint32_t) + strlen(zone) + - sizeof(new_serial) + sizeof(id) + sizeof(seq_nr) + len; - - if (gettimeofday(&tv, NULL) != 0) { - log_msg(LOG_ERR, "could not set timestamp for %s: %s", - filename, strerror(errno)); + FILE* df = xfrd_open_xfrfile(nsd, filenumber, seq_nr?"a":"w"); + if(!df) { + log_msg(LOG_ERR, "could not open transfer %s file %lld: %s", + zone, (long long)filenumber, strerror(errno)); return; } - df = fopen(filename, "a"); - if(!df) { - log_msg(LOG_ERR, "could not open file %s for append: %s", - filename, strerror(errno)); - return; + /* if first part, first write the header */ + if(seq_nr == 0) { + struct timeval tv; + if (gettimeofday(&tv, NULL) != 0) { + log_msg(LOG_ERR, "could not get timestamp for %s: %s", + zone, strerror(errno)); + } + if(!write_32(df, DIFF_PART_XFRF) || + !write_8(df, 0) /* notcommitted(yet) */ || + !write_32(df, 0) /* numberofparts when done */ || + !write_64(df, (uint64_t) tv.tv_sec) || + !write_32(df, (uint32_t) tv.tv_usec) || + !write_32(df, old_serial) || + !write_32(df, new_serial) || + !write_64(df, (uint64_t) tv.tv_sec) || + !write_32(df, (uint32_t) tv.tv_usec) || + !write_str(df, zone) || + !write_str(df, pat)) { + log_msg(LOG_ERR, "could not write transfer %s file %lld: %s", + zone, (long long)filenumber, strerror(errno)); + fclose(df); + return; + } } - if(!write_32(df, DIFF_PART_IXFR) || - !write_32(df, (uint32_t) tv.tv_sec) || - !write_32(df, (uint32_t) tv.tv_usec) || - !write_32(df, file_len) || - !write_str(df, zone) || - !write_32(df, new_serial) || - !write_16(df, id) || - !write_32(df, seq_nr) || + if(!write_32(df, DIFF_PART_XXFR) || + !write_32(df, len) || !write_data(df, data, len) || - !write_32(df, file_len)) + !write_32(df, len)) { - log_msg(LOG_ERR, "could not write to file %s: %s", - filename, strerror(errno)); + log_msg(LOG_ERR, "could not write transfer %s file %lld: %s", + zone, (long long)filenumber, strerror(errno)); } fclose(df); } void -diff_write_commit(const char* zone, uint32_t old_serial, - uint32_t new_serial, uint16_t id, uint32_t num_parts, - uint8_t commit, const char* log_str, nsd_options_t* opt) +diff_write_commit(const char* zone, uint32_t old_serial, uint32_t new_serial, + uint32_t num_parts, uint8_t commit, const char* log_str, + struct nsd* nsd, uint64_t filenumber) { - const char* filename = opt->difffile; struct timeval tv; - FILE *df; - uint32_t len; + FILE* df; if (gettimeofday(&tv, NULL) != 0) { log_msg(LOG_ERR, "could not set timestamp for %s: %s", - filename, strerror(errno)); - return; + zone, strerror(errno)); } - df = fopen(filename, "a"); + /* overwrite the first part of the file with 'committed = 1', + * as well as the end_time and number of parts. + * also write old_serial and new_serial, so that a bad file mixup + * will result in unusable serial numbers. */ + + df = xfrd_open_xfrfile(nsd, filenumber, "r+"); if(!df) { - log_msg(LOG_ERR, "could not open file %s for append: %s", - filename, strerror(errno)); + log_msg(LOG_ERR, "could not open transfer %s file %lld: %s", + zone, (long long)filenumber, strerror(errno)); return; } - - len = strlen(zone) + sizeof(len) + sizeof(old_serial) + - sizeof(new_serial) + sizeof(id) + sizeof(num_parts) + - sizeof(commit) + strlen(log_str) + sizeof(len); - - if(!write_32(df, DIFF_PART_SURE) || - !write_32(df, (uint32_t) tv.tv_sec) || + if(!write_32(df, DIFF_PART_XFRF) || + !write_8(df, commit) /* committed */ || + !write_32(df, num_parts) || + !write_64(df, (uint64_t) tv.tv_sec) || !write_32(df, (uint32_t) tv.tv_usec) || - !write_32(df, len) || - !write_str(df, zone) || !write_32(df, old_serial) || - !write_32(df, new_serial) || - !write_16(df, id) || - !write_32(df, num_parts) || - !write_8(df, commit) || - !write_str(df, log_str) || - !write_32(df, len)) + !write_32(df, new_serial)) { - log_msg(LOG_ERR, "could not write to file %s: %s", - filename, strerror(errno)); + log_msg(LOG_ERR, "could not write transfer %s file %lld: %s", + zone, (long long)filenumber, strerror(errno)); + fclose(df); + return; + } + + /* append the log_str to the end of the file */ + if(fseek(df, 0, SEEK_END) == -1) { + log_msg(LOG_ERR, "could not fseek transfer %s file %lld: %s", + zone, (long long)filenumber, strerror(errno)); + fclose(df); + return; + } + if(!write_str(df, log_str)) { + log_msg(LOG_ERR, "could not write transfer %s file %lld: %s", + zone, (long long)filenumber, strerror(errno)); + fclose(df); + return; + } fflush(df); fclose(df); } -/* - * Checksum to signal no data change occured (for example, by a - * zonec run. - */ int -db_crc_different(namedb_type* db) +diff_read_64(FILE *in, uint64_t* result) { - FILE *fd = fopen(db->filename, "r"); - uint32_t crc_file; - char buf[NAMEDB_MAGIC_SIZE]; - if(fd == NULL) { - log_msg(LOG_ERR, "unable to load %s: %s", - db->filename, strerror(errno)); - return -1; - } - - /* seek to position of CRC, check it and magic no */ - if(fseeko(fd, db->crc_pos, SEEK_SET)==-1) { - log_msg(LOG_ERR, "unable to fseeko %s: %s. db changed?", - db->filename, strerror(errno)); - fclose(fd); - return -1; - } - - if(fread(&crc_file, sizeof(crc_file), 1, fd) != 1) { - if(!feof(fd)) - log_msg(LOG_ERR, "could not read %s CRC: %s. " - "db changed?", db->filename, strerror(errno)); - fclose(fd); - return -1; - } - crc_file = ntohl(crc_file); - - if(fread(buf, sizeof(char), sizeof(buf), fd) != sizeof(buf)) { - if(!feof(fd)) - log_msg(LOG_ERR, "could not read %s magic: %s. " - "db changed?", db->filename, strerror(errno)); - fclose(fd); - return -1; - } - if(memcmp(buf, NAMEDB_MAGIC, NAMEDB_MAGIC_SIZE) != 0) { - fclose(fd); - return -1; - } - - fclose(fd); - - if(db->crc == crc_file) + if (fread(result, sizeof(*result), 1, in) == 1) { + return 1; + } else { return 0; - return 1; + } } int @@ -199,17 +178,6 @@ diff_read_32(FILE *in, uint32_t* result) } int -diff_read_16(FILE *in, uint16_t* result) -{ - if (fread(result, sizeof(*result), 1, in) == 1) { - *result = ntohs(*result); - return 1; - } else { - return 0; - } -} - -int diff_read_8(FILE *in, uint8_t* result) { if (fread(result, sizeof(*result), 1, in) == 1) { @@ -259,7 +227,7 @@ has_data_below(domain_type* top) assert(d != NULL); /* in the canonical ordering subdomains are after this name */ d = domain_next(d); - while(d != NULL && dname_is_subdomain(domain_dname(d), domain_dname(top))) { + while(d != NULL && domain_is_subdomain(d, top)) { if(d->is_existing) return 1; d = domain_next(d); @@ -267,35 +235,8 @@ has_data_below(domain_type* top) return 0; } - -/* this routine makes empty terminals non-existent. - * @domain the lowest empty terminal - * @ce the closest encloser - */ -static domain_type* -rrset_delete_empty_terminals(domain_type* domain, domain_type* ce) -{ - assert(domain); - if (domain->rrsets == 0) { - /* if there is no data below it, it becomes non existing. - also empty nonterminals above it become nonexisting */ - /* check for data below this node. */ - if(!has_data_below(domain)) { - /* nonexist this domain and all parent empty nonterminals */ - domain_type* p = domain; - while(p != NULL && p->rrsets == 0) { - if(p == ce || has_data_below(p)) - return p; - p->is_existing = 0; - p = p->parent; - } - } - } - return NULL; -} - - -static domain_type* +/** remove rrset. Adjusts zone params. Does not remove domain */ +static void rrset_delete(namedb_type* db, domain_type* domain, rrset_type* rrset) { int i; @@ -306,40 +247,29 @@ rrset_delete(namedb_type* db, domain_type* domain, rrset_type* rrset) } if(!*pp) { /* rrset does not exist for domain */ - return NULL; + return; } *pp = rrset->next; DEBUG(DEBUG_XFRD,2, (LOG_INFO, "delete rrset of %s type %s", - dname_to_string(domain_dname(domain),0), + domain_to_string(domain), rrtype_to_string(rrset_rrtype(rrset)))); /* is this a SOA rrset ? */ if(rrset->zone->soa_rrset == rrset) { rrset->zone->soa_rrset = 0; - rrset->zone->updated = 1; - domain->has_SOA = 0; } if(rrset->zone->ns_rrset == rrset) { rrset->zone->ns_rrset = 0; } if(domain == rrset->zone->apex && rrset_rrtype(rrset) == TYPE_RRSIG) { for (i = 0; i < rrset->rr_count; ++i) { - if (rr_rrsig_type_covered(&rrset->rrs[i]) == TYPE_DNSKEY) { + if(rr_rrsig_type_covered(&rrset->rrs[i])==TYPE_DNSKEY) { rrset->zone->is_secure = 0; break; } } } - -#ifdef NSEC3 -#ifndef FULL_PREHASH - if (rrset->rrs[0].type == TYPE_NSEC3) { - namedb_del_nsec3_domain(db, domain, rrset->zone); - } -#endif /* !FULL_PREHASH */ -#endif /* NSEC3 */ - /* recycle the memory space of the rrset */ for (i = 0; i < rrset->rr_count; ++i) add_rdata_to_recyclebin(db, &rrset->rrs[i]); @@ -349,60 +279,334 @@ rrset_delete(namedb_type* db, domain_type* domain, rrset_type* rrset) region_recycle(db->region, rrset, sizeof(rrset_type)); /* is the node now an empty node (completely deleted) */ - if (domain->rrsets == 0) { - return domain; + if(domain->rrsets == 0) { + /* if there is no data below it, it becomes non existing. + also empty nonterminals above it become nonexisting */ + /* check for data below this node. */ + if(!has_data_below(domain)) { + /* nonexist this domain and all parent empty nonterminals */ + domain_type* p = domain; + while(p != NULL && p->rrsets == 0) { + if(has_data_below(p)) + break; + p->is_existing = 0; + p = p->parent; + } + } } - return NULL; } static int -rdatas_equal(rdata_atom_type *a, rdata_atom_type *b, int num, uint16_t type) +rdatas_equal(rdata_atom_type *a, rdata_atom_type *b, int num, uint16_t type, + int* rdnum, char** reason) { int k; for(k = 0; k < num; k++) { if(rdata_atom_is_domain(type, k)) { if(dname_compare(domain_dname(a[k].domain), - domain_dname(b[k].domain))!=0) + domain_dname(b[k].domain))!=0) { + *rdnum = k; + *reason = "dname data"; + return 0; + } + } else if(rdata_atom_is_literal_domain(type, k)) { + /* literal dname, but compare case insensitive */ + if(a[k].data[0] != b[k].data[0]) { + *rdnum = k; + *reason = "literal dname len"; + return 0; /* uncompressed len must be equal*/ + } + if(!dname_equal_nocase((uint8_t*)(a[k].data+1), + (uint8_t*)(b[k].data+1), a[k].data[0])) { + *rdnum = k; + *reason = "literal dname data"; return 0; + } } else { /* check length */ - if(a[k].data[0] != b[k].data[0]) + if(a[k].data[0] != b[k].data[0]) { + *rdnum = k; + *reason = "rdata len"; return 0; + } /* check data */ - if(memcmp(a[k].data+1, b[k].data+1, a[k].data[0])!=0) + if(memcmp(a[k].data+1, b[k].data+1, a[k].data[0])!=0) { + *rdnum = k; + *reason = "rdata data"; return 0; + } } } return 1; } -static int -find_rr_num(rrset_type* rrset, - uint16_t type, uint16_t klass, +static void +debug_find_rr_num(rrset_type* rrset, uint16_t type, uint16_t klass, rdata_atom_type *rdatas, ssize_t rdata_num) { - int i; + int i, rd; + char* reason = ""; + + for(i=0; i < rrset->rr_count; ++i) { + if (rrset->rrs[i].type != type) { + log_msg(LOG_WARNING, "diff: RR <%s, %s> does not match " + "RR num %d type %s", + dname_to_string(rrset->rrs[i].owner->dname,0), + rrtype_to_string(type), i, + rrtype_to_string(rrset->rrs[i].type)); + } + if (rrset->rrs[i].klass != klass) { + log_msg(LOG_WARNING, "diff: RR <%s, %s> class %d " + "does not match RR num %d class %d", + dname_to_string(rrset->rrs[i].owner->dname,0), + rrtype_to_string(type), + klass, i, + rrset->rrs[i].klass); + } + if (rrset->rrs[i].rdata_count != rdata_num) { + log_msg(LOG_WARNING, "diff: RR <%s, %s> rdlen %u " + "does not match RR num %d rdlen %d", + dname_to_string(rrset->rrs[i].owner->dname,0), + rrtype_to_string(type), + (unsigned) rdata_num, i, + (unsigned) rrset->rrs[i].rdata_count); + } + if (!rdatas_equal(rdatas, rrset->rrs[i].rdatas, rdata_num, type, + &rd, &reason)) { + log_msg(LOG_WARNING, "diff: RR <%s, %s> rdata element " + "%d differs from RR num %d rdata (%s)", + dname_to_string(rrset->rrs[i].owner->dname,0), + rrtype_to_string(type), + rd, i, reason); + } + } +} + +static int +find_rr_num(rrset_type* rrset, uint16_t type, uint16_t klass, + rdata_atom_type *rdatas, ssize_t rdata_num, int add) +{ + int i, rd; + char* reason; for(i=0; i < rrset->rr_count; ++i) { if(rrset->rrs[i].type == type && rrset->rrs[i].klass == klass && rrset->rrs[i].rdata_count == rdata_num && - rdatas_equal(rdatas, rrset->rrs[i].rdatas, rdata_num, type)) + rdatas_equal(rdatas, rrset->rrs[i].rdatas, rdata_num, type, + &rd, &reason)) { return i; } } - + /* this is odd. Log why rr cannot be found. */ + if (!add) { + debug_find_rr_num(rrset, type, klass, rdatas, rdata_num); + } return -1; } -static int +#ifdef NSEC3 +/* see if nsec3 deletion triggers need action */ +static void +nsec3_delete_rr_trigger(namedb_type* db, rr_type* rr, zone_type* zone, + udb_ptr* udbz) +{ + /* the RR has not actually been deleted yet, so we can inspect it */ + if(!zone->nsec3_param) + return; + /* see if the domain was an NSEC3-domain in the chain, but no longer */ + if(rr->type == TYPE_NSEC3 && rr->owner->nsec3 && + rr->owner->nsec3->nsec3_node.key && + nsec3_rr_uses_params(rr, zone) && + nsec3_in_chain_count(rr->owner, zone) <= 1) { + domain_type* prev = nsec3_chain_find_prev(zone, rr->owner); + /* remove from prehash because no longer an NSEC3 domain */ + if(domain_is_prehash(db->domains, rr->owner)) + prehash_del(db->domains, rr->owner); + /* fixup the last in the zone */ + if(rr->owner == zone->nsec3_last) + zone->nsec3_last = prev; + /* unlink from the nsec3tree */ + zone_del_domain_in_hash_tree(zone->nsec3tree, + &rr->owner->nsec3->nsec3_node); + /* add previous NSEC3 to the prehash list */ + if(prev && prev != rr->owner) + prehash_add(db->domains, prev); + else nsec3_clear_precompile(db, zone); + /* this domain becomes ordinary data domain: done later */ + } + /* see if the rr was NSEC3PARAM that we were using */ + else if(rr->type == TYPE_NSEC3PARAM && rr == zone->nsec3_param) { + /* clear trees, wipe hashes, wipe precompile */ + nsec3_clear_precompile(db, zone); + /* pick up new nsec3param from udb */ + nsec3_find_zone_param(db, zone, udbz); + /* if no more NSEC3, done */ + if(!zone->nsec3_param) + return; + nsec3_precompile_newparam(db, zone); + } +} + +/* see if nsec3 prehash can be removed with new rrset content */ +static void +nsec3_rrsets_changed_remove_prehash(domain_type* domain, zone_type* zone) +{ + /* deletion of rrset already done, we can check if conditions apply */ + /* see if the domain is no longer precompiled */ + /* it has a hash_node, but no longer fulfills conditions */ + if(nsec3_domain_part_of_zone(domain, zone) && domain->nsec3 && + domain->nsec3->hash_node.key && + !nsec3_condition_hash(domain, zone)) { + /* remove precompile */ + domain->nsec3->nsec3_cover = NULL; + domain->nsec3->nsec3_wcard_child_cover = NULL; + domain->nsec3->nsec3_is_exact = 0; + /* remove it from the hash tree */ + zone_del_domain_in_hash_tree(zone->hashtree, + &domain->nsec3->hash_node); + zone_del_domain_in_hash_tree(zone->wchashtree, + &domain->nsec3->wchash_node); + } + if(domain != zone->apex && domain->nsec3 && + domain->nsec3->dshash_node.key && + !nsec3_condition_dshash(domain, zone)) { + /* remove precompile */ + domain->nsec3->nsec3_ds_parent_cover = NULL; + domain->nsec3->nsec3_ds_parent_is_exact = 0; + /* remove it from the hash tree */ + zone_del_domain_in_hash_tree(zone->dshashtree, + &domain->nsec3->dshash_node); + } +} + +/* see if domain needs to get precompiled info */ +static void +nsec3_rrsets_changed_add_prehash(namedb_type* db, domain_type* domain, + zone_type* zone) +{ + if(!zone->nsec3_param) + return; + if((!domain->nsec3 || !domain->nsec3->hash_node.key) + && nsec3_condition_hash(domain, zone)) { + region_type* tmpregion = region_create(xalloc, free); + nsec3_precompile_domain(db, domain, zone, tmpregion); + region_destroy(tmpregion); + } + if((!domain->nsec3 || !domain->nsec3->dshash_node.key) + && nsec3_condition_dshash(domain, zone)) { + nsec3_precompile_domain_ds(db, domain, zone); + } +} + +/* see if nsec3 rrset-deletion triggers need action */ +static void +nsec3_delete_rrset_trigger(namedb_type* db, domain_type* domain, + zone_type* zone, uint16_t type) +{ + if(!zone->nsec3_param) + return; + nsec3_rrsets_changed_remove_prehash(domain, zone); + /* for type nsec3, or a delegation, the domain may have become a + * 'normal' domain with its remaining data now */ + if(type == TYPE_NSEC3 || type == TYPE_NS || type == TYPE_DS) + nsec3_rrsets_changed_add_prehash(db, domain, zone); + /* for type DNAME or a delegation, obscured data may be revealed */ + if(type == TYPE_NS || type == TYPE_DS || type == TYPE_DNAME) { + /* walk over subdomains and check them each */ + domain_type *d; + for(d=domain_next(domain); d && domain_is_subdomain(d, domain); + d=domain_next(d)) { + nsec3_rrsets_changed_add_prehash(db, d, zone); + } + } +} + +/* see if nsec3 addition triggers need action */ +static void +nsec3_add_rr_trigger(namedb_type* db, rr_type* rr, zone_type* zone, + udb_ptr* udbz) +{ + /* the RR has been added in full, also to UDB (and thus NSEC3PARAM + * in the udb has been adjusted) */ + if(zone->nsec3_param && rr->type == TYPE_NSEC3 && + (!rr->owner->nsec3 || !rr->owner->nsec3->nsec3_node.key) + && nsec3_rr_uses_params(rr, zone)) { + /* added NSEC3 into the chain */ + nsec3_precompile_nsec3rr(db, rr->owner, zone); + /* the domain has become an NSEC3-domain, if it was precompiled + * previously, remove that, neatly done in routine above */ + nsec3_rrsets_changed_remove_prehash(rr->owner, zone); + /* set this NSEC3 to prehash */ + prehash_add(db->domains, rr->owner); + } else if(!zone->nsec3_param && rr->type == TYPE_NSEC3PARAM) { + /* see if this means NSEC3 chain can be used */ + nsec3_find_zone_param(db, zone, udbz); + if(!zone->nsec3_param) + return; + nsec3_zone_trees_create(db->region, zone); + nsec3_precompile_newparam(db, zone); + } +} + +/* see if nsec3 rrset-addition triggers need action */ +static void +nsec3_add_rrset_trigger(namedb_type* db, domain_type* domain, zone_type* zone, + uint16_t type) +{ + /* the rrset has been added so we can inspect it */ + if(!zone->nsec3_param) + return; + /* because the rrset is added we can check conditions easily. + * check if domain needs to become precompiled now */ + nsec3_rrsets_changed_add_prehash(db, domain, zone); + /* if a delegation, it changes from normal name to unhashed referral */ + if(type == TYPE_NS || type == TYPE_DS) { + nsec3_rrsets_changed_remove_prehash(domain, zone); + } + /* if delegation or DNAME added, then some RRs may get obscured */ + if(type == TYPE_NS || type == TYPE_DS || type == TYPE_DNAME) { + /* walk over subdomains and check them each */ + domain_type *d; + for(d=domain_next(domain); d && domain_is_subdomain(d, domain); + d=domain_next(d)) { + nsec3_rrsets_changed_remove_prehash(d, zone); + } + } +} +#endif /* NSEC3 */ + +/* fixup usage lower for domain names in the rdata */ +static void +rr_lower_usage(namedb_type* db, rr_type* rr) +{ + unsigned i; + for(i=0; i<rr->rdata_count; i++) { + if(rdata_atom_is_domain(rr->type, i)) { + assert(rdata_atom_domain(rr->rdatas[i])->usage > 0); + rdata_atom_domain(rr->rdatas[i])->usage --; + if(rdata_atom_domain(rr->rdatas[i])->usage == 0) + domain_table_deldomain(db, + rdata_atom_domain(rr->rdatas[i])); + } + } +} + +static void +rrset_lower_usage(namedb_type* db, rrset_type* rrset) +{ + unsigned i; + for(i=0; i<rrset->rr_count; i++) + rr_lower_usage(db, &rrset->rrs[i]); +} + +int delete_RR(namedb_type* db, const dname_type* dname, uint16_t type, uint16_t klass, - domain_type* prevdomain, buffer_type* packet, size_t rdatalen, zone_type *zone, - region_type* temp_region, int is_axfr) + region_type* temp_region, udb_ptr* udbz) { domain_type *domain; rrset_type *rrset; @@ -436,36 +640,30 @@ delete_RR(namedb_type* db, const dname_type* dname, dname_to_string(dname,0)); return 0; } - rrnum = find_rr_num(rrset, type, klass, rdatas, rdata_num); + rrnum = find_rr_num(rrset, type, klass, rdatas, rdata_num, 0); if(rrnum == -1) { log_msg(LOG_WARNING, "diff: RR <%s, %s> does not exist", dname_to_string(dname,0), rrtype_to_string(type)); return 1; /* not fatal error */ } + /* delete the normalized RR from the udb */ + udb_del_rr(db->udb, udbz, &rrset->rrs[rrnum]); #ifdef NSEC3 -#ifndef FULL_PREHASH - if (is_axfr == 0) { - struct domain *parent = domain; - do { - if (0 != namedb_add_nsec3_mod_domain(db, - parent)) { - return 0; - } - parent = parent->parent; - } while (parent != zone->apex->parent); - } -#else - (void)is_axfr; -#endif /* !FULL_PREHASH */ -#endif /* NSEC3 */ - + /* process triggers for RR deletions */ + nsec3_delete_rr_trigger(db, &rrset->rrs[rrnum], zone, udbz); +#endif + /* lower usage (possibly deleting other domains, and thus + * invalidating the current RR's domain pointers) */ + rr_lower_usage(db, &rrset->rrs[rrnum]); if(rrset->rr_count == 1) { /* delete entire rrset */ - domain = rrset_delete(db, domain, rrset); - if (domain && domain != prevdomain && !domain->nextdiff) { - /* this domain is not yet in the diff chain */ - prevdomain->nextdiff = domain; - } + rrset_delete(db, domain, rrset); +#ifdef NSEC3 + /* cleanup nsec3 */ + nsec3_delete_rrset_trigger(db, domain, zone, type); +#endif + /* see if the domain can be deleted (and inspect parents) */ + domain_table_deldomain(db, domain); } else { /* swap out the bad RR and decrease the count */ rr_type* rrs_orig = rrset->rrs; @@ -482,17 +680,40 @@ delete_RR(namedb_type* db, const dname_type* dname, } region_recycle(db->region, rrs_orig, sizeof(rr_type) * rrset->rr_count); +#ifdef NSEC3 + if(type == TYPE_NSEC3PARAM && zone->nsec3_param) { + /* fixup nsec3_param pointer to same RR */ + assert(zone->nsec3_param >= rrs_orig && + zone->nsec3_param <= + rrs_orig+rrset->rr_count); + /* last moved to rrnum, others at same index*/ + if(zone->nsec3_param == &rrs_orig[ + rrset->rr_count-1]) + zone->nsec3_param = &rrset->rrs[rrnum]; + else + zone->nsec3_param = + (void*)zone->nsec3_param + -(void*)rrs_orig + + (void*)rrset->rrs; + } +#endif /* NSEC3 */ rrset->rr_count --; +#ifdef NSEC3 + /* for type nsec3, the domain may have become a + * 'normal' domain with its remaining data now */ + if(type == TYPE_NSEC3) + nsec3_rrsets_changed_add_prehash(db, domain, + zone); +#endif /* NSEC3 */ } } return 1; } -static int +int add_RR(namedb_type* db, const dname_type* dname, uint16_t type, uint16_t klass, uint32_t ttl, - buffer_type* packet, size_t rdatalen, zone_type *zone, - int is_axfr) + buffer_type* packet, size_t rdatalen, zone_type *zone, udb_ptr* udbz) { domain_type* domain; rrset_type* rrset; @@ -500,6 +721,7 @@ add_RR(namedb_type* db, const dname_type* dname, rr_type *rrs_old; ssize_t rdata_num; int rrnum; + int rrset_added = 0; domain = domain_table_find(db->domains, dname); if(!domain) { /* create the domain */ @@ -517,6 +739,7 @@ add_RR(namedb_type* db, const dname_type* dname, rrset->rrs = 0; rrset->rr_count = 0; domain_add_rrset(domain, rrset); + rrset_added = 1; } /* dnames in rdata are normalized, conform RFC 4035, @@ -529,21 +752,13 @@ add_RR(namedb_type* db, const dname_type* dname, dname_to_string(dname,0)); return 0; } - rrnum = find_rr_num(rrset, type, klass, rdatas, rdata_num); + rrnum = find_rr_num(rrset, type, klass, rdatas, rdata_num, 1); if(rrnum != -1) { DEBUG(DEBUG_XFRD, 2, (LOG_ERR, "diff: RR <%s, %s> already exists", dname_to_string(dname,0), rrtype_to_string(type))); /* ignore already existing RR: lenient accepting of messages */ return 1; } - if(domain == zone->apex) { - /* make sure we don't get multiple soa rrs */ - if (type == TYPE_SOA && rrset->rr_count > 0) { - log_msg(LOG_ERR, "diff: multiple soa records for %s", - dname_to_string(dname,0)); - return 0; - } - } /* re-alloc the rrs and add the new */ rrs_old = rrset->rrs; @@ -567,190 +782,104 @@ add_RR(namedb_type* db, const dname_type* dname, /* see if it is a SOA */ if(domain == zone->apex) { - if(type == TYPE_SOA) { - uint32_t soa_minimum; - zone->soa_rrset = rrset; - zone->updated = 1; - /* BUG #103 tweaked SOA ttl value */ - if(zone->soa_nx_rrset == 0) { - zone->soa_nx_rrset = region_alloc(db->region, - sizeof(rrset_type)); - if(!zone->soa_nx_rrset) { - log_msg(LOG_ERR, "out of memory, %s:%d", - __FILE__, __LINE__); - exit(1); - } - zone->soa_nx_rrset->rr_count = 1; - zone->soa_nx_rrset->next = 0; - zone->soa_nx_rrset->zone = zone; - zone->soa_nx_rrset->rrs = region_alloc(db->region, - sizeof(rr_type)); - if(!zone->soa_nx_rrset->rrs) { - log_msg(LOG_ERR, "out of memory, %s:%d", - __FILE__, __LINE__); - exit(1); - } - } - memcpy(zone->soa_nx_rrset->rrs, rrset->rrs, sizeof(rr_type)); - memcpy(&soa_minimum, rdata_atom_data(rrset->rrs->rdatas[6]), - rdata_atom_size(rrset->rrs->rdatas[6])); - if (rrset->rrs->ttl > ntohl(soa_minimum)) { - rrset->zone->soa_nx_rrset->rrs[0].ttl = ntohl(soa_minimum); - } - domain->has_SOA = 1; - } - if(type == TYPE_NS) { - zone->ns_rrset = rrset; - } - if(type == TYPE_RRSIG) { - int i; - for (i = 0; i < rrset->rr_count; ++i) { - if (rr_rrsig_type_covered(&rrset->rrs[i]) == TYPE_DNSKEY) { - zone->is_secure = 1; - break; - } - } + apex_rrset_checks(db, rrset, domain); +#ifdef NSEC3 + if(type == TYPE_NSEC3PARAM && zone->nsec3_param) { + /* the pointer just changed, fix it up to point + * to the same record */ + assert(zone->nsec3_param >= rrs_old && + zone->nsec3_param < rrs_old+rrset->rr_count); + /* in this order to make sure no overflow/underflow*/ + zone->nsec3_param = (void*)zone->nsec3_param - + (void*)rrs_old + (void*)rrset->rrs; } +#endif /* NSEC3 */ } -#ifdef NSEC3 -#ifndef FULL_PREHASH - if ((type == TYPE_NSEC3) && - (rrset->rr_count == 1)) { - /* NSEC3 RRset just added */ - if (0 != namedb_add_nsec3_domain(db, domain, zone)) - return 0; + /* write the just-normalized RR to the udb */ + if(!udb_write_rr(db->udb, udbz, &rrset->rrs[rrset->rr_count - 1])) { + log_msg(LOG_ERR, "could not add RR to nsd.db, disk-space?"); + return 0; } - if (is_axfr == 0) { - struct domain *parent = domain; - do { - if (0 != namedb_add_nsec3_mod_domain(db, parent)) - return 0; - parent = parent->parent; - } while (parent != zone->apex->parent); +#ifdef NSEC3 + if(rrset_added) { + domain_type* p = domain->parent; + nsec3_add_rrset_trigger(db, domain, zone, type); + /* go up and process (possibly created) empty nonterminals, + * until we hit the apex or root */ + while(p && p->rrsets == NULL && !p->is_apex) { + nsec3_rrsets_changed_add_prehash(db, p, zone); + p = p->parent; + } } -#else - (void)is_axfr; -#endif /* !FULL_PREHASH */ + nsec3_add_rr_trigger(db, &rrset->rrs[rrset->rr_count - 1], zone, udbz); #endif /* NSEC3 */ - return 1; } static zone_type* -find_zone(namedb_type* db, const dname_type* zone_name, nsd_options_t* opt, - size_t child_count) +find_or_create_zone(namedb_type* db, const dname_type* zone_name, + nsd_options_t* opt, const char* zstr, const char* patname) { - domain_type *domain; zone_type* zone; - zone_options_t* opts; - domain = domain_table_find(db->domains, zone_name); - if(!domain) { - DEBUG(DEBUG_XFRD,1, (LOG_INFO, "xfr: creating domain %s", - dname_to_string(zone_name,0))); - /* create the zone and domain of apex (zone has config options) */ - domain = domain_table_insert(db->domains, zone_name); - } else { - /* O(1) if SOA exists */ - zone = domain_find_zone(domain); - /* if domain was empty (no rrsets, empty zone) search in zonelist */ - /* check apex to make sure we don't find a parent zone */ - if(!zone || zone->apex != domain) - zone = namedb_find_zone(db, domain); - if(zone) { - assert(zone->apex == domain); - return zone; + zone_options_t* zopt; + zone = namedb_find_zone(db, zone_name); + if(zone) { + return zone; + } + zopt = zone_options_find(opt, zone_name); + if(!zopt) { + /* if _implicit_ then insert as _part_of_config */ + if(strncmp(patname, PATTERN_IMPLICIT_MARKER, + strlen(PATTERN_IMPLICIT_MARKER)) == 0) { + zopt = zone_options_create(opt->region); + if(!zopt) return 0; + zopt->part_of_config = 1; + zopt->name = region_strdup(opt->region, zstr); + zopt->pattern = pattern_options_find(opt, patname); + if(!zopt->name || !zopt->pattern) return 0; + if(!nsd_options_insert_zone(opt, zopt)) { + log_msg(LOG_ERR, "bad domain name or duplicate zone '%s' " + "pattern %s", zstr, patname); + } + } else { + /* create zone : presumably already added to zonelist + * by xfrd, who wrote the AXFR or IXFR to disk, so we only + * need to add it to our config. + * This process does not need linesize and offset zonelist */ + zopt = zone_list_zone_insert(opt, zstr, patname, 0, 0); + if(!zopt) + return 0; } } - /* lookup in config */ - opts = zone_options_find(opt, domain_dname(domain)); - if(!opts) { - log_msg(LOG_ERR, "xfr: zone %s not in config.", - dname_to_string(zone_name,0)); - return 0; - } - /* create the zone */ - DEBUG(DEBUG_XFRD,1, (LOG_INFO, "xfr: creating zone_type %s", - dname_to_string(zone_name,0))); - zone = (zone_type *) region_alloc(db->region, sizeof(zone_type)); - if(!zone) { - log_msg(LOG_ERR, "out of memory, %s:%d", __FILE__, __LINE__); - exit(1); - } - zone->next = db->zones; - zone->opts = opts; - db->zones = zone; - db->zone_count++; - zone->apex = domain; - zone->soa_rrset = 0; - zone->soa_nx_rrset = 0; - zone->ns_rrset = 0; -#ifdef NSEC3 - zone->nsec3_soa_rr = NULL; - zone->nsec3_last = NULL; -#endif - zone->dirty = region_alloc(db->region, sizeof(uint8_t)*child_count); - if(!zone->dirty) { - log_msg(LOG_ERR, "out of memory, %s:%d", __FILE__, __LINE__); - exit(1); - } - memset(zone->dirty, 0, sizeof(uint8_t)*child_count); -#ifdef NSEC3 -#ifndef FULL_PREHASH - zone->nsec3_domains = NULL; - - if (0 != zone_nsec3_domains_create(db, zone)) { - log_msg(LOG_ERR, - "xfr: zone NSEC3 domains " - "memory allocation failure"); - return 0; - } -#endif /* !FULL_PREHASH */ -#endif /* NSEC3 */ - zone->number = db->zone_count; - zone->is_secure = 0; - zone->updated = 1; - zone->is_ok = 0; + zone = namedb_zone_create(db, zone_name, zopt); return zone; } -static void +void delete_zone_rrs(namedb_type* db, zone_type* zone) { rrset_type *rrset; - domain_type *domain = zone->apex; - domain_type *next = NULL; - zone->updated = 1; -#ifdef NSEC3 -#ifndef FULL_PREHASH - zone_nsec3_domains_destroy(db, zone); -#endif /* !FULL_PREHASH */ -#endif /* NSEC3 */ - + domain_type *domain = zone->apex, *next; /* go through entire tree below the zone apex (incl subzones) */ - while(domain && dname_is_subdomain( - domain_dname(domain), domain_dname(zone->apex))) + while(domain && domain_is_subdomain(domain, zone->apex)) { DEBUG(DEBUG_XFRD,2, (LOG_INFO, "delete zone visit %s", - dname_to_string(domain_dname(domain),0))); + domain_to_string(domain))); /* delete all rrsets of the zone */ while((rrset = domain_find_any_rrset(domain, zone))) { - (void)rrset_delete(db, domain, rrset); + /* lower usage can delete other domains */ + rrset_lower_usage(db, rrset); + /* rrset del does not delete our domain(yet) */ + rrset_delete(db, domain, rrset); } + /* the delete upcoming could delete parents, but nothing next + * or after the domain so store next ptr */ next = domain_next(domain); - domain->nextdiff = next; + /* see if the domain can be deleted (and inspect parents) */ + domain_table_deldomain(db, domain); domain = next; } -#ifdef NSEC3 -#ifndef FULL_PREHASH - if (0 != zone_nsec3_domains_create(db, zone)) { - log_msg(LOG_ERR, - "Zone %s: unable to create zone NSEC3 prehash table", - dname_to_string(domain_dname(zone->apex), - NULL)); - } -#endif /* !FULL_PREHASH */ -#endif /* NSEC3 */ DEBUG(DEBUG_XFRD, 1, (LOG_INFO, "axfrdel: recyclebin holds %lu bytes", (unsigned long) region_get_recycle_size(db->region))); @@ -760,34 +889,19 @@ delete_zone_rrs(namedb_type* db, zone_type* zone) #endif assert(zone->soa_rrset == 0); - /* keep zone->soa_nx_rrset alloced */ + /* keep zone->soa_nx_rrset alloced: it is reused */ assert(zone->ns_rrset == 0); assert(zone->is_secure == 0); - assert(zone->updated == 1); -} - -/* fix empty terminals */ -static void -fix_empty_terminals(zone_type* zone_db) -{ - domain_type* domain = zone_db->apex, *ce = NULL, *next = NULL; - while (domain) { - ce = rrset_delete_empty_terminals(domain, ce); - next = domain->nextdiff; - domain->nextdiff = NULL; - domain = next; - } } /* return value 0: syntaxerror,badIXFR, 1:OK, 2:done_and_skip_it */ static int -apply_ixfr(namedb_type* db, FILE *in, const off_t* startpos, - const char* zone, uint32_t serialno, nsd_options_t* opt, - uint16_t id, uint32_t seq_nr, uint32_t seq_total, +apply_ixfr(namedb_type* db, FILE *in, const char* zone, uint32_t serialno, + nsd_options_t* opt, uint32_t seq_nr, uint32_t seq_total, int* is_axfr, int* delete_mode, int* rr_count, - size_t child_count) + udb_ptr* udbz, struct zone** zone_res, const char* patname, int* bytes) { - uint32_t filelen, msglen, pkttype, timestamp[2]; + uint32_t msglen, checklen, pkttype; int qcount, ancount, counter; buffer_type* packet; region_type* region; @@ -795,36 +909,24 @@ apply_ixfr(namedb_type* db, FILE *in, const off_t* startpos, uint16_t rrlen; const dname_type *dname_zone, *dname; zone_type* zone_db; - domain_type* last_in_list; - char file_zone_name[3072]; - uint32_t file_serial, file_seq_nr; - uint16_t file_id; - off_t mempos; - - memmove(&mempos, startpos, sizeof(off_t)); - if(fseeko(in, mempos, SEEK_SET) == -1) { - log_msg(LOG_INFO, "could not fseeko: %s.", strerror(errno)); - return 0; - } - /* read ixfr packet RRs and apply to in memory db */ - if(!diff_read_32(in, &pkttype) || pkttype != DIFF_PART_IXFR) { + /* note that errors could not really happen due to format of the + * packet since xfrd has checked all dnames and RRs before commit, + * this is why the errors are fatal (exit process), it must be + * something internal or a bad disk or something. */ + + /* read ixfr packet RRs and apply to in memory db */ + if(!diff_read_32(in, &pkttype) || pkttype != DIFF_PART_XXFR) { log_msg(LOG_ERR, "could not read type or wrong type"); return 0; } - if(!diff_read_32(in, ×tamp[0]) || - !diff_read_32(in, ×tamp[1])) { - log_msg(LOG_ERR, "could not read timestamp"); - return 0; - } - if(!diff_read_32(in, &filelen)) { + if(!diff_read_32(in, &msglen)) { log_msg(LOG_ERR, "could not read len"); return 0; } - /* read header */ - if(filelen < QHEADERSZ + sizeof(uint32_t)*3 + sizeof(uint16_t)) { + if(msglen < QHEADERSZ) { log_msg(LOG_ERR, "msg too short"); return 0; } @@ -834,35 +936,7 @@ apply_ixfr(namedb_type* db, FILE *in, const off_t* startpos, log_msg(LOG_ERR, "out of memory"); return 0; } - - if(!diff_read_str(in, file_zone_name, sizeof(file_zone_name)) || - !diff_read_32(in, &file_serial) || - !diff_read_16(in, &file_id) || - !diff_read_32(in, &file_seq_nr)) - { - log_msg(LOG_ERR, "could not part data"); - region_destroy(region); - return 0; - } - - if(strcmp(file_zone_name, zone) != 0 || serialno != file_serial || - id != file_id || seq_nr != file_seq_nr) { - log_msg(LOG_ERR, "internal error: reading part with changed id"); - region_destroy(region); - return 0; - } - msglen = filelen - sizeof(uint32_t)*3 - sizeof(uint16_t) - - strlen(file_zone_name); packet = buffer_create(region, QIOBUFSZ); - dname_zone = dname_parse(region, zone); - zone_db = find_zone(db, dname_zone, opt, child_count); - if(!zone_db) { - log_msg(LOG_ERR, "no zone exists"); - region_destroy(region); - /* break out and stop the IXFR, ignore it */ - return 2; - } - if(msglen > QIOBUFSZ) { log_msg(LOG_ERR, "msg too long"); region_destroy(region); @@ -876,6 +950,23 @@ apply_ixfr(namedb_type* db, FILE *in, const off_t* startpos, } buffer_set_limit(packet, msglen); + /* see if check on data fails: checks that we are not reading + * random garbage */ + if(!diff_read_32(in, &checklen) || checklen != msglen) { + log_msg(LOG_ERR, "transfer part has incorrect checkvalue"); + return 0; + } + *bytes += msglen; + + dname_zone = dname_parse(region, zone); + zone_db = find_or_create_zone(db, dname_zone, opt, zone, patname); + if(!zone_db) { + log_msg(LOG_ERR, "could not create zone %s %s", zone, patname); + region_destroy(region); + return 0; + } + *zone_res = zone_db; + /* only answer section is really used, question, additional and authority section RRs are skipped */ qcount = QDCOUNT(packet); @@ -931,8 +1022,8 @@ apply_ixfr(namedb_type* db, FILE *in, const off_t* startpos, } if(buffer_read_u32(packet) != serialno) { buffer_skip(packet, -4); - log_msg(LOG_ERR, "SOA serial %d different from commit %d", - buffer_read_u32(packet), serialno); + log_msg(LOG_ERR, "SOA serial %u different from commit %u", + (unsigned)buffer_read_u32(packet), (unsigned)serialno); region_destroy(region); return 0; } @@ -941,13 +1032,11 @@ apply_ixfr(namedb_type* db, FILE *in, const off_t* startpos, *rr_count = 1; *is_axfr = 0; *delete_mode = 0; - DEBUG(DEBUG_XFRD,2, (LOG_INFO, "diff: %s start count %d, ax %d, delmode %d", dname_to_string(dname_zone, 0), *rr_count, *is_axfr, *delete_mode)); } else counter = 0; - last_in_list = zone_db->apex; for(; counter < ancount; ++counter,++(*rr_count)) { uint16_t type, klass; @@ -978,7 +1067,15 @@ apply_ixfr(namedb_type* db, FILE *in, const off_t* startpos, if(*rr_count == 1 && type != TYPE_SOA) { /* second RR: if not SOA: this is an AXFR; delete all zone contents */ +#ifdef NSEC3 + nsec3_hash_tree_clear(zone_db); +#endif delete_zone_rrs(db, zone_db); + udb_zone_clear(db->udb, udbz); +#ifdef NSEC3 + nsec3_clear_precompile(db, zone_db); + zone_db->nsec3_param = NULL; +#endif /* NSEC3 */ /* add everything else (incl end SOA) */ *delete_mode = 0; *is_axfr = 1; @@ -1000,7 +1097,15 @@ apply_ixfr(namedb_type* db, FILE *in, const off_t* startpos, thisserial = buffer_read_u32(packet); if(thisserial == serialno) { /* AXFR */ +#ifdef NSEC3 + nsec3_hash_tree_clear(zone_db); +#endif delete_zone_rrs(db, zone_db); + udb_zone_clear(db->udb, udbz); +#ifdef NSEC3 + nsec3_clear_precompile(db, zone_db); + zone_db->nsec3_param = NULL; +#endif /* NSEC3 */ *delete_mode = 0; *is_axfr = 1; } @@ -1040,26 +1145,22 @@ apply_ixfr(namedb_type* db, FILE *in, const off_t* startpos, && seq_nr == seq_total-1) { continue; /* do not delete final SOA RR for IXFR */ } - if(!delete_RR(db, dname, type, klass, last_in_list, packet, - rrlen, zone_db, region, *is_axfr)) { + if(!delete_RR(db, dname, type, klass, packet, + rrlen, zone_db, region, udbz)) { region_destroy(region); return 0; } - if (!*is_axfr && last_in_list->nextdiff) { - last_in_list = last_in_list->nextdiff; - } } else { /* add this rr */ if(!add_RR(db, dname, type, klass, ttl, packet, - rrlen, zone_db, *is_axfr)) { + rrlen, zone_db, udbz)) { region_destroy(region); return 0; } } } - fix_empty_terminals(zone_db); region_destroy(region); return 1; } @@ -1089,601 +1190,764 @@ check_for_bad_serial(namedb_type* db, const char* zone_str, uint32_t old_serial) return 0; } -/* for multiple tcp packets use a data structure that has - * a rbtree (zone_names) with for each zone: - * has a rbtree by sequence number - * with inside a serial_number and ID (for checking only) - * and contains a off_t to the IXFR packet in the file. - * so when you get a commit for a zone, get zone obj, find sequence, - * then check if you have all sequence numbers available. Apply all packets. - */ -struct diff_read_data { - /* rbtree of struct diff_zone*/ - rbtree_t* zones; - /* region for allocation */ - region_type* region; -}; -struct diff_zone { - /* key is dname of zone */ - rbnode_t node; - /* rbtree of struct diff_xfrpart */ - rbtree_t* parts; -}; -struct diff_xfrpart { - /* key is sequence number */ - rbnode_t node; - uint32_t seq_nr; - uint32_t new_serial; - uint16_t id; - off_t file_pos; -}; - -static struct diff_read_data* -diff_read_data_create() -{ - region_type* region = region_create(xalloc, free); - struct diff_read_data* data = (struct diff_read_data*) - region_alloc(region, sizeof(struct diff_read_data)); - if(!data) { - log_msg(LOG_ERR, "out of memory, %s:%d", __FILE__, __LINE__); - exit(1); - } - data->region = region; - data->zones = rbtree_create(region, - (int (*)(const void *, const void *)) dname_compare); - return data; -} - -static struct diff_zone* -diff_read_find_zone(struct diff_read_data* data, const char* name) -{ - const dname_type* dname = dname_parse(data->region, name); - struct diff_zone* zp = (struct diff_zone*) - rbtree_search(data->zones, dname); - return zp; -} - -static int intcompf(const void* a, const void* b) -{ - if(*(uint32_t*)a < *(uint32_t*)b) - return -1; - if(*(uint32_t*)a > *(uint32_t*)b) - return +1; - return 0; -} - -static struct diff_zone* -diff_read_insert_zone(struct diff_read_data* data, const char* name) -{ - const dname_type* dname = dname_parse(data->region, name); - struct diff_zone* zp = region_alloc(data->region, - sizeof(struct diff_zone)); - if(!zp) { - log_msg(LOG_ERR, "out of memory, %s:%d", __FILE__, __LINE__); - exit(1); - } - zp->node = *RBTREE_NULL; - zp->node.key = dname; - zp->parts = rbtree_create(data->region, intcompf); - rbtree_insert(data->zones, (rbnode_t*)zp); - return zp; -} - -static struct diff_xfrpart* -diff_read_find_part(struct diff_zone* zp, uint32_t seq_nr) -{ - struct diff_xfrpart* xp = (struct diff_xfrpart*) - rbtree_search(zp->parts, &seq_nr); - return xp; -} - -static struct diff_xfrpart* -diff_read_insert_part(struct diff_read_data* data, - struct diff_zone* zp, uint32_t seq_nr) -{ - struct diff_xfrpart* xp = region_alloc(data->region, - sizeof(struct diff_xfrpart)); - if(!xp) { - log_msg(LOG_ERR, "out of memory, %s:%d", __FILE__, __LINE__); - exit(1); - } - xp->node = *RBTREE_NULL; - xp->node.key = &xp->seq_nr; - xp->seq_nr = seq_nr; - rbtree_insert(zp->parts, (rbnode_t*)xp); - return xp; -} - -/* mark commit as rollback and close inputfile, fatal exits */ -static void -mark_and_exit(nsd_options_t* opt, FILE* f, off_t commitpos, const char* desc) -{ - const char* filename = opt->difffile; - fclose(f); - if(!(f = fopen(filename, "r+"))) { - log_msg(LOG_ERR, "mark xfr, failed to re-open difffile %s: %s", - filename, strerror(errno)); - } else if(fseeko(f, commitpos, SEEK_SET) == -1) { - log_msg(LOG_INFO, "could not fseeko: %s.", strerror(errno)); - fclose(f); - } else { - uint8_t c = 0; - (void)write_data(f, &c, sizeof(c)); - fclose(f); - log_msg(LOG_ERR, "marked xfr as failed: %s", desc); - log_msg(LOG_ERR, "marked xfr so that next reload can succeed"); - } - exit(1); -} - static int -read_sure_part(namedb_type* db, FILE *in, nsd_options_t* opt, - struct diff_read_data* data, struct diff_log** log, - size_t child_count) +apply_ixfr_for_zone(nsd_type* nsd, zone_type* zonedb, FILE* in, + nsd_options_t* opt, udb_base* taskudb, udb_ptr* last_task, + uint32_t xfrfilenr) { char zone_buf[3072]; char log_buf[5120]; - uint32_t old_serial, new_serial, num_parts; - uint16_t id; + char patname_buf[2048]; + + uint32_t old_serial, new_serial, num_parts, type; + uint64_t time_end_0, time_start_0; + uint32_t time_end_1, time_start_1; uint8_t committed; - struct diff_zone *zp; uint32_t i; - int have_all_parts = 1; - struct diff_log* thislog = 0; - off_t commitpos; + int num_bytes = 0; /* read zone name and serial */ - if(!diff_read_str(in, zone_buf, sizeof(zone_buf)) || - !diff_read_32(in, &old_serial) || - !diff_read_32(in, &new_serial) || - !diff_read_16(in, &id) || - !diff_read_32(in, &num_parts)) { - log_msg(LOG_ERR, "diff file bad commit part"); + if(!diff_read_32(in, &type)) { + log_msg(LOG_ERR, "diff file too short"); return 0; } - commitpos = ftello(in); /* position of commit byte */ - if(commitpos == -1) { - log_msg(LOG_INFO, "could not ftello: %s.", strerror(errno)); + if(type != DIFF_PART_XFRF) { + log_msg(LOG_ERR, "xfr file has wrong format"); return 0; + } + /* committed and num_parts are first because they need to be + * updated once the rest is written. The log buf is not certain + * until its done, so at end of file. The patname is in case a + * new zone is created, we know what the options-pattern is */ if(!diff_read_8(in, &committed) || - !diff_read_str(in, log_buf, sizeof(log_buf)) ) - { + !diff_read_32(in, &num_parts) || + !diff_read_64(in, &time_end_0) || + !diff_read_32(in, &time_end_1) || + !diff_read_32(in, &old_serial) || + !diff_read_32(in, &new_serial) || + !diff_read_64(in, &time_start_0) || + !diff_read_32(in, &time_start_1) || + !diff_read_str(in, zone_buf, sizeof(zone_buf)) || + !diff_read_str(in, patname_buf, sizeof(patname_buf))) { log_msg(LOG_ERR, "diff file bad commit part"); return 0; } - if(log) { - thislog = (struct diff_log*)region_alloc(db->region, sizeof(struct diff_log)); - if(!thislog) { - log_msg(LOG_ERR, "out of memory, %s:%d", __FILE__, __LINE__); - exit(1); - } - thislog->zone_name = region_strdup(db->region, zone_buf); - thislog->comment = region_strdup(db->region, log_buf); - thislog->error = 0; - thislog->next = *log; - *log = thislog; - } - /* has been read in completely */ - zp = diff_read_find_zone(data, zone_buf); - if(!zp) { - log_msg(LOG_ERR, "diff file commit without IXFR"); - if(thislog) - thislog->error = "error no IXFR parts"; - return 1; + if(strcmp(zone_buf, dname_to_string(zonedb->apex->dname,0)) != 0) { + log_msg(LOG_ERR, "file %s does not match task %s", + zone_buf, dname_to_string(zonedb->apex->dname,0)); + return 0; } - if(committed && check_for_bad_serial(db, zone_buf, old_serial)) { - DEBUG(DEBUG_XFRD,1, (LOG_ERR, - "skipping diff file commit with bad serial")); - zp->parts->root = RBTREE_NULL; - zp->parts->count = 0; - if(thislog) - thislog->error = "error bad serial"; - return 1; + if(!committed) { + log_msg(LOG_ERR, "diff file %s was not committed", zone_buf); + return 0; } - for(i=0; i<num_parts; i++) { - struct diff_xfrpart *xp = diff_read_find_part(zp, i); - if(!xp || xp->id != id || xp->new_serial != new_serial) { - have_all_parts = 0; - } + if(num_parts == 0) { + log_msg(LOG_ERR, "diff file %s was not completed", zone_buf); + return 0; } - if(!have_all_parts) { + if(check_for_bad_serial(nsd->db, zone_buf, old_serial)) { DEBUG(DEBUG_XFRD,1, (LOG_ERR, - "skipping diff file commit without all parts")); - if(thislog) - thislog->error = "error missing parts"; + "skipping diff file commit with bad serial")); + return 1; } - if(committed && have_all_parts) + if(committed) { int is_axfr=0, delete_mode=0, rr_count=0; - off_t resume_pos; + const dname_type* apex = zonedb->apex->dname; + udb_ptr z; -#ifdef NSEC3 -#ifndef FULL_PREHASH - struct region *region; - dname_type const *zone_dname; - struct zone *zone; - - region = region_create(xalloc, free); - if (region == NULL) { - log_msg(LOG_ERR, "out of memory"); - return 0; - } - zone_dname = dname_parse(region, zone_buf); - if (zone_dname == NULL) { - log_msg(LOG_ERR, "out of memory"); - region_destroy(region); - return 0; - } - zone = find_zone(db, zone_dname, opt, child_count); - region_destroy(region); - if (zone == NULL) { - log_msg(LOG_ERR, "no zone exists"); - /* just stop trying applying ixfr */ - return 1; - } - if (0 != namedb_nsec3_mod_domains_create(db)) { - log_msg(LOG_ERR, - "unable to allocate space " - "for modified NSEC3 domains"); - return 0; + DEBUG(DEBUG_XFRD,1, (LOG_INFO, "processing xfr: %s", zone_buf)); + if(udb_base_get_userflags(nsd->db->udb) != 0) { + log_msg(LOG_ERR, "database corrupted, cannot update"); + xfrd_unlink_xfrfile(nsd, xfrfilenr); + exit(1); } -#endif /* !FULL_PREHASH */ -#endif /* NSEC3 */ - - DEBUG(DEBUG_XFRD,1, (LOG_INFO, "processing xfr: %s", log_buf)); - - resume_pos = ftello(in); - if(resume_pos == -1) { - log_msg(LOG_INFO, "could not ftello: %s.", strerror(errno)); - return 0; + /* all parts were checked by xfrd before commit */ + if(!udb_zone_search(nsd->db->udb, &z, dname_name(apex), + apex->name_size)) { + /* create it */ + if(!udb_zone_create(nsd->db->udb, &z, dname_name(apex), + apex->name_size)) { + /* out of disk space perhaps */ + log_msg(LOG_ERR, "could not udb_create_zone " + "%s, disk space full?", log_buf); + return 0; + } } + /* set the udb dirty until we are finished applying changes */ + udb_base_set_userflags(nsd->db->udb, 1); + /* read and apply all of the parts */ for(i=0; i<num_parts; i++) { - struct diff_xfrpart *xp = diff_read_find_part(zp, i); int ret; DEBUG(DEBUG_XFRD,2, (LOG_INFO, "processing xfr: apply part %d", (int)i)); - ret = apply_ixfr(db, in, &xp->file_pos, zone_buf, new_serial, opt, - id, xp->seq_nr, num_parts, &is_axfr, &delete_mode, - &rr_count, child_count); + ret = apply_ixfr(nsd->db, in, zone_buf, new_serial, opt, + i, num_parts, &is_axfr, &delete_mode, + &rr_count, &z, &zonedb, patname_buf, &num_bytes); if(ret == 0) { - log_msg(LOG_ERR, "bad ixfr packet part %d in %s", (int)i, - opt->difffile); - mark_and_exit(opt, in, commitpos, log_buf); + log_msg(LOG_ERR, "bad ixfr packet part %d in diff file for %s", (int)i, zone_buf); + xfrd_unlink_xfrfile(nsd, xfrfilenr); + /* the udb is still dirty, it is bad */ + exit(1); } else if(ret == 2) { break; } } + udb_base_set_userflags(nsd->db->udb, 0); + /* read the final log_str: but do not fail on it */ + if(!diff_read_str(in, log_buf, sizeof(log_buf))) { + log_msg(LOG_ERR, "could not read log for transfer %s", + zone_buf); + snprintf(log_buf, sizeof(log_buf), "error reading log"); + } #ifdef NSEC3 -#ifndef FULL_PREHASH - if (is_axfr != 0) - prehash_zone(db, zone); - else - prehash_zone_incremental(db, zone); -#endif /* !FULL_PREHASH */ + if(zonedb) prehash_zone(nsd->db, zonedb); #endif /* NSEC3 */ - - if(fseeko(in, resume_pos, SEEK_SET) == -1) { - log_msg(LOG_INFO, "could not fseeko: %s.", strerror(errno)); - return 0; + zonedb->is_changed = 1; + ZONE(&z)->is_changed = 1; + ZONE(&z)->mtime = time_end_0; + udb_zone_set_log_str(nsd->db->udb, &z, log_buf); + udb_ptr_unlink(&z, nsd->db->udb); + if(taskudb) task_new_soainfo(taskudb, last_task, zonedb); + + if(1 <= verbosity) { + double elapsed = (double)(time_end_0 - time_start_0)+ + (double)((double)time_end_1 + -(double)time_start_1) / 1000000.0; + VERBOSITY(2, (LOG_INFO, "zone %s %s of %d bytes in %g seconds", + zone_buf, log_buf, num_bytes, elapsed)); } } else { DEBUG(DEBUG_XFRD,1, (LOG_INFO, "skipping xfr: %s", log_buf)); } - - /* clean out the parts for the zone after the commit/rollback */ - zp->parts->root = RBTREE_NULL; - zp->parts->count = 0; return 1; } +struct udb_base* task_file_create(const char* file) +{ + return udb_base_create_new(file, &namedb_walkfunc, NULL); +} + static int -store_ixfr_data(FILE *in, uint32_t len, struct diff_read_data* data, off_t* startpos) +task_create_new_elem(struct udb_base* udb, udb_ptr* last, udb_ptr* e, + size_t sz, const dname_type* zname) { - char zone_name[3072]; - struct diff_zone* zp; - struct diff_xfrpart* xp; - uint32_t new_serial, seq; - uint16_t id; - if(!diff_read_str(in, zone_name, sizeof(zone_name)) || - !diff_read_32(in, &new_serial) || - !diff_read_16(in, &id) || - !diff_read_32(in, &seq)) { - log_msg(LOG_INFO, "could not read ixfr store info: file format error"); + if(!udb_ptr_alloc_space(e, udb, udb_chunk_type_task, sz)) { return 0; } - len -= sizeof(uint32_t)*3 + sizeof(uint16_t) + strlen(zone_name); - if(fseeko(in, len, SEEK_CUR) == -1) - log_msg(LOG_INFO, "fseek failed: %s", strerror(errno)); - /* store the info */ - zp = diff_read_find_zone(data, zone_name); - if(!zp) - zp = diff_read_insert_zone(data, zone_name); - xp = diff_read_find_part(zp, seq); - if(xp) { - log_msg(LOG_INFO, "discarding partial xfr part: %s %d", zone_name, seq); - /* overwrite with newer value (which probably relates to next commit) */ + if(udb_ptr_is_null(last)) { + udb_base_set_userdata(udb, e->data); + } else { + udb_rptr_set_ptr(&TASKLIST(last)->next, udb, e); } - else { - xp = diff_read_insert_part(data, zp, seq); + udb_ptr_set_ptr(last, udb, e); + + /* fill in tasklist item */ + udb_rel_ptr_init(&TASKLIST(e)->next); + TASKLIST(e)->size = sz; + TASKLIST(e)->oldserial = 0; + TASKLIST(e)->newserial = 0; + TASKLIST(e)->yesno = 0; + + if(zname) { + memmove(TASKLIST(e)->zname, zname, dname_total_size(zname)); } - xp->new_serial = new_serial; - xp->id = id; - memmove(&xp->file_pos, startpos, sizeof(off_t)); return 1; } -static int -read_process_part(namedb_type* db, FILE *in, uint32_t type, - nsd_options_t* opt, struct diff_read_data* data, - struct diff_log** log, size_t child_count, off_t* startpos) +void task_new_soainfo(struct udb_base* udb, udb_ptr* last, struct zone* z) { - uint32_t len, len2; + /* calculate size */ + udb_ptr e; + size_t sz; + const dname_type* apex, *ns, *em; + if(!z || !z->apex || !domain_dname(z->apex)) + return; /* safety check */ + + DEBUG(DEBUG_IPC,1, (LOG_INFO, "nsd: add soa info for zone %s", + domain_to_string(z->apex))); + apex = domain_dname(z->apex); + sz = sizeof(struct task_list_d) + dname_total_size(apex); + if(z->soa_rrset) { + ns = domain_dname(rdata_atom_domain( + z->soa_rrset->rrs[0].rdatas[0])); + em = domain_dname(rdata_atom_domain( + z->soa_rrset->rrs[0].rdatas[1])); + sz += sizeof(uint32_t)*6 + sizeof(uint8_t)*2 + + ns->name_size + em->name_size; + } else { + ns = 0; + em = 0; + } - /* read length */ - if(!diff_read_32(in, &len)) - return 1; - /* read content */ - if(type == DIFF_PART_IXFR) { - DEBUG(DEBUG_XFRD,2, (LOG_INFO, "part IXFR len %d", len)); - if(!store_ixfr_data(in, len, data, startpos)) - return 0; + /* create new task_list item */ + if(!task_create_new_elem(udb, last, &e, sz, apex)) { + log_msg(LOG_ERR, "tasklist: out of space, cannot add SOAINFO"); + return; } - else if(type == DIFF_PART_SURE) { - DEBUG(DEBUG_XFRD,2, (LOG_INFO, "part SURE len %d", len)); - if(!read_sure_part(db, in, opt, data, log, child_count)) - return 0; - } else { - DEBUG(DEBUG_XFRD,1, (LOG_INFO, "unknown part %x len %d", type, len)); - return 0; + TASKLIST(&e)->task_type = task_soa_info; + + if(z->soa_rrset) { + uint32_t ttl = htonl(z->soa_rrset->rrs[0].ttl); + uint8_t* p = (uint8_t*)TASKLIST(&e)->zname; + p += dname_total_size(apex); + memmove(p, &ttl, sizeof(uint32_t)); + p += sizeof(uint32_t); + memmove(p, &ns->name_size, sizeof(uint8_t)); + p += sizeof(uint8_t); + memmove(p, dname_name(ns), ns->name_size); + p += ns->name_size; + memmove(p, &em->name_size, sizeof(uint8_t)); + p += sizeof(uint8_t); + memmove(p, dname_name(em), em->name_size); + p += em->name_size; + memmove(p, rdata_atom_data(z->soa_rrset->rrs[0].rdatas[2]), + sizeof(uint32_t)); + p += sizeof(uint32_t); + memmove(p, rdata_atom_data(z->soa_rrset->rrs[0].rdatas[3]), + sizeof(uint32_t)); + p += sizeof(uint32_t); + memmove(p, rdata_atom_data(z->soa_rrset->rrs[0].rdatas[4]), + sizeof(uint32_t)); + p += sizeof(uint32_t); + memmove(p, rdata_atom_data(z->soa_rrset->rrs[0].rdatas[5]), + sizeof(uint32_t)); + p += sizeof(uint32_t); + memmove(p, rdata_atom_data(z->soa_rrset->rrs[0].rdatas[6]), + sizeof(uint32_t)); } - /* read length */ - if(!diff_read_32(in, &len2)) - return 1; /* short read is OK */ - /* verify length */ - if(len != len2) - return 0; /* bad data is wrong */ - return 1; + udb_ptr_unlink(&e, udb); } -/* - * Finds smallest offset in data structs - * returns 0 if no offsets in the data structs. - */ -static int -find_smallest_offset(struct diff_read_data* data, off_t* offset) +void task_process_sync(struct udb_base* taskudb) { - int found_any = 0; - struct diff_zone* dz; - struct diff_xfrpart* dx; - off_t mem_offset, mem_fpos; + /* need to sync before other process uses the mmap? */ + DEBUG(DEBUG_IPC,1, (LOG_INFO, "task procsync %s size %d", + taskudb->fname, (int)taskudb->base_size)); + (void)taskudb; +} - if(!data || !data->zones) - return 0; - RBTREE_FOR(dz, struct diff_zone*, data->zones) - { - if(!dz->parts) - continue; - RBTREE_FOR(dx, struct diff_xfrpart*, dz->parts) - { - memmove(&mem_fpos, &dx->file_pos, sizeof(off_t)); +void task_remap(struct udb_base* taskudb) +{ + DEBUG(DEBUG_IPC,1, (LOG_INFO, "task remap %s size %d", + taskudb->fname, (int)taskudb->glob_data->fsize)); + udb_base_remap_process(taskudb); +} - if(found_any) { - memmove(&mem_offset, offset, sizeof(off_t)); +void task_clear(struct udb_base* taskudb) +{ + udb_ptr t, n; + udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb)); + udb_base_set_userdata(taskudb, 0); + udb_ptr_init(&n, taskudb); + while(!udb_ptr_is_null(&t)) { + udb_ptr_set_rptr(&n, taskudb, &TASKLIST(&t)->next); + udb_rptr_zero(&TASKLIST(&t)->next, taskudb); + udb_ptr_free_space(&t, taskudb, TASKLIST(&t)->size); + udb_ptr_set_ptr(&t, taskudb, &n); + } + udb_ptr_unlink(&t, taskudb); + udb_ptr_unlink(&n, taskudb); +} - if(mem_fpos < mem_offset) - memmove(offset, &mem_fpos, sizeof(off_t)); - } else { - found_any = 1; - memmove(offset, &mem_fpos, sizeof(off_t)); - } - } +void task_new_expire(struct udb_base* udb, udb_ptr* last, + const struct dname* z, int expired) +{ + udb_ptr e; + if(!z) return; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "add expire info for zone %s", + dname_to_string(z,NULL))); + if(!task_create_new_elem(udb, last, &e, sizeof(struct task_list_d)+ + dname_total_size(z), z)) { + log_msg(LOG_ERR, "tasklist: out of space, cannot add expire"); + return; } + TASKLIST(&e)->task_type = task_expire; + TASKLIST(&e)->yesno = expired; + udb_ptr_unlink(&e, udb); +} - return found_any; +void task_new_check_zonefiles(udb_base* udb, udb_ptr* last, + const dname_type* zone) +{ + udb_ptr e; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "add task checkzonefiles")); + if(!task_create_new_elem(udb, last, &e, sizeof(struct task_list_d) + + (zone?dname_total_size(zone):0), zone)) { + log_msg(LOG_ERR, "tasklist: out of space, cannot add check_zones"); + return; + } + TASKLIST(&e)->task_type = task_check_zonefiles; + TASKLIST(&e)->yesno = (zone!=NULL); + udb_ptr_unlink(&e, udb); } -int -diff_read_file(namedb_type* db, nsd_options_t* opt, struct diff_log** log, +void task_new_write_zonefiles(udb_base* udb, udb_ptr* last, + const dname_type* zone) +{ + udb_ptr e; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "add task writezonefiles")); + if(!task_create_new_elem(udb, last, &e, sizeof(struct task_list_d) + + (zone?dname_total_size(zone):0), zone)) { + log_msg(LOG_ERR, "tasklist: out of space, cannot add writezones"); + return; + } + TASKLIST(&e)->task_type = task_write_zonefiles; + TASKLIST(&e)->yesno = (zone!=NULL); + udb_ptr_unlink(&e, udb); +} + +void task_new_set_verbosity(udb_base* udb, udb_ptr* last, int v) +{ + udb_ptr e; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "add task set_verbosity")); + if(!task_create_new_elem(udb, last, &e, sizeof(struct task_list_d), + NULL)) { + log_msg(LOG_ERR, "tasklist: out of space, cannot add set_v"); + return; + } + TASKLIST(&e)->task_type = task_set_verbosity; + TASKLIST(&e)->yesno = v; + udb_ptr_unlink(&e, udb); +} + +#ifdef BIND8_STATS +void* task_new_stat_info(udb_base* udb, udb_ptr* last, struct nsdst* stat, size_t child_count) { - const char* filename = opt->difffile; - FILE *df; - uint32_t type, timestamp[2], curr_timestamp[2]; - struct diff_read_data* data = diff_read_data_create(); - off_t startpos; + void* p; + udb_ptr e; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "add task stat_info")); + if(!task_create_new_elem(udb, last, &e, sizeof(struct task_list_d)+ + sizeof(*stat) + sizeof(stc_t)*child_count, NULL)) { + log_msg(LOG_ERR, "tasklist: out of space, cannot add stati"); + return NULL; + } + TASKLIST(&e)->task_type = task_stat_info; + p = TASKLIST(&e)->zname; + memcpy(p, stat, sizeof(*stat)); + udb_ptr_unlink(&e, udb); + return p + sizeof(*stat); +} +#endif /* BIND8_STATS */ - df = fopen(filename, "r"); - if(!df) { - DEBUG(DEBUG_XFRD,1, (LOG_INFO, "could not open file %s for reading: %s", - filename, strerror(errno))); - region_destroy(data->region); - return 1; +void +task_new_add_zone(udb_base* udb, udb_ptr* last, const char* zone, + const char* pattern) +{ + size_t zlen = strlen(zone); + size_t plen = strlen(pattern); + void *p; + udb_ptr e; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "add task addzone %s %s", zone, pattern)); + if(!task_create_new_elem(udb, last, &e, sizeof(struct task_list_d)+ + zlen + 1 + plen + 1, NULL)) { + log_msg(LOG_ERR, "tasklist: out of space, cannot add addz"); + return; } + TASKLIST(&e)->task_type = task_add_zone; + p = TASKLIST(&e)->zname; + memcpy(p, zone, zlen+1); + memmove(p+zlen+1, pattern, plen+1); + udb_ptr_unlink(&e, udb); +} - /* check timestamp */ - curr_timestamp[0] = (uint32_t) db->diff_timestamp.tv_sec; - curr_timestamp[1] = (uint32_t) db->diff_timestamp.tv_usec; +void +task_new_del_zone(udb_base* udb, udb_ptr* last, const dname_type* dname) +{ + udb_ptr e; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "add task delzone %s", dname_to_string(dname, 0))); + if(!task_create_new_elem(udb, last, &e, sizeof(struct task_list_d) + +dname_total_size(dname), dname)) { + log_msg(LOG_ERR, "tasklist: out of space, cannot add delz"); + return; + } + TASKLIST(&e)->task_type = task_del_zone; + udb_ptr_unlink(&e, udb); +} - if(!diff_read_32(df, &type)) { - DEBUG(DEBUG_XFRD,1, (LOG_INFO, "difffile %s is empty", - filename)); - db->diff_skip = 0; - db->diff_pos = 0; +void task_new_add_key(udb_base* udb, udb_ptr* last, key_options_t* key) +{ + char* p; + udb_ptr e; + assert(key->name && key->algorithm && key->secret); + DEBUG(DEBUG_IPC,1, (LOG_INFO, "add task addkey")); + if(!task_create_new_elem(udb, last, &e, sizeof(struct task_list_d) + +strlen(key->name)+1+strlen(key->algorithm)+1+ + strlen(key->secret)+1, NULL)) { + log_msg(LOG_ERR, "tasklist: out of space, cannot add addk"); + return; } - else if (!diff_read_32(df, ×tamp[0]) || - !diff_read_32(df, ×tamp[1])) { - log_msg(LOG_ERR, "difffile %s bad first part: no timestamp", - filename); - region_destroy(data->region); - fclose(df); - return 0; + TASKLIST(&e)->task_type = task_add_key; + p = (char*)TASKLIST(&e)->zname; + memmove(p, key->name, strlen(key->name)+1); + p+=strlen(key->name)+1; + memmove(p, key->algorithm, strlen(key->algorithm)+1); + p+=strlen(key->algorithm)+1; + memmove(p, key->secret, strlen(key->secret)+1); + udb_ptr_unlink(&e, udb); +} + +void task_new_del_key(udb_base* udb, udb_ptr* last, const char* name) +{ + char* p; + udb_ptr e; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "add task delkey")); + if(!task_create_new_elem(udb, last, &e, sizeof(struct task_list_d) + +strlen(name)+1, NULL)) { + log_msg(LOG_ERR, "tasklist: out of space, cannot add delk"); + return; } - else if (curr_timestamp[0] != timestamp[0] || - curr_timestamp[1] != timestamp[1]) { - /* new timestamp, no skipping */ - db->diff_timestamp.tv_sec = (time_t) timestamp[0]; - db->diff_timestamp.tv_usec = (suseconds_t) timestamp[1]; - - if (db->diff_skip) { - DEBUG(DEBUG_XFRD,1, (LOG_INFO, "new timestamp on " - "difffile %s, restoring diff_skip and diff_pos " - "[old timestamp: %u.%u; new timestamp: %u.%u]", - filename, curr_timestamp[0], curr_timestamp[1], - timestamp[0], timestamp[1])); - db->diff_skip = 0; - db->diff_pos = 0; - } + TASKLIST(&e)->task_type = task_del_key; + p = (char*)TASKLIST(&e)->zname; + memmove(p, name, strlen(name)+1); + udb_ptr_unlink(&e, udb); +} + +void task_new_add_pattern(udb_base* udb, udb_ptr* last, pattern_options_t* p) +{ + region_type* temp; + buffer_type* buffer; + udb_ptr e; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "add task addpattern %s", p->pname)); + temp = region_create(xalloc, free); + buffer = buffer_create(temp, 4096); + pattern_options_marshal(buffer, p); + buffer_flip(buffer); + if(!task_create_new_elem(udb, last, &e, sizeof(struct task_list_d) + + buffer_limit(buffer), NULL)) { + log_msg(LOG_ERR, "tasklist: out of space, cannot add addp"); + region_destroy(temp); + return; } + TASKLIST(&e)->task_type = task_add_pattern; + TASKLIST(&e)->yesno = buffer_limit(buffer); + memmove(TASKLIST(&e)->zname, buffer_begin(buffer), + buffer_limit(buffer)); + udb_ptr_unlink(&e, udb); + region_destroy(temp); +} - /* Always seek, to diff_pos or to beginning of the file. */ - if (fseeko(df, 0, SEEK_SET)==-1) { - log_msg(LOG_INFO, "could not fseeko file %s: %s.", filename, - strerror(errno)); - region_destroy(data->region); - fclose(df); - return 0; +void task_new_del_pattern(udb_base* udb, udb_ptr* last, const char* name) +{ + char* p; + udb_ptr e; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "add task delpattern %s", name)); + if(!task_create_new_elem(udb, last, &e, sizeof(struct task_list_d) + +strlen(name)+1, NULL)) { + log_msg(LOG_ERR, "tasklist: out of space, cannot add delp"); + return; } - if(db->diff_skip) { - DEBUG(DEBUG_XFRD,1, (LOG_INFO, "skip diff file")); - if(fseeko(df, db->diff_pos, SEEK_SET)==-1) { - log_msg(LOG_INFO, "could not fseeko file %s: %s. " - "Reread from start.", filename, - strerror(errno)); - } + TASKLIST(&e)->task_type = task_del_pattern; + p = (char*)TASKLIST(&e)->zname; + memmove(p, name, strlen(name)+1); + udb_ptr_unlink(&e, udb); +} + +void task_new_opt_change(udb_base* udb, udb_ptr* last, nsd_options_t* opt) +{ + udb_ptr e; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "add task opt_change")); + if(!task_create_new_elem(udb, last, &e, sizeof(struct task_list_d), + NULL)) { + log_msg(LOG_ERR, "tasklist: out of space, cannot add o_c"); + return; } + TASKLIST(&e)->task_type = task_opt_change; +#ifdef RATELIMIT + TASKLIST(&e)->oldserial = opt->rrl_ratelimit; + TASKLIST(&e)->newserial = opt->rrl_whitelist_ratelimit; + TASKLIST(&e)->yesno = (uint64_t) opt->rrl_slip; +#else + (void)opt; +#endif + udb_ptr_unlink(&e, udb); +} - startpos = ftello(df); - if(startpos == -1) { - log_msg(LOG_INFO, "could not ftello: %s.", strerror(errno)); - region_destroy(data->region); - fclose(df); +int +task_new_apply_xfr(udb_base* udb, udb_ptr* last, const dname_type* dname, + uint32_t old_serial, uint32_t new_serial, uint64_t filenumber) +{ + udb_ptr e; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "add task apply_xfr")); + if(!task_create_new_elem(udb, last, &e, sizeof(struct task_list_d) + +dname_total_size(dname), dname)) { + log_msg(LOG_ERR, "tasklist: out of space, cannot add applyxfr"); return 0; } + TASKLIST(&e)->oldserial = old_serial; + TASKLIST(&e)->newserial = new_serial; + TASKLIST(&e)->yesno = filenumber; + TASKLIST(&e)->task_type = task_apply_xfr; + udb_ptr_unlink(&e, udb); + return 1; +} - DEBUG(DEBUG_XFRD,1, (LOG_INFO, "start of diff file read at pos %u", - (uint32_t) db->diff_pos)); - while(diff_read_32(df, &type)) - { - DEBUG(DEBUG_XFRD,2, (LOG_INFO, "iter loop")); - - /* read timestamp */ - if(!diff_read_32(df, ×tamp[0]) || - !diff_read_32(df, ×tamp[1])) { - log_msg(LOG_INFO, "could not read timestamp: %s.", - strerror(errno)); - region_destroy(data->region); - fclose(df); - return 0; - } +void +task_process_expire(namedb_type* db, struct task_list_d* task) +{ + uint8_t ok; + zone_type* z = namedb_find_zone(db, task->zname); + assert(task->task_type == task_expire); + if(!z) { + DEBUG(DEBUG_IPC, 1, (LOG_WARNING, "zone %s %s but not in zonetree", + dname_to_string(task->zname, NULL), + task->yesno?"expired":"unexpired")); + return; + } + DEBUG(DEBUG_IPC,1, (LOG_INFO, "xfrd: expire task zone %s %s", + dname_to_string(task->zname,0), + task->yesno?"expired":"unexpired")); + /* find zone, set expire flag */ + ok = !task->yesno; + /* only update zone->is_ok if needed to minimize copy-on-write + * of memory pages shared after fork() */ + if(ok && !z->is_ok) + z->is_ok = 1; + else if(!ok && z->is_ok) + z->is_ok = 0; +} - if(!read_process_part(db, df, type, opt, data, log, - child_count, &startpos)) - { - log_msg(LOG_INFO, "error processing diff file"); - region_destroy(data->region); - fclose(df); - return 0; - } - startpos = ftello(df); - if(startpos == -1) { - log_msg(LOG_INFO, "could not ftello: %s.", strerror(errno)); - region_destroy(data->region); - fclose(df); - return 0; - } +static void +task_process_set_verbosity(struct task_list_d* task) +{ + DEBUG(DEBUG_IPC,1, (LOG_INFO, "verbosity task %d", (int)task->yesno)); + verbosity = task->yesno; +} + +static void +task_process_checkzones(struct nsd* nsd, udb_base* udb, udb_ptr* last_task, + struct task_list_d* task) +{ + /* on SIGHUP check if zone-text-files changed and if so, + * reread. When from xfrd-reload, no need to fstat the files */ + if(task->yesno) { + zone_options_t* zo = zone_options_find(nsd->options, + task->zname); + if(zo) + namedb_check_zonefile(nsd->db, udb, last_task, zo); + } else { + /* check all zones */ + namedb_check_zonefiles(nsd->db, nsd->options, udb, last_task); } - DEBUG(DEBUG_XFRD,1, (LOG_INFO, "end of diff file read")); +} - if(find_smallest_offset(data, &db->diff_pos)) { - /* can skip to the first unused element */ - DEBUG(DEBUG_XFRD,2, (LOG_INFO, "next time skip diff file")); - db->diff_skip = 1; +static void +task_process_writezones(struct nsd* nsd, struct task_list_d* task) +{ + if(task->yesno) { + zone_options_t* zo = zone_options_find(nsd->options, + task->zname); + if(zo) + namedb_write_zonefile(nsd->db, zo); } else { - /* all processed, can skip to here next time */ - DEBUG(DEBUG_XFRD,2, (LOG_INFO, "next time skip diff file")); - db->diff_skip = 1; - db->diff_pos = ftello(df); - if(db->diff_pos == -1) { - log_msg(LOG_INFO, "could not ftello: %s.", - strerror(errno)); - db->diff_skip = 0; - } + namedb_write_zonefiles(nsd->db, nsd->options); } +} - region_destroy(data->region); - fclose(df); - return 1; +static void +task_process_add_zone(struct nsd* nsd, udb_base* udb, udb_ptr* last_task, + struct task_list_d* task) +{ + zone_type* z; + const dname_type* zdname; + const char* zname = (const char*)task->zname; + const char* pname = zname + strlen(zname)+1; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "addzone task %s %s", zname, pname)); + zdname = dname_parse(nsd->db->region, zname); + if(!zdname) { + log_msg(LOG_ERR, "can not parse zone name %s", zname); + return; + } + /* create zone */ + z = find_or_create_zone(nsd->db, zdname, nsd->options, zname, pname); + if(!z) { + region_recycle(nsd->db->region, (void*)zdname, + dname_total_size(zdname)); + log_msg(LOG_ERR, "can not add zone %s %s", zname, pname); + return; + } + /* if zone is empty, attempt to read the zonefile from disk (if any) */ + if(!z->soa_rrset && z->opts->pattern->zonefile) { + namedb_read_zonefile(nsd->db, z, udb, last_task); + } } -static int diff_broken(FILE *df, off_t* break_pos) +static void +task_process_del_zone(struct nsd* nsd, struct task_list_d* task) { - uint32_t type, len, len2; - *break_pos = ftello(df); + udb_ptr udbz; + zone_type* zone; + zone_options_t* zopt; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "delzone task %s", dname_to_string( + task->zname, NULL))); + zone = namedb_find_zone(nsd->db, task->zname); + if(!zone) + return; - /* try to read and validate parts of the file */ - while(diff_read_32(df, &type)) /* cannot read type is no error, normal EOF */ - { - /* check type */ - if(type != DIFF_PART_IXFR && type != DIFF_PART_SURE) - return 1; - /* check length */ - if(!diff_read_32(df, &len)) - return 1; /* EOF inside the part is error */ - if(fseeko(df, len, SEEK_CUR) == -1) - { - log_msg(LOG_INFO, "fseeko failed: %s", strerror(errno)); - return 1; - } - /* fseek clears EOF flag, but try reading length value, - if EOF, the part is truncated */ - if(!diff_read_32(df, &len2)) - return 1; - if(len != len2) - return 1; /* bad part, lengths must agree */ - /* this part is ok */ - *break_pos = ftello(df); +#ifdef NSEC3 + nsec3_hash_tree_clear(zone); +#endif + delete_zone_rrs(nsd->db, zone); + if(udb_zone_search(nsd->db->udb, &udbz, dname_name(task->zname), + task->zname->name_size)) { + udb_zone_delete(nsd->db->udb, &udbz); + udb_ptr_unlink(&udbz, nsd->db->udb); } - return 0; +#ifdef NSEC3 + nsec3_clear_precompile(nsd->db, zone); + zone->nsec3_param = NULL; +#endif /* NSEC3 */ + + /* remove from zonetree, apex, soa */ + zopt = zone->opts; + namedb_zone_delete(nsd->db, zone); + /* remove from options (zone_list already edited by xfrd) */ + zone_options_delete(nsd->options, zopt); +} + +static void +task_process_add_key(struct nsd* nsd, struct task_list_d* task) +{ + key_options_t key; + key.name = (char*)task->zname; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "addkey task %s", key.name)); + key.algorithm = key.name + strlen(key.name)+1; + key.secret = key.algorithm + strlen(key.algorithm)+1; + key_options_add_modify(nsd->options, &key); + memset(key.secret, 0xdd, strlen(key.secret)); /* wipe secret */ } -void diff_snip_garbage(namedb_type* db, nsd_options_t* opt) +static void +task_process_del_key(struct nsd* nsd, struct task_list_d* task) { - off_t break_pos; - const char* filename = opt->difffile; - FILE *df; + char* name = (char*)task->zname; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "delkey task %s", name)); + /* this is reload and nothing is using the TSIG key right now */ + key_options_remove(nsd->options, name); +} - /* open file here and keep open, so it cannot change under our nose */ - df = fopen(filename, "r+"); - if(!df) { - DEBUG(DEBUG_XFRD,1, (LOG_INFO, "could not open file %s for garbage collecting: %s", - filename, strerror(errno))); +static void +task_process_add_pattern(struct nsd* nsd, struct task_list_d* task) +{ + region_type* temp = region_create(xalloc, free); + buffer_type buffer; + pattern_options_t *pat; + buffer_create_from(&buffer, task->zname, task->yesno); + pat = pattern_options_unmarshal(temp, &buffer); + DEBUG(DEBUG_IPC,1, (LOG_INFO, "addpattern task %s", pat->pname)); + pattern_options_add_modify(nsd->options, pat); + region_destroy(temp); +} + +static void +task_process_del_pattern(struct nsd* nsd, struct task_list_d* task) +{ + char* name = (char*)task->zname; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "delpattern task %s", name)); + pattern_options_remove(nsd->options, name); +} + +static void +task_process_opt_change(struct nsd* nsd, struct task_list_d* task) +{ + DEBUG(DEBUG_IPC,1, (LOG_INFO, "optchange task")); +#ifdef RATELIMIT + nsd->options->rrl_ratelimit = task->oldserial; + nsd->options->rrl_whitelist_ratelimit = task->newserial; + nsd->options->rrl_slip = task->yesno; + rrl_set_limit(nsd->options->rrl_ratelimit, nsd->options->rrl_whitelist_ratelimit, + nsd->options->rrl_slip); +#else + (void)nsd; (void)task; +#endif +} + +static void +task_process_apply_xfr(struct nsd* nsd, udb_base* udb, udb_ptr *last_task, + udb_ptr* task) +{ + /* we have to use an udb_ptr task here, because the apply_xfr procedure + * appends soa_info which may remap and change the pointer. */ + zone_type* zone; + FILE* df; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "applyxfr task %s", dname_to_string( + TASKLIST(task)->zname, NULL))); + zone = namedb_find_zone(nsd->db, TASKLIST(task)->zname); + if(!zone) { + /* assume the zone has been deleted and a zone transfer was + * still waiting to be processed */ return; } - /* and skip into file, since nsd does not read anything before the pos */ - if(db->diff_skip) { - DEBUG(DEBUG_XFRD,1, (LOG_INFO, "garbage collect skip diff file")); - if(fseeko(df, db->diff_pos, SEEK_SET)==-1) { - log_msg(LOG_INFO, "could not fseeko file %s: %s.", - filename, strerror(errno)); - fclose(df); - return; - } + /* apply the XFR */ + /* oldserial, newserial, yesno is filenumber */ + df = xfrd_open_xfrfile(nsd, TASKLIST(task)->yesno, "r"); + if(!df) { + /* could not open file to update */ + /* there is no reply to xfrd failed-update, + * because xfrd has a scan for apply-failures. */ + return; } - - /* detect break point */ - if(diff_broken(df, &break_pos)) - { - /* snip off at break_pos */ - DEBUG(DEBUG_XFRD,1, (LOG_INFO, "snipping off trailing partial part of %s", - filename)); - if(ftruncate(fileno(df), break_pos) == -1) - log_msg(LOG_ERR, "ftruncate %s failed: %s", - filename, strerror(errno)); + /* read and apply zone transfer */ + if(!apply_ixfr_for_zone(nsd, zone, df, nsd->options, udb, + last_task, TASKLIST(task)->yesno)) { + /* there is no reply to xfrd failed-update, + * because xfrd has a scan for apply-failures. */ } fclose(df); + xfrd_unlink_xfrfile(nsd, TASKLIST(task)->yesno); +} + + +void task_process_in_reload(struct nsd* nsd, udb_base* udb, udb_ptr *last_task, + udb_ptr* task) +{ + switch(TASKLIST(task)->task_type) { + case task_expire: + task_process_expire(nsd->db, TASKLIST(task)); + break; + case task_check_zonefiles: + task_process_checkzones(nsd, udb, last_task, TASKLIST(task)); + break; + case task_write_zonefiles: + task_process_writezones(nsd, TASKLIST(task)); + break; + case task_set_verbosity: + task_process_set_verbosity(TASKLIST(task)); + break; + case task_add_zone: + task_process_add_zone(nsd, udb, last_task, TASKLIST(task)); + break; + case task_del_zone: + task_process_del_zone(nsd, TASKLIST(task)); + break; + case task_add_key: + task_process_add_key(nsd, TASKLIST(task)); + break; + case task_del_key: + task_process_del_key(nsd, TASKLIST(task)); + break; + case task_add_pattern: + task_process_add_pattern(nsd, TASKLIST(task)); + break; + case task_del_pattern: + task_process_del_pattern(nsd, TASKLIST(task)); + break; + case task_opt_change: + task_process_opt_change(nsd, TASKLIST(task)); + break; + case task_apply_xfr: + task_process_apply_xfr(nsd, udb, last_task, task); + break; + default: + log_msg(LOG_WARNING, "unhandled task in reload type %d", + (int)TASKLIST(task)->task_type); + break; + } + udb_ptr_free_space(task, udb, TASKLIST(task)->size); } diff --git a/usr.sbin/nsd/difffile.h b/usr.sbin/nsd/difffile.h index d54c629b5a7..d5f2cb8833f 100644 --- a/usr.sbin/nsd/difffile.h +++ b/usr.sbin/nsd/difffile.h @@ -1,7 +1,7 @@ /* * difffile.h - nsd.diff file handling header file. Read/write diff files. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -9,62 +9,123 @@ #ifndef DIFFFILE_H #define DIFFFILE_H -#include "config.h" #include "rbtree.h" #include "namedb.h" #include "options.h" +#include "udb.h" +struct nsd; +struct nsdst; -#define DIFF_PART_IXFR ('I'<<24 | 'X'<<16 | 'F'<<8 | 'R') -#define DIFF_PART_SURE ('S'<<24 | 'U'<<16 | 'R'<<8 | 'E') - -/* - * Used to pass commit logs - */ -struct diff_log { - char* zone_name; - char* error; - char* comment; - struct diff_log* next; -}; +#define DIFF_PART_XXFR ('X'<<24 | 'X'<<16 | 'F'<<8 | 'R') +#define DIFF_PART_XFRF ('X'<<24 | 'F'<<16 | 'R'<<8 | 'F') /* write an xfr packet data to the diff file, type=IXFR. - The diff file is created if necessary. */ -void diff_write_packet(const char* zone, uint32_t new_serial, uint16_t id, - uint32_t seq_nr, uint8_t* data, size_t len, nsd_options_t* opt); + The diff file is created if necessary, with initial header(notcommitted). */ +void diff_write_packet(const char* zone, const char* pat, uint32_t old_serial, + uint32_t new_serial, uint32_t seq_nr, uint8_t* data, size_t len, + struct nsd* nsd, uint64_t filenumber); /* - * Write a commit packet to the diff file, type=SURE. - * The zone data (preceding ixfr packets) are committed. - * See NSD-DIFFFILE for meaning of the arguments. + * Overwrite header of diff file with committed vale and other data. + * append log string. */ void diff_write_commit(const char* zone, uint32_t old_serial, - uint32_t new_serial, uint16_t id, uint32_t num_parts, - uint8_t commit, const char* log_msg, - nsd_options_t* opt); - -/* check if the crc in the nsd.db is the same in memory as on disk. - returns 1 if different. 0 if the same. returns -1 on error. */ -int db_crc_different(namedb_type* db); - -/* read the diff file and apply to the database in memory. - It will attempt to skip bad data. - If you pass a non-null value log, log comments are alloced in namedb.region - then, *log must be 0 on start of call (entries are prepended). - returns 0 on an unrecoverable error. */ -int diff_read_file(namedb_type* db, nsd_options_t* opt, struct diff_log** log, - size_t child_count); - -/* check the diff file for garbage at the end (bad type, partial write) - * and snip it off. - */ -void diff_snip_garbage(namedb_type* db, nsd_options_t* opt); + uint32_t new_serial, uint32_t num_parts, uint8_t commit, + const char* log_msg, struct nsd* nsd, uint64_t filenumber); /* * These functions read parts of the diff file. */ int diff_read_32(FILE *in, uint32_t* result); -int diff_read_16(FILE *in, uint16_t* result); int diff_read_8(FILE *in, uint8_t* result); int diff_read_str(FILE* in, char* buf, size_t len); +/* delete the RRs for a zone from memory */ +void delete_zone_rrs(namedb_type* db, zone_type* zone); +/* delete an RR */ +int delete_RR(namedb_type* db, const dname_type* dname, + uint16_t type, uint16_t klass, + buffer_type* packet, size_t rdatalen, zone_type *zone, + region_type* temp_region, struct udb_ptr* udbz); +/* add an RR */ +int add_RR(namedb_type* db, const dname_type* dname, + uint16_t type, uint16_t klass, uint32_t ttl, + buffer_type* packet, size_t rdatalen, zone_type *zone, + struct udb_ptr* udbz); + +/* task udb structure */ +struct task_list_d { + /** next task in list */ + udb_rel_ptr next; + /** task type */ + enum { + /** expire or un-expire a zone */ + task_expire, + /** apply an ixfr or axfr to a zone */ + task_apply_xfr, + /** soa info for zone */ + task_soa_info, + /** check mtime of zonefiles and read them, done on SIGHUP */ + task_check_zonefiles, + /** write zonefiles (if changed) */ + task_write_zonefiles, + /** set verbosity */ + task_set_verbosity, + /** statistic info */ + task_stat_info, + /** add a zone */ + task_add_zone, + /** delete zone */ + task_del_zone, + /** add TSIG key */ + task_add_key, + /** delete TSIG key */ + task_del_key, + /** add pattern */ + task_add_pattern, + /** delete pattern */ + task_del_pattern, + /** options change */ + task_opt_change + } task_type; + uint32_t size; /* size of this struct */ + + /** soainfo: zonename dname, soaRR wireform */ + /** expire: zonename, boolyesno */ + /** apply_xfr: zonename, serials, yesno is filenamecounter */ + uint32_t oldserial, newserial; + /** general variable. for some used to see if zname is present. */ + uint64_t yesno; + struct dname zname[0]; +}; +#define TASKLIST(ptr) ((struct task_list_d*)UDB_PTR(ptr)) +/** create udb for tasks */ +struct udb_base* task_file_create(const char* file); +void task_remap(udb_base* udb); +void task_process_sync(udb_base* udb); +void task_clear(udb_base* udb); +void task_new_soainfo(udb_base* udb, udb_ptr* last, struct zone* z); +void task_new_expire(udb_base* udb, udb_ptr* last, + const struct dname* z, int expired); +void* task_new_stat_info(udb_base* udb, udb_ptr* last, struct nsdst* stat, + size_t child_count); +void task_new_check_zonefiles(udb_base* udb, udb_ptr* last, + const dname_type* zone); +void task_new_write_zonefiles(udb_base* udb, udb_ptr* last, + const dname_type* zone); +void task_new_set_verbosity(udb_base* udb, udb_ptr* last, int v); +void task_new_add_zone(udb_base* udb, udb_ptr* last, const char* zone, + const char* pattern); +void task_new_del_zone(udb_base* udb, udb_ptr* last, const dname_type* dname); +void task_new_add_key(udb_base* udb, udb_ptr* last, key_options_t* key); +void task_new_del_key(udb_base* udb, udb_ptr* last, const char* name); +void task_new_add_pattern(udb_base* udb, udb_ptr* last, pattern_options_t* p); +void task_new_del_pattern(udb_base* udb, udb_ptr* last, const char* name); +void task_new_opt_change(udb_base* udb, udb_ptr* last, nsd_options_t* opt); +int task_new_apply_xfr(udb_base* udb, udb_ptr* last, const dname_type* zone, + uint32_t old_serial, uint32_t new_serial, uint64_t filenumber); +void task_process_in_reload(struct nsd* nsd, udb_base* udb, udb_ptr *last_task, + udb_ptr* task); +void task_process_expire(namedb_type* db, struct task_list_d* task); + #endif /* DIFFFILE_H */ diff --git a/usr.sbin/nsd/dname.h b/usr.sbin/nsd/dname.h index fccc3ee2967..a9aa15ad177 100644 --- a/usr.sbin/nsd/dname.h +++ b/usr.sbin/nsd/dname.h @@ -1,7 +1,7 @@ /* * dname.h -- Domain name handling. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -179,9 +179,9 @@ dname_label(const dname_type *dname, uint8_t label) * Return < 0 if LEFT < RIGHT, 0 if LEFT == RIGHT, and > 0 if LEFT > * RIGHT. The comparison is case sensitive. * - * Pre: vleft != NULL && vright != NULL + * Pre: left != NULL && right != NULL */ -int dname_compare(const void *vleft, const void *right); +int dname_compare(const dname_type *left, const dname_type *right); /* @@ -346,21 +346,6 @@ label_next(const uint8_t *label) const char *dname_to_string(const dname_type *dname, const dname_type *origin); -/* - * Convert DNAME to its string representation. This is a reentrant - * version of dname_to_string. The buf argument is a pointer to a - * user defined result buffer capable of holding the string representation - * of a DNAME. Due to escape sequences and such, this buffer is recommeneded - * to be at least 5 * MAXDOMAINLEN in size. - * - * If ORIGIN is provided and DNAME is a subdomain of ORIGIN the dname - * will be represented relative to ORIGIN. - * - * Pre: dname != NULL - */ -const char *dname_to_string_r(const dname_type *dname, - const dname_type *origin, - char *buf); /* * Create a dname containing the single label specified by STR @@ -389,13 +374,11 @@ const dname_type *dname_replace(region_type* region, const dname_type* src, const dname_type* dest); -#ifndef FULL_PREHASH -/** - * Create a dname representing the wildcard form of the passed dname. - */ -int dname_make_wildcard(struct region *region, - struct dname const *dname, - struct dname const **wildcard); -#endif +/** Convert uncompressed wireformat dname to a string */ +char* wiredname2str(const uint8_t* dname); +/** convert uncompressed label to string */ +char* wirelabel2str(const uint8_t* label); +/** check if two uncompressed dnames of the same total length are equal */ +int dname_equal_nocase(uint8_t* a, uint8_t* b, uint16_t len); #endif /* _DNAME_H_ */ diff --git a/usr.sbin/nsd/edns.c b/usr.sbin/nsd/edns.c index b69873f5fa1..57c2e6c6634 100644 --- a/usr.sbin/nsd/edns.c +++ b/usr.sbin/nsd/edns.c @@ -1,7 +1,7 @@ /* * edns.c -- EDNS definitions (RFC 2671). * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * diff --git a/usr.sbin/nsd/edns.h b/usr.sbin/nsd/edns.h index 8de1b685f01..b8643e954e4 100644 --- a/usr.sbin/nsd/edns.h +++ b/usr.sbin/nsd/edns.h @@ -1,7 +1,7 @@ /* * edns.h -- EDNS definitions (RFC 2671). * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * diff --git a/usr.sbin/nsd/ipc.c b/usr.sbin/nsd/ipc.c index 28e1cc5e7ec..141b0f3a83d 100644 --- a/usr.sbin/nsd/ipc.c +++ b/usr.sbin/nsd/ipc.c @@ -1,7 +1,7 @@ /* * ipc.c - Interprocess communication routines. Handlers read and write. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -11,6 +11,7 @@ #include <errno.h> #include <unistd.h> #include <stdlib.h> +#include <fcntl.h> #include "ipc.h" #include "buffer.h" #include "xfrd-tcp.h" @@ -18,59 +19,16 @@ #include "namedb.h" #include "xfrd.h" #include "xfrd-notify.h" +#include "difffile.h" -/* set is_ok for the zone according to the zone message */ -static zone_type* handle_xfrd_zone_state(struct nsd* nsd, buffer_type* packet); -/* write ipc ZONE_STATE message into the buffer */ -static void write_zone_state_packet(buffer_type* packet, zone_type* zone); /* attempt to send NSD_STATS command to child fd */ static void send_stat_to_child(struct main_ipc_handler_data* data, int fd); -/* write IPC expire notification msg to a buffer */ -static void xfrd_write_expire_notification(buffer_type* buffer, xfrd_zone_t* zone); /* send reload request over the IPC channel */ static void xfrd_send_reload_req(xfrd_state_t* xfrd); /* send quit request over the IPC channel */ static void xfrd_send_quit_req(xfrd_state_t* xfrd); -/* get SOA INFO out of IPC packet buffer */ -static void xfrd_handle_ipc_SOAINFO(xfrd_state_t* xfrd, buffer_type* packet); /* perform read part of handle ipc for xfrd */ -static void xfrd_handle_ipc_read(netio_handler_type *handler, xfrd_state_t* xfrd); - -static zone_type* -handle_xfrd_zone_state(struct nsd* nsd, buffer_type* packet) -{ - uint8_t ok; - const dname_type *dname; - domain_type *domain; - zone_type *zone; - - ok = buffer_read_u8(packet); - dname = (dname_type*)buffer_current(packet); - DEBUG(DEBUG_IPC,1, (LOG_INFO, "handler zone state %s is %s", - dname_to_string(dname, NULL), ok?"ok":"expired")); - /* find in zone_types, if does not exist, we cannot serve anyway */ - /* find zone in config, since that one always exists */ - domain = domain_table_find(nsd->db->domains, dname); - if(!domain) { - DEBUG(DEBUG_IPC,1, (LOG_INFO, "zone state msg, empty zone (domain %s)", - dname_to_string(dname, NULL))); - return NULL; - } - zone = domain_find_zone(domain); - if(!zone || dname_compare(domain_dname(zone->apex), dname) != 0) { - DEBUG(DEBUG_IPC,1, (LOG_INFO, "zone state msg, empty zone (zone %s)", - dname_to_string(dname, NULL))); - return NULL; - } - assert(zone); - /* only update zone->is_ok if needed to minimize copy-on-write - of memory pages shared after fork() */ - if(ok && !zone->is_ok) - zone->is_ok = 1; - if(!ok && zone->is_ok) - zone->is_ok = 0; - return zone; -} +static void xfrd_handle_ipc_read(struct event* handler, xfrd_state_t* xfrd); static void ipc_child_quit(struct nsd* nsd) @@ -80,42 +38,27 @@ ipc_child_quit(struct nsd* nsd) #ifdef BIND8_STATS bind8_stats(nsd); #endif /* BIND8_STATS */ + +#if 0 /* OS collects memory pages */ + event_base_free(event_base); + region_destroy(server_region); +#endif server_shutdown(nsd); exit(0); } void -child_handle_parent_command(netio_type *ATTR_UNUSED(netio), - netio_handler_type *handler, - netio_event_types_type event_types) +child_handle_parent_command(int fd, short event, void* arg) { sig_atomic_t mode; int len; struct ipc_handler_conn_data *data = - (struct ipc_handler_conn_data *) handler->user_data; - if (!(event_types & NETIO_EVENT_READ)) { - return; - } - - if(data->conn->is_reading) { - int ret = conn_read(data->conn); - if(ret == -1) { - log_msg(LOG_ERR, "handle_parent_command: error in conn_read: %s", - strerror(errno)); - data->conn->is_reading = 0; - return; - } - if(ret == 0) { - return; /* continue later */ - } - /* completed */ - data->conn->is_reading = 0; - buffer_flip(data->conn->packet); - (void)handle_xfrd_zone_state(data->nsd, data->conn->packet); + (struct ipc_handler_conn_data *) arg; + if (!(event & EV_READ)) { return; } - if ((len = read(handler->fd, &mode, sizeof(mode))) == -1) { + if ((len = read(fd, &mode, sizeof(mode))) == -1) { log_msg(LOG_ERR, "handle_parent_command: read: %s", strerror(errno)); return; @@ -123,7 +66,7 @@ child_handle_parent_command(netio_type *ATTR_UNUSED(netio), if (len == 0) { /* parent closed the connection. Quit */ - data->nsd->mode = NSD_QUIT; + ipc_child_quit(data->nsd); return; } @@ -139,15 +82,22 @@ child_handle_parent_command(netio_type *ATTR_UNUSED(netio), server_close_all_sockets(data->nsd->udp, data->nsd->ifs); server_close_all_sockets(data->nsd->tcp, data->nsd->ifs); /* mode == NSD_QUIT_CHILD */ - (void)write(handler->fd, &mode, sizeof(mode)); + (void)write(fd, &mode, sizeof(mode)); ipc_child_quit(data->nsd); break; - case NSD_ZONE_STATE: - data->conn->is_reading = 1; - data->conn->total_bytes = 0; - data->conn->msglen = 0; - data->conn->fd = handler->fd; - buffer_clear(data->conn->packet); + case NSD_QUIT_WITH_STATS: +#ifdef BIND8_STATS + DEBUG(DEBUG_IPC, 2, (LOG_INFO, "quit QUIT_WITH_STATS")); + /* reply with ack and stats and then quit */ + if(!write_socket(fd, &mode, sizeof(mode))) { + log_msg(LOG_ERR, "cannot write quitwst to parent"); + } + if(!write_socket(fd, &data->nsd->st, sizeof(data->nsd->st))) { + log_msg(LOG_ERR, "cannot write stats to parent"); + } + fsync(fd); +#endif /* BIND8_STATS */ + ipc_child_quit(data->nsd); break; default: log_msg(LOG_ERR, "handle_parent_command: bad mode %d", @@ -169,38 +119,6 @@ parent_handle_xfrd_command(netio_type *ATTR_UNUSED(netio), return; } - if(data->conn->is_reading) { - /* handle ZONE_STATE forward to children */ - int ret = conn_read(data->conn); - size_t i; - zone_type* zone; - if(ret == -1) { - log_msg(LOG_ERR, "main xfrd listener: error in conn_read: %s", - strerror(errno)); - data->conn->is_reading = 0; - return; - } - if(ret == 0) { - return; /* continue later */ - } - /* completed */ - data->conn->is_reading = 0; - buffer_flip(data->conn->packet); - zone = handle_xfrd_zone_state(data->nsd, data->conn->packet); - if(!zone) - return; - /* forward to all children */ - for (i = 0; i < data->nsd->child_count; ++i) { - if(!zone->dirty[i]) { - zone->dirty[i] = 1; - stack_push(data->nsd->children[i].dirty_zones, zone); - data->nsd->children[i].handler->event_types |= - NETIO_EVENT_WRITE; - } - } - return; - } - if ((len = read(handler->fd, &mode, sizeof(mode))) == -1) { log_msg(LOG_ERR, "handle_xfrd_command: read: %s", strerror(errno)); @@ -212,6 +130,7 @@ parent_handle_xfrd_command(netio_type *ATTR_UNUSED(netio), DEBUG(DEBUG_IPC,1, (LOG_INFO, "handle_xfrd_command: xfrd closed channel.")); close(handler->fd); handler->fd = -1; + data->nsd->mode = NSD_SHUTDOWN; return; } @@ -221,18 +140,15 @@ parent_handle_xfrd_command(netio_type *ATTR_UNUSED(netio), data->nsd->signal_hint_reload = 1; break; case NSD_QUIT: + case NSD_SHUTDOWN: data->nsd->mode = mode; break; + case NSD_STATS: + data->nsd->signal_hint_stats = 1; + break; case NSD_REAP_CHILDREN: data->nsd->signal_hint_child = 1; break; - case NSD_ZONE_STATE: - data->conn->is_reading = 1; - data->conn->total_bytes = 0; - data->conn->msglen = 0; - data->conn->fd = handler->fd; - buffer_clear(data->conn->packet); - break; default: log_msg(LOG_ERR, "handle_xfrd_command: bad mode %d", (int) mode); @@ -241,27 +157,6 @@ parent_handle_xfrd_command(netio_type *ATTR_UNUSED(netio), } static void -write_zone_state_packet(buffer_type* packet, zone_type* zone) -{ - sig_atomic_t cmd = NSD_ZONE_STATE; - uint8_t ok = zone->is_ok; - uint16_t sz; - if(!zone->apex) { - return; - } - sz = dname_total_size(domain_dname(zone->apex)) + 1; - sz = htons(sz); - - buffer_clear(packet); - buffer_write(packet, &cmd, sizeof(cmd)); - buffer_write(packet, &sz, sizeof(sz)); - buffer_write(packet, &ok, sizeof(ok)); - buffer_write(packet, domain_dname(zone->apex), - dname_total_size(domain_dname(zone->apex))); - buffer_flip(packet); -} - -static void send_stat_to_child(struct main_ipc_handler_data* data, int fd) { sig_atomic_t cmd = NSD_STATS; @@ -275,6 +170,7 @@ send_stat_to_child(struct main_ipc_handler_data* data, int fd) data->child->need_to_send_STATS = 0; } +#ifndef NDEBUG int packet_read_query_section(buffer_type *packet, uint8_t* dest, uint16_t* qtype, uint16_t* qclass); static void debug_print_fwd_name(int ATTR_UNUSED(len), buffer_type* packet, int acl_num) @@ -297,11 +193,16 @@ debug_print_fwd_name(int ATTR_UNUSED(len), buffer_type* packet, int acl_num) buffer_set_position(packet, bufpos); region_destroy(tempregion); } +#endif static void send_quit_to_child(struct main_ipc_handler_data* data, int fd) { +#ifdef BIND8_STATS + sig_atomic_t cmd = NSD_QUIT_WITH_STATS; +#else sig_atomic_t cmd = NSD_QUIT; +#endif if(write(fd, &cmd, sizeof(cmd)) == -1) { if(errno == EAGAIN || errno == EINTR) return; /* try again later */ @@ -314,6 +215,75 @@ send_quit_to_child(struct main_ipc_handler_data* data, int fd) (int)data->child->pid)); } +/** the child is done, mark it as exited */ +static void +child_is_done(struct nsd* nsd, int fd) +{ + size_t i; + if(fd != -1) close(fd); + for(i=0; i<nsd->child_count; ++i) + if(nsd->children[i].child_fd == fd) { + nsd->children[i].child_fd = -1; + nsd->children[i].has_exited = 1; + nsd->children[i].handler->fd = -1; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "server %d is done", + (int)nsd->children[i].pid)); + } + parent_check_all_children_exited(nsd); +} + +#ifdef BIND8_STATS +/** add stats to total */ +void +stats_add(struct nsdst* total, struct nsdst* s) +{ + unsigned i; + for(i=0; i<sizeof(total->qtype)/sizeof(stc_t); i++) + total->qtype[i] += s->qtype[i]; + for(i=0; i<sizeof(total->qclass)/sizeof(stc_t); i++) + total->qclass[i] += s->qclass[i]; + total->qudp += s->qudp; + total->qudp6 += s->qudp6; + total->ctcp += s->ctcp; + total->ctcp6 += s->ctcp6; + for(i=0; i<sizeof(total->rcode)/sizeof(stc_t); i++) + total->rcode[i] += s->rcode[i]; + for(i=0; i<sizeof(total->opcode)/sizeof(stc_t); i++) + total->opcode[i] += s->opcode[i]; + total->dropped += s->dropped; + total->truncated += s->truncated; + total->wrongzone += s->wrongzone; + total->txerr += s->txerr; + total->rxerr += s->rxerr; + total->edns += s->edns; + total->ednserr += s->ednserr; + total->raxfr += s->raxfr; + total->nona += s->nona; + + total->db_disk = s->db_disk; + total->db_mem = s->db_mem; +} + +#define FINAL_STATS_TIMEOUT 10 /* seconds */ +static void +read_child_stats(struct nsd* nsd, struct nsd_child* child, int fd) +{ + struct nsdst s; + errno=0; + if(block_read(nsd, fd, &s, sizeof(s), FINAL_STATS_TIMEOUT)!=sizeof(s)) { + log_msg(LOG_ERR, "problems reading finalstats from server " + "%d: %s", (int)child->pid, strerror(errno)); + } else { + stats_add(&nsd->st, &s); + child->query_count = s.qudp + s.qudp6 + s.ctcp + s.ctcp6; + /* we know that the child is going to close the connection + * now (this is an ACK of the QUIT_W_STATS so we know the + * child is done, no longer sending e.g. NOTIFY contents) */ + child_is_done(nsd, fd); + } +} +#endif /* BIND8_STATS */ + void parent_handle_child_command(netio_type *ATTR_UNUSED(netio), netio_handler_type *handler, @@ -326,43 +296,14 @@ parent_handle_child_command(netio_type *ATTR_UNUSED(netio), /* do a nonblocking write to the child if it is ready. */ if (event_types & NETIO_EVENT_WRITE) { - if(!data->busy_writing_zone_state && - !data->child->need_to_send_STATS && - !data->child->need_to_send_QUIT && - !data->child->need_to_exit && - data->child->dirty_zones->num > 0) { - /* create packet from next dirty zone */ - zone_type* zone = (zone_type*)stack_pop(data->child->dirty_zones); - assert(zone); - zone->dirty[data->child_num] = 0; - data->busy_writing_zone_state = 1; - write_zone_state_packet(data->write_conn->packet, zone); - data->write_conn->msglen = buffer_limit(data->write_conn->packet); - data->write_conn->total_bytes = sizeof(uint16_t); /* len bytes already in packet */ - data->write_conn->fd = handler->fd; - } - if(data->busy_writing_zone_state) { - /* write more of packet */ - int ret = conn_write(data->write_conn); - if(ret == -1) { - log_msg(LOG_ERR, "handle_child_cmd %d: could not write: %s", - (int)data->child->pid, strerror(errno)); - data->busy_writing_zone_state = 0; - } else if(ret == 1) { - data->busy_writing_zone_state = 0; /* completed */ - } - } else if(data->child->need_to_send_STATS && - !data->child->need_to_exit) { + if(data->child->need_to_send_STATS && + !data->child->need_to_exit) { send_stat_to_child(data, handler->fd); } else if(data->child->need_to_send_QUIT) { send_quit_to_child(data, handler->fd); if(!data->child->need_to_send_QUIT) handler->event_types = NETIO_EVENT_READ; - } - if(!data->busy_writing_zone_state && - !data->child->need_to_send_STATS && - !data->child->need_to_send_QUIT && - data->child->dirty_zones->num == 0) { + } else { handler->event_types = NETIO_EVENT_READ; } } @@ -468,18 +409,7 @@ parent_handle_child_command(netio_type *ATTR_UNUSED(netio), } if (len == 0) { - size_t i; - if(handler->fd != -1) close(handler->fd); - for(i=0; i<data->nsd->child_count; ++i) - if(data->nsd->children[i].child_fd == handler->fd) { - data->nsd->children[i].child_fd = -1; - data->nsd->children[i].has_exited = 1; - DEBUG(DEBUG_IPC,1, (LOG_INFO, - "server %d closed cmd channel", - (int) data->nsd->children[i].pid)); - } - handler->fd = -1; - parent_check_all_children_exited(data->nsd); + child_is_done(data->nsd, handler->fd); return; } @@ -487,6 +417,11 @@ parent_handle_child_command(netio_type *ATTR_UNUSED(netio), case NSD_QUIT: data->nsd->mode = mode; break; +#ifdef BIND8_STATS + case NSD_QUIT_WITH_STATS: + read_child_stats(data->nsd, data->child, handler->fd); + break; +#endif /* BIND8_STATS */ case NSD_STATS: data->nsd->signal_hint_stats = 1; break; @@ -573,33 +508,16 @@ parent_handle_reload_command(netio_type *ATTR_UNUSED(netio), } static void -xfrd_write_expire_notification(buffer_type* buffer, xfrd_zone_t* zone) -{ - sig_atomic_t cmd = NSD_ZONE_STATE; - uint8_t ok = 1; - uint16_t sz = dname_total_size(zone->apex) + 1; - sz = htons(sz); - if(zone->state == xfrd_zone_expired) - ok = 0; - - DEBUG(DEBUG_IPC,1, (LOG_INFO, - "xfrd encoding ipc zone state msg for zone %s state %d.", - zone->apex_str, (int)zone->state)); - - buffer_clear(buffer); - buffer_write(buffer, &cmd, sizeof(cmd)); - buffer_write(buffer, &sz, sizeof(sz)); - buffer_write(buffer, &ok, sizeof(ok)); - buffer_write(buffer, zone->apex, dname_total_size(zone->apex)); - buffer_flip(buffer); -} - -static void xfrd_send_reload_req(xfrd_state_t* xfrd) { sig_atomic_t req = NSD_RELOAD; + uint64_t p = xfrd->last_task->data; + udb_ptr_unlink(xfrd->last_task, xfrd->nsd->task[xfrd->nsd->mytask]); + task_process_sync(xfrd->nsd->task[xfrd->nsd->mytask]); /* ask server_main for a reload */ - if(write(xfrd->ipc_handler.fd, &req, sizeof(req)) == -1) { + if(write(xfrd->ipc_handler.ev_fd, &req, sizeof(req)) == -1) { + udb_ptr_init(xfrd->last_task, xfrd->nsd->task[xfrd->nsd->mytask]); + udb_ptr_set(xfrd->last_task, xfrd->nsd->task[xfrd->nsd->mytask], p); if(errno == EAGAIN || errno == EINTR) return; /* try again later */ log_msg(LOG_ERR, "xfrd: problems sending reload command: %s", @@ -607,21 +525,55 @@ xfrd_send_reload_req(xfrd_state_t* xfrd) return; } DEBUG(DEBUG_IPC,1, (LOG_INFO, "xfrd: asked nsd to reload new updates")); + /* swapped task to other side, start to use other task udb. */ + xfrd->nsd->mytask = 1 - xfrd->nsd->mytask; + task_remap(xfrd->nsd->task[xfrd->nsd->mytask]); + udb_ptr_init(xfrd->last_task, xfrd->nsd->task[xfrd->nsd->mytask]); + assert(udb_base_get_userdata(xfrd->nsd->task[xfrd->nsd->mytask])->data == 0); + xfrd_prepare_zones_for_reload(); xfrd->reload_cmd_last_sent = xfrd_time(); xfrd->need_to_send_reload = 0; xfrd->can_send_reload = 0; } +void +ipc_xfrd_set_listening(struct xfrd_state* xfrd, short mode) +{ + int fd = xfrd->ipc_handler.ev_fd; + struct event_base* base = xfrd->event_base; + event_del(&xfrd->ipc_handler); + event_set(&xfrd->ipc_handler, fd, mode, xfrd_handle_ipc, xfrd); + if(event_base_set(base, &xfrd->ipc_handler) != 0) + log_msg(LOG_ERR, "ipc: cannot set event_base"); + /* no timeout for IPC events */ + if(event_add(&xfrd->ipc_handler, NULL) != 0) + log_msg(LOG_ERR, "ipc: cannot add event"); + xfrd->ipc_handler_flags = mode; +} + +static void +xfrd_send_shutdown_req(xfrd_state_t* xfrd) +{ + sig_atomic_t cmd = NSD_SHUTDOWN; + xfrd->ipc_send_blocked = 1; + ipc_xfrd_set_listening(xfrd, EV_PERSIST|EV_READ); + DEBUG(DEBUG_IPC,1, (LOG_INFO, "xfrd: ipc send shutdown")); + if(!write_socket(xfrd->ipc_handler.ev_fd, &cmd, sizeof(cmd))) { + log_msg(LOG_ERR, "xfrd: error writing shutdown to main: %s", + strerror(errno)); + } + xfrd->need_to_send_shutdown = 0; +} + static void xfrd_send_quit_req(xfrd_state_t* xfrd) { sig_atomic_t cmd = NSD_QUIT; xfrd->ipc_send_blocked = 1; - xfrd->ipc_handler.event_types &= (~NETIO_EVENT_WRITE); - xfrd->sending_zone_state = 0; + ipc_xfrd_set_listening(xfrd, EV_PERSIST|EV_READ); DEBUG(DEBUG_IPC,1, (LOG_INFO, "xfrd: ipc send ackreload(quit)")); - if(!write_socket(xfrd->ipc_handler.fd, &cmd, sizeof(cmd))) { + if(!write_socket(xfrd->ipc_handler.ev_fd, &cmd, sizeof(cmd))) { log_msg(LOG_ERR, "xfrd: error writing ack to main: %s", strerror(errno)); } @@ -629,119 +581,55 @@ xfrd_send_quit_req(xfrd_state_t* xfrd) } static void -xfrd_handle_ipc_SOAINFO(xfrd_state_t* xfrd, buffer_type* packet) +xfrd_send_stats(xfrd_state_t* xfrd) { - xfrd_soa_t soa; - xfrd_soa_t* soa_ptr = &soa; - xfrd_zone_t* zone; - /* dname is sent in memory format */ - const dname_type* dname = (const dname_type*)buffer_begin(packet); - - /* find zone and decode SOA */ - zone = (xfrd_zone_t*)rbtree_search(xfrd->zones, dname); - buffer_skip(packet, dname_total_size(dname)); - - if(!buffer_available(packet, sizeof(uint32_t)*6 + sizeof(uint8_t)*2)) { - /* NSD has zone without any info */ - DEBUG(DEBUG_IPC,1, (LOG_INFO, "SOAINFO for %s lost zone", - dname_to_string(dname,0))); - soa_ptr = NULL; - } else { - /* read soa info */ - memset(&soa, 0, sizeof(soa)); - /* left out type, klass, count for speed */ - soa.type = htons(TYPE_SOA); - soa.klass = htons(CLASS_IN); - soa.ttl = htonl(buffer_read_u32(packet)); - soa.rdata_count = htons(7); - soa.prim_ns[0] = buffer_read_u8(packet); - if(!buffer_available(packet, soa.prim_ns[0])) - return; - buffer_read(packet, soa.prim_ns+1, soa.prim_ns[0]); - soa.email[0] = buffer_read_u8(packet); - if(!buffer_available(packet, soa.email[0])) - return; - buffer_read(packet, soa.email+1, soa.email[0]); - - soa.serial = htonl(buffer_read_u32(packet)); - soa.refresh = htonl(buffer_read_u32(packet)); - soa.retry = htonl(buffer_read_u32(packet)); - soa.expire = htonl(buffer_read_u32(packet)); - soa.minimum = htonl(buffer_read_u32(packet)); - DEBUG(DEBUG_IPC,1, (LOG_INFO, "SOAINFO for %s %u", - dname_to_string(dname,0), ntohl(soa.serial))); - } - - if(!zone) { - DEBUG(DEBUG_IPC,1, (LOG_INFO, "xfrd: zone %s master zone updated", - dname_to_string(dname,0))); - notify_handle_master_zone_soainfo(xfrd->notify_zones, - dname, soa_ptr); - return; + sig_atomic_t cmd = NSD_STATS; + DEBUG(DEBUG_IPC,1, (LOG_INFO, "xfrd: ipc send stats")); + if(!write_socket(xfrd->ipc_handler.ev_fd, &cmd, sizeof(cmd))) { + log_msg(LOG_ERR, "xfrd: error writing stats to main: %s", + strerror(errno)); } - xfrd_handle_incoming_soa(zone, soa_ptr, xfrd_time()); + xfrd->need_to_send_stats = 0; } void -xfrd_handle_ipc(netio_type* ATTR_UNUSED(netio), - netio_handler_type *handler, - netio_event_types_type event_types) +xfrd_handle_ipc(int ATTR_UNUSED(fd), short event, void* arg) { - xfrd_state_t* xfrd = (xfrd_state_t*)handler->user_data; - if ((event_types & NETIO_EVENT_READ)) + xfrd_state_t* xfrd = (xfrd_state_t*)arg; + if ((event & EV_READ)) { /* first attempt to read as a signal from main * could block further send operations */ - xfrd_handle_ipc_read(handler, xfrd); + xfrd_handle_ipc_read(&xfrd->ipc_handler, xfrd); } - if ((event_types & NETIO_EVENT_WRITE)) + if ((event & EV_WRITE)) { - if(xfrd->ipc_send_blocked) { /* wait for SOA_END */ - handler->event_types = NETIO_EVENT_READ; + if(xfrd->ipc_send_blocked) { /* wait for RELOAD_DONE */ + ipc_xfrd_set_listening(xfrd, EV_PERSIST|EV_READ); return; } - /* if necessary prepare a packet */ - if(!(xfrd->can_send_reload && xfrd->need_to_send_reload) && - !xfrd->need_to_send_quit && - !xfrd->sending_zone_state && - xfrd->dirty_zones->num > 0) { - xfrd_zone_t* zone = (xfrd_zone_t*)stack_pop(xfrd->dirty_zones); - assert(zone); - zone->dirty = 0; - xfrd->sending_zone_state = 1; - xfrd_write_expire_notification(xfrd->ipc_conn_write->packet, zone); - xfrd->ipc_conn_write->msglen = buffer_limit(xfrd->ipc_conn_write->packet); - /* skip length bytes; they are encoded in the packet, after cmd */ - xfrd->ipc_conn_write->total_bytes = sizeof(uint16_t); - } - /* write a bit */ - if(xfrd->sending_zone_state) { - /* call conn_write */ - int ret = conn_write(xfrd->ipc_conn_write); - if(ret == -1) { - log_msg(LOG_ERR, "xfrd: error in write ipc: %s", strerror(errno)); - xfrd->sending_zone_state = 0; - } - else if(ret == 1) { /* done */ - xfrd->sending_zone_state = 0; - } + if(xfrd->need_to_send_shutdown) { + xfrd_send_shutdown_req(xfrd); } else if(xfrd->need_to_send_quit) { xfrd_send_quit_req(xfrd); } else if(xfrd->can_send_reload && xfrd->need_to_send_reload) { xfrd_send_reload_req(xfrd); + } else if(xfrd->need_to_send_stats) { + xfrd_send_stats(xfrd); } if(!(xfrd->can_send_reload && xfrd->need_to_send_reload) && + !xfrd->need_to_send_shutdown && !xfrd->need_to_send_quit && - !xfrd->sending_zone_state && - xfrd->dirty_zones->num == 0) { - handler->event_types = NETIO_EVENT_READ; /* disable writing for now */ + !xfrd->need_to_send_stats) { + /* disable writing for now */ + ipc_xfrd_set_listening(xfrd, EV_PERSIST|EV_READ); } } } static void -xfrd_handle_ipc_read(netio_handler_type *handler, xfrd_state_t* xfrd) +xfrd_handle_ipc_read(struct event* handler, xfrd_state_t* xfrd) { sig_atomic_t cmd; int len; @@ -770,6 +658,7 @@ xfrd_handle_ipc_read(netio_handler_type *handler, xfrd_state_t* xfrd) } if(xfrd->ipc_conn->is_reading) { /* reading an IPC message */ + buffer_type* tmp; int ret = conn_read(xfrd->ipc_conn); if(ret == -1) { log_msg(LOG_ERR, "xfrd: error in read ipc: %s", strerror(errno)); @@ -779,26 +668,22 @@ xfrd_handle_ipc_read(netio_handler_type *handler, xfrd_state_t* xfrd) if(ret == 0) return; buffer_flip(xfrd->ipc_conn->packet); - if(xfrd->ipc_is_soa) { - xfrd->ipc_conn->is_reading = 0; - xfrd_handle_ipc_SOAINFO(xfrd, xfrd->ipc_conn->packet); - } else { - /* use ipc_conn to read remaining data as well */ - buffer_type* tmp = xfrd->ipc_pass; - xfrd->ipc_conn->is_reading=2; - xfrd->ipc_pass = xfrd->ipc_conn->packet; - xfrd->ipc_conn->packet = tmp; - xfrd->ipc_conn->total_bytes = sizeof(xfrd->ipc_conn->msglen); - xfrd->ipc_conn->msglen = 2*sizeof(uint32_t); - buffer_clear(xfrd->ipc_conn->packet); - buffer_set_limit(xfrd->ipc_conn->packet, xfrd->ipc_conn->msglen); - } + /* use ipc_conn to read remaining data as well */ + tmp = xfrd->ipc_pass; + xfrd->ipc_conn->is_reading=2; + xfrd->ipc_pass = xfrd->ipc_conn->packet; + xfrd->ipc_conn->packet = tmp; + xfrd->ipc_conn->total_bytes = sizeof(xfrd->ipc_conn->msglen); + xfrd->ipc_conn->msglen = 2*sizeof(uint32_t); + buffer_clear(xfrd->ipc_conn->packet); + buffer_set_limit(xfrd->ipc_conn->packet, xfrd->ipc_conn->msglen); return; } - if((len = read(handler->fd, &cmd, sizeof(cmd))) == -1) { - log_msg(LOG_ERR, "xfrd_handle_ipc: read: %s", - strerror(errno)); + if((len = read(handler->ev_fd, &cmd, sizeof(cmd))) == -1) { + if(errno != EINTR && errno != EAGAIN) + log_msg(LOG_ERR, "xfrd_handle_ipc: read: %s", + strerror(errno)); return; } if(len == 0) @@ -812,48 +697,49 @@ xfrd_handle_ipc_read(netio_handler_type *handler, xfrd_state_t* xfrd) switch(cmd) { case NSD_QUIT: case NSD_SHUTDOWN: - DEBUG(DEBUG_IPC,1, (LOG_INFO, "xfrd: main send shutdown cmd.")); + DEBUG(DEBUG_IPC,1, (LOG_INFO, "xfrd: main sent shutdown cmd.")); xfrd->shutdown = 1; break; - case NSD_SOA_BEGIN: - DEBUG(DEBUG_IPC,1, (LOG_INFO, "xfrd: ipc recv SOA_BEGIN")); - /* reload starts sending SOA INFOs; don't block */ - xfrd->parent_soa_info_pass = 1; - /* reset the nonblocking ipc write; - the new parent does not want half a packet */ - xfrd->sending_zone_state = 0; - break; - case NSD_SOA_INFO: - DEBUG(DEBUG_IPC,1, (LOG_INFO, "xfrd: ipc recv SOA_INFO")); - assert(xfrd->parent_soa_info_pass); - xfrd->ipc_is_soa = 1; - xfrd->ipc_conn->is_reading = 1; - break; - case NSD_SOA_END: + case NSD_RELOAD_DONE: /* reload has finished */ - DEBUG(DEBUG_IPC,1, (LOG_INFO, "xfrd: ipc recv SOA_END")); + DEBUG(DEBUG_IPC,1, (LOG_INFO, "xfrd: ipc recv RELOAD_DONE")); +#ifdef BIND8_STATS + if(block_read(NULL, handler->ev_fd, &xfrd->reload_pid, + sizeof(pid_t), -1) != sizeof(pid_t)) { + log_msg(LOG_ERR, "xfrd cannot get reload_pid"); + } +#endif /* BIND8_STATS */ + /* read the not-mytask for the results and soainfo */ + xfrd_process_task_result(xfrd, + xfrd->nsd->task[1-xfrd->nsd->mytask]); + /* reset the IPC, (and the nonblocking ipc write; + the new parent does not want half a packet) */ xfrd->can_send_reload = 1; - xfrd->parent_soa_info_pass = 0; xfrd->ipc_send_blocked = 0; - handler->event_types |= NETIO_EVENT_WRITE; + ipc_xfrd_set_listening(xfrd, EV_PERSIST|EV_READ|EV_WRITE); xfrd_reopen_logfile(); xfrd_check_failed_updates(); - xfrd_send_expy_all_zones(); break; case NSD_PASS_TO_XFRD: DEBUG(DEBUG_IPC,1, (LOG_INFO, "xfrd: ipc recv PASS_TO_XFRD")); - xfrd->ipc_is_soa = 0; xfrd->ipc_conn->is_reading = 1; break; + case NSD_RELOAD_REQ: + DEBUG(DEBUG_IPC,1, (LOG_INFO, "xfrd: ipc recv RELOAD_REQ")); + /* make reload happen, right away, and schedule file check */ + task_new_check_zonefiles(xfrd->nsd->task[xfrd->nsd->mytask], + xfrd->last_task, NULL); + xfrd_set_reload_now(xfrd); + break; case NSD_RELOAD: /* main tells us that reload is done, stop ipc send to main */ DEBUG(DEBUG_IPC,1, (LOG_INFO, "xfrd: ipc recv RELOAD")); - handler->event_types |= NETIO_EVENT_WRITE; + ipc_xfrd_set_listening(xfrd, EV_PERSIST|EV_READ|EV_WRITE); xfrd->need_to_send_quit = 1; break; default: log_msg(LOG_ERR, "xfrd_handle_ipc: bad mode %d (%d)", (int)cmd, - ntohl(cmd)); + (int)ntohl(cmd)); break; } @@ -864,4 +750,3 @@ xfrd_handle_ipc_read(netio_handler_type *handler, xfrd_state_t* xfrd) buffer_clear(xfrd->ipc_conn->packet); } } - diff --git a/usr.sbin/nsd/ipc.h b/usr.sbin/nsd/ipc.h index 0bd02e32b72..cb27db46063 100644 --- a/usr.sbin/nsd/ipc.h +++ b/usr.sbin/nsd/ipc.h @@ -1,7 +1,7 @@ /* * ipc.h - Interprocess communication routines. Handlers read and write. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -10,12 +10,14 @@ #ifndef NSD_IPC_H #define NSD_IPC_H -#include "config.h" #include "netio.h" struct buffer; struct nsd; struct nsd_child; struct xfrd_tcp; +struct xfrd_state; +struct nsdst; +struct event; /* * Data for the server_main IPC handler @@ -35,10 +37,6 @@ struct main_ipc_handler_data uint16_t total_bytes; uint32_t acl_num; int32_t acl_xfr; - - /* writing data, connection and state */ - uint8_t busy_writing_zone_state; - struct xfrd_tcp *write_conn; }; /* @@ -78,17 +76,21 @@ void parent_handle_child_command(netio_type *netio, * Routine used by server_child. * Handle a command received from the parent process. */ -void child_handle_parent_command(netio_type *netio, - netio_handler_type *handler, netio_event_types_type event_types); +void child_handle_parent_command(int fd, short event, void* arg); /* * Routine used by xfrd * Handle interprocess communication with parent process, read and write. */ -void xfrd_handle_ipc(netio_type *netio, - netio_handler_type *handler, netio_event_types_type event_types); +void xfrd_handle_ipc(int fd, short event, void* arg); /* check if all children have exited in an orderly fashion and set mode */ void parent_check_all_children_exited(struct nsd* nsd); +/** add stats to total */ +void stats_add(struct nsdst* total, struct nsdst* s); + +/** set event to listen to given mode, no timeout, must be added already */ +void ipc_xfrd_set_listening(struct xfrd_state* xfrd, short mode); + #endif /* NSD_IPC_H */ diff --git a/usr.sbin/nsd/iterated_hash.c b/usr.sbin/nsd/iterated_hash.c index 4211f503c25..e8606a3b06c 100644 --- a/usr.sbin/nsd/iterated_hash.c +++ b/usr.sbin/nsd/iterated_hash.c @@ -1,7 +1,7 @@ /* * iterated_hash.c -- nsec3 hash calculation. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * diff --git a/usr.sbin/nsd/iterated_hash.h b/usr.sbin/nsd/iterated_hash.h index 2a6bef399d0..9997e62598b 100644 --- a/usr.sbin/nsd/iterated_hash.h +++ b/usr.sbin/nsd/iterated_hash.h @@ -1,7 +1,7 @@ /* * iterated_hash.h -- nsec3 hash calculation. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -10,10 +10,11 @@ #ifndef ITERATED_HASH_H #define ITERATED_HASH_H -#include "config.h" #ifdef NSEC3 #include <openssl/sha.h> +#define NSEC3_SHA1_HASH 1 /* same type code as DS hash */ + int iterated_hash(unsigned char out[SHA_DIGEST_LENGTH], const unsigned char *salt,int saltlength, const unsigned char *in,int inlength,int iterations); diff --git a/usr.sbin/nsd/mini_event.c b/usr.sbin/nsd/mini_event.c new file mode 100644 index 00000000000..4048bcfae89 --- /dev/null +++ b/usr.sbin/nsd/mini_event.c @@ -0,0 +1,446 @@ +/* + * mini_event.c - implementation of part of libevent api, portably. + * + * Copyright (c) 2007, NLnet Labs. All rights reserved. + * + * This software is open source. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * Neither the name of the NLNET LABS nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +/** + * \file + * fake libevent implementation. Less broad in functionality, and only + * supports select(2). + */ + +#include "config.h" +#ifdef HAVE_TIME_H +#include <time.h> +#endif +#include <string.h> +#include <errno.h> +#include <sys/time.h> + +#if defined(USE_MINI_EVENT) && !defined(USE_WINSOCK) +#ifdef HAVE_WINSOCK2_H +#define FD_SET_T (u_int) +#else +#define FD_SET_T +#endif + +#include <signal.h> +#include "mini_event.h" +#include "util.h" + +/** compare events in tree, based on timevalue, ptr for uniqueness */ +int +mini_ev_cmp(const void* a, const void* b) +{ + const struct event* e = (const struct event*)a; + const struct event* f = (const struct event*)b; + if(e->ev_timeout.tv_sec < f->ev_timeout.tv_sec) + return -1; + if(e->ev_timeout.tv_sec > f->ev_timeout.tv_sec) + return 1; + if(e->ev_timeout.tv_usec < f->ev_timeout.tv_usec) + return -1; + if(e->ev_timeout.tv_usec > f->ev_timeout.tv_usec) + return 1; + if(e < f) + return -1; + if(e > f) + return 1; + return 0; +} + +/** set time */ +static int +settime(struct event_base* base) +{ + if(gettimeofday(base->time_tv, NULL) < 0) { + return -1; + } +#ifndef S_SPLINT_S + *base->time_secs = (time_t)base->time_tv->tv_sec; +#endif + return 0; +} + +/** create event base */ +void * +event_init(time_t* time_secs, struct timeval* time_tv) +{ + struct event_base* base = (struct event_base*)malloc( + sizeof(struct event_base)); + if(!base) + return NULL; + memset(base, 0, sizeof(*base)); + base->region = region_create(xalloc, free); + if(!base->region) { + free(base); + return NULL; + } + base->time_secs = time_secs; + base->time_tv = time_tv; + if(settime(base) < 0) { + event_base_free(base); + return NULL; + } + base->times = rbtree_create(base->region, mini_ev_cmp); + if(!base->times) { + event_base_free(base); + return NULL; + } + base->capfd = MAX_FDS; +#ifdef FD_SETSIZE + if((int)FD_SETSIZE < base->capfd) + base->capfd = (int)FD_SETSIZE; +#endif + base->fds = (struct event**)calloc((size_t)base->capfd, + sizeof(struct event*)); + if(!base->fds) { + event_base_free(base); + return NULL; + } + base->signals = (struct event**)calloc(MAX_SIG, sizeof(struct event*)); + if(!base->signals) { + event_base_free(base); + return NULL; + } +#ifndef S_SPLINT_S + FD_ZERO(&base->reads); + FD_ZERO(&base->writes); +#endif + return base; +} + +/** get version */ +const char * +event_get_version(void) +{ + return "mini-event-"PACKAGE_VERSION; +} + +/** get polling method, select */ +const char * +event_get_method(void) +{ + return "select"; +} + +/** call timeouts handlers, and return how long to wait for next one or -1 */ +static int +handle_timeouts(struct event_base* base, struct timeval* now, + struct timeval* wait) +{ + struct event* p; + int tofired = 0; +#ifndef S_SPLINT_S + wait->tv_sec = (time_t)-1; +#endif + + while((rbnode_t*)(p = (struct event*)rbtree_first(base->times)) + !=RBTREE_NULL) { +#ifndef S_SPLINT_S + if(p->ev_timeout.tv_sec > now->tv_sec || + (p->ev_timeout.tv_sec==now->tv_sec && + p->ev_timeout.tv_usec > now->tv_usec)) { + /* there is a next larger timeout. wait for it */ + wait->tv_sec = p->ev_timeout.tv_sec - now->tv_sec; + if(now->tv_usec > p->ev_timeout.tv_usec) { + wait->tv_sec--; + wait->tv_usec = 1000000 - (now->tv_usec - + p->ev_timeout.tv_usec); + } else { + wait->tv_usec = p->ev_timeout.tv_usec + - now->tv_usec; + } + return tofired; + } +#endif + /* event times out, remove it */ + tofired = 1; + (void)rbtree_delete(base->times, p); + p->ev_flags &= ~EV_TIMEOUT; + (*p->ev_callback)(p->ev_fd, EV_TIMEOUT, p->ev_arg); + } + return tofired; +} + +/** call select and callbacks for that */ +static int +handle_select(struct event_base* base, struct timeval* wait) +{ + fd_set r, w; + int ret, i; + +#ifndef S_SPLINT_S + if(wait->tv_sec==(time_t)-1) + wait = NULL; +#endif + memmove(&r, &base->reads, sizeof(fd_set)); + memmove(&w, &base->writes, sizeof(fd_set)); + memmove(&base->ready, &base->content, sizeof(fd_set)); + + if((ret = select(base->maxfd+1, &r, &w, NULL, wait)) == -1) { + ret = errno; + if(settime(base) < 0) + return -1; + errno = ret; + if(ret == EAGAIN || ret == EINTR) + return 0; + return -1; + } + if(settime(base) < 0) + return -1; + + for(i=0; i<base->maxfd+1; i++) { + short bits = 0; + if(!base->fds[i] || !(FD_ISSET(i, &base->ready))) { + continue; + } + if(FD_ISSET(i, &r)) { + bits |= EV_READ; + ret--; + } + if(FD_ISSET(i, &w)) { + bits |= EV_WRITE; + ret--; + } + bits &= base->fds[i]->ev_flags; + if(bits) { + (*base->fds[i]->ev_callback)(base->fds[i]->ev_fd, + bits, base->fds[i]->ev_arg); + if(ret==0) + break; + } + } + return 0; +} + +/** run select once */ +int +event_base_loop(struct event_base* base, int flags) +{ + struct timeval wait; + if(!(flags & EVLOOP_ONCE)) + return event_base_dispatch(base); + /* see if timeouts need handling */ + if(handle_timeouts(base, base->time_tv, &wait)) + return 0; /* there were timeouts, end of loop */ + if(base->need_to_exit) + return 0; + /* do select */ + if(handle_select(base, &wait) < 0) { + if(base->need_to_exit) + return 0; + return -1; + } + return 0; +} + +/** run select in a loop */ +int +event_base_dispatch(struct event_base* base) +{ + struct timeval wait; + if(settime(base) < 0) + return -1; + while(!base->need_to_exit) + { + /* see if timeouts need handling */ + (void)handle_timeouts(base, base->time_tv, &wait); + if(base->need_to_exit) + return 0; + /* do select */ + if(handle_select(base, &wait) < 0) { + if(base->need_to_exit) + return 0; + return -1; + } + } + return 0; +} + +/** exit that loop */ +int +event_base_loopexit(struct event_base* base, + struct timeval* ATTR_UNUSED(tv)) +{ + base->need_to_exit = 1; + return 0; +} + +/* free event base, free events yourself */ +void +event_base_free(struct event_base* base) +{ + if(!base) + return; + if(base->times) + free(base->times); + if(base->fds) + free(base->fds); + if(base->signals) + free(base->signals); + region_destroy(base->region); + free(base); +} + +/** set content of event */ +void +event_set(struct event* ev, int fd, short bits, + void (*cb)(int, short, void *), void* arg) +{ + ev->node.key = ev; + ev->ev_fd = fd; + ev->ev_flags = bits; + ev->ev_callback = cb; + ev->ev_arg = arg; + ev->added = 0; +} + +/* add event to a base */ +int +event_base_set(struct event_base* base, struct event* ev) +{ + ev->ev_base = base; + ev->added = 0; + return 0; +} + +/* add event to make it active, you may not change it with event_set anymore */ +int +event_add(struct event* ev, struct timeval* tv) +{ + if(ev->added) + event_del(ev); + if(ev->ev_fd != -1 && ev->ev_fd >= ev->ev_base->capfd) + return -1; + if( (ev->ev_flags&(EV_READ|EV_WRITE)) && ev->ev_fd != -1) { + ev->ev_base->fds[ev->ev_fd] = ev; + if(ev->ev_flags&EV_READ) { + FD_SET(FD_SET_T ev->ev_fd, &ev->ev_base->reads); + } + if(ev->ev_flags&EV_WRITE) { + FD_SET(FD_SET_T ev->ev_fd, &ev->ev_base->writes); + } + FD_SET(FD_SET_T ev->ev_fd, &ev->ev_base->content); + FD_CLR(FD_SET_T ev->ev_fd, &ev->ev_base->ready); + if(ev->ev_fd > ev->ev_base->maxfd) + ev->ev_base->maxfd = ev->ev_fd; + } + if(tv && (ev->ev_flags&EV_TIMEOUT)) { +#ifndef S_SPLINT_S + struct timeval* now = ev->ev_base->time_tv; + ev->ev_timeout.tv_sec = tv->tv_sec + now->tv_sec; + ev->ev_timeout.tv_usec = tv->tv_usec + now->tv_usec; + while(ev->ev_timeout.tv_usec > 1000000) { + ev->ev_timeout.tv_usec -= 1000000; + ev->ev_timeout.tv_sec++; + } +#endif + (void)rbtree_insert(ev->ev_base->times, &ev->node); + } + ev->added = 1; + return 0; +} + +/* remove event, you may change it again */ +int +event_del(struct event* ev) +{ + if(ev->ev_fd != -1 && ev->ev_fd >= ev->ev_base->capfd) + return -1; + if((ev->ev_flags&EV_TIMEOUT)) + (void)rbtree_delete(ev->ev_base->times, &ev->node); + if((ev->ev_flags&(EV_READ|EV_WRITE)) && ev->ev_fd != -1) { + ev->ev_base->fds[ev->ev_fd] = NULL; + FD_CLR(FD_SET_T ev->ev_fd, &ev->ev_base->reads); + FD_CLR(FD_SET_T ev->ev_fd, &ev->ev_base->writes); + FD_CLR(FD_SET_T ev->ev_fd, &ev->ev_base->ready); + FD_CLR(FD_SET_T ev->ev_fd, &ev->ev_base->content); + } + ev->added = 0; + return 0; +} + +/** which base gets to handle signals */ +static struct event_base* signal_base = NULL; + +/** signal handler */ +static RETSIGTYPE +sigh(int sig) +{ + struct event* ev; + if(!signal_base || sig < 0 || sig >= MAX_SIG) + return; + ev = signal_base->signals[sig]; + if(!ev) + return; + (*ev->ev_callback)(sig, EV_SIGNAL, ev->ev_arg); +} + +/** install signal handler */ +int +signal_add(struct event* ev, struct timeval* ATTR_UNUSED(tv)) +{ + struct sigaction action; + if(ev->ev_fd == -1 || ev->ev_fd >= MAX_SIG) + return -1; + signal_base = ev->ev_base; + ev->ev_base->signals[ev->ev_fd] = ev; + ev->added = 1; + action.sa_handler = sigh; + sigfillset(&action.sa_mask); + action.sa_flags = 0; + return sigaction(ev->ev_fd, &action, NULL); +} + +/** remove signal handler */ +int +signal_del(struct event* ev) +{ + if(ev->ev_fd == -1 || ev->ev_fd >= MAX_SIG) + return -1; + ev->ev_base->signals[ev->ev_fd] = NULL; + ev->added = 0; + return 0; +} + +#else /* USE_MINI_EVENT */ +#ifndef USE_WINSOCK +int +mini_ev_cmp(const void* ATTR_UNUSED(a), const void* ATTR_UNUSED(b)) +{ + return 0; +} +#endif /* not USE_WINSOCK */ +#endif /* USE_MINI_EVENT */ diff --git a/usr.sbin/nsd/mini_event.h b/usr.sbin/nsd/mini_event.h new file mode 100644 index 00000000000..b40983b6479 --- /dev/null +++ b/usr.sbin/nsd/mini_event.h @@ -0,0 +1,183 @@ +/* + * mini-event.h - micro implementation of libevent api, using select() only. + * + * Copyright (c) 2007, NLnet Labs. All rights reserved. + * + * This software is open source. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * Neither the name of the NLNET LABS nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file + * This file implements part of the event(3) libevent api. + * The back end is only select. Max number of fds is limited. + * Max number of signals is limited, one handler per signal only. + * And one handler per fd. + * + * Although limited to select() and a max (1024) open fds, it + * is efficient: + * o dispatch call caches fd_sets to use. + * o handler calling takes time ~ to the number of fds. + * o timeouts are stored in a redblack tree, sorted, so take log(n). + * Timeouts are only accurate to the second (no subsecond accuracy). + * To avoid cpu hogging, fractional timeouts are rounded up to a whole second. + */ + +#ifndef MINI_EVENT_H +#define MINI_EVENT_H +struct region; + +#if defined(USE_MINI_EVENT) && !defined(USE_WINSOCK) + +#ifndef HAVE_EVENT_BASE_FREE +#define HAVE_EVENT_BASE_FREE +#endif + +/** event timeout */ +#define EV_TIMEOUT 0x01 +/** event fd readable */ +#define EV_READ 0x02 +/** event fd writable */ +#define EV_WRITE 0x04 +/** event signal */ +#define EV_SIGNAL 0x08 +/** event must persist */ +#define EV_PERSIST 0x10 + +/* needs our redblack tree */ +#include "rbtree.h" + +/** max number of file descriptors to support */ +#define MAX_FDS 1024 +/** max number of signals to support */ +#define MAX_SIG 32 + +/** event base */ +struct event_base +{ + /** sorted by timeout (absolute), ptr */ + rbtree_t* times; + /** array of 0 - maxfd of ptr to event for it */ + struct event** fds; + /** max fd in use */ + int maxfd; + /** capacity - size of the fds array */ + int capfd; + /* fdset for read write, for fds ready, and added */ + fd_set + /** fds for reading */ + reads, + /** fds for writing */ + writes, + /** fds determined ready for use */ + ready, + /** ready plus newly added events. */ + content; + /** array of 0 - maxsig of ptr to event for it */ + struct event** signals; + /** if we need to exit */ + int need_to_exit; + /** where to store time in seconds */ + time_t* time_secs; + /** where to store time in microseconds */ + struct timeval* time_tv; + /** region for allocation */ + struct region* region; +}; + +/** + * Event structure. Has some of the event elements. + */ +struct event { + /** node in timeout rbtree */ + rbnode_t node; + /** is event already added */ + int added; + + /** event base it belongs to */ + struct event_base *ev_base; + /** fd to poll or -1 for timeouts. signal number for sigs. */ + int ev_fd; + /** what events this event is interested in, see EV_.. above. */ + short ev_flags; + /** timeout value */ + struct timeval ev_timeout; + + /** callback to call: fd, eventbits, userarg */ + void (*ev_callback)(int, short, void *arg); + /** callback user arg */ + void *ev_arg; +}; + +/* function prototypes (some are as they appear in event.h) */ +/** create event base */ +void *event_init(time_t* time_secs, struct timeval* time_tv); +/** get version */ +const char *event_get_version(void); +/** get polling method, select */ +const char *event_get_method(void); +/** run select in a loop */ +int event_base_dispatch(struct event_base *); +/** exit that loop */ +int event_base_loopexit(struct event_base *, struct timeval *); +/** run select once */ +#define EVLOOP_ONCE 1 +int event_base_loop(struct event_base* base, int flags); +/** free event base. Free events yourself */ +void event_base_free(struct event_base *); +/** set content of event */ +void event_set(struct event *, int, short, void (*)(int, short, void *), void *); +/** add event to a base. You *must* call this for every event. */ +int event_base_set(struct event_base *, struct event *); +/** add event to make it active. You may not change it with event_set anymore */ +int event_add(struct event *, struct timeval *); +/** remove event. You may change it again */ +int event_del(struct event *); + +/** add a timer */ +#define evtimer_add(ev, tv) event_add(ev, tv) +/** remove a timer */ +#define evtimer_del(ev) event_del(ev) + +/* uses different implementation. Cannot mix fd/timeouts and signals inside + * the same struct event. create several event structs for that. */ +/** install signal handler */ +int signal_add(struct event *, struct timeval *); +/** set signal event contents */ +#define signal_set(ev, x, cb, arg) \ + event_set(ev, x, EV_SIGNAL|EV_PERSIST, cb, arg) +/** remove signal handler */ +int signal_del(struct event *); + +#endif /* USE_MINI_EVENT and not USE_WINSOCK */ + +/** compare events in tree, based on timevalue, ptr for uniqueness */ +int mini_ev_cmp(const void* a, const void* b); + +#endif /* MINI_EVENT_H */ diff --git a/usr.sbin/nsd/namedb.c b/usr.sbin/nsd/namedb.c index 5ed3b31baf6..5ffb6ff10dd 100644 --- a/usr.sbin/nsd/namedb.c +++ b/usr.sbin/nsd/namedb.c @@ -1,7 +1,7 @@ /* * namedb.c -- common namedb operations. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -16,14 +16,14 @@ #include <limits.h> #include <stdio.h> #include <string.h> -#include <errno.h> #include "namedb.h" +#include "nsec3.h" static domain_type * -allocate_domain_info(domain_table_type *table, - const dname_type *dname, - domain_type *parent) +allocate_domain_info(domain_table_type* table, + const dname_type* dname, + domain_type* parent) { domain_type *result; @@ -33,70 +33,312 @@ allocate_domain_info(domain_table_type *table, result = (domain_type *) region_alloc(table->region, sizeof(domain_type)); - result->node.key = dname_partial_copy( + result->dname = dname_partial_copy( table->region, dname, domain_dname(parent)->label_count + 1); result->parent = parent; - result->nextdiff = NULL; result->wildcard_child_closest_match = result; result->rrsets = NULL; - result->number = 0; + result->usage = 0; #ifdef NSEC3 - result->nsec3_cover = NULL; -#ifdef FULL_PREHASH - result->nsec3_wcard_child_cover = NULL; - result->nsec3_ds_parent_cover = NULL; - result->nsec3_lookup = NULL; - result->nsec3_is_exact = 0; - result->nsec3_ds_parent_is_exact = 0; -#endif /* FULL_PREHASH */ -#endif /* NSEC3 */ + result->nsec3 = NULL; +#endif result->is_existing = 0; result->is_apex = 0; - result->has_SOA = 0; + assert(table->numlist_last); /* it exists because root exists */ + /* push this domain at the end of the numlist */ + result->number = table->numlist_last->number+1; + result->numlist_next = NULL; + result->numlist_prev = table->numlist_last; + table->numlist_last->numlist_next = result; + table->numlist_last = result; return result; } +#ifdef NSEC3 +void +allocate_domain_nsec3(domain_table_type* table, domain_type* result) +{ + if(result->nsec3) + return; + result->nsec3 = (struct nsec3_domain_data*) region_alloc(table->region, + sizeof(struct nsec3_domain_data)); + result->nsec3->nsec3_cover = NULL; + result->nsec3->nsec3_wcard_child_cover = NULL; + result->nsec3->nsec3_ds_parent_cover = NULL; + result->nsec3->nsec3_is_exact = 0; + result->nsec3->nsec3_ds_parent_is_exact = 0; + result->nsec3->have_nsec3_hash = 0; + result->nsec3->have_nsec3_wc_hash = 0; + result->nsec3->have_nsec3_ds_parent_hash = 0; + result->nsec3->prehash_prev = NULL; + result->nsec3->prehash_next = NULL; + result->nsec3->nsec3_node.key = NULL; + result->nsec3->hash_node.key = NULL; + result->nsec3->wchash_node.key = NULL; + result->nsec3->dshash_node.key = NULL; +} +#endif /* NSEC3 */ + +/** make the domain last in the numlist, changes numbers of domains */ +static void +numlist_make_last(domain_table_type* table, domain_type* domain) +{ + size_t sw; + domain_type* last = table->numlist_last; + if(domain == last) + return; + /* swap numbers with the last element */ + sw = domain->number; + domain->number = last->number; + last->number = sw; + /* swap list position with the last element */ + assert(domain->numlist_next); + assert(last->numlist_prev); + if(domain->numlist_next != last) { + /* case 1: there are nodes between domain .. last */ + domain_type* span_start = domain->numlist_next; + domain_type* span_end = last->numlist_prev; + /* these assignments walk the new list from start to end */ + if(domain->numlist_prev) + domain->numlist_prev->numlist_next = last; + last->numlist_prev = domain->numlist_prev; + last->numlist_next = span_start; + span_start->numlist_prev = last; + span_end->numlist_next = domain; + domain->numlist_prev = span_end; + domain->numlist_next = NULL; + } else { + /* case 2: domain and last are neighbors */ + /* these assignments walk the new list from start to end */ + if(domain->numlist_prev) + domain->numlist_prev->numlist_next = last; + last->numlist_prev = domain->numlist_prev; + last->numlist_next = domain; + domain->numlist_prev = last; + domain->numlist_next = NULL; + } + table->numlist_last = domain; +} + +/** pop the biggest domain off the numlist */ +static domain_type* +numlist_pop_last(domain_table_type* table) +{ + domain_type* d = table->numlist_last; + table->numlist_last = table->numlist_last->numlist_prev; + if(table->numlist_last) + table->numlist_last->numlist_next = NULL; + return d; +} + +/** see if a domain is eligible to be deleted, and thus is not used */ +static int +domain_can_be_deleted(domain_type* domain) +{ + domain_type* n; + /* it has data or it has usage, do not delete it */ + if(domain->rrsets) return 0; + if(domain->usage) return 0; + n = domain_next(domain); + /* it has children domains, do not delete it */ + if(n && domain_is_subdomain(n, domain)) + return 0; + return 1; +} + +#ifdef NSEC3 +/** see if domain is on the prehash list */ +int domain_is_prehash(domain_table_type* table, domain_type* domain) +{ + if(domain->nsec3 + && (domain->nsec3->prehash_prev || domain->nsec3->prehash_next)) + return 1; + return (table->prehash_list == domain); +} + +/** remove domain node from NSEC3 tree in hash space */ +void +zone_del_domain_in_hash_tree(rbtree_t* tree, rbnode_t* node) +{ + if(!node->key) + return; + rbtree_delete(tree, node->key); + /* note that domain is no longer in the tree */ + node->key = NULL; +} + +/** clear the prehash list */ +void prehash_clear(domain_table_type* table) +{ + domain_type* d = table->prehash_list, *n; + while(d) { + n = d->nsec3->prehash_next; + d->nsec3->prehash_prev = NULL; + d->nsec3->prehash_next = NULL; + d = n; + } + table->prehash_list = NULL; +} + +/** add domain to prehash list */ +void +prehash_add(domain_table_type* table, domain_type* domain) +{ + if(domain_is_prehash(table, domain)) + return; + allocate_domain_nsec3(table, domain); + domain->nsec3->prehash_next = table->prehash_list; + if(table->prehash_list) + table->prehash_list->nsec3->prehash_prev = domain; + table->prehash_list = domain; +} + +/** remove domain from prehash list */ +void +prehash_del(domain_table_type* table, domain_type* domain) +{ + if(domain->nsec3->prehash_next) + domain->nsec3->prehash_next->nsec3->prehash_prev = + domain->nsec3->prehash_prev; + if(domain->nsec3->prehash_prev) + domain->nsec3->prehash_prev->nsec3->prehash_next = + domain->nsec3->prehash_next; + else table->prehash_list = domain->nsec3->prehash_next; + domain->nsec3->prehash_next = NULL; + domain->nsec3->prehash_prev = NULL; +} +#endif /* NSEC3 */ + +/** perform domain name deletion */ +static void +do_deldomain(namedb_type* db, domain_type* domain) +{ + assert(domain && domain->parent); /* exists and not root */ + /* first adjust the number list so that domain is the last one */ + numlist_make_last(db->domains, domain); + /* pop off the domain from the number list */ + (void)numlist_pop_last(db->domains); + +#ifdef NSEC3 + /* if on prehash list, remove from prehash */ + if(domain_is_prehash(db->domains, domain)) + prehash_del(db->domains, domain); + + /* see if nsec3-nodes are used */ + if(domain->nsec3) { + if(domain->nsec3->nsec3_node.key) + zone_del_domain_in_hash_tree(nsec3_tree_zone(db, domain) + ->nsec3tree, &domain->nsec3->nsec3_node); + if(domain->nsec3->hash_node.key) + zone_del_domain_in_hash_tree(nsec3_tree_zone(db, domain) + ->hashtree, &domain->nsec3->hash_node); + if(domain->nsec3->wchash_node.key) + zone_del_domain_in_hash_tree(nsec3_tree_zone(db, domain) + ->wchashtree, &domain->nsec3->wchash_node); + if(domain->nsec3->dshash_node.key) + zone_del_domain_in_hash_tree(nsec3_tree_dszone(db, domain) + ->dshashtree, &domain->nsec3->dshash_node); + region_recycle(db->domains->region, domain->nsec3, + sizeof(struct nsec3_domain_data)); + } +#endif /* NSEC3 */ + + /* see if this domain is someones wildcard-child-closest-match, + * which can only be the parent, and then it should use the + * one-smaller than this domain as closest-match. */ + if(domain->parent->wildcard_child_closest_match == domain) + domain->parent->wildcard_child_closest_match = + domain_previous(domain); + + /* actual removal */ + radix_delete(db->domains->nametree, domain->rnode); + region_recycle(db->domains->region, (dname_type*)domain->dname, + dname_total_size(domain->dname)); + region_recycle(db->domains->region, domain, sizeof(domain_type)); +} + +void +domain_table_deldomain(namedb_type* db, domain_type* domain) +{ + while(domain_can_be_deleted(domain)) { + /* delete it */ + do_deldomain(db, domain); + /* test parent */ + domain = domain->parent; + } +} + +/** clear hash tree */ +void +hash_tree_clear(rbtree_t* tree) +{ + rbnode_t* n; + if(!tree) return; + + /* note that elements are no longer in the tree */ + for(n=rbtree_first(tree); n!=RBTREE_NULL; n=rbtree_next(n)) { + n->key = NULL; + } + tree->count = 0; + tree->root = RBTREE_NULL; +} + +void hash_tree_delete(region_type* region, rbtree_t* tree) +{ + region_recycle(region, tree, sizeof(rbtree_t)); +} + +/** add domain nsec3 node to hashedspace tree */ +void zone_add_domain_in_hash_tree(region_type* region, rbtree_t** tree, + int (*cmpf)(const void*, const void*), + domain_type* domain, rbnode_t* node) +{ + if(!*tree) + *tree = rbtree_create(region, cmpf); + memset(node, 0, sizeof(rbnode_t)); + node->key = domain; + rbtree_insert(*tree, node); +} + domain_table_type * -domain_table_create(region_type *region) +domain_table_create(region_type* region) { - const dname_type *origin; - domain_table_type *result; - domain_type *root; + const dname_type* origin; + domain_table_type* result; + domain_type* root; assert(region); origin = dname_make(region, (uint8_t *) "", 0); root = (domain_type *) region_alloc(region, sizeof(domain_type)); - root->node.key = origin; + root->dname = origin; root->parent = NULL; - root->nextdiff = NULL; root->wildcard_child_closest_match = root; root->rrsets = NULL; root->number = 1; /* 0 is used for after header */ + root->usage = 1; /* do not delete root, ever */ root->is_existing = 0; root->is_apex = 0; - root->has_SOA = 0; + root->numlist_prev = NULL; + root->numlist_next = NULL; #ifdef NSEC3 - root->nsec3_cover = NULL; -#ifdef FULL_PREHASH - root->nsec3_is_exact = 0; - root->nsec3_ds_parent_is_exact = 0; - root->nsec3_wcard_child_cover = NULL; - root->nsec3_ds_parent_cover = NULL; - root->nsec3_lookup = NULL; -#endif /* FULL_PREHASH */ -#endif /* NSEC3 */ + root->nsec3 = NULL; +#endif result = (domain_table_type *) region_alloc(region, sizeof(domain_table_type)); result->region = region; - result->names_to_domains = rbtree_create( - region, (int (*)(const void *, const void *)) dname_compare); - rbtree_insert(result->names_to_domains, (rbnode_t *) root); + result->nametree = radix_tree_create(region); + root->rnode = radname_insert(result->nametree, dname_name(root->dname), + root->dname->name_size, root); result->root = root; + result->numlist_last = root; +#ifdef NSEC3 + result->prehash_list = NULL; +#endif return result; } @@ -115,7 +357,9 @@ domain_table_search(domain_table_type *table, assert(closest_match); assert(closest_encloser); - exact = rbtree_find_less_equal(table->names_to_domains, dname, (rbnode_t **) closest_match); + exact = radname_find_less_equal(table->nametree, dname_name(dname), + dname->name_size, (struct radnode**)closest_match); + *closest_match = (domain_type*)((*(struct radnode**)closest_match)->elem); assert(*closest_match); *closest_encloser = *closest_match; @@ -135,11 +379,11 @@ domain_table_search(domain_table_type *table, } domain_type * -domain_table_find(domain_table_type *table, - const dname_type *dname) +domain_table_find(domain_table_type* table, + const dname_type* dname) { - domain_type *closest_match; - domain_type *closest_encloser; + domain_type* closest_match; + domain_type* closest_encloser; int exact; exact = domain_table_search( @@ -149,12 +393,12 @@ domain_table_find(domain_table_type *table, domain_type * -domain_table_insert(domain_table_type *table, - const dname_type *dname) +domain_table_insert(domain_table_type* table, + const dname_type* dname) { - domain_type *closest_match; - domain_type *closest_encloser; - domain_type *result; + domain_type* closest_match; + domain_type* closest_encloser; + domain_type* result; int exact; assert(table); @@ -172,8 +416,9 @@ domain_table_insert(domain_table_type *table, result = allocate_domain_info(table, dname, closest_encloser); - rbtree_insert(table->names_to_domains, (rbnode_t *) result); - result->number = table->names_to_domains->count; + result->rnode = radname_insert(table->nametree, + dname_name(result->dname), + result->dname->name_size, result); /* * If the newly added domain name is larger @@ -199,26 +444,21 @@ domain_table_insert(domain_table_type *table, } int -domain_table_iterate(domain_table_type *table, +domain_table_iterate(domain_table_type* table, domain_table_iterator_type iterator, - void *user_data) + void* user_data) { - const void *dname; - void *node; int error = 0; - - assert(table); - - RBTREE_WALK(table->names_to_domains, dname, node) { - error += iterator((domain_type *) node, user_data); + struct radnode* n; + for(n = radix_first(table->nametree); n; n = radix_next(n)) { + error += iterator((domain_type*)n->elem, user_data); } - return error; } void -domain_add_rrset(domain_type *domain, rrset_type *rrset) +domain_add_rrset(domain_type* domain, rrset_type* rrset) { #if 0 /* fast */ rrset->next = domain->rrsets; @@ -240,9 +480,9 @@ domain_add_rrset(domain_type *domain, rrset_type *rrset) rrset_type * -domain_find_rrset(domain_type *domain, zone_type *zone, uint16_t type) +domain_find_rrset(domain_type* domain, zone_type* zone, uint16_t type) { - rrset_type *result = domain->rrsets; + rrset_type* result = domain->rrsets; while (result) { if (result->zone == zone && rrset_rrtype(result) == type) { @@ -254,9 +494,9 @@ domain_find_rrset(domain_type *domain, zone_type *zone, uint16_t type) } rrset_type * -domain_find_any_rrset(domain_type *domain, zone_type *zone) +domain_find_any_rrset(domain_type* domain, zone_type* zone) { - rrset_type *result = domain->rrsets; + rrset_type* result = domain->rrsets; while (result) { if (result->zone == zone) { @@ -268,9 +508,9 @@ domain_find_any_rrset(domain_type *domain, zone_type *zone) } zone_type * -domain_find_zone(domain_type *domain) +domain_find_zone(domain_type* domain) { - rrset_type *rrset; + rrset_type* rrset; while (domain) { for (rrset = domain->rrsets; rrset; rrset = rrset->next) { if (rrset_rrtype(rrset) == TYPE_SOA) { @@ -282,22 +522,10 @@ domain_find_zone(domain_type *domain) return NULL; } -#ifndef FULL_PREHASH -domain_type * -domain_find_zone_apex(domain_type *domain) { - while (domain != NULL) { - if (domain->has_SOA != 0) - return domain; - domain = domain->parent; - } - return NULL; -} -#endif /* !FULL_PREHASH */ - zone_type * -domain_find_parent_zone(zone_type *zone) +domain_find_parent_zone(zone_type* zone) { - rrset_type *rrset; + rrset_type* rrset; assert(zone); @@ -310,7 +538,7 @@ domain_find_parent_zone(zone_type *zone) } domain_type * -domain_find_ns_rrsets(domain_type *domain, zone_type *zone, rrset_type **ns) +domain_find_ns_rrsets(domain_type* domain, zone_type* zone, rrset_type **ns) { while (domain && domain != zone->apex) { *ns = domain_find_rrset(domain, zone, TYPE_NS); @@ -324,18 +552,18 @@ domain_find_ns_rrsets(domain_type *domain, zone_type *zone, rrset_type **ns) } int -domain_is_glue(domain_type *domain, zone_type *zone) +domain_is_glue(domain_type* domain, zone_type* zone) { - rrset_type *unused; - domain_type *ns_domain = domain_find_ns_rrsets(domain, zone, &unused); + rrset_type* unused; + domain_type* ns_domain = domain_find_ns_rrsets(domain, zone, &unused); return (ns_domain != NULL && domain_find_rrset(ns_domain, zone, TYPE_SOA) == NULL); } domain_type * -domain_wildcard_child(domain_type *domain) +domain_wildcard_child(domain_type* domain) { - domain_type *wildcard_child; + domain_type* wildcard_child; assert(domain); assert(domain->wildcard_child_closest_match); @@ -351,14 +579,14 @@ domain_wildcard_child(domain_type *domain) } int -zone_is_secure(zone_type *zone) +zone_is_secure(zone_type* zone) { assert(zone); return zone->is_secure; } uint16_t -rr_rrsig_type_covered(rr_type *rr) +rr_rrsig_type_covered(rr_type* rr) { assert(rr->type == TYPE_RRSIG); assert(rr->rdata_count > 0); @@ -368,20 +596,16 @@ rr_rrsig_type_covered(rr_type *rr) } zone_type * -namedb_find_zone(namedb_type *db, domain_type *domain) +namedb_find_zone(namedb_type* db, const dname_type* dname) { - zone_type *zone; - - for (zone = db->zones; zone; zone = zone->next) { - if (zone->apex == domain) - break; - } - - return zone; + struct radnode* n = radname_search(db->zonetree, dname_name(dname), + dname->name_size); + if(n) return (zone_type*)n->elem; + return NULL; } rrset_type * -domain_find_non_cname_rrset(domain_type *domain, zone_type *zone) +domain_find_non_cname_rrset(domain_type* domain, zone_type* zone) { /* find any rrset type that is not allowed next to a CNAME */ /* nothing is allowed next to a CNAME, except RRSIG, NSEC, NSEC3 */ @@ -402,251 +626,12 @@ domain_find_non_cname_rrset(domain_type *domain, zone_type *zone) return NULL; } -/** - * Create namedb. - * - */ -struct namedb * -namedb_create(void) -{ - struct namedb *db = NULL; - region_type *region = NULL; -#ifdef NSEC3 -#ifndef FULL_PREHASH - region_type *nsec3_region = NULL; - region_type *nsec3_mod_region = NULL; -#endif /* !FULL_PREHASH */ -#endif /* NSEC3 */ - -#ifdef USE_MMAP_ALLOC - region = region_create_custom(mmap_alloc, mmap_free, - MMAP_ALLOC_CHUNK_SIZE, MMAP_ALLOC_LARGE_OBJECT_SIZE, - MMAP_ALLOC_INITIAL_CLEANUP_SIZE, 1); -#else /* !USE_MMAP_ALLOC */ - region = region_create_custom(xalloc, free, - DEFAULT_CHUNK_SIZE, DEFAULT_LARGE_OBJECT_SIZE, - DEFAULT_INITIAL_CLEANUP_SIZE, 1); -#endif /* !USE_MMAP_ALLOC */ - if (region == NULL) - return NULL; - -#ifdef NSEC3 -#ifndef FULL_PREHASH -#ifdef USE_MMAP_ALLOC - nsec3_region = region_create_custom(mmap_alloc, mmap_free, - MMAP_ALLOC_CHUNK_SIZE, MMAP_ALLOC_LARGE_OBJECT_SIZE, - MMAP_ALLOC_INITIAL_CLEANUP_SIZE, 1); -#else /* !USE_MMAP_ALLOC */ - nsec3_region = region_create_custom(xalloc, free, - DEFAULT_CHUNK_SIZE, DEFAULT_LARGE_OBJECT_SIZE, - DEFAULT_INITIAL_CLEANUP_SIZE, 1); -#endif /* !USE_MMAP_ALLOC */ - if (nsec3_region == NULL) { - region_destroy(region); - return NULL; - } -#ifdef USE_MMAP_ALLOC - nsec3_mod_region = region_create_custom(mmap_alloc, mmap_free, - MMAP_ALLOC_CHUNK_SIZE, MMAP_ALLOC_LARGE_OBJECT_SIZE, - MMAP_ALLOC_INITIAL_CLEANUP_SIZE, 1); -#else /* !USE_MMAP_ALLOC */ - nsec3_mod_region = region_create_custom(xalloc, free, - DEFAULT_CHUNK_SIZE, DEFAULT_LARGE_OBJECT_SIZE, - DEFAULT_INITIAL_CLEANUP_SIZE, 1); -#endif /* !USE_MMAP_ALLOC */ - if (nsec3_mod_region == NULL) { - region_destroy(region); - region_destroy(nsec3_region); - return NULL; - } -#endif /* !FULL_PREHASH */ -#endif /* NSEC3 */ - - /* Make a new structure... */ - db = (namedb_type *) region_alloc(region, sizeof(namedb_type)); - db->region = region; -#ifdef NSEC3 -#ifndef FULL_PREHASH - db->nsec3_region = nsec3_region; - db->nsec3_mod_region = nsec3_mod_region; - db->nsec3_mod_domains = NULL; -#endif /* !FULL_PREHASH */ -#endif /* NSEC3 */ - db->domains = domain_table_create(region); - db->zones = NULL; - db->zone_count = 0; - db->filename = NULL; - db->fd = NULL; - db->crc = ~0; - db->crc_pos = 0; - db->diff_skip = 0; - db->diff_pos = 0; - return db; -} - -/** - * Destroy namedb. - * - */ -void -namedb_destroy(struct namedb *db) -{ -#ifdef NSEC3 -#ifndef FULL_PREHASH - region_destroy(db->nsec3_mod_region); - db->nsec3_mod_region = NULL; - db->nsec3_mod_domains = NULL; - region_destroy(db->nsec3_region); - db->nsec3_region = NULL; -#endif /* !FULL_PREHASH */ -#endif /* NSEC3 */ - region_destroy(db->region); -} - - -#ifdef NSEC3 -#ifndef FULL_PREHASH -int -zone_nsec3_domains_create(struct namedb *db, struct zone *zone) -{ - if ((db == NULL) || (zone == NULL)) - return EINVAL; - if (zone->nsec3_domains != NULL) - return 0; - zone->nsec3_domains = rbtree_create(db->nsec3_region, - dname_compare); - if (zone->nsec3_domains == NULL) - return ENOMEM; - return 0; -} - -int -zone_nsec3_domains_destroy(struct namedb *db, struct zone *zone) -{ - rbnode_t *node; - if ((db == NULL) || (zone == NULL)) - return EINVAL; - if (zone->nsec3_domains == NULL) - return 0; - - node = rbtree_postorder_first(zone->nsec3_domains->root); - while (node != RBTREE_NULL) { - struct nsec3_domain *nsec3_domain = - (struct nsec3_domain *) node; - node = rbtree_postorder_next(node); - - if (nsec3_domain->covers != NULL) { - nsec3_domain->covers->nsec3_cover = NULL; - } - region_recycle(db->nsec3_region, nsec3_domain, - sizeof(*nsec3_domain)); - } - region_recycle(db->nsec3_region, zone->nsec3_domains, - sizeof(*(zone->nsec3_domains))); - zone->nsec3_domains = NULL; - return 0; -} - - int -namedb_add_nsec3_domain(struct namedb *db, struct domain *domain, - struct zone *zone) +namedb_lookup(struct namedb* db, + const dname_type* dname, + domain_type **closest_match, + domain_type **closest_encloser) { - struct nsec3_domain *nsec3_domain; - if (zone->nsec3_domains == NULL) - return 0; - nsec3_domain = (struct nsec3_domain *) region_alloc(db->nsec3_region, - sizeof(*nsec3_domain)); - if (nsec3_domain == NULL) - return ENOMEM; - nsec3_domain->node.key = domain_dname(domain); - nsec3_domain->nsec3_domain = domain; - nsec3_domain->covers = NULL; - if (rbtree_insert(zone->nsec3_domains, (rbnode_t *) nsec3_domain) == NULL) { - region_recycle(db->nsec3_region, nsec3_domain, sizeof(*nsec3_domain)); - } - return 0; + return domain_table_search( + db->domains, dname, closest_match, closest_encloser); } - - -int -namedb_del_nsec3_domain(struct namedb *db, struct domain *domain, - struct zone *zone) -{ - rbnode_t *node; - struct nsec3_domain *nsec3_domain; - int error = 0; - - if (zone->nsec3_domains == NULL) - return 0; - - node = rbtree_delete(zone->nsec3_domains, domain_dname(domain)); - if (node == NULL) - return 0; - - nsec3_domain = (struct nsec3_domain *) node; - if (nsec3_domain->covers != NULL) { - /* - * It is possible that this NSEC3 domain was modified - * due to the addition/deletion of another NSEC3 domain. - * Make sure it gets added to the NSEC3 list later by - * making sure it's covered domain is added to the - * NSEC3 mod list. S64#3441 - */ - error = namedb_add_nsec3_mod_domain(db, nsec3_domain->covers); - nsec3_domain->covers->nsec3_cover = NULL; - nsec3_domain->covers = NULL; - } - region_recycle(db->nsec3_region, nsec3_domain, sizeof(*nsec3_domain)); - return error; -} - - -int -namedb_nsec3_mod_domains_create(struct namedb *db) -{ - if (db == NULL) - return EINVAL; - namedb_nsec3_mod_domains_destroy(db); - - db->nsec3_mod_domains = rbtree_create(db->nsec3_mod_region, dname_compare); - if (db->nsec3_mod_domains == NULL) - return ENOMEM; - return 0; -} - - -int -namedb_nsec3_mod_domains_destroy(struct namedb *db) -{ - if (db == NULL) - return EINVAL; - if (db->nsec3_mod_domains == NULL) - return 0; - region_free_all(db->nsec3_mod_region); - db->nsec3_mod_domains = NULL; - return 0; -} - -int -namedb_add_nsec3_mod_domain(struct namedb *db, struct domain *domain) -{ - struct nsec3_mod_domain *nsec3_mod_domain; - nsec3_mod_domain = (struct nsec3_mod_domain *) - region_alloc(db->nsec3_mod_region, sizeof(*nsec3_mod_domain)); - if (nsec3_mod_domain == NULL) { - log_msg(LOG_ERR, - "memory allocation failure on modified domain"); - return ENOMEM; - } - nsec3_mod_domain->node.key = domain_dname(domain); - nsec3_mod_domain->domain = domain; - - if (rbtree_insert(db->nsec3_mod_domains, (rbnode_t *) nsec3_mod_domain) == NULL) { - region_recycle(db->nsec3_mod_region, nsec3_mod_domain, - sizeof(*nsec3_mod_domain)); - } - return 0; -} -#endif /* !FULL_PREHASH */ -#endif /* NSEC3 */ diff --git a/usr.sbin/nsd/namedb.h b/usr.sbin/nsd/namedb.h index e50986a0fa4..dc0cb3fbff7 100644 --- a/usr.sbin/nsd/namedb.h +++ b/usr.sbin/nsd/namedb.h @@ -1,7 +1,7 @@ /* * namedb.h -- nsd(8) internal namespace database definitions * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -14,13 +14,12 @@ #include "dname.h" #include "dns.h" +#include "radtree.h" #include "rbtree.h" -#include "util.h" struct zone_options; struct nsd_options; - -#define NAMEDB_MAGIC "NSDdbV08" -#define NAMEDB_MAGIC_SIZE 8 +struct udb_base; +struct udb_ptr; typedef union rdata_atom rdata_atom_type; typedef struct rrset rrset_type; @@ -35,102 +34,104 @@ typedef struct zone zone_type; struct domain_table { - region_type *region; - rbtree_t *names_to_domains; - domain_type *root; + region_type* region; + struct radtree *nametree; + domain_type* root; + /* ptr to biggest domain.number and last in list. + * the root is the lowest and first in the list. */ + domain_type *numlist_last; +#ifdef NSEC3 + /* the prehash list, start of the list */ + domain_type* prehash_list; +#endif /* NSEC3 */ }; -struct domain -{ - rbnode_t node; - domain_type *parent; - domain_type *nextdiff; - domain_type *wildcard_child_closest_match; - rrset_type *rrsets; #ifdef NSEC3 - domain_type *nsec3_cover; /* != NULL is exact cover */ -#ifdef FULL_PREHASH - /* (if nsec3 chain complete) nsec_cover is always the covering nsec3 - record */ +struct nsec3_domain_data { + /* (if nsec3 chain complete) always the covering nsec3 record */ + domain_type* nsec3_cover; /* the nsec3 that covers the wildcard child of this domain. */ - domain_type *nsec3_wcard_child_cover; + domain_type* nsec3_wcard_child_cover; /* for the DS case we must answer on the parent side of zone cut */ - domain_type *nsec3_ds_parent_cover; - /* the NSEC3 domain that has a hash-base32 <= than this dname. */ - /* or NULL (no smaller one within this zone) - * this variable is used to look up the NSEC3 record that matches - * or covers a given b64-encoded-hash-string domain name. - * The result of the lookup is stored in the *_cover variables. - * The variable makes it possible to perform a rbtree lookup for - * a name, then take this 'jump' to the previous element that contains - * an NSEC3 record, with hopefully the correct parameters. */ - domain_type *nsec3_lookup; -#endif /* FULL_PREHASH */ + domain_type* nsec3_ds_parent_cover; + /* NSEC3 domains to prehash, prev and next on the list or cleared */ + domain_type* prehash_prev, *prehash_next; + /* entry in the nsec3tree (for NSEC3s in the chain in use) */ + rbnode_t nsec3_node; + /* entry in the hashtree (for precompiled domains) */ + rbnode_t hash_node; + /* entry in the wchashtree (the wildcard precompile) */ + rbnode_t wchash_node; + /* entry in the dshashtree (the parent ds precompile) */ + rbnode_t dshash_node; + + /* nsec3 hash */ + uint8_t nsec3_hash[NSEC3_HASH_LEN]; + /* nsec3 hash of wildcard before this name */ + uint8_t nsec3_wc_hash[NSEC3_HASH_LEN]; + /* parent-side DS hash */ + uint8_t nsec3_ds_parent_hash[NSEC3_HASH_LEN]; + /* if the nsec3 has is available */ + unsigned have_nsec3_hash : 1; + unsigned have_nsec3_wc_hash : 1; + unsigned have_nsec3_ds_parent_hash : 1; + /* if the domain has an NSEC3 for it, use cover ptr to get it. */ + unsigned nsec3_is_exact : 1; + /* same but on parent side */ + unsigned nsec3_ds_parent_is_exact : 1; +}; #endif /* NSEC3 */ - uint32_t number; /* Unique domain name number. */ + +struct domain +{ + struct radnode* rnode; + const dname_type* dname; + domain_type* parent; + domain_type* wildcard_child_closest_match; + rrset_type* rrsets; +#ifdef NSEC3 + struct nsec3_domain_data* nsec3; +#endif + /* double-linked list sorted by domain.number */ + domain_type* numlist_prev, *numlist_next; + size_t number; /* Unique domain name number. */ + size_t usage; /* number of ptrs to this from RRs(in rdata) and + from zone-apex pointers, also the root has one + more to make sure it cannot be deleted. */ /* * This domain name exists (see wildcard clarification draft). */ unsigned is_existing : 1; unsigned is_apex : 1; - unsigned has_SOA : 1; -#ifdef NSEC3 -#ifdef FULL_PREHASH - /* if the domain has an NSEC3 for it, use cover ptr to get it. */ - unsigned nsec3_is_exact : 1; - /* same but on parent side */ - unsigned nsec3_ds_parent_is_exact : 1; -#endif /* FULL_PREHASH */ -#endif /* NSEC3 */ }; struct zone { - zone_type *next; - domain_type *apex; - rrset_type *soa_rrset; - rrset_type *soa_nx_rrset; /* see bug #103 */ - rrset_type *ns_rrset; + struct radnode *node; /* this entry in zonetree */ + domain_type* apex; + rrset_type* soa_rrset; + rrset_type* soa_nx_rrset; /* see bug #103 */ + rrset_type* ns_rrset; #ifdef NSEC3 - rr_type *nsec3_soa_rr; /* rrset with SOA bit set */ - domain_type *nsec3_last; /* last domain with nsec3, wraps */ -#ifndef FULL_PREHASH - rbtree_t *nsec3_domains; -#endif /* !FULL_PREHASH */ -#endif /* NSEC3 */ - -#if defined(BIND8_STATS) && defined(USE_ZONE_STATS) - struct nsdst st; -#endif /* defined(BIND8_STATS) && defined(USE_ZONE_STATS) */ - - struct zone_options *opts; - uint32_t number; - uint8_t* dirty; /* array of dirty-flags, per child */ + rr_type* nsec3_param; /* NSEC3PARAM RR of chain in use or NULL */ + domain_type* nsec3_last; /* last domain with nsec3, wraps */ + /* in these trees, the root contains an elem ptr to the radtree* */ + rbtree_t* nsec3tree; /* tree with relevant NSEC3 domains */ + rbtree_t* hashtree; /* tree, hashed NSEC3precompiled domains */ + rbtree_t* wchashtree; /* tree, wildcard hashed domains */ + rbtree_t* dshashtree; /* tree, ds-parent-hash domains */ +#endif + struct zone_options* opts; unsigned is_secure : 1; /* zone uses DNSSEC */ - unsigned updated : 1; /* zone SOA was updated */ unsigned is_ok : 1; /* zone has not expired. */ + unsigned is_changed : 1; /* zone was changed by AXFR */ }; -#ifdef NSEC3 -#ifndef FULL_PREHASH -struct nsec3_domain { - rbnode_t node; - struct domain *nsec3_domain; - struct domain *covers; -}; - -struct nsec3_mod_domain { - rbnode_t node; - struct domain *domain; -}; -#endif /* !FULL_PREHASH */ -#endif /* NSEC3 */ - /* a RR in DNS */ struct rr { - domain_type *owner; - rdata_atom_type *rdatas; + domain_type* owner; + rdata_atom_type* rdatas; uint32_t ttl; uint16_t type; uint16_t klass; @@ -143,9 +144,9 @@ struct rr { */ struct rrset { - rrset_type *next; - zone_type *zone; - rr_type *rrs; + rrset_type* next; + zone_type* zone; + rr_type* rrs; uint16_t rr_count; }; @@ -157,10 +158,10 @@ struct rrset union rdata_atom { /* RDATA_WF_COMPRESSED_DNAME, RDATA_WF_UNCOMPRESSED_DNAME */ - domain_type *domain; + domain_type* domain; /* Default. */ - uint16_t *data; + uint16_t* data; }; /* @@ -171,8 +172,8 @@ domain_table_type *domain_table_create(region_type *region); /* * Search the domain table for a match and the closest encloser. */ -int domain_table_search(domain_table_type *table, - const dname_type *dname, +int domain_table_search(domain_table_type* table, + const dname_type* dname, domain_type **closest_match, domain_type **closest_encloser); @@ -181,17 +182,17 @@ int domain_table_search(domain_table_type *table, * root domain). */ static inline uint32_t -domain_table_count(domain_table_type *table) +domain_table_count(domain_table_type* table) { - return table->names_to_domains->count; + return table->nametree->count; } /* * Find the specified dname in the domain_table. NULL is returned if * there is no exact match. */ -domain_type *domain_table_find(domain_table_type *table, - const dname_type *dname); +domain_type* domain_table_find(domain_table_type* table, + const dname_type* dname); /* * Insert a domain name in the domain table. If the domain name is @@ -203,6 +204,17 @@ domain_type *domain_table_find(domain_table_type *table, domain_type *domain_table_insert(domain_table_type *table, const dname_type *dname); +/* put domain into nsec3 hash space tree */ +void zone_add_domain_in_hash_tree(region_type* region, rbtree_t** tree, + int (*cmpf)(const void*, const void*), domain_type* domain, + rbnode_t* node); +void zone_del_domain_in_hash_tree(rbtree_t* tree, rbnode_t* node); +void hash_tree_clear(rbtree_t* tree); +void hash_tree_delete(region_type* region, rbtree_t* tree); +void prehash_clear(domain_table_type* table); +void prehash_add(domain_table_type* table, domain_type* domain); +void prehash_del(domain_table_type* table, domain_type* domain); +int domain_is_prehash(domain_table_type* table, domain_type* domain); /* * Iterate over all the domain names in the domain tree. @@ -210,87 +222,80 @@ domain_type *domain_table_insert(domain_table_type *table, typedef int (*domain_table_iterator_type)(domain_type *node, void *user_data); -int domain_table_iterate(domain_table_type *table, +int domain_table_iterate(domain_table_type* table, domain_table_iterator_type iterator, - void *user_data); + void* user_data); /* * Add an RRset to the specified domain. Updates the is_existing flag * as required. */ -void domain_add_rrset(domain_type *domain, rrset_type *rrset); +void domain_add_rrset(domain_type* domain, rrset_type* rrset); -rrset_type *domain_find_rrset(domain_type *domain, zone_type *zone, uint16_t type); -rrset_type *domain_find_any_rrset(domain_type *domain, zone_type *zone); +rrset_type* domain_find_rrset(domain_type* domain, zone_type* zone, uint16_t type); +rrset_type* domain_find_any_rrset(domain_type* domain, zone_type* zone); -zone_type *domain_find_zone(domain_type *domain); -zone_type *domain_find_parent_zone(zone_type *zone); +zone_type* domain_find_zone(domain_type* domain); +zone_type* domain_find_parent_zone(zone_type* zone); -#ifndef FULL_PREHASH -domain_type *domain_find_zone_apex(domain_type *domain); -#endif /* !FULL_PREHASH */ -domain_type *domain_find_ns_rrsets(domain_type *domain, zone_type *zone, rrset_type **ns); +domain_type* domain_find_ns_rrsets(domain_type* domain, zone_type* zone, rrset_type **ns); -int domain_is_glue(domain_type *domain, zone_type *zone); +int domain_is_glue(domain_type* domain, zone_type* zone); -rrset_type *domain_find_non_cname_rrset(domain_type *domain, zone_type *zone); +rrset_type* domain_find_non_cname_rrset(domain_type* domain, zone_type* zone); -domain_type *domain_wildcard_child(domain_type *domain); +domain_type* domain_wildcard_child(domain_type* domain); -int zone_is_secure(zone_type *zone); +int zone_is_secure(zone_type* zone); static inline const dname_type * -domain_dname(domain_type *domain) +domain_dname(domain_type* domain) { - return (const dname_type *) domain->node.key; + return domain->dname; } static inline domain_type * -domain_previous(domain_type *domain) +domain_previous(domain_type* domain) { - rbnode_t *prev = rbtree_previous((rbnode_t *) domain); - return prev == RBTREE_NULL ? NULL : (domain_type *) prev; + struct radnode* prev = radix_prev(domain->rnode); + return prev == NULL ? NULL : (domain_type*)prev->elem; } static inline domain_type * -domain_next(domain_type *domain) +domain_next(domain_type* domain) { - rbnode_t *next = rbtree_next((rbnode_t *) domain); - return next == RBTREE_NULL ? NULL : (domain_type *) next; + struct radnode* next = radix_next(domain->rnode); + return next == NULL ? NULL : (domain_type*)next->elem; } +/* easy comparison for subdomain, true if d1 is subdomain of d2. */ +static inline int domain_is_subdomain(domain_type* d1, domain_type* d2) +{ return dname_is_subdomain(domain_dname(d1), domain_dname(d2)); } +/* easy printout, to static buffer of dname_to_string, fqdn. */ +static inline const char* domain_to_string(domain_type* domain) +{ return dname_to_string(domain_dname(domain), NULL); } + /* * The type covered by the signature in the specified RRSIG RR. */ -uint16_t rr_rrsig_type_covered(rr_type *rr); +uint16_t rr_rrsig_type_covered(rr_type* rr); typedef struct namedb namedb_type; struct namedb { - region_type *region; -#ifdef NSEC3 -#ifndef FULL_PREHASH - region_type *nsec3_region; - region_type *nsec3_mod_region; - rbtree_t *nsec3_mod_domains; -#endif /* !FULL_PREHASH */ -#endif /* NSEC3 */ - domain_table_type *domains; - zone_type *zones; - size_t zone_count; - char *filename; - FILE *fd; + region_type* region; + domain_table_type* domains; + struct radtree* zonetree; + struct udb_base* udb; /* the timestamp on the ixfr.db file */ struct timeval diff_timestamp; - /* the CRC on the nsd.db file and position of CRC in the db file */ - uint32_t crc; - off_t crc_pos; /* if diff_skip=1, diff_pos contains the nsd.diff place to continue */ uint8_t diff_skip; off_t diff_pos; }; static inline int rdata_atom_is_domain(uint16_t type, size_t index); +static inline int rdata_atom_is_literal_domain(uint16_t type, size_t index); static inline domain_type * rdata_atom_domain(rdata_atom_type atom) @@ -311,27 +316,48 @@ rdata_atom_data(rdata_atom_type atom) } +/* Find the zone for the specified dname in DB. */ +zone_type *namedb_find_zone(namedb_type *db, const dname_type *dname); /* - * Find the zone for the specified DOMAIN in DB. + * Delete a domain name from the domain table. Removes dname_info node. + * Only deletes if usage is 0, has no rrsets and no children. Checks parents + * for deletion as well. Adjusts numberlist(domain.number), and + * wcard_child closest match. */ -zone_type *namedb_find_zone(namedb_type *db, domain_type *domain); - -/* dbcreate.c */ -struct namedb *namedb_new(const char *filename); -int namedb_save(struct namedb *db); -void namedb_discard(struct namedb *db); +void domain_table_deldomain(namedb_type* db, domain_type* domain); +/** dbcreate.c */ +int udb_write_rr(struct udb_base* udb, struct udb_ptr* z, rr_type* rr); +void udb_del_rr(struct udb_base* udb, struct udb_ptr* z, rr_type* rr); +int write_zone_to_udb(struct udb_base* udb, zone_type* zone, time_t mtime); +/** marshal rdata into buffer, must be MAX_RDLENGTH in size */ +size_t rr_marshal_rdata(rr_type* rr, uint8_t* rdata, size_t sz); /* dbaccess.c */ -int namedb_lookup (struct namedb *db, - const dname_type *dname, +int namedb_lookup (struct namedb* db, + const dname_type* dname, domain_type **closest_match, domain_type **closest_encloser); /* pass number of children (to alloc in dirty array */ -struct namedb *namedb_open(const char *filename, struct nsd_options* opt, - size_t num_children); -void namedb_fd_close(struct namedb *db); -void namedb_close(struct namedb *db); +struct namedb *namedb_open(const char *filename, struct nsd_options* opt); +void namedb_close_udb(struct namedb* db); +void namedb_close(struct namedb* db); +void namedb_check_zonefiles(struct namedb* db, struct nsd_options* opt, + struct udb_base* taskudb, struct udb_ptr* last_task); +void namedb_check_zonefile(struct namedb* db, struct udb_base* taskudb, + struct udb_ptr* last_task, struct zone_options* zo); +/** zone one zonefile into memory and revert on parse error, write to udb */ +void namedb_read_zonefile(struct namedb* db, struct zone* zone, + struct udb_base* taskudb, struct udb_ptr* last_task); +void apex_rrset_checks(struct namedb* db, rrset_type* rrset, + domain_type* domain); +zone_type* namedb_zone_create(namedb_type* db, const dname_type* dname, + struct zone_options* zopt); +void namedb_zone_delete(namedb_type* db, zone_type* zone); +void namedb_write_zonefile(namedb_type* db, struct zone_options* zopt); +void namedb_write_zonefiles(namedb_type* db, struct nsd_options* options); +int create_dirs(const char* path); +void allocate_domain_nsec3(domain_table_type *table, domain_type *result); static inline int rdata_atom_is_domain(uint16_t type, size_t index) @@ -343,6 +369,15 @@ rdata_atom_is_domain(uint16_t type, size_t index) || descriptor->wireformat[index] == RDATA_WF_UNCOMPRESSED_DNAME)); } +static inline int +rdata_atom_is_literal_domain(uint16_t type, size_t index) +{ + const rrtype_descriptor_type *descriptor + = rrtype_descriptor_by_type(type); + return (index < descriptor->maximum + && (descriptor->wireformat[index] == RDATA_WF_LITERAL_DNAME)); +} + static inline rdata_wireformat_type rdata_atom_wireformat_type(uint16_t type, size_t index) { @@ -353,7 +388,7 @@ rdata_atom_wireformat_type(uint16_t type, size_t index) } static inline uint16_t -rrset_rrtype(rrset_type *rrset) +rrset_rrtype(rrset_type* rrset) { assert(rrset); assert(rrset->rr_count > 0); @@ -361,35 +396,11 @@ rrset_rrtype(rrset_type *rrset) } static inline uint16_t -rrset_rrclass(rrset_type *rrset) +rrset_rrclass(rrset_type* rrset) { assert(rrset); assert(rrset->rr_count > 0); return rrset->rrs[0].klass; } -/** - * Allocate and initialize a struct namedb. - * Returns a pointer to a valid struct namedb or NULL on failure. - */ -struct namedb * namedb_create(void); - -/** - * Destroy a struct namedb created using the namedb_create function. - * Frees all regions associated with the namedb structure. - */ -void namedb_destroy(struct namedb *db); - -#ifdef NSEC3 -#ifndef FULL_PREHASH -int zone_nsec3_domains_create(struct namedb *db, struct zone *zone); -int zone_nsec3_domains_destroy(struct namedb *db, struct zone *zone); -int namedb_add_nsec3_domain(struct namedb *db, struct domain *domain, struct zone *zone); -int namedb_del_nsec3_domain(struct namedb *db, struct domain *domain, struct zone *zone); -int namedb_nsec3_mod_domains_create(struct namedb *db); -int namedb_nsec3_mod_domains_destroy(struct namedb *db); -int namedb_add_nsec3_mod_domain(struct namedb *db, struct domain *domain); -#endif /* !FULL_PREHASH */ -#endif /* NSEC3 */ - #endif diff --git a/usr.sbin/nsd/netio.c b/usr.sbin/nsd/netio.c index 2c64b6d1f67..ad8ee16ee60 100644 --- a/usr.sbin/nsd/netio.c +++ b/usr.sbin/nsd/netio.c @@ -1,7 +1,7 @@ /* * netio.c -- network I/O support. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -151,32 +151,7 @@ netio_dispatch(netio_type *netio, const struct timespec *timeout, const sigset_t max_fd = handler->fd; } if (handler->event_types & NETIO_EVENT_READ) { - extern int slowaccept; - extern struct timespec slowaccept_timeout; - - if ((handler->event_types & NETIO_EVENT_ACCEPT) && slowaccept) { - if (timespec_compare(&slowaccept_timeout, netio_current_time(netio)) < 0) { - slowaccept = 0; - } - if (slowaccept) { - /** Timeout after slowaccept timeout. */ - struct timespec relative; - relative.tv_sec = slowaccept_timeout.tv_sec; - relative.tv_nsec = slowaccept_timeout.tv_nsec; - timespec_subtract(&relative, netio_current_time(netio)); - if (!have_timeout || - timespec_compare(&relative, &minimum_timeout) < 0) { - have_timeout = 1; - minimum_timeout.tv_sec = relative.tv_sec; - minimum_timeout.tv_nsec = relative.tv_nsec; - } - } else { - FD_SET(handler->fd, &readfds); - } - } else { - /* Not accept event or not slow accept */ - FD_SET(handler->fd, &readfds); - } + FD_SET(handler->fd, &readfds); } if (handler->event_types & NETIO_EVENT_WRITE) { FD_SET(handler->fd, &writefds); diff --git a/usr.sbin/nsd/netio.h b/usr.sbin/nsd/netio.h index c6686afc26f..c8299b97adb 100644 --- a/usr.sbin/nsd/netio.h +++ b/usr.sbin/nsd/netio.h @@ -1,7 +1,7 @@ /* * netio.h -- network I/O support. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -50,8 +50,6 @@ #include "region-allocator.h" -#define NETIO_SLOW_ACCEPT_TIMEOUT 2 /* in seconds */ - /* * The type of events a handler is interested in. These can be OR'ed * together to specify multiple event types. @@ -62,7 +60,6 @@ enum netio_event_types { NETIO_EVENT_WRITE = 2, NETIO_EVENT_EXCEPT = 4, NETIO_EVENT_TIMEOUT = 8, - NETIO_EVENT_ACCEPT = 16 }; typedef enum netio_event_types netio_event_types_type; diff --git a/usr.sbin/nsd/nsd-control-setup.sh.in b/usr.sbin/nsd/nsd-control-setup.sh.in new file mode 100755 index 00000000000..394afb40c41 --- /dev/null +++ b/usr.sbin/nsd/nsd-control-setup.sh.in @@ -0,0 +1,160 @@ +#!/bin/sh +# +# nsd-control-setup.sh - set up SSL certificates for nsd-control +# +# Copyright (c) 2011, NLnet Labs. All rights reserved. +# +# This software is open source. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# Neither the name of the NLNET LABS nor the names of its contributors may +# be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +# settings: + +# directory for files +DESTDIR=@configdir@ + +# issuer and subject name for certificates +SERVERNAME=nsd +CLIENTNAME=nsd-control + +# validity period for certificates +DAYS=7200 + +# size of keys in bits +BITS=1536 + +# hash algorithm +HASH=sha256 + +# base name for nsd server keys +SVR_BASE=nsd_server + +# base name for nsd-control keys +CTL_BASE=nsd_control + +# we want -rw-r--- access (say you run this as root: grp=yes (server), all=no). +umask 0026 + +# end of options + +# functions: +error ( ) { + echo "$0 fatal error: $1" + exit 1 +} + +# check arguments: +while test $# -ne 0; do + case $1 in + -d) + if test $# -eq 1; then error "need argument for -d"; fi + DESTDIR="$2" + shift + ;; + *) + echo "nsd-control-setup.sh - setup SSL keys for nsd-control" + echo " -d dir use directory to store keys and certificates." + echo " default: $DESTDIR" + exit 1 + ;; + esac + shift +done + +# go!: +echo "setup in directory $DESTDIR" +cd "$DESTDIR" || error "could not cd to $DESTDIR" + +# create certificate keys; do not recreate if they already exist. +if test -f $SVR_BASE.key; then + echo "$SVR_BASE.key exists" +else + echo "generating $SVR_BASE.key" + openssl genrsa -out $SVR_BASE.key $BITS || error "could not genrsa" +fi +if test -f $CTL_BASE.key; then + echo "$CTL_BASE.key exists" +else + echo "generating $CTL_BASE.key" + openssl genrsa -out $CTL_BASE.key $BITS || error "could not genrsa" +fi + +# create self-signed cert for server +cat >request.cfg <<EOF +[req] +default_bits=$BITS +default_md=$HASH +prompt=no +distinguished_name=req_distinguished_name + +[req_distinguished_name] +commonName=$SERVERNAME +EOF +test -f request.cfg || error "could not create request.cfg" + +echo "create $SVR_BASE.pem (self signed certificate)" +openssl req -key $SVR_BASE.key -config request.cfg -new -x509 -days $DAYS -out $SVR_BASE.pem || error "could not create $SVR_BASE.pem" +# create trusted usage pem +openssl x509 -in $SVR_BASE.pem -addtrust serverAuth -out $SVR_BASE"_trust.pem" + +# create client request and sign it, piped +cat >request.cfg <<EOF +[req] +default_bits=$BITS +default_md=$HASH +prompt=no +distinguished_name=req_distinguished_name + +[req_distinguished_name] +commonName=$CLIENTNAME +EOF +test -f request.cfg || error "could not create request.cfg" + +echo "create $CTL_BASE.pem (signed client certificate)" +openssl req -key $CTL_BASE.key -config request.cfg -new | openssl x509 -req -days $DAYS -CA $SVR_BASE"_trust.pem" -CAkey $SVR_BASE.key -CAcreateserial -$HASH -out $CTL_BASE.pem +test -f $CTL_BASE.pem || error "could not create $CTL_BASE.pem" +# create trusted usage pem +# openssl x509 -in $CTL_BASE.pem -addtrust clientAuth -out $CTL_BASE"_trust.pem" + +# see details with openssl x509 -noout -text < $SVR_BASE.pem +# echo "create $CTL_BASE""_browser.pfx (web client certificate)" +# echo "create webbrowser PKCS#12 .PFX certificate file. In Firefox import in:" +# echo "preferences - advanced - encryption - view certificates - your certs" +# echo "empty password is used, simply click OK on the password dialog box." +# openssl pkcs12 -export -in $CTL_BASE"_trust.pem" -inkey $CTL_BASE.key -name "nsd remote control client cert" -out $CTL_BASE"_browser.pfx" -password "pass:" || error "could not create browser certificate" + +# remove unused permissions +chmod o-rw $SVR_BASE.pem $SVR_BASE.key $CTL_BASE.pem $CTL_BASE.key + +# remove crap +rm -f request.cfg +rm -f $CTL_BASE"_trust.pem" $SVR_BASE"_trust.pem" $SVR_BASE"_trust.srl" + +echo "Setup success. Certificates created. Enable in nsd.conf file to use" + +exit 0 diff --git a/usr.sbin/nsd/nsd-control.8.in b/usr.sbin/nsd/nsd-control.8.in new file mode 100644 index 00000000000..bf610f1097b --- /dev/null +++ b/usr.sbin/nsd/nsd-control.8.in @@ -0,0 +1,245 @@ +.TH "nsd\-control" "8" "Oct 29, 2013" "NLnet Labs" "nsd 4.0.0" +.\" Copyright (c) 2011, NLnet Labs. All rights reserved. +.\" See LICENSE for the license. +.SH "NAME" +.LP +.B nsd\-control, +.B nsd\-control\-setup +\- NSD remote server control utility. +.SH "SYNOPSIS" +.B nsd\-control +.RB [ \-c +.IR cfgfile ] +.RB [ \-s +.IR server ] +.IR command +.SH "DESCRIPTION" +.B nsd\-control +performs remote administration on the \fInsd\fR(8) DNS server. It reads +the configuration file, contacts the nsd server over SSL, sends the +command and displays the result. +.P +The available options are: +.TP +.B \-h +Show the version and commandline option help. +.TP +.B \-c \fIcfgfile +The config file to read with settings. If not given the default +config file @nsdconfigfile@ is used. +.TP +.B \-s \fIserver[@port] +IPv4 or IPv6 address of the server to contact. If not given, the +address is read from the config file. +.SH "COMMANDS" +There are several commands that the server understands. +.TP +.B start +Start the server. Simply execs \fInsd\fR(8). The nsd executable +is searched for in the \fBPATH\fR set in the environment. It is started +with the config file specified using \fI\-c\fR or the default config file. +.TP +.B stop +Stop the server. The server daemon exits. +.TP +.B reload [<zone>] +Reload zonefiles and reopen logfile. Without argument reads changed +zonefiles. With argument reads the zonefile for the given zone and +loads it. +.TP +.B reconfig +Reload nsd.conf and apply changes to TSIG keys and configuration patterns, +and apply the changes to add and remove zones that are mentioned in the config. +Other changes are not applied, such as listening ip address and port and chroot. +The pattern updates means that the configuration options for +zones (request\-xfr, zonefile, notify, ...) are updated. Also new +patterns are available for use with the addzone command. +.TP +.B repattern +Same as the reconfig option. +.TP +.B log_reopen +Reopen the logfile, for log rotate that wants to move the logfile away +and create a new logfile. The log can also be reopened with kill \-HUP +(which also reloads all zonefiles). +.TP +.B status +Display server status. Exit code 3 if not running (the connection to the +port is refused), 1 on error, 0 if running. +.TP +.B stats +Output a sequence of name=value lines with statistics information, requires +NSD to be compiled with this option enabled. +.TP +.B stats_noreset +Same as stats, but does not zero the counters. +.TP +.B addzone <zone name> <pattern name> +Add a new zone to the running server. The zone is added to the zonelist +file on disk, so it stays after a restart. The pattern name determines +the options for the new zone. For slave zones a zone transfer is +immediately attempted. For zones with a zonefile, the zone file is +attempted to be read in. +.TP +.B delzone <zone name> +Remove the zone from the running server. The zone is removed from the +zonelist file on disk, from the nsd.db file and from the memory. If it +had a zonefile, this remains (but may be outdated). Zones configured +inside nsd.conf itself cannot be removed this way because the daemon +does not write to the nsd.conf file, you need to add such zones to the +zonelist file to be able to delete them with the delzone command. +.TP +.B write [<zone>] +Write zonefiles to disk, or the given zonefile to disk. Zones that have +changed (via AXFR or IXFR) are written, or if the zonefile has not been +created yet then it is created. Directory components of the zonefile +path are created if necessary. +.TP +.B notify [<zone>] +Send NOTIFY messages to slave servers. Sends to the IP addresses +configured in the 'notify:' lists for the master zones hosted on this +server. Usually NSD sends NOTIFY messages right away when a master zone +serial is updated. If a zone is given, notifies are sent for that zone. +These slave servers are supposed to initiate a zone transfer request +later (to this server or another master), this can be allowed via +the 'provide\-xfr:' acl list configuration. +.TP +.B transfer [<zone>] +Attempt to update slave zones that are hosted on this server by contacting +the masters. The masters are configured via 'request\-xfr:' lists. +If a zone is given, that zone is updated. Usually NSD receives a NOTIFY +from the masters (configured via 'allow\-notify:' acl list) that a new zone +serial has to be transferred. +.TP +.B force_transfer [<zone>] +Force update slave zones that are hosted on this server. Even if the +master hosts the same serial number of the zone, a full AXFR is performed +to fetch it. If you want to use IXFR and check that the serial number +increases, use the 'transfer' command. +.TP +.B zonestatus [<zone>] +Print state of the zone, the serial numbers and since when they have +been acquired. Also prints the notify action (to which server), and +zone transfer (and from which master) if there is activity right now. +.TP +.B serverpid +Prints the PID of the server process. This is used for statistics (and +only works when NSD is compiled with statistics enabled). This pid is +not for sending unix signals, use the pid from nsd.pid for that, that pid +is also stable. +.TP +.B verbosity <number> +Change logging verbosity. +.SH "EXIT CODE" +The nsd\-control program exits with status code 1 on error, 0 on success. +.SH "SET UP" +The setup requires a self\-signed certificate and private keys for both +the server and client. The script \fInsd\-control\-setup\fR generates +these in the default run directory, or with \-d in another directory. +If you change the access control permissions on the key files you can decide +who can use nsd\-control, by default owner and group but not all users. +The script preserves private keys present in the directory. +After running the script as root, turn on \fBcontrol\-enable\fR in +\fInsd.conf\fR. +.SH "STATISTIC COUNTERS" +The \fIstats\fR command shows a number of statistic counters. +.TP +.I num.queries +number of queries received (the tcp and udp queries added up). +.TP +.I serverX.queries +number of queries handled by the server process. The number of +server processes is set with the config statement \fBserver\-count\fR. +.TP +.I time.boot +uptime in seconds since the server was started. With fractional seconds. +.TP +.I time.elapsed +time since the last stats report, in seconds. With fractional seconds. +Can be zero if polled quickly and the previous stats command resets the +counters, so that the next gets a fully zero, and zero elapsed time, report. +.TP +.I size.db.disk +size of nsd.db on disk, in bytes. +.TP +.I size.db.mem +size of the DNS database in memory, in bytes. +.TP +.I size.xfrd.mem +size of memory for zone transfers and notifies in xfrd process, excludes +TSIG data, in bytes. +.TP +.I size.config.disk +size of zonelist file on disk, excludes the nsd.conf size, in bytes. +.TP +.I size.config.mem +size of config data in memory, kept twice in server and xfrd process, +in bytes. +.TP +.I num.type.X +number of queries with this query type. +.TP +.I num.opcode.X +number of queries with this opcode. +.TP +.I num.class.X +number of queries with this query class. +.TP +.I num.rcode.X +number of answers that carried this return code. +.TP +.I num.edns +number of queries with EDNS OPT. +.TP +.I num.ednserr +number of queries which failed EDNS parse. +.TP +.I num.udp +number of queries over UDP ip4. +.TP +.I num.udp6 +number of queries over UDP ip6. +.TP +.I num.tcp +number of connections over TCP ip4. +.TP +.I num.tcp6 +number of connections over TCP ip6. +.TP +.I num.answer_wo_aa +number of answers with NOERROR rcode and without AA flag, this includes the referrals. +.TP +.I num.rxerr +number of queries for which the receive failed. +.TP +.I num.txerr +number of answers for which the transmit failed. +.TP +.I num.raxfr +number of AXFR requests from clients (that got served with reply). +.TP +.I num.truncated +number of answers with TC flag set. +.TP +.I num.dropped +number of queries that were dropped because they failed sanity check. +.TP +.I zone.master +number of master zones served. These are zones with no 'request\-xfr:' +entries. +.TP +.I zone.slave +number of slave zones served. These are zones with 'request\-xfr' +entries. +.SH "FILES" +.TP +.I @nsdconfigfile@ +nsd configuration file. +.TP +.I @configdir@ +directory with private keys (nsd_server.key and nsd_control.key) and +self\-signed certificates (nsd_server.pem and nsd_control.pem). +.SH "SEE ALSO" +\fInsd.conf\fR(5), +\fInsd\fR(8), +\fInsd\-checkconf\fR(8) diff --git a/usr.sbin/nsd/nsd-control.c b/usr.sbin/nsd/nsd-control.c new file mode 100644 index 00000000000..e9551851f17 --- /dev/null +++ b/usr.sbin/nsd/nsd-control.c @@ -0,0 +1,415 @@ +/* + * nsd-control.c - remote control utility for nsd. + * + * Copyright (c) 2011, NLnet Labs. All rights reserved. + * + * This software is open source. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * Neither the name of the NLNET LABS nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file + * + * The remote control utility contacts the nsd server over ssl and + * sends the command, receives the answer, and displays the result + * from the commandline. + */ + +#include "config.h" +#ifdef HAVE_SSL + +#include <sys/types.h> +#include <unistd.h> +#include <string.h> +#ifdef HAVE_OPENSSL_SSL_H +#include <openssl/ssl.h> +#endif +#ifdef HAVE_OPENSSL_ERR_H +#include <openssl/err.h> +#endif +#ifdef HAVE_OPENSSL_RAND_H +#include <openssl/rand.h> +#endif +#include "util.h" +#include "tsig.h" +#include "options.h" + +/** Give nsd-control usage, and exit (1). */ +static void +usage() +{ + printf("Usage: nsd-control [options] command\n"); + printf(" Remote control utility for nsd server.\n"); + printf("Version %s. Report bugs to <%s>.\n", + PACKAGE_VERSION, PACKAGE_BUGREPORT); + printf("Options:\n"); + printf(" -c file config file, default is %s\n", CONFIGFILE); + printf(" -s ip[@port] server address, if omitted config is used.\n"); + printf(" -h show this usage help.\n"); + printf("Commands:\n"); + printf(" start start server; runs nsd(8)\n"); + printf(" stop stops the server\n"); + printf(" reload [<zone>] reload modified zonefiles from disk\n"); + printf(" reconfig reload the config file\n"); + printf(" repattern the same as reconfig\n"); + printf(" log_reopen reopen logfile (for log rotate)\n"); + printf(" status display status of server\n"); + printf(" stats print statistics\n"); + printf(" stats_noreset peek at statistics\n"); + printf(" addzone <name> <pattern> add a new zone\n"); + printf(" delzone <name> remove a zone\n"); + printf(" write [<zone>] write changed zonefiles to disk\n"); + printf(" notify [<zone>] send NOTIFY messages to slave servers\n"); + printf(" transfer [<zone>] try to update slave zones to newer serial\n"); + printf(" force_transfer [<zone>] update slave zones with AXFR, no serial check\n"); + printf(" zonestatus [<zone>] print state, serial, activity\n"); + printf(" serverpid get pid of server process\n"); + printf(" verbosity <number> change logging detail\n"); + exit(1); +} + +/** exit with ssl error */ +static void ssl_err(const char* s) +{ + fprintf(stderr, "error: %s\n", s); + ERR_print_errors_fp(stderr); + exit(1); +} + +/** setup SSL context */ +static SSL_CTX* +setup_ctx(nsd_options_t* cfg) +{ + char* s_cert, *c_key, *c_cert; + SSL_CTX* ctx; + + s_cert = cfg->server_cert_file; + c_key = cfg->control_key_file; + c_cert = cfg->control_cert_file; + + /* filenames may be relative to zonesdir */ + if (cfg->zonesdir && cfg->zonesdir[0] && + (s_cert[0] != '/' || c_key[0] != '/' || c_cert[0] != '/')) { + if(chdir(cfg->zonesdir)) + ssl_err("could not chdir to zonesdir"); + } + + ctx = SSL_CTX_new(SSLv23_client_method()); + if(!ctx) + ssl_err("could not allocate SSL_CTX pointer"); + if(!(SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2)) + ssl_err("could not set SSL_OP_NO_SSLv2"); + if(!SSL_CTX_use_certificate_file(ctx,c_cert,SSL_FILETYPE_PEM) || + !SSL_CTX_use_PrivateKey_file(ctx,c_key,SSL_FILETYPE_PEM) + || !SSL_CTX_check_private_key(ctx)) + ssl_err("Error setting up SSL_CTX client key and cert"); + if (SSL_CTX_load_verify_locations(ctx, s_cert, NULL) != 1) + ssl_err("Error setting up SSL_CTX verify, server cert"); + SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL); + + return ctx; +} + +/** contact the server with TCP connect */ +static int +contact_server(const char* svr, nsd_options_t* cfg, int statuscmd) +{ +#ifdef INET6 + struct sockaddr_storage addr; +#else + struct sockaddr_in addr; +#endif + socklen_t addrlen; + int fd; + int port = cfg->control_port; + /* use svr or a config entry */ + if(!svr) { + if(cfg->control_interface) + svr = cfg->control_interface->address; + else svr = "127.0.0.1"; + /* config 0 addr (everything), means ask localhost */ + if(strcmp(svr, "0.0.0.0") == 0) + svr = "127.0.0.1"; + else if(strcmp(svr, "::0") == 0 || + strcmp(svr, "0::0") == 0 || + strcmp(svr, "0::") == 0 || + strcmp(svr, "::") == 0) + svr = "::1"; + } + if(strchr(svr, '@')) { + char* ps = strchr(svr, '@'); + *ps++ = 0; + port = atoi(ps); + if(!port) { + fprintf(stderr, "could not parse port %s\n", ps); + exit(1); + } + } + if(strchr(svr, ':')) { + struct sockaddr_in6 sa; + addrlen = (socklen_t)sizeof(struct sockaddr_in6); + memset(&sa, 0, addrlen); + sa.sin6_family = AF_INET6; + sa.sin6_port = (in_port_t)htons((uint16_t)port); + if(inet_pton((int)sa.sin6_family, svr, &sa.sin6_addr) <= 0) { + fprintf(stderr, "could not parse IP: %s\n", svr); + exit(1); + } + memcpy(&addr, &sa, addrlen); + } else { /* ip4 */ + struct sockaddr_in sa; + addrlen = (socklen_t)sizeof(struct sockaddr_in); + memset(&sa, 0, addrlen); + sa.sin_family = AF_INET; + sa.sin_port = (in_port_t)htons((uint16_t)port); + if(inet_pton((int)sa.sin_family, svr, &sa.sin_addr) <= 0) { + fprintf(stderr, "could not parse IP: %s\n", svr); + exit(1); + } + memcpy(&addr, &sa, addrlen); + } + + fd = socket(strchr(svr, ':')?AF_INET6:AF_INET, SOCK_STREAM, 0); + if(fd == -1) { + fprintf(stderr, "socket: %s\n", strerror(errno)); + exit(1); + } + if(connect(fd, (struct sockaddr*)&addr, addrlen) < 0) { + fprintf(stderr, "error: connect (%s@%d): %s\n", svr, port, + strerror(errno)); + if(errno == ECONNREFUSED && statuscmd) { + printf("nsd is stopped\n"); + exit(3); + } + exit(1); + } + return fd; +} + +/** setup SSL on the connection */ +static SSL* +setup_ssl(SSL_CTX* ctx, int fd) +{ + SSL* ssl; + X509* x; + int r; + + ssl = SSL_new(ctx); + if(!ssl) + ssl_err("could not SSL_new"); + SSL_set_connect_state(ssl); + (void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY); + if(!SSL_set_fd(ssl, fd)) + ssl_err("could not SSL_set_fd"); + while(1) { + ERR_clear_error(); + if( (r=SSL_do_handshake(ssl)) == 1) + break; + r = SSL_get_error(ssl, r); + if(r != SSL_ERROR_WANT_READ && r != SSL_ERROR_WANT_WRITE) + ssl_err("SSL handshake failed"); + /* wants to be called again */ + } + + /* check authenticity of server */ + if(SSL_get_verify_result(ssl) != X509_V_OK) + ssl_err("SSL verification failed"); + x = SSL_get_peer_certificate(ssl); + if(!x) + ssl_err("Server presented no peer certificate"); + X509_free(x); + return ssl; +} + +/** send stdin to server */ +static void +send_file(SSL* ssl, FILE* in, char* buf, size_t sz) +{ + while(fgets(buf, (int)sz, in)) { + if(SSL_write(ssl, buf, (int)strlen(buf)) <= 0) + ssl_err("could not SSL_write contents"); + } +} + +/** send command and display result */ +static int +go_cmd(SSL* ssl, int argc, char* argv[]) +{ + char pre[10]; + const char* space=" "; + const char* newline="\n"; + int was_error = 0, first_line = 1; + int r, i; + char buf[1024]; + snprintf(pre, sizeof(pre), "NSDCT%d ", NSD_CONTROL_VERSION); + if(SSL_write(ssl, pre, (int)strlen(pre)) <= 0) + ssl_err("could not SSL_write"); + for(i=0; i<argc; i++) { + if(SSL_write(ssl, space, (int)strlen(space)) <= 0) + ssl_err("could not SSL_write"); + if(SSL_write(ssl, argv[i], (int)strlen(argv[i])) <= 0) + ssl_err("could not SSL_write"); + } + if(SSL_write(ssl, newline, (int)strlen(newline)) <= 0) + ssl_err("could not SSL_write"); + + /* TODO remove or use file upload */ + if(argc == 1 && strcmp(argv[0], "load_cache") == 0) { + send_file(ssl, stdin, buf, sizeof(buf)); + } + + while(1) { + ERR_clear_error(); + if((r = SSL_read(ssl, buf, (int)sizeof(buf)-1)) <= 0) { + if(SSL_get_error(ssl, r) == SSL_ERROR_ZERO_RETURN) { + /* EOF */ + break; + } + ssl_err("could not SSL_read"); + } + buf[r] = 0; + printf("%s", buf); + if(first_line && strncmp(buf, "error", 5) == 0) + was_error = 1; + first_line = 0; + } + return was_error; +} + +/** go ahead and read config, contact server and perform command and display */ +static int +go(const char* cfgfile, char* svr, int argc, char* argv[]) +{ + nsd_options_t* opt; + int fd, ret; + SSL_CTX* ctx; + SSL* ssl; + + /* read config */ + if(!(opt = nsd_options_create(region_create(xalloc, free)))) { + fprintf(stderr, "out of memory\n"); + exit(1); + } + tsig_init(opt->region); + if(!parse_options_file(opt, cfgfile, NULL, NULL)) { + fprintf(stderr, "could not read config file\n"); + exit(1); + } + if(!opt->control_enable) + fprintf(stderr, "warning: control-enable is 'no' in the config file.\n"); + ctx = setup_ctx(opt); + + /* contact server */ + fd = contact_server(svr, opt, argc>0&&strcmp(argv[0],"status")==0); + ssl = setup_ssl(ctx, fd); + + /* send command */ + ret = go_cmd(ssl, argc, argv); + + SSL_free(ssl); + close(fd); + SSL_CTX_free(ctx); + region_destroy(opt->region); + return ret; +} + +/** getopt global, in case header files fail to declare it. */ +extern int optind; +/** getopt global, in case header files fail to declare it. */ +extern char* optarg; + +/** Main routine for nsd-control */ +int main(int argc, char* argv[]) +{ + int c; + const char* cfgfile = CONFIGFILE; + char* svr = NULL; +#ifdef USE_WINSOCK + int r; + WSADATA wsa_data; +#endif + log_init("nsd-control"); + + ERR_load_crypto_strings(); + ERR_load_SSL_strings(); + OpenSSL_add_all_algorithms(); + (void)SSL_library_init(); + + if(!RAND_status()) { + /* try to seed it */ + unsigned char buf[256]; + unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid(); + size_t i; + v = seed; + for(i=0; i<256/sizeof(v); i++) { + memmove(buf+i*sizeof(v), &v, sizeof(v)); + v = v*seed + (unsigned int)i; + } + RAND_seed(buf, 256); + fprintf(stderr, "warning: no entropy, seeding openssl PRNG with time\n"); + } + + /* parse the options */ + while( (c=getopt(argc, argv, "c:s:h")) != -1) { + switch(c) { + case 'c': + cfgfile = optarg; + break; + case 's': + svr = optarg; + break; + case '?': + case 'h': + default: + usage(); + } + } + argc -= optind; + argv += optind; + if(argc == 0) + usage(); + if(argc >= 1 && strcmp(argv[0], "start")==0) { + if(execl(NSD_START_PATH, "nsd", "-c", cfgfile, + (char*)NULL) < 0) { + fprintf(stderr, "could not exec %s: %s\n", + NSD_START_PATH, strerror(errno)); + exit(1); + } + } + + return go(cfgfile, svr, argc, argv); +} + +#else /* HAVE_SSL */ +int main(void) +{ + printf("error: NSD was compiled without SSL.\n"); + return 1; +} +#endif /* HAVE_SSL */ diff --git a/usr.sbin/nsd/nsd-mem.c b/usr.sbin/nsd/nsd-mem.c new file mode 100644 index 00000000000..0981eafef73 --- /dev/null +++ b/usr.sbin/nsd/nsd-mem.c @@ -0,0 +1,360 @@ +/* + * nsd-mem.c -- nsd-mem(8) + * + * Copyright (c) 2013, NLnet Labs. All rights reserved. + * + * See LICENSE for the license. + * + */ + +#include "config.h" + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> +#include <errno.h> + +#include "nsd.h" +#include "tsig.h" +#include "options.h" +#include "namedb.h" +#include "udb.h" +#include "udbzone.h" +#include "util.h" + +static void error(const char *format, ...) ATTR_FORMAT(printf, 1, 2); + +/* + * Print the help text. + * + */ +static void +usage (void) +{ + fprintf(stderr, "Usage: nsd-mem [-c configfile]\n"); + fprintf(stderr, "Version %s. Report bugs to <%s>.\n", + PACKAGE_VERSION, PACKAGE_BUGREPORT); +} + +/* + * Something went wrong, give error messages and exit. + * + */ +static void +error(const char *format, ...) +{ + va_list args; + va_start(args, format); + log_vmsg(LOG_ERR, format, args); + va_end(args); + exit(1); +} + +/* zone memory structure */ +struct zone_mem { + /* size of data (allocated in db.region) */ + size_t data; + /* unused space (in db.region) due to alignment */ + size_t data_unused; + /* udb data allocated */ + size_t udb_data; + /* udb overhead (chunk2**x - data) */ + size_t udb_overhead; + + /* count of number of domains */ + size_t domaincount; +}; + +/* total memory structure */ +struct tot_mem { + /* size of data (allocated in db.region) */ + size_t data; + /* unused space (in db.region) due to alignment */ + size_t data_unused; + /* udb data allocated */ + size_t udb_data; + /* udb overhead (chunk2**x - data) */ + size_t udb_overhead; + + /* count of number of domains */ + size_t domaincount; + + /* options data */ + size_t opt_data; + /* unused in options region */ + size_t opt_unused; + /* dname compression table */ + size_t compresstable; +#ifdef RATELIMIT + /* size of rrl tables */ + size_t rrl; +#endif + + /* total ram usage */ + size_t ram; + /* total nsd.db disk usage */ + size_t disk; +}; + +static void +account_zone(struct namedb* db, struct zone_mem* zmem) +{ + zmem->data = region_get_mem(db->region); + zmem->data_unused = region_get_mem_unused(db->region); + zmem->udb_data = (size_t)db->udb->alloc->disk->stat_data; + zmem->udb_overhead = (size_t)(db->udb->alloc->disk->stat_alloc - + db->udb->alloc->disk->stat_data); + zmem->domaincount = db->domains->nametree->count; +} + +static void +pretty_mem(size_t x, const char* s) +{ + char buf[32]; + memset(buf, 0, sizeof(buf)); + if(snprintf(buf, sizeof(buf), "%12lld", (long long)x) > 12) { + printf("%12lld %s\n", (long long)x, s); + return; + } + printf("%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c %s\n", + buf[0], buf[1], buf[2], (buf[2]==' '?' ':'.'), + buf[3], buf[4], buf[5], (buf[5]==' '?' ':'.'), + buf[6], buf[7], buf[8], (buf[8]==' '?' ':'.'), + buf[9], buf[10], buf[11], s); +} + +static void +print_zone_mem(struct zone_mem* z) +{ + pretty_mem(z->data, "zone data"); + pretty_mem(z->data_unused, "zone unused space (due to alignment)"); + pretty_mem(z->udb_data, "data in nsd.db"); + pretty_mem(z->udb_overhead, "overhead in nsd.db"); +} + +static void +account_total(nsd_options_t* opt, struct tot_mem* t) +{ + t->opt_data = region_get_mem(opt->region); + t->opt_unused = region_get_mem_unused(opt->region); + t->compresstable = sizeof(uint16_t) * + (t->domaincount + 1 + EXTRA_DOMAIN_NUMBERS); + t->compresstable *= opt->server_count; + +#ifdef RATELIMIT +#define SIZE_RRL_BUCKET (8 + 4 + 4 + 4 + 4 + 2) + t->rrl = opt->rrl_size * SIZE_RRL_BUCKET; + t->rrl *= opt->server_count; +#endif + + t->ram = t->data + t->data_unused + t->opt_data + t->opt_unused + + t->compresstable; +#ifdef RATELIMIT + t->ram += t->rrl; +#endif + t->disk = t->udb_data + t->udb_overhead; +} + +static void +print_tot_mem(struct tot_mem* t) +{ + printf("\ntotal\n"); + pretty_mem(t->data, "data"); + pretty_mem(t->data_unused, "unused space (due to alignment)"); + pretty_mem(t->opt_data, "options"); + pretty_mem(t->opt_unused, "options unused space (due to alignment)"); + pretty_mem(t->compresstable, "name table (depends on servercount)"); +#ifdef RATELIMIT + pretty_mem(t->rrl, "RRL table (depends on servercount)"); +#endif + pretty_mem(t->udb_data, "data in nsd.db"); + pretty_mem(t->udb_overhead, "overhead in nsd.db"); + printf("\nsummary\n"); + + pretty_mem(t->ram, "ram usage (excl space for buffers)"); + pretty_mem(t->disk, "disk usage (excl 12% space claimed for growth)"); +} + +static void +add_mem(struct tot_mem* t, struct zone_mem* z) +{ + t->data += z->data; + t->data_unused += z->data_unused; + t->udb_data += z->udb_data; + t->udb_overhead += z->udb_overhead; + t->domaincount += z->domaincount; +} + +static void +check_zone_mem(const char* tf, const char* df, zone_options_t* zo, + nsd_options_t* opt, struct tot_mem* totmem) +{ + struct namedb* db; + const dname_type* dname = (const dname_type*)zo->node.key; + zone_type* zone; + struct udb_base* taskudb; + udb_ptr last_task; + struct zone_mem zmem; + + printf("zone %s\n", zo->name); + + /* init*/ + memset(&zmem, 0, sizeof(zmem)); + db = namedb_open(df, opt); + if(!db) error("cannot open %s: %s", df, strerror(errno)); + zone = namedb_zone_create(db, dname, zo); + taskudb = udb_base_create_new(tf, &namedb_walkfunc, NULL); + udb_ptr_init(&last_task, taskudb); + + /* read the zone */ + namedb_read_zonefile(db, zone, taskudb, &last_task); + + /* account the memory for this zone */ + account_zone(db, &zmem); + + /* pretty print the memory for this zone */ + print_zone_mem(&zmem); + + /* delete the zone from memory */ + namedb_close(db); + udb_base_free(taskudb); + unlink(df); + unlink(tf); + + /* add up totals */ + add_mem(totmem, &zmem); +} + +static void +check_mem(nsd_options_t* opt) +{ + struct tot_mem totmem; + zone_options_t* zo; + char tf[512]; + char df[512]; + memset(&totmem, 0, sizeof(totmem)); + snprintf(tf, sizeof(tf), "./nsd-mem-task-%u.db", (unsigned)getpid()); + snprintf(df, sizeof(df), "./nsd-mem-db-%u.db", (unsigned)getpid()); + + /* read all zones and account memory */ + RBTREE_FOR(zo, zone_options_t*, opt->zone_options) { + check_zone_mem(tf, df, zo, opt, &totmem); + } + + /* calculate more total statistics */ + account_total(opt, &totmem); + /* print statistics */ + print_tot_mem(&totmem); + + /* final advice */ + printf("\nFinal advice estimate:\n"); + printf("(The partial mmap causes reload&AXFR to take longer(disk access))\n"); + pretty_mem(totmem.ram + totmem.disk, "data and big mmap"); + pretty_mem(totmem.ram + totmem.disk/6, "data and partial mmap"); +} + +/* dummy functions to link */ +struct nsd; +int writepid(struct nsd * ATTR_UNUSED(nsd)) +{ + return 0; +} +void unlinkpid(const char * ATTR_UNUSED(file)) +{ +} +void bind8_stats(struct nsd * ATTR_UNUSED(nsd)) +{ +} + +void sig_handler(int ATTR_UNUSED(sig)) +{ +} + +extern char *optarg; +extern int optind; + +int +main(int argc, char *argv[]) +{ + /* Scratch variables... */ + int c; + struct nsd nsd; + const char *configfile = CONFIGFILE; + memset(&nsd, 0, sizeof(nsd)); + + log_init("nsd-mem"); + + /* Parse the command line... */ + while ((c = getopt(argc, argv, "c:h" + )) != -1) { + switch (c) { + case 'c': + configfile = optarg; + break; + case 'h': + usage(); + exit(0); + case '?': + default: + usage(); + exit(1); + } + } + argc -= optind; + argv += optind; + + /* Commandline parse error */ + if (argc != 0) { + usage(); + exit(1); + } + + /* Read options */ + nsd.options = nsd_options_create(region_create_custom(xalloc, free, + DEFAULT_CHUNK_SIZE, DEFAULT_LARGE_OBJECT_SIZE, + DEFAULT_INITIAL_CLEANUP_SIZE, 1)); + tsig_init(nsd.options->region); + if(!parse_options_file(nsd.options, configfile, NULL, NULL)) { + error("could not read config: %s\n", configfile); + } + if(!parse_zone_list_file(nsd.options)) { + error("could not read zonelist file %s\n", + nsd.options->zonelistfile); + } + if (verbosity == 0) + verbosity = nsd.options->verbosity; + +#ifdef HAVE_CHROOT + if(nsd.chrootdir == 0) nsd.chrootdir = nsd.options->chroot; +#ifdef CHROOTDIR + /* if still no chrootdir, fallback to default */ + if(nsd.chrootdir == 0) nsd.chrootdir = CHROOTDIR; +#endif /* CHROOTDIR */ +#endif /* HAVE_CHROOT */ + if(nsd.options->zonesdir && nsd.options->zonesdir[0]) { + if(chdir(nsd.options->zonesdir)) { + error("cannot chdir to '%s': %s", + nsd.options->zonesdir, strerror(errno)); + } + DEBUG(DEBUG_IPC,1, (LOG_INFO, "changed directory to %s", + nsd.options->zonesdir)); + } + + /* Chroot */ +#ifdef HAVE_CHROOT + if (nsd.chrootdir && strlen(nsd.chrootdir)) { + if(chdir(nsd.chrootdir)) { + error("unable to chdir to chroot: %s", strerror(errno)); + } + DEBUG(DEBUG_IPC,1, (LOG_INFO, "changed root directory to %s", + nsd.chrootdir)); + } +#endif /* HAVE_CHROOT */ + + check_mem(nsd.options); + + exit(0); +} diff --git a/usr.sbin/nsd/nsd.conf.sample.in b/usr.sbin/nsd/nsd.conf.sample.in index fe1a4874c5c..002d40e7065 100644 --- a/usr.sbin/nsd/nsd.conf.sample.in +++ b/usr.sbin/nsd/nsd.conf.sample.in @@ -8,10 +8,15 @@ # This is a comment. # Sample configuration file +# include: "file" # include that file's text over here. # options for the nsd server server: - # uncomment to specify specific interfaces to bind (default wildcard interface). + # Number of NSD servers to fork. Put the number of CPUs to use here. + # server-count: 1 + + # uncomment to specify specific interfaces to bind (default are the + # wildcard interfaces 0.0.0.0 and ::0). # ip-address: 1.2.3.4 # ip-address: 1.2.3.4@5678 # ip-address: 12fe::8ef0 @@ -19,36 +24,65 @@ server: # Allow binding to non local addresses. Default no. # ip-transparent: no - # don't answer VERSION.BIND and VERSION.SERVER CHAOS class queries - # hide-version: no - # enable debug mode, does not fork daemon process into the background. # debug-mode: no - # listen only on IPv4 connections - # ip4-only: no + # listen on IPv4 connections + # do-ip4: yes + + # listen on IPv6 connections + # do-ip6: yes + + # port to answer queries on. default is 53. + # port: 53 + + # Verbosity level. + # verbosity: 0 - # listen only on IPv6 connections - # ip6-only: no + # After binding socket, drop user privileges. + # can be a username, id or id.gid. + # username: @user@ + + # Run NSD in a chroot-jail. + # make sure to have pidfile and database reachable from there. + # by default, no chroot-jail is used. + # chroot: "@configdir@" + + # The directory for zonefile: files. The daemon chdirs here. + # zonesdir: "@zonesdir@" + # the list of dynamically added zones. + # zonelistfile: "@zonelistfile@" + # the database to use # database: "@dbfile@" + # log messages to file. Default to stderr and syslog (with + # facility LOG_DAEMON). stderr disappears when daemon goes to bg. + # logfile: "@logfile@" + + # File to store pid for nsd in. + # pidfile: "@pidfile@" + + # The file where secondary zone refresh and expire timeouts are kept. + # If you delete this file, all secondary zones are forced to be + # 'refreshing' (as if nsd got a notify). + # xfrdfile: "@xfrdfile@" + + # The directory where zone transfers are stored, in a subdir of it. + # xfrdir: "@xfrdir@" + + # don't answer VERSION.BIND and VERSION.SERVER CHAOS class queries + # hide-version: no + # identify the server (CH TXT ID.SERVER entry). # identity: "unidentified server" # NSID identity (hex string). default disabled. # nsid: "aabbccdd" - # log messages to file. Default to stderr and syslog (with facility LOG_DAEMON). - # logfile: "@logfile@" - - # Number of NSD servers to fork. - # server-count: 1 - # Maximum number of concurrent TCP connections per server. - # This option should have a value below 1000. - # tcp-count: 10 + # tcp-count: 100 # Maximum number of queries served on a single TCP connection. # By default 0, which means no maximum. @@ -63,44 +97,14 @@ server: # Preferred EDNS buffer size for IPv6. # ipv6-edns-size: 4096 - # File to store pid for nsd in. - # pidfile: "@pidfile@" - - # port to answer queries on. default is 53. - # port: 53 - - # statistics are produced every number of seconds. + # statistics are produced every number of seconds. Prints to log. # statistics: 3600 - # if per zone statistics is enabled, file to store statistics. - # zone-stats-file: "@zonestatsfile@" - - # Run NSD in a chroot-jail. - # make sure to have pidfile and database reachable from there. - # by default, no chroot-jail is used. - # chroot: "@configdir@" - - # After binding socket, drop user privileges. - # can be a username, id or id.gid. - # username: @user@ - - # The directory for zonefile: files. - # zonesdir: "@zonesdir@" - - # The file where incoming zone transfers are stored. - # run nsd-patch to update zone files, then you can safely delete it. - # difffile: "@difffile@" - - # The file where secondary zone refresh and expire timeouts are kept. - # If you delete this file, all secondary zones are forced to be - # 'refreshing' (as if nsd got a notify). - # xfrdfile: "@xfrdfile@" - # Number of seconds between reloads triggered by xfrd. - # xfrd-reload-timeout: 10 + # xfrd-reload-timeout: 1 - # Verbosity level. - # verbosity: 0 + # check mtime of all zone files on start and sighup + # zonefiles-check: yes # RRLconfig # Response Rate Limiting, size of the hashtable. Default 1000000. @@ -132,79 +136,113 @@ server: # rrl-whitelist-ratelimit: 2000 # RRLend -# key for zone 1 -key: - name: mskey - algorithm: hmac-md5 - secret: "K2tf3TRjvQkVCmJF3/Z9vA==" +# Remote control config section. +remote-control: + # Enable remote control with nsd-control(8) here. + # set up the keys and certificates with nsd-control-setup. + # control-enable: no -# Sample zone 1 -zone: - name: "example.com" - zonefile: "example.com.zone" + # what interfaces are listened to for control, default is on localhost. + # control-interface: 127.0.0.1 + # control-interface: ::1 - # This is a slave zone. Masters are listed below. - # If no access control elements are provided, this zone - # will not be served to/from other servers. + # port number for remote control operations (uses TLS over TCP). + # control-port: 8952 - # master 1 - allow-notify: 168.192.44.42 mskey - request-xfr: 168.192.44.42 mskey + # nsd server key file for remote control. + # server-key-file: "@configdir@/nsd_server.key" - # master 2 - allow-notify: 10.0.0.11 NOKEY - request-xfr: 10.0.0.11 NOKEY + # nsd server certificate file for remote control. + # server-cert-file: "@configdir@/nsd_server.pem" - # By default, a slave will request a zone transfer with IXFR/TCP. - # If you want to make use of IXFR/UDP use - allow-notify: 10.0.0.12 NOKEY - request-xfr: UDP 10.0.0.12 NOKEY + # nsd-control key file. + # control-key-file: "@configdir@/nsd_control.key" - # for a master that only speaks AXFR (like NSD) use - allow-notify: 10.0.0.13 NOKEY - request-xfr: AXFR 10.0.0.13 NOKEY + # nsd-control certificate file. + # control-cert-file: "@configdir@/nsd_control.pem" - # Attention: You cannot use UDP and AXFR together. AXFR is always over - # TCP. If you use UDP, we higly recommend you to deploy TSIG. - # Allow AXFR fallback if the master does not support IXFR. Default - # is yes. - allow-axfr-fallback: "yes" +# Secret keys for TSIGs that secure zone transfers. +# You could include: "secret.keys" and put the 'key:' statements in there, +# and give that file special access control permissions. +# +# key: + # The key name is sent to the other party, it must be the same + #name: "keyname" + # algorithm hmac-md5, or hmac-sha1, or hmac-sha256 (if compiled in) + #algorithm: hmac-sha256 + # secret material, must be the same as the other party uses. + # base64 encoded random number. + # e.g. from dd if=/dev/random of=/dev/stdout count=1 bs=32 | base64 + #secret: "K2tf3TRjvQkVCmJF3/Z9vA==" + + +# Patterns have zone configuration and they are shared by one or more zones. +# +# pattern: + # name by which the pattern is referred to + #name: "myzones" + # the zonefile for the zones that use this pattern. + # if relative then from the zonesdir (inside the chroot). + # the name is processed: %s - zone name (as appears in zone:name). + # %1 - first character of zone name, %2 second, %3 third. + # %z - topleveldomain label of zone, %y, %x next labels in name. + # if label or character does not exist you get a dot '.'. + # for example "%s.zone" or "zones/%1/%2/%3/%s" or "secondary/%z/%s" + #zonefile: "%s.zone" + + # If no master and slave access control elements are provided, + # this zone will not be served to/from other servers. + + # A master zone needs notify: and provide-xfr: lists. A slave + # may also allow zone transfer (for debug or other secondaries). + # notify these slaves when the master zone changes, address TSIG|NOKEY + # IP can be ipv4 and ipv6, with @port for a nondefault port number. + #notify: 192.0.2.1 NOKEY + # allow these IPs and TSIG to transfer zones, addr TSIG|NOKEY|BLOCKED + # address range 192.0.2.0/24, 1.2.3.4&255.255.0.0, 3.0.2.20-3.0.2.40 + #provide-xfr: 192.0.2.0/24 my_tsig_key_name + # set the number of retries for notify. + #notify-retry: 5 # uncomment to provide AXFR to all the world # provide-xfr: 0.0.0.0/0 NOKEY # provide-xfr: ::0/0 NOKEY + # A slave zone needs allow-notify: and request-xfr: lists. + #allow-notify: 2001:db8::0/64 my_tsig_key_name + # By default, a slave will request a zone transfer with IXFR/TCP. + # If you want to make use of IXFR/UDP use: UDP addr tsigkey + # for a master that only speaks AXFR (like NSD) use AXFR addr tsigkey + #request-xfr: 192.0.2.2 the_tsig_key_name + # Attention: You cannot use UDP and AXFR together. AXFR is always over + # TCP. If you use UDP, we higly recommend you to deploy TSIG. + # Allow AXFR fallback if the master does not support IXFR. Default + # is yes. + #allow-axfr-fallback: yes # set local interface for sending zone transfer requests. - outgoing-interface: 10.0.0.10 - -# Sample zone 2 -zone: - name: "example.net" - zonefile: "example.net.signed.zone" - - # This is a master zone. Slaves are listed below. - # If no access control elements are provided, this zone - # will not be served to/from other servers. + # default is let the OS choose. + #outgoing-interface: 10.0.0.10 - # secondary 1. Uses port 5300. - notify: 10.0.0.14@5300 sec1_key - provide-xfr: 10.0.0.14@5300 sec1_key + # if you give another pattern name here, at this point the settings + # from that pattern are inserted into this one (as if it were a + # macro). The statement can be given in between other statements, + # because the order of access control elements can make a difference + # (which master to request from first, which slave to notify first). + #include-pattern: "common-masters" - # secondary 2. - notify: 10.11.12.14 sec2_key - provide-xfr: 10.11.12.14 sec2_key - - # also provide xfr to operator's network. - provide-xfr: 169.192.85.0/24 NOKEY - # uncomment to disable xfr for the address. - # provide-xfr: 169.192.85.66 BLOCKED - - # set the number of retries for notify. - notify-retry: 5 - # set local interface for sending notifies - outgoing-interface: 10.0.0.15 +# Fixed zone entries. Here you can config zones that cannot be deleted. +# Zones that are dynamically added and deleted are put in the zonelist file. +# +# zone: + # name: "example.com" + # you can give a pattern here, all the settings from that pattern + # are then inserted at this point + # include-pattern: "master" + # You can also specify (additional) options directly for this zone. + # zonefile: "example.com.zone" + # request-xfr: 192.0.2.1 example.com.key # RRLconfig # Response Rate Limiting, whitelist types @@ -220,20 +258,3 @@ zone: # rrl-whitelist: all # RRLend - -# keys for zone 2 -key: - name: "sec1_key" - algorithm: hmac-md5 - secret: "6KM6qiKfwfEpamEq72HQdA==" - -key: - name: sec2_key - algorithm: hmac-sha1 - secret: "m83H2x8R0zbDf3yRKhrqgw==" - -key: - name: sec3_key - algorithm: hmac-sha256 - secret: "m83H2x8R0zbDf3yRKhrqgw==" - diff --git a/usr.sbin/nsd/nsd.h b/usr.sbin/nsd/nsd.h index 2dd4676937e..955fc4fbae2 100644 --- a/usr.sbin/nsd/nsd.h +++ b/usr.sbin/nsd/nsd.h @@ -1,7 +1,7 @@ /* * nsd.h -- nsd(8) definitions and prototypes * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -14,9 +14,10 @@ #include "dns.h" #include "edns.h" -#include "util.h" struct netio_handler; struct nsd_options; +struct udb_base; +struct daemon_remote; /* The NSD runtime states and NSD ipc command values */ #define NSD_RUN 0 @@ -26,42 +27,38 @@ struct nsd_options; #define NSD_REAP_CHILDREN 4 #define NSD_QUIT 5 /* - * NSD_SOA_INFO is followed by u16(len in network byte order), dname, - * and then nothing (no info) or soa info. - */ -#define NSD_SOA_INFO 6 -/* * PASS_TO_XFRD is followed by the u16(len in network order) and * then network packet contents. packet is a notify(acl checked), or * xfr reply from a master(acl checked). * followed by u32(acl number that matched from notify/xfr acl). */ -#define NSD_PASS_TO_XFRD 7 +#define NSD_PASS_TO_XFRD 6 /* - * NSD_ZONE_STATE is followed by u16(len in network byte order), - * octet 0: zone is expired, 1: zone ok. and dname of zone. + * RELOAD_REQ is sent when parent receives a SIGHUP and tells + * xfrd that it wants to initiate a reload (and thus task swap). */ -#define NSD_ZONE_STATE 8 +#define NSD_RELOAD_REQ 7 /* - * SOA BEGIN is sent at the start of a reload SOA_INFO pass - * xfrd will not send to the parent (deadlock prevention). - */ -#define NSD_SOA_BEGIN 9 -/* - * SOA END is sent at the end of a reload SOA_INFO pass. + * RELOAD_DONE is sent at the end of a reload pass. * xfrd then knows that reload phase is over. */ -#define NSD_SOA_END 10 +#define NSD_RELOAD_DONE 8 /* * QUIT_SYNC is sent to signify a synchronisation of ipc * channel content during reload */ -#define NSD_QUIT_SYNC 11 +#define NSD_QUIT_SYNC 9 +/* + * QUIT_WITH_STATS is sent during a reload when BIND8_STATS is defined, + * from parent to children. The stats are transferred too from child to + * parent with this commandvalue, when the child is exiting. + */ +#define NSD_QUIT_WITH_STATS 10 /* * QUIT_CHILD is sent at exit, to make sure the child has exited so that * port53 is free when all of nsd's processes have exited at shutdown time */ -#define NSD_QUIT_CHILD 12 +#define NSD_QUIT_CHILD 11 #define NSD_SERVER_MAIN 0x0U #define NSD_SERVER_UDP 0x1U @@ -76,29 +73,20 @@ struct nsd_options; #ifdef BIND8_STATS -#define LASTELEM(arr) (sizeof(arr) / sizeof(arr[0]) - 1) - -#define STATUP(nsd, stc) nsd->st.stc++ -#define STATUP2(nsd, stc, i) nsd->st.stc[(i) <= (LASTELEM(nsd->st.stc) - 1) ? i : LASTELEM(nsd->st.stc)]++ - -# ifdef USE_ZONE_STATS +/* Counter for statistics */ +typedef unsigned long stc_t; -# define ZTATUP(zone, stc) zone->st.stc++ -# define ZTATUP2(zone, stc, i) zone->st.stc[(i) <= (LASTELEM(zone->st.stc) - 1) ? i : LASTELEM(zone->st.stc)]++ - -# else - -# define ZTATUP(zone, stc) /* Nothing */ -# define ZTATUP2(zone, stc, i) /* Nothing */ +#define LASTELEM(arr) (sizeof(arr) / sizeof(arr[0]) - 1) -# endif /* USE_ZONE_STATS */ +#define STATUP(nsd, stc) nsd->st.stc++ +/* #define STATUP2(nsd, stc, i) ((i) <= (LASTELEM(nsd->st.stc) - 1)) ? nsd->st.stc[(i)]++ : \ + nsd->st.stc[LASTELEM(nsd->st.stc)]++ */ -#else /* BIND8_STATS */ +#define STATUP2(nsd, stc, i) nsd->st.stc[(i) <= (LASTELEM(nsd->st.stc) - 1) ? i : LASTELEM(nsd->st.stc)]++ +#else /* BIND8_STATS */ #define STATUP(nsd, stc) /* Nothing */ #define STATUP2(nsd, stc, i) /* Nothing */ -#define ZTATUP(zone, stc) /* Nothing */ -#define ZTATUP2(zone, stc, i) /* Nothing */ #endif /* BIND8_STATS */ @@ -133,12 +121,15 @@ struct nsd_child */ uint8_t need_to_send_STATS, need_to_send_QUIT; uint8_t need_to_exit, has_exited; - stack_type* dirty_zones; /* stack of type zone_type* */ /* * The handler for handling the commands from the child. */ struct netio_handler* handler; + +#ifdef BIND8_STATS + stc_t query_count; +#endif }; /* NSD configuration and run-time variables */ @@ -153,6 +144,7 @@ struct nsd /* Run-time variables */ pid_t pid; volatile sig_atomic_t mode; + volatile sig_atomic_t signal_hint_reload_hup; volatile sig_atomic_t signal_hint_reload; volatile sig_atomic_t signal_hint_child; volatile sig_atomic_t signal_hint_quit; @@ -170,12 +162,15 @@ struct nsd /* NULL if this is the parent process. */ struct nsd_child *this_child; + /* mmaps with data exchange from xfrd and reload */ + struct udb_base* task[2]; + int mytask; /* the base used by this process */ + struct netio_handler* xfrd_listener; + struct daemon_remote* rc; + /* Configuration */ const char *dbfile; const char *pidfile; -#ifdef USE_ZONE_STATS - const char *zonestatsfile; -#endif const char *log_filename; const char *username; uid_t uid; @@ -210,7 +205,20 @@ struct nsd size_t ipv6_edns_size; #ifdef BIND8_STATS - struct nsdst st; + + struct nsdst { + time_t boot; + int period; /* Produce statistics dump every st_period seconds */ + stc_t qtype[257]; /* Counters per qtype */ + stc_t qclass[4]; /* Class IN or Class CH or other */ + stc_t qudp, qudp6; /* Number of queries udp and udp6 */ + stc_t ctcp, ctcp6; /* Number of tcp and tcp6 connections */ + stc_t rcode[17], opcode[6]; /* Rcodes & opcodes */ + /* Dropped, truncated, queries for nonconfigured zone, tx errors */ + stc_t dropped, truncated, wrongzone, txerr, rxerr; + stc_t edns, ednserr, raxfr, nona; + uint64_t db_disk, db_mem; + } st; #endif /* BIND8_STATS */ struct nsd_options* options; @@ -230,7 +238,16 @@ void server_main(struct nsd *nsd); void server_child(struct nsd *nsd); void server_shutdown(struct nsd *nsd); void server_close_all_sockets(struct nsd_socket sockets[], size_t n); +struct event_base* nsd_child_event_base(void); /* extra domain numbers for temporary domains */ #define EXTRA_DOMAIN_NUMBERS 1024 +#define SLOW_ACCEPT_TIMEOUT 2 /* in seconds */ +/* allocate and init xfrd variables */ +void server_prepare_xfrd(struct nsd *nsd); +/* start xfrdaemon (again) */ +void server_start_xfrd(struct nsd *nsd, int del_db, int reload_active); +/* send SOA serial numbers to xfrd */ +void server_send_soa_xfrd(struct nsd *nsd, int shortsoa); +ssize_t block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout); #endif /* _NSD_H_ */ diff --git a/usr.sbin/nsd/nsec3.h b/usr.sbin/nsd/nsec3.h index d55b4825394..96c4367ff33 100644 --- a/usr.sbin/nsd/nsec3.h +++ b/usr.sbin/nsd/nsec3.h @@ -1,7 +1,7 @@ /* * nsec3.h -- nsec3 handling. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -9,9 +9,8 @@ #ifndef NSEC3_H #define NSEC3_H -#include "config.h" #ifdef NSEC3 - +struct udb_ptr; struct domain; struct dname; struct region; @@ -19,40 +18,23 @@ struct zone; struct namedb; struct query; struct answer; -#ifndef FULL_PREHASH struct rr; -struct nsec3_domain; -#endif /* - * Create the hashed name of the nsec3 record - * for the given dname. + * calculate prehash information for zone. */ -const struct dname *nsec3_hash_dname(struct region *region, - struct zone *zone, const struct dname *dname); - +void prehash_zone(struct namedb* db, struct zone* zone); /* - * calculate prehash information for all zones, - * selects only updated=1 zones if bool set. + * calculate prehash for zone, assumes no partial precompile or prehashlist */ -void prehash(struct namedb* db, int updated_only); -#ifndef FULL_PREHASH -void prehash_zone(struct namedb *db, struct zone *zone); -void prehash_zone_incremental(struct namedb *db, struct zone *zone); -#endif +void prehash_zone_complete(struct namedb* db, struct zone* zone); /* - * finds nsec3 that covers the given domain dname. + * finds nsec3 that covers the given domain hash. * returns true if the find is exact. - * hashname is the already hashed dname for the NSEC3. */ -#ifdef FULL_PREHASH -int nsec3_find_cover(struct namedb* db, struct zone* zone, - const struct dname* hashname, struct domain** result); -#else -int nsec3_find_cover(struct namedb* ATTR_UNUSED(db), struct zone* zone, - const struct dname* hashname, struct nsec3_domain** result); -#endif +int nsec3_find_cover(struct zone* zone, uint8_t* hash, size_t hashlen, + struct domain** result); /* * _answer_ Routines used to add the correct nsec3 record to a query answer. @@ -62,9 +44,8 @@ int nsec3_find_cover(struct namedb* ATTR_UNUSED(db), struct zone* zone, * add proof for wildcards that the name below the wildcard.parent * does not exist */ -void nsec3_answer_wildcard(struct query *query, struct answer *answer, - struct domain *wildcard, struct namedb* db, - const struct dname *qname); +void nsec3_answer_wildcard(struct query* query, struct answer* answer, + struct domain* wildcard, const struct dname* qname); /* * add NSEC3 to provide domain name but not rrset exists, @@ -84,7 +65,7 @@ void nsec3_answer_delegation(struct query *query, struct answer *answer); */ void nsec3_answer_authoritative(struct domain** match, struct query *query, struct answer *answer, struct domain* closest_encloser, - struct namedb* db, const struct dname* qname); + const struct dname* qname); /* * True if domain is a NSEC3 (+RRSIG) data only variety. @@ -92,5 +73,48 @@ void nsec3_answer_authoritative(struct domain** match, struct query *query, */ int domain_has_only_NSEC3(struct domain* domain, struct zone* zone); +/* get hashed bytes */ +void nsec3_hash_and_store(struct zone* zone, const struct dname* dname, + uint8_t* store); +/* see if NSEC3 record uses the params in use for the zone */ +int nsec3_rr_uses_params(struct rr* rr, struct zone* zone); +/* number of NSEC3s that are in the zone chain */ +int nsec3_in_chain_count(struct domain* domain, struct zone* zone); +/* find previous NSEC3, or, lastinzone, or, NULL */ +struct domain* nsec3_chain_find_prev(struct zone* zone, struct domain* domain); +/* clear nsec3 precompile for the zone */ +void nsec3_clear_precompile(struct namedb* db, struct zone* zone); +/* if domain is part of nsec3hashed domains of a zone */ +int nsec3_domain_part_of_zone(struct domain* d, struct zone* z); +/* condition when a domain is precompiled */ +int nsec3_condition_hash(struct domain* d, struct zone* z); +/* condition when a domain is ds precompiled */ +int nsec3_condition_dshash(struct domain* d, struct zone* z); +/* set nsec3param for this zone or NULL if no NSEC3 available */ +void nsec3_find_zone_param(struct namedb* db, struct zone* zone, + struct udb_ptr* z); +/* hash domain and wcchild, and lookup nsec3 in tree, and precompile */ +void nsec3_precompile_domain(struct namedb* db, struct domain* domain, + struct zone* zone, struct region* tmpregion); +/* hash ds_parent_cover, and lookup nsec3 and precompile */ +void nsec3_precompile_domain_ds(struct namedb* db, struct domain* domain, + struct zone* zone); +/* put nsec3 into nsec3tree and adjust zonelast */ +void nsec3_precompile_nsec3rr(struct namedb* db, struct domain* domain, + struct zone* zone); +/* precompile entire zone, assumes all is null at start */ +void nsec3_precompile_newparam(struct namedb* db, struct zone* zone); +/* create b32.zone for a hash, allocated in the region */ +const struct dname* nsec3_b32_create(struct region* region, struct zone* zone, + unsigned char* hash); +/* create trees for nsec3 updates and lookups in zone */ +void nsec3_zone_trees_create(struct region* region, struct zone* zone); +/* clear trees for nsec3 in zone */ +void nsec3_hash_tree_clear(struct zone* zone); +/* lookup zone that contains domain's nsec3 trees */ +struct zone* nsec3_tree_zone(struct namedb* db, struct domain* domain); +/* lookup zone that contains domain's ds tree */ +struct zone* nsec3_tree_dszone(struct namedb* db, struct domain* domain); + #endif /* NSEC3 */ #endif /* NSEC3_H*/ diff --git a/usr.sbin/nsd/options.c b/usr.sbin/nsd/options.c index 39cfa610864..221a0f7eb35 100644 --- a/usr.sbin/nsd/options.c +++ b/usr.sbin/nsd/options.c @@ -1,7 +1,7 @@ /* * options.c -- options functions. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -13,10 +13,11 @@ #include "options.h" #include "query.h" #include "tsig.h" +#include "difffile.h" #include "rrl.h" #include "configyyrename.h" -nsd_options_t* nsd_options = 0; +#include "configparser.h" config_parser_state_t* cfg_parser = 0; extern FILE* c_in, *c_out; int c_parse(void); @@ -24,28 +25,36 @@ int c_lex(void); int c_wrap(void); void c_error(const char *message); -nsd_options_t* nsd_options_create(region_type* region) +static int +rbtree_strcmp(const void* p1, const void* p2) +{ + return strcmp((const char*)p1, (const char*)p2); +} + +nsd_options_t* +nsd_options_create(region_type* region) { nsd_options_t* opt; opt = (nsd_options_t*)region_alloc(region, sizeof(nsd_options_t)); opt->region = region; opt->zone_options = rbtree_create(region, (int (*)(const void *, const void *)) dname_compare); - opt->keys = NULL; - opt->numkeys = 0; + opt->configfile = NULL; + opt->patterns = rbtree_create(region, rbtree_strcmp); + opt->keys = rbtree_create(region, rbtree_strcmp); opt->ip_addresses = NULL; opt->ip_transparent = 0; opt->debug_mode = 0; opt->verbosity = 0; opt->hide_version = 0; - opt->ip4_only = 0; - opt->ip6_only = 0; + opt->do_ip4 = 1; + opt->do_ip6 = 1; opt->database = DBFILE; opt->identity = 0; opt->nsid = 0; opt->logfile = 0; opt->server_count = 1; - opt->tcp_count = 10; + opt->tcp_count = 100; opt->tcp_query_count = 0; opt->tcp_timeout = TCP_TIMEOUT; opt->ipv4_edns_size = EDNS_MAX_MESSAGE_LEN; @@ -54,17 +63,12 @@ nsd_options_t* nsd_options_create(region_type* region) opt->port = UDP_PORT; /* deprecated? opt->port = TCP_PORT; */ opt->statistics = 0; -#ifdef USE_ZONE_STATS - opt->zonestatsfile = ZONESTATSFILE; -#else - opt->zonestatsfile = 0; -#endif opt->chroot = 0; opt->username = USER; opt->zonesdir = ZONESDIR; - opt->difffile = DIFFFILE; opt->xfrdfile = XFRDFILE; - opt->xfrd_reload_timeout = 10; + opt->xfrdir = XFRDIR; + opt->zonelistfile = ZONELISTFILE; #ifdef RATELIMIT opt->rrl_size = RRL_BUCKETS; opt->rrl_ratelimit = RRL_LIMIT/2; @@ -73,11 +77,20 @@ nsd_options_t* nsd_options_create(region_type* region) opt->rrl_ipv6_prefix_length = RRL_IPV6_PREFIX_LENGTH; opt->rrl_whitelist_ratelimit = RRL_WLIST_LIMIT/2; #endif - nsd_options = opt; + opt->zonefiles_check = 1; + opt->xfrd_reload_timeout = 1; + opt->control_enable = 0; + opt->control_interface = NULL; + opt->control_port = NSD_CONTROL_PORT; + opt->server_key_file = CONFIGDIR"/nsd_server.key"; + opt->server_cert_file = CONFIGDIR"/nsd_server.pem"; + opt->control_key_file = CONFIGDIR"/nsd_control.key"; + opt->control_cert_file = CONFIGDIR"/nsd_control.pem"; return opt; } -int nsd_options_insert_zone(nsd_options_t* opt, zone_options_t* zone) +int +nsd_options_insert_zone(nsd_options_t* opt, zone_options_t* zone) { /* create dname for lookup */ const dname_type* dname = dname_parse(opt->region, zone->name); @@ -89,23 +102,40 @@ int nsd_options_insert_zone(nsd_options_t* opt, zone_options_t* zone) return 1; } -int parse_options_file(nsd_options_t* opt, const char* file) +int +nsd_options_insert_pattern(nsd_options_t* opt, pattern_options_t* pat) +{ + if(!pat->pname) + return 0; + pat->node.key = pat->pname; + if(!rbtree_insert(opt->patterns, (rbnode_t*)pat)) + return 0; + return 1; +} + +int +parse_options_file(nsd_options_t* opt, const char* file, + void (*err)(void*,const char*), void* err_arg) { FILE *in = 0; - zone_options_t* zone; + pattern_options_t* pat; acl_options_t* acl; - if(!cfg_parser) + if(!cfg_parser) { cfg_parser = (config_parser_state_t*)region_alloc( opt->region, sizeof(config_parser_state_t)); + cfg_parser->chroot = 0; + } + cfg_parser->err = err; + cfg_parser->err_arg = err_arg; cfg_parser->filename = file; cfg_parser->line = 1; cfg_parser->errors = 0; + cfg_parser->server_settings_seen = 0; cfg_parser->opt = opt; + cfg_parser->current_pattern = 0; cfg_parser->current_zone = 0; - cfg_parser->current_key = opt->keys; - while(cfg_parser->current_key && cfg_parser->current_key->next) - cfg_parser->current_key = cfg_parser->current_key->next; + cfg_parser->current_key = 0; cfg_parser->current_ip_address_option = opt->ip_addresses; while(cfg_parser->current_ip_address_option && cfg_parser->current_ip_address_option->next) cfg_parser->current_ip_address_option = cfg_parser->current_ip_address_option->next; @@ -113,16 +143,34 @@ int parse_options_file(nsd_options_t* opt, const char* file) cfg_parser->current_request_xfr = 0; cfg_parser->current_notify = 0; cfg_parser->current_provide_xfr = 0; - + in = fopen(cfg_parser->filename, "r"); if(!in) { - fprintf(stderr, "Could not open %s: %s\n", file, strerror(errno)); + if(err) { + char m[MAXSYSLOGMSGLEN]; + snprintf(m, sizeof(m), "Could not open %s: %s\n", + file, strerror(errno)); + err(err_arg, m); + } else { + fprintf(stderr, "Could not open %s: %s\n", + file, strerror(errno)); + } return 0; } c_in = in; c_parse(); fclose(in); + opt->configfile = region_strdup(opt->region, file); + if(cfg_parser->current_pattern) { + if(!cfg_parser->current_pattern->pname) + c_error("last pattern has no name"); + else { + if(!nsd_options_insert_pattern(cfg_parser->opt, + cfg_parser->current_pattern)) + c_error("duplicate pattern"); + } + } if(cfg_parser->current_zone) { if(!cfg_parser->current_zone->name) c_error("last zone has no name"); @@ -131,83 +179,499 @@ int parse_options_file(nsd_options_t* opt, const char* file) cfg_parser->current_zone)) c_error("duplicate zone"); } - if(!cfg_parser->current_zone->zonefile) - c_error("last zone has no zonefile"); + if(!cfg_parser->current_zone->pattern) + c_error("last zone has no pattern"); } - if(opt->keys) + if(cfg_parser->current_key) { - if(!opt->keys->name) + if(!cfg_parser->current_key->name) c_error("last key has no name"); - if(!opt->keys->algorithm) + if(!cfg_parser->current_key->algorithm) c_error("last key has no algorithm"); - if(!opt->keys->secret) + if(!cfg_parser->current_key->secret) c_error("last key has no secret blob"); + key_options_insert(opt, cfg_parser->current_key); } - RBTREE_FOR(zone, zone_options_t*, opt->zone_options) + RBTREE_FOR(pat, pattern_options_t*, opt->patterns) { - if(!zone->name) - continue; - if(!zone->zonefile) - continue; /* lookup keys for acls */ - for(acl=zone->allow_notify; acl; acl=acl->next) + for(acl=pat->allow_notify; acl; acl=acl->next) { if(acl->nokey || acl->blocked) continue; acl->key_options = key_options_find(opt, acl->key_name); if(!acl->key_options) - c_error_msg("key %s in zone %s could not be found", - acl->key_name, zone->name); + c_error_msg("key %s in pattern %s could not be found", + acl->key_name, pat->pname); } - for(acl=zone->notify; acl; acl=acl->next) + for(acl=pat->notify; acl; acl=acl->next) { if(acl->nokey || acl->blocked) continue; acl->key_options = key_options_find(opt, acl->key_name); if(!acl->key_options) - c_error_msg("key %s in zone %s could not be found", - acl->key_name, zone->name); + c_error_msg("key %s in pattern %s could not be found", + acl->key_name, pat->pname); } - for(acl=zone->request_xfr; acl; acl=acl->next) + for(acl=pat->request_xfr; acl; acl=acl->next) { if(acl->nokey || acl->blocked) continue; acl->key_options = key_options_find(opt, acl->key_name); if(!acl->key_options) - c_error_msg("key %s in zone %s could not be found", - acl->key_name, zone->name); + c_error_msg("key %s in pattern %s could not be found", + acl->key_name, pat->pname); } - for(acl=zone->provide_xfr; acl; acl=acl->next) + for(acl=pat->provide_xfr; acl; acl=acl->next) { if(acl->nokey || acl->blocked) continue; acl->key_options = key_options_find(opt, acl->key_name); if(!acl->key_options) - c_error_msg("key %s in zone %s could not be found", - acl->key_name, zone->name); + c_error_msg("key %s in pattern %s could not be found", + acl->key_name, pat->pname); } } if(cfg_parser->errors > 0) { - fprintf(stderr, "read %s failed: %d errors in configuration file\n", - cfg_parser->filename, - cfg_parser->errors); + if(err) { + char m[MAXSYSLOGMSGLEN]; + snprintf(m, sizeof(m), "read %s failed: %d errors in " + "configuration file\n", cfg_parser->filename, + cfg_parser->errors); + err(err_arg, m); + } else { + fprintf(stderr, "read %s failed: %d errors in " + "configuration file\n", cfg_parser->filename, + cfg_parser->errors); + } return 0; } return 1; } -void c_error_va_list(const char *fmt, va_list args) +#define ZONELIST_HEADER "# NSD zone list\n# name pattern\n" +static int +comp_zonebucket(const void* a, const void* b) +{ + return *(const int*)b - *(const int*)a; +} + +/* insert free entry into zonelist free buckets */ +static void +zone_list_free_insert(nsd_options_t* opt, int linesize, off_t off) +{ + struct zonelist_free* e; + struct zonelist_bucket* b = (struct zonelist_bucket*)rbtree_search( + opt->zonefree, &linesize); + if(!b) { + b = region_alloc_zero(opt->region, sizeof(*b)); + b->linesize = linesize; + b->node = *RBTREE_NULL; + b->node.key = &b->linesize; + rbtree_insert(opt->zonefree, &b->node); + } + e = (struct zonelist_free*)region_alloc_zero(opt->region, sizeof(*e)); + e->next = b->list; + b->list = e; + e->off = off; + opt->zonefree_number++; +} + +zone_options_t* +zone_list_zone_insert(nsd_options_t* opt, const char* nm, const char* patnm, + int linesize, off_t off) +{ + pattern_options_t* pat = pattern_options_find(opt, patnm); + zone_options_t* zone; + if(!pat) { + log_msg(LOG_ERR, "pattern does not exist for zone %s " + "pattern %s", nm, patnm); + return NULL; + } + zone = zone_options_create(opt->region); + zone->part_of_config = 0; + zone->name = region_strdup(opt->region, nm); + zone->linesize = linesize; + zone->off = off; + zone->pattern = pat; + if(!nsd_options_insert_zone(opt, zone)) { + log_msg(LOG_ERR, "bad domain name or duplicate zone '%s' " + "pattern %s", nm, patnm); + region_recycle(opt->region, (void*)zone->name, strlen(nm)+1); + region_recycle(opt->region, zone, sizeof(*zone)); + return NULL; + } + return zone; +} + +int +parse_zone_list_file(nsd_options_t* opt) +{ + /* zonelist looks like this: + # name pattern + add example.com master + del example.net slave + add foo.bar.nl slave + add rutabaga.uk config + */ + char buf[1024]; + + /* create empty data structures */ + opt->zonefree = rbtree_create(opt->region, comp_zonebucket); + opt->zonelist = NULL; + opt->zonefree_number = 0; + opt->zonelist_off = 0; + + /* try to open the zonelist file, an empty or nonexist file is OK */ + opt->zonelist = fopen(opt->zonelistfile, "r+"); + if(!opt->zonelist) { + if(errno == ENOENT) + return 1; /* file does not exist, it is created later */ + log_msg(LOG_ERR, "could not open zone list %s: %s", opt->zonelistfile, + strerror(errno)); + return 0; + } + /* read header */ + buf[strlen(ZONELIST_HEADER)] = 0; + if(fread(buf, 1, strlen(ZONELIST_HEADER), opt->zonelist) != + strlen(ZONELIST_HEADER) || strncmp(buf, ZONELIST_HEADER, + strlen(ZONELIST_HEADER)) != 0) { + log_msg(LOG_ERR, "zone list %s contains bad header\n", opt->zonelistfile); + fclose(opt->zonelist); + opt->zonelist = NULL; + return 0; + } + + /* read entries in file */ + while(fgets(buf, sizeof(buf), opt->zonelist)) { + /* skip comments and empty lines */ + if(buf[0] == 0 || buf[0] == '\n' || buf[0] == '#') + continue; + if(strncmp(buf, "add ", 4) == 0) { + int linesize = strlen(buf); + /* parse the 'add' line */ + /* pick last space on the line, so that the domain + * name can have a space in it (but not the pattern)*/ + char* space = strrchr(buf+4, ' '); + char* nm, *patnm; + if(!space) { + /* parse error */ + log_msg(LOG_ERR, "parse error in %s: '%s'", + opt->zonelistfile, buf); + continue; + } + nm = buf+4; + *space = 0; + patnm = space+1; + if(linesize && buf[linesize-1] == '\n') + buf[linesize-1] = 0; + + /* store offset and line size for zone entry */ + /* and create zone entry in zonetree */ + (void)zone_list_zone_insert(opt, nm, patnm, linesize, + ftello(opt->zonelist)-linesize); + } else if(strncmp(buf, "del ", 4) == 0) { + /* store offset and line size for deleted entry */ + int linesize = strlen(buf); + zone_list_free_insert(opt, linesize, + ftello(opt->zonelist)-linesize); + } else { + log_msg(LOG_WARNING, "bad data in %s, '%s'", opt->zonelistfile, + buf); + } + } + /* store EOF offset */ + opt->zonelist_off = ftello(opt->zonelist); + return 1; +} + +void +zone_options_delete(nsd_options_t* opt, zone_options_t* zone) +{ + rbtree_delete(opt->zone_options, zone->node.key); + region_recycle(opt->region, (void*)zone->node.key, dname_total_size( + (dname_type*)zone->node.key)); + region_recycle(opt->region, zone, sizeof(*zone)); +} + +/* add a new zone to the zonelist */ +zone_options_t* +zone_list_add(nsd_options_t* opt, const char* zname, const char* pname) +{ + int r; + struct zonelist_free* e; + struct zonelist_bucket* b; + int linesize = 6 + strlen(zname) + strlen(pname); + /* create zone entry */ + zone_options_t* zone = zone_list_zone_insert(opt, zname, pname, + linesize, 0); + if(!zone) + return NULL; + + /* use free entry or append to file or create new file */ + if(!opt->zonelist || opt->zonelist_off == 0) { + /* create new file */ + if(opt->zonelist) fclose(opt->zonelist); + opt->zonelist = fopen(opt->zonelistfile, "w+"); + if(!opt->zonelist) { + log_msg(LOG_ERR, "could not create zone list %s: %s", + opt->zonelistfile, strerror(errno)); + log_msg(LOG_ERR, "zone %s could not be added", zname); + zone_options_delete(opt, zone); + return NULL; + } + r = fprintf(opt->zonelist, ZONELIST_HEADER); + if(r != strlen(ZONELIST_HEADER)) { + if(r == -1) + log_msg(LOG_ERR, "could not write to %s: %s", + opt->zonelistfile, strerror(errno)); + else log_msg(LOG_ERR, "partial write to %s: disk full", + opt->zonelistfile); + log_msg(LOG_ERR, "zone %s could not be added", zname); + zone_options_delete(opt, zone); + return NULL; + } + zone->off = ftello(opt->zonelist); + if(zone->off == -1) + log_msg(LOG_ERR, "ftello(%s): %s", opt->zonelistfile, strerror(errno)); + r = fprintf(opt->zonelist, "add %s %s\n", zname, pname); + if(r != zone->linesize) { + if(r == -1) + log_msg(LOG_ERR, "could not write to %s: %s", + opt->zonelistfile, strerror(errno)); + else log_msg(LOG_ERR, "partial write to %s: disk full", + opt->zonelistfile); + log_msg(LOG_ERR, "zone %s could not be added", zname); + zone_options_delete(opt, zone); + return NULL; + } + opt->zonelist_off = ftello(opt->zonelist); + if(opt->zonelist_off == -1) + log_msg(LOG_ERR, "ftello(%s): %s", opt->zonelistfile, strerror(errno)); + if(fflush(opt->zonelist) != 0) { + log_msg(LOG_ERR, "fflush %s: %s", opt->zonelistfile, strerror(errno)); + } + return zone; + } + b = (struct zonelist_bucket*)rbtree_search(opt->zonefree, + &zone->linesize); + if(!b || b->list == NULL) { + /* no empty place, append to file */ + zone->off = opt->zonelist_off; + if(fseeko(opt->zonelist, zone->off, SEEK_SET) == -1) { + log_msg(LOG_ERR, "fseeko(%s): %s", opt->zonelistfile, strerror(errno)); + log_msg(LOG_ERR, "zone %s could not be added", zname); + zone_options_delete(opt, zone); + return NULL; + } + r = fprintf(opt->zonelist, "add %s %s\n", zname, pname); + if(r != zone->linesize) { + if(r == -1) + log_msg(LOG_ERR, "could not write to %s: %s", + opt->zonelistfile, strerror(errno)); + else log_msg(LOG_ERR, "partial write to %s: disk full", + opt->zonelistfile); + log_msg(LOG_ERR, "zone %s could not be added", zname); + zone_options_delete(opt, zone); + return NULL; + } + opt->zonelist_off += linesize; + if(fflush(opt->zonelist) != 0) { + log_msg(LOG_ERR, "fflush %s: %s", opt->zonelistfile, strerror(errno)); + } + return zone; + } + /* reuse empty spot */ + e = b->list; + zone->off = e->off; + if(fseeko(opt->zonelist, zone->off, SEEK_SET) == -1) { + log_msg(LOG_ERR, "fseeko(%s): %s", opt->zonelistfile, strerror(errno)); + log_msg(LOG_ERR, "zone %s could not be added", zname); + zone_options_delete(opt, zone); + return NULL; + } + r = fprintf(opt->zonelist, "add %s %s\n", zname, pname); + if(r != zone->linesize) { + if(r == -1) + log_msg(LOG_ERR, "could not write to %s: %s", + opt->zonelistfile, strerror(errno)); + else log_msg(LOG_ERR, "partial write to %s: disk full", + opt->zonelistfile); + log_msg(LOG_ERR, "zone %s could not be added", zname); + zone_options_delete(opt, zone); + return NULL; + } + if(fflush(opt->zonelist) != 0) { + log_msg(LOG_ERR, "fflush %s: %s", opt->zonelistfile, strerror(errno)); + } + + /* snip off and recycle element */ + b->list = e->next; + region_recycle(opt->region, e, sizeof(*e)); + if(b->list == NULL) { + rbtree_delete(opt->zonefree, &b->linesize); + region_recycle(opt->region, b, sizeof(*b)); + } + opt->zonefree_number--; + return zone; +} + +/* remove a zone on the zonelist */ +void +zone_list_del(nsd_options_t* opt, zone_options_t* zone) +{ + /* put its space onto the free entry */ + if(fseeko(opt->zonelist, zone->off, SEEK_SET) == -1) { + log_msg(LOG_ERR, "fseeko(%s): %s", opt->zonelistfile, strerror(errno)); + return; + } + fprintf(opt->zonelist, "del"); + zone_list_free_insert(opt, zone->linesize, zone->off); + + /* remove zone_options_t */ + zone_options_delete(opt, zone); + + /* see if we need to compact: it is going to halve the zonelist */ + if(opt->zonefree_number > opt->zone_options->count) { + zone_list_compact(opt); + } else { + if(fflush(opt->zonelist) != 0) { + log_msg(LOG_ERR, "fflush %s: %s", opt->zonelistfile, strerror(errno)); + } + } +} +/* postorder delete of zonelist free space tree */ +static void +delbucket(region_type* region, struct zonelist_bucket* b) +{ + struct zonelist_free* e, *f; + if(!b || (rbnode_t*)b==RBTREE_NULL) + return; + delbucket(region, (struct zonelist_bucket*)b->node.left); + delbucket(region, (struct zonelist_bucket*)b->node.right); + e = b->list; + while(e) { + f = e->next; + region_recycle(region, e, sizeof(*e)); + e = f; + } + region_recycle(region, b, sizeof(*b)); +} + +/* compact zonelist file */ +void +zone_list_compact(nsd_options_t* opt) +{ + char outname[1024]; + FILE* out; + zone_options_t* zone; + off_t off; + int r; + snprintf(outname, sizeof(outname), "%s~", opt->zonelistfile); + /* useful, when : count-of-free > count-of-used */ + /* write zonelist to zonelist~ */ + out = fopen(outname, "w+"); + if(!out) { + log_msg(LOG_ERR, "could not open %s: %s", outname, strerror(errno)); + return; + } + r = fprintf(out, ZONELIST_HEADER); + if(r == -1) { + log_msg(LOG_ERR, "write %s failed: %s", outname, + strerror(errno)); + fclose(out); + return; + } else if(r != strlen(ZONELIST_HEADER)) { + log_msg(LOG_ERR, "write %s was partial: disk full", + outname); + fclose(out); + return; + } + off = ftello(out); + if(off == -1) { + log_msg(LOG_ERR, "ftello(%s): %s", outname, strerror(errno)); + fclose(out); + return; + } + RBTREE_FOR(zone, zone_options_t*, opt->zone_options) { + if(zone->part_of_config) + continue; + r = fprintf(out, "add %s %s\n", zone->name, + zone->pattern->pname); + if(r < 0) { + log_msg(LOG_ERR, "write %s failed: %s", outname, + strerror(errno)); + fclose(out); + return; + } else if(r != zone->linesize) { + log_msg(LOG_ERR, "write %s was partial: disk full", + outname); + fclose(out); + return; + } + } + if(fflush(out) != 0) { + log_msg(LOG_ERR, "fflush %s: %s", outname, strerror(errno)); + } + + /* rename zonelist~ onto zonelist */ + if(rename(outname, opt->zonelistfile) == -1) { + log_msg(LOG_ERR, "rename(%s to %s) failed: %s", + outname, opt->zonelistfile, strerror(errno)); + fclose(out); + return; + } + fclose(opt->zonelist); + /* set offsets */ + RBTREE_FOR(zone, zone_options_t*, opt->zone_options) { + if(zone->part_of_config) + continue; + zone->off = off; + off += zone->linesize; + } + /* empty the free tree */ + delbucket(opt->region, (struct zonelist_bucket*)opt->zonefree->root); + opt->zonefree->root = RBTREE_NULL; + opt->zonefree->count = 0; + opt->zonefree_number = 0; + /* finish */ + opt->zonelist = out; + opt->zonelist_off = off; +} + +/* close zonelist file */ +void +zone_list_close(nsd_options_t* opt) +{ + fclose(opt->zonelist); + opt->zonelist = NULL; +} + + +void +c_error_va_list(const char* fmt, va_list args) { cfg_parser->errors++; + if(cfg_parser->err) { + char m[MAXSYSLOGMSGLEN]; + snprintf(m, sizeof(m), "%s:%d: error: ", cfg_parser->filename, + cfg_parser->line); + (*cfg_parser->err)(cfg_parser->err_arg, m); + vsnprintf(m, sizeof(m), fmt, args); + (*cfg_parser->err)(cfg_parser->err_arg, m); + (*cfg_parser->err)(cfg_parser->err_arg, "\n"); + return; + } fprintf(stderr, "%s:%d: error: ", cfg_parser->filename, cfg_parser->line); vfprintf(stderr, fmt, args); fprintf(stderr, "\n"); } -void c_error_msg(const char* fmt, ...) +void +c_error_msg(const char* fmt, ...) { va_list args; va_start(args, fmt); @@ -215,62 +679,546 @@ void c_error_msg(const char* fmt, ...) va_end(args); } -void c_error(const char *str) +void +c_error(const char* str) { - cfg_parser->errors++; - fprintf(stderr, "%s:%d: error: %s\n", cfg_parser->filename, - cfg_parser->line, str); + c_error_msg("%s", str); } -int c_wrap() +int +c_wrap() { return 1; } -zone_options_t* zone_options_create(region_type* region) +zone_options_t* +zone_options_create(region_type* region) { zone_options_t* zone; zone = (zone_options_t*)region_alloc(region, sizeof(zone_options_t)); zone->node = *RBTREE_NULL; zone->name = 0; - zone->zonefile = 0; - zone->allow_notify = 0; - zone->request_xfr = 0; - zone->notify = 0; - zone->notify_retry = 5; - zone->provide_xfr = 0; - zone->outgoing_interface = 0; - zone->allow_axfr_fallback = 1; + zone->pattern = 0; + zone->part_of_config = 0; + return zone; +} + +/* true is booleans are the same truth value */ +#define booleq(x,y) ( ((x) && (y)) || (!(x) && !(y)) ) + +int +acl_equal(acl_options_t* p, acl_options_t* q) +{ + if(!booleq(p->use_axfr_only, q->use_axfr_only)) return 0; + if(!booleq(p->allow_udp, q->allow_udp)) return 0; + if(strcmp(p->ip_address_spec, q->ip_address_spec)!=0) return 0; + /* the ip6, port, addr, mask, type: are derived from the ip_address_spec */ + if(!booleq(p->nokey, q->nokey)) return 0; + if(!booleq(p->blocked, q->blocked)) return 0; + if(p->key_name && q->key_name) { + if(strcmp(p->key_name, q->key_name)!=0) return 0; + } else if(p->key_name && !q->key_name) return 0; + else if(!p->key_name && q->key_name) return 0; + /* key_options is derived from key_name */ + return 1; +} + +int +acl_list_equal(acl_options_t* p, acl_options_t* q) +{ + /* must be same and in same order */ + while(p && q) { + if(!acl_equal(p, q)) + return 0; + p = p->next; + q = q->next; + } + if(!p && !q) return 1; + /* different lengths */ + return 0; +} + +pattern_options_t* +pattern_options_create(region_type* region) +{ + pattern_options_t* p; + p = (pattern_options_t*)region_alloc(region, sizeof(pattern_options_t)); + p->node = *RBTREE_NULL; + p->pname = 0; + p->zonefile = 0; + p->allow_notify = 0; + p->request_xfr = 0; + p->notify = 0; + p->provide_xfr = 0; + p->outgoing_interface = 0; + p->notify_retry = 5; + p->notify_retry_is_default = 1; + p->allow_axfr_fallback = 1; + p->allow_axfr_fallback_is_default = 1; + p->implicit = 0; + p->xfrd_flags = 0; #ifdef RATELIMIT - zone->rrl_whitelist = 0; + p->rrl_whitelist = 0; #endif - return zone; + return p; +} + +static void +acl_delete(region_type* region, acl_options_t* acl) +{ + if(acl->ip_address_spec) + region_recycle(region, (void*)acl->ip_address_spec, + strlen(acl->ip_address_spec)+1); + if(acl->key_name) + region_recycle(region, (void*)acl->key_name, + strlen(acl->key_name)+1); + /* key_options is a convenience pointer, not owned by the acl */ + region_recycle(region, acl, sizeof(*acl)); +} + +static void +acl_list_delete(region_type* region, acl_options_t* list) +{ + acl_options_t* n; + while(list) { + n = list->next; + acl_delete(region, list); + list = n; + } } -key_options_t* key_options_create(region_type* region) +void +pattern_options_remove(nsd_options_t* opt, const char* name) +{ + pattern_options_t* p = (pattern_options_t*)rbtree_delete( + opt->patterns, name); + /* delete p and its contents */ + if (!p) + return; + if(p->pname) + region_recycle(opt->region, (void*)p->pname, + strlen(p->pname)+1); + if(p->zonefile) + region_recycle(opt->region, (void*)p->zonefile, + strlen(p->zonefile)+1); + acl_list_delete(opt->region, p->allow_notify); + acl_list_delete(opt->region, p->request_xfr); + acl_list_delete(opt->region, p->notify); + acl_list_delete(opt->region, p->provide_xfr); + acl_list_delete(opt->region, p->outgoing_interface); + + region_recycle(opt->region, p, sizeof(pattern_options_t)); +} + +static acl_options_t* +copy_acl(region_type* region, acl_options_t* a) +{ + acl_options_t* b; + if(!a) return NULL; + b = (acl_options_t*)region_alloc(region, sizeof(*b)); + /* copy the whole lot */ + *b = *a; + /* fix the pointers */ + if(a->ip_address_spec) + b->ip_address_spec = region_strdup(region, a->ip_address_spec); + if(a->key_name) + b->key_name = region_strdup(region, a->key_name); + b->next = NULL; + b->key_options = NULL; + return b; +} + +static acl_options_t* +copy_acl_list(nsd_options_t* opt, acl_options_t* a) +{ + acl_options_t* b, *blast = NULL, *blist = NULL; + while(a) { + b = copy_acl(opt->region, a); + /* fixup key_options */ + if(b->key_name) + b->key_options = key_options_find(opt, b->key_name); + else b->key_options = NULL; + + /* link as last into list */ + b->next = NULL; + if(!blist) blist = b; + else blast->next = b; + blast = b; + + a = a->next; + } + return blist; +} + +static void +copy_changed_acl(nsd_options_t* opt, acl_options_t** orig, + acl_options_t* anew) +{ + if(!acl_list_equal(*orig, anew)) { + acl_list_delete(opt->region, *orig); + *orig = copy_acl_list(opt, anew); + } +} + +static void +copy_pat_fixed(region_type* region, pattern_options_t* orig, + pattern_options_t* p) +{ + orig->allow_axfr_fallback = p->allow_axfr_fallback; + orig->allow_axfr_fallback_is_default = + p->allow_axfr_fallback_is_default; + orig->notify_retry = p->notify_retry; + orig->notify_retry_is_default = p->notify_retry_is_default; + orig->implicit = p->implicit; + if(p->zonefile) + orig->zonefile = region_strdup(region, p->zonefile); + else orig->zonefile = NULL; +#ifdef RATELIMIT + orig->rrl_whitelist = p->rrl_whitelist; +#endif +} + +void +pattern_options_add_modify(nsd_options_t* opt, pattern_options_t* p) +{ + pattern_options_t* orig = pattern_options_find(opt, p->pname); + if(!orig) { + /* needs to be copied to opt region */ + orig = pattern_options_create(opt->region); + orig->pname = region_strdup(opt->region, p->pname); + copy_pat_fixed(opt->region, orig, p); + orig->allow_notify = copy_acl_list(opt, p->allow_notify); + orig->request_xfr = copy_acl_list(opt, p->request_xfr); + orig->notify = copy_acl_list(opt, p->notify); + orig->provide_xfr = copy_acl_list(opt, p->provide_xfr); + orig->outgoing_interface = copy_acl_list(opt, + p->outgoing_interface); + nsd_options_insert_pattern(opt, orig); + } else { + /* modify in place so pointers stay valid (and copy + into region). Do not touch unchanged acls. */ + if(orig->zonefile) + region_recycle(opt->region, (char*)orig->zonefile, + strlen(orig->zonefile)+1); + copy_pat_fixed(opt->region, orig, p); + copy_changed_acl(opt, &orig->allow_notify, p->allow_notify); + copy_changed_acl(opt, &orig->request_xfr, p->request_xfr); + copy_changed_acl(opt, &orig->notify, p->notify); + copy_changed_acl(opt, &orig->provide_xfr, p->provide_xfr); + copy_changed_acl(opt, &orig->outgoing_interface, + p->outgoing_interface); + } +} + +pattern_options_t* +pattern_options_find(nsd_options_t* opt, const char* name) +{ + return (pattern_options_t*)rbtree_search(opt->patterns, name); +} + +int +pattern_options_equal(pattern_options_t* p, pattern_options_t* q) +{ + if(strcmp(p->pname, q->pname) != 0) return 0; + if(!p->zonefile && q->zonefile) return 0; + else if(p->zonefile && !q->zonefile) return 0; + else if(p->zonefile && q->zonefile) { + if(strcmp(p->zonefile, q->zonefile) != 0) return 0; + } + if(!booleq(p->allow_axfr_fallback, q->allow_axfr_fallback)) return 0; + if(!booleq(p->allow_axfr_fallback_is_default, + q->allow_axfr_fallback_is_default)) return 0; + if(p->notify_retry != q->notify_retry) return 0; + if(!booleq(p->notify_retry_is_default, + q->notify_retry_is_default)) return 0; + if(!booleq(p->implicit, q->implicit)) return 0; + if(!acl_list_equal(p->allow_notify, q->allow_notify)) return 0; + if(!acl_list_equal(p->request_xfr, q->request_xfr)) return 0; + if(!acl_list_equal(p->notify, q->notify)) return 0; + if(!acl_list_equal(p->provide_xfr, q->provide_xfr)) return 0; + if(!acl_list_equal(p->outgoing_interface, q->outgoing_interface)) + return 0; +#ifdef RATELIMIT + if(p->rrl_whitelist != q->rrl_whitelist) return 0; +#endif + return 1; +} + +static void +marshal_u8(struct buffer* b, uint8_t v) +{ + buffer_reserve(b, 1); + buffer_write_u8(b, v); +} + +static uint8_t +unmarshal_u8(struct buffer* b) +{ + return buffer_read_u8(b); +} + +#ifdef RATELIMIT +static void +marshal_u16(struct buffer* b, uint16_t v) +{ + buffer_reserve(b, 2); + buffer_write_u16(b, v); +} +#endif + +#ifdef RATELIMIT +static uint16_t +unmarshal_u16(struct buffer* b) +{ + return buffer_read_u16(b); +} +#endif + +static void +marshal_str(struct buffer* b, const char* s) +{ + if(!s) marshal_u8(b, 0); + else { + size_t len = strlen(s); + marshal_u8(b, 1); + buffer_reserve(b, len+1); + buffer_write(b, s, len+1); + } +} + +static char* +unmarshal_str(region_type* r, struct buffer* b) +{ + uint8_t nonnull = unmarshal_u8(b); + if(nonnull) { + char* result = region_strdup(r, (char*)buffer_current(b)); + size_t len = strlen((char*)buffer_current(b)); + buffer_skip(b, len+1); + return result; + } else return NULL; +} + +static void +marshal_acl(struct buffer* b, acl_options_t* acl) +{ + buffer_reserve(b, sizeof(*acl)); + buffer_write(b, acl, sizeof(*acl)); + marshal_str(b, acl->ip_address_spec); + marshal_str(b, acl->key_name); +} + +static acl_options_t* +unmarshal_acl(region_type* r, struct buffer* b) +{ + acl_options_t* acl = (acl_options_t*)region_alloc(r, sizeof(*acl)); + buffer_read(b, acl, sizeof(*acl)); + acl->next = NULL; + acl->key_options = NULL; + acl->ip_address_spec = unmarshal_str(r, b); + acl->key_name = unmarshal_str(r, b); + return acl; +} + +static void +marshal_acl_list(struct buffer* b, acl_options_t* list) +{ + while(list) { + marshal_u8(b, 1); /* is there a next one marker */ + marshal_acl(b, list); + list = list->next; + } + marshal_u8(b, 0); /* end of list marker */ +} + +static acl_options_t* +unmarshal_acl_list(region_type* r, struct buffer* b) +{ + acl_options_t* a, *last=NULL, *list=NULL; + while(unmarshal_u8(b)) { + a = unmarshal_acl(r, b); + /* link in */ + a->next = NULL; + if(!list) list = a; + else last->next = a; + last = a; + } + return list; +} + +void +pattern_options_marshal(struct buffer* b, pattern_options_t* p) +{ + marshal_str(b, p->pname); + marshal_str(b, p->zonefile); +#ifdef RATELIMIT + marshal_u16(b, p->rrl_whitelist); +#endif + marshal_u8(b, p->allow_axfr_fallback); + marshal_u8(b, p->allow_axfr_fallback_is_default); + marshal_u8(b, p->notify_retry); + marshal_u8(b, p->notify_retry_is_default); + marshal_u8(b, p->implicit); + marshal_acl_list(b, p->allow_notify); + marshal_acl_list(b, p->request_xfr); + marshal_acl_list(b, p->notify); + marshal_acl_list(b, p->provide_xfr); + marshal_acl_list(b, p->outgoing_interface); +} + +pattern_options_t* +pattern_options_unmarshal(region_type* r, struct buffer* b) +{ + pattern_options_t* p = pattern_options_create(r); + p->pname = unmarshal_str(r, b); + p->zonefile = unmarshal_str(r, b); +#ifdef RATELIMIT + p->rrl_whitelist = unmarshal_u16(b); +#endif + p->allow_axfr_fallback = unmarshal_u8(b); + p->allow_axfr_fallback_is_default = unmarshal_u8(b); + p->notify_retry = unmarshal_u8(b); + p->notify_retry_is_default = unmarshal_u8(b); + p->implicit = unmarshal_u8(b); + p->allow_notify = unmarshal_acl_list(r, b); + p->request_xfr = unmarshal_acl_list(r, b); + p->notify = unmarshal_acl_list(r, b); + p->provide_xfr = unmarshal_acl_list(r, b); + p->outgoing_interface = unmarshal_acl_list(r, b); + return p; +} + +key_options_t* +key_options_create(region_type* region) { key_options_t* key; - key = (key_options_t*)region_alloc(region, sizeof(key_options_t)); - key->name = 0; - key->next = 0; - key->algorithm = 0; - key->secret = 0; - key->tsig_key = 0; + key = (key_options_t*)region_alloc_zero(region, sizeof(key_options_t)); return key; } -key_options_t* key_options_find(nsd_options_t* opt, const char* name) +void +key_options_insert(nsd_options_t* opt, key_options_t* key) +{ + if(!key->name) return; + key->node.key = key->name; + (void)rbtree_insert(opt->keys, &key->node); +} + +key_options_t* +key_options_find(nsd_options_t* opt, const char* name) { - key_options_t* key = opt->keys; - while(key) { - if(strcmp(key->name, name)==0) - return key; - key = key->next; + return (key_options_t*)rbtree_search(opt->keys, name); +} + +/** remove tsig_key contents */ +void +key_options_desetup(region_type* region, key_options_t* key) +{ + /* keep tsig_key pointer so that existing references keep valid */ + if(!key->tsig_key) + return; + /* name stays the same */ + if(key->tsig_key->data) { + /* wipe secret! */ + memset(key->tsig_key->data, 0xdd, key->tsig_key->size); + region_recycle(region, key->tsig_key->data, + key->tsig_key->size); + key->tsig_key->data = NULL; + key->tsig_key->size = 0; } - return 0; } -int acl_check_incoming(acl_options_t* acl, struct query* q, +/** add tsig_key contents */ +void +key_options_setup(region_type* region, key_options_t* key) +{ + uint8_t data[16384]; /* 16KB */ + int size; + if(!key->tsig_key) { + /* create it */ + key->tsig_key = (tsig_key_type *) region_alloc(region, + sizeof(tsig_key_type)); + /* create name */ + key->tsig_key->name = dname_parse(region, key->name); + if(!key->tsig_key->name) { + log_msg(LOG_ERR, "Failed to parse tsig key name %s", + key->name); + /* key and base64 were checked during syntax parse */ + exit(1); + } + key->tsig_key->size = 0; + key->tsig_key->data = NULL; + } + size = b64_pton(key->secret, data, sizeof(data)); + if(size == -1) { + log_msg(LOG_ERR, "Failed to parse tsig key data %s", + key->name); + /* key and base64 were checked during syntax parse */ + exit(1); + } + key->tsig_key->size = size; + key->tsig_key->data = (uint8_t *)region_alloc_init(region, data, size); +} + +void +key_options_remove(nsd_options_t* opt, const char* name) +{ + key_options_t* k = key_options_find(opt, name); + if(!k) return; + (void)rbtree_delete(opt->keys, name); + if(k->name) + region_recycle(opt->region, k->name, strlen(k->name)+1); + if(k->algorithm) + region_recycle(opt->region, k->algorithm, strlen(k->algorithm)+1); + if(k->secret) { + memset(k->secret, 0xdd, strlen(k->secret)); /* wipe secret! */ + region_recycle(opt->region, k->secret, strlen(k->secret)+1); + } + if(k->tsig_key) { + tsig_del_key(k->tsig_key); + if(k->tsig_key->name) + region_recycle(opt->region, (void*)k->tsig_key->name, + dname_total_size(k->tsig_key->name)); + key_options_desetup(opt->region, k); + region_recycle(opt->region, k->tsig_key, sizeof(tsig_key_type)); + } + region_recycle(opt->region, k, sizeof(key_options_t)); +} + +int +key_options_equal(key_options_t* p, key_options_t* q) +{ + return strcmp(p->name, q->name)==0 && strcmp(p->algorithm, + q->algorithm)==0 && strcmp(p->secret, q->secret)==0; +} + +void +key_options_add_modify(nsd_options_t* opt, key_options_t* key) +{ + key_options_t* orig = key_options_find(opt, key->name); + if(!orig) { + /* needs to be copied to opt region */ + orig = key_options_create(opt->region); + orig->name = region_strdup(opt->region, key->name); + orig->algorithm = region_strdup(opt->region, key->algorithm); + orig->secret = region_strdup(opt->region, key->secret); + key_options_setup(opt->region, orig); + tsig_add_key(orig->tsig_key); + key_options_insert(opt, orig); + } else { + /* modify entries in existing key, and copy to opt region */ + key_options_desetup(opt->region, orig); + region_recycle(opt->region, orig->algorithm, + strlen(orig->algorithm)+1); + orig->algorithm = region_strdup(opt->region, key->algorithm); + region_recycle(opt->region, orig->secret, + strlen(orig->secret)+1); + orig->secret = region_strdup(opt->region, key->secret); + key_options_setup(opt->region, orig); + } +} + +int +acl_check_incoming(acl_options_t* acl, struct query* q, acl_options_t** reason) { /* check each acl element. @@ -311,7 +1259,8 @@ int acl_check_incoming(acl_options_t* acl, struct query* q, } #ifdef INET6 -int acl_addr_matches_ipv6host(acl_options_t* acl, struct sockaddr_storage* addr_storage, unsigned int port) +int +acl_addr_matches_ipv6host(acl_options_t* acl, struct sockaddr_storage* addr_storage, unsigned int port) { struct sockaddr_in6* addr = (struct sockaddr_in6*)addr_storage; if(acl->port != 0 && acl->port != port) @@ -339,7 +1288,8 @@ int acl_addr_matches_ipv6host(acl_options_t* acl, struct sockaddr_storage* addr_ } #endif -int acl_addr_matches_ipv4host(acl_options_t* acl, struct sockaddr_in* addr, unsigned int port) +int +acl_addr_matches_ipv4host(acl_options_t* acl, struct sockaddr_in* addr, unsigned int port) { if(acl->port != 0 && acl->port != port) return 0; @@ -365,7 +1315,8 @@ int acl_addr_matches_ipv4host(acl_options_t* acl, struct sockaddr_in* addr, unsi return 1; } -int acl_addr_matches_host(acl_options_t* acl, acl_options_t* host) +int +acl_addr_matches_host(acl_options_t* acl, acl_options_t* host) { if(acl->is_ipv6) { @@ -387,7 +1338,8 @@ int acl_addr_matches_host(acl_options_t* acl, acl_options_t* host) return 0; } -int acl_addr_matches(acl_options_t* acl, struct query* q) +int +acl_addr_matches(acl_options_t* acl, struct query* q) { if(acl->is_ipv6) { @@ -411,7 +1363,8 @@ int acl_addr_matches(acl_options_t* acl, struct query* q) return 0; } -int acl_addr_match_mask(uint32_t* a, uint32_t* b, uint32_t* mask, size_t sz) +int +acl_addr_match_mask(uint32_t* a, uint32_t* b, uint32_t* mask, size_t sz) { size_t i; #ifndef NDEBUG @@ -427,7 +1380,8 @@ int acl_addr_match_mask(uint32_t* a, uint32_t* b, uint32_t* mask, size_t sz) return 1; } -int acl_addr_match_range(uint32_t* minval, uint32_t* x, uint32_t* maxval, size_t sz) +int +acl_addr_match_range(uint32_t* minval, uint32_t* x, uint32_t* maxval, size_t sz) { size_t i; uint8_t checkmin = 1, checkmax = 1; @@ -456,7 +1410,8 @@ int acl_addr_match_range(uint32_t* minval, uint32_t* x, uint32_t* maxval, size_t return 1; } -int acl_key_matches(acl_options_t* acl, struct query* q) +int +acl_key_matches(acl_options_t* acl, struct query* q) { if(acl->blocked) return 1; @@ -527,42 +1482,93 @@ acl_same_host(acl_options_t* a, acl_options_t* b) } #if defined(HAVE_SSL) -void key_options_tsig_add(nsd_options_t* opt) +void +key_options_tsig_add(nsd_options_t* opt) { key_options_t* optkey; - uint8_t data[4000]; - tsig_key_type* tsigkey; - const dname_type* dname; - int size; - - for(optkey = opt->keys; optkey; optkey = optkey->next) - { - dname = dname_parse(opt->region, optkey->name); - if(!dname) { - log_msg(LOG_ERR, "Failed to parse tsig key name %s", optkey->name); - continue; - } - size = b64_pton(optkey->secret, data, sizeof(data)); - if(size == -1) { - log_msg(LOG_ERR, "Failed to parse tsig key data %s", optkey->name); - continue; - } - tsigkey = (tsig_key_type *) region_alloc(opt->region, sizeof(tsig_key_type)); - tsigkey->name = dname; - tsigkey->size = size; - tsigkey->data = (uint8_t *) region_alloc_init(opt->region, data, tsigkey->size); - tsig_add_key(tsigkey); - optkey->tsig_key = tsigkey; + RBTREE_FOR(optkey, key_options_t*, opt->keys) { + key_options_setup(opt->region, optkey); + tsig_add_key(optkey->tsig_key); } } #endif -int zone_is_slave(zone_options_t* opt) +int +zone_is_slave(zone_options_t* opt) +{ + return opt && opt->pattern && opt->pattern->request_xfr != 0; +} + +/* get a character in string (or replacement char if not long enough) */ +static const char* +get_char(const char* str, size_t i) +{ + static char res[2]; + if(i >= strlen(str)) + return "."; + res[0] = str[i]; + res[1] = 0; + return res; +} +/* get end label of the zone name (or .) */ +static const char* +get_end_label(zone_options_t* zone, int i) +{ + const dname_type* d = (const dname_type*)zone->node.key; + if(i >= d->label_count) { + return "."; + } + return wirelabel2str(dname_label(d, i)); +} +/* replace occurrences of one with two */ +void +replace_str(char* str, size_t len, const char* one, const char* two) +{ + char* pos; + char* at = str; + while( (pos=strstr(at, one)) ) { + if(strlen(str)+strlen(two)-strlen(one) >= len) + return; /* no more space to replace */ + /* stuff before pos is fine */ + /* move the stuff after pos to make space for two, add + * one to length of remainder to also copy the 0 byte end */ + memmove(pos+strlen(two), pos+strlen(one), + strlen(pos+strlen(one))+1); + /* copy in two */ + memmove(pos, two, strlen(two)); + /* at is end of the newly inserted two (avoids recursion if + * two contains one) */ + at = pos+strlen(two); + } +} + +const char* +config_make_zonefile(zone_options_t* zone) { - return opt->request_xfr != 0; + static char f[1024]; + /* if not a template, return as-is */ + if(!strchr(zone->pattern->zonefile, '%')) + return zone->pattern->zonefile; + strlcpy(f, zone->pattern->zonefile, sizeof(f)); + if(strstr(f, "%1")) + replace_str(f, sizeof(f), "%1", get_char(zone->name, 0)); + if(strstr(f, "%2")) + replace_str(f, sizeof(f), "%2", get_char(zone->name, 1)); + if(strstr(f, "%3")) + replace_str(f, sizeof(f), "%3", get_char(zone->name, 2)); + if(strstr(f, "%z")) + replace_str(f, sizeof(f), "%z", get_end_label(zone, 1)); + if(strstr(f, "%y")) + replace_str(f, sizeof(f), "%y", get_end_label(zone, 2)); + if(strstr(f, "%x")) + replace_str(f, sizeof(f), "%x", get_end_label(zone, 3)); + if(strstr(f, "%s")) + replace_str(f, sizeof(f), "%s", zone->name); + return f; } -zone_options_t* zone_options_find(nsd_options_t* opt, const struct dname* apex) +zone_options_t* +zone_options_find(nsd_options_t* opt, const struct dname* apex) { return (zone_options_t*) rbtree_search(opt->zone_options, apex); } @@ -583,7 +1589,8 @@ acl_find_num(acl_options_t* acl, int num) } /* true if ipv6 address, false if ipv4 */ -int parse_acl_is_ipv6(const char* p) +int +parse_acl_is_ipv6(const char* p) { /* see if addr is ipv6 or ipv4 -- by : and . */ while(*p) { @@ -595,7 +1602,8 @@ int parse_acl_is_ipv6(const char* p) } /* returns range type. mask is the 2nd part of the range */ -int parse_acl_range_type(char* ip, char** mask) +int +parse_acl_range_type(char* ip, char** mask) { char *p; if((p=strchr(ip, '&'))!=0) { @@ -618,7 +1626,8 @@ int parse_acl_range_type(char* ip, char** mask) } /* parses subnet mask, fills 0 mask as well */ -void parse_acl_range_subnet(char* p, void* addr, int maxbits) +void +parse_acl_range_subnet(char* p, void* addr, int maxbits) { int subnet_bits = atoi(p); uint8_t* addr_bytes = (uint8_t*)addr; @@ -641,7 +1650,8 @@ void parse_acl_range_subnet(char* p, void* addr, int maxbits) } } -acl_options_t* parse_acl_info(region_type* region, char* ip, const char* key) +acl_options_t* +parse_acl_info(region_type* region, char* ip, const char* key) { char* p; acl_options_t* acl = (acl_options_t*)region_alloc(region, sizeof(acl_options_t)); @@ -704,7 +1714,62 @@ acl_options_t* parse_acl_info(region_type* region, char* ip, const char* key) return acl; } -void nsd_options_destroy(nsd_options_t* opt) +/* copy acl list at end of parser start, update current */ +static +void append_acl(acl_options_t** start, acl_options_t** cur, + acl_options_t* list) +{ + while(list) { + acl_options_t* acl = copy_acl(cfg_parser->opt->region, list); + acl->next = NULL; + if(*cur) + (*cur)->next = acl; + else *start = acl; + *cur = acl; + list = list->next; + } +} + +void +config_apply_pattern(const char* name) +{ + /* find the pattern */ + pattern_options_t* pat = pattern_options_find(cfg_parser->opt, name); + pattern_options_t* a = cfg_parser->current_pattern; + if(!pat) { + c_error_msg("could not find pattern %s", name); + return; + } + + /* apply settings */ + if(pat->zonefile) + a->zonefile = region_strdup(cfg_parser->opt->region, + pat->zonefile); + if(!pat->allow_axfr_fallback_is_default) { + a->allow_axfr_fallback = pat->allow_axfr_fallback; + a->allow_axfr_fallback_is_default = 0; + } + if(!pat->notify_retry_is_default) { + a->notify_retry = pat->notify_retry; + a->notify_retry_is_default = 0; + } +#ifdef RATELIMIT + a->rrl_whitelist |= pat->rrl_whitelist; +#endif + /* append acl items */ + append_acl(&a->allow_notify, &cfg_parser->current_allow_notify, + pat->allow_notify); + append_acl(&a->request_xfr, &cfg_parser->current_request_xfr, + pat->request_xfr); + append_acl(&a->notify, &cfg_parser->current_notify, pat->notify); + append_acl(&a->provide_xfr, &cfg_parser->current_provide_xfr, + pat->provide_xfr); + append_acl(&a->outgoing_interface, &cfg_parser-> + current_outgoing_interface, pat->outgoing_interface); +} + +void +nsd_options_destroy(nsd_options_t* opt) { region_destroy(opt->region); } diff --git a/usr.sbin/nsd/options.h b/usr.sbin/nsd/options.h index cab7d5749cf..4cad972a683 100644 --- a/usr.sbin/nsd/options.h +++ b/usr.sbin/nsd/options.h @@ -1,7 +1,7 @@ /* * options.h -- nsd.conf options definitions and prototypes * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -17,8 +17,10 @@ struct query; struct dname; struct tsig_key; +struct buffer; typedef struct nsd_options nsd_options_t; +typedef struct pattern_options pattern_options_t; typedef struct zone_options zone_options_t; typedef struct ipaddress_option ip_address_option_t; typedef struct acl_options acl_options_t; @@ -28,12 +30,24 @@ typedef struct config_parser_state config_parser_state_t; * Options global for nsd. */ struct nsd_options { + /* config file name */ + char* configfile; /* options for zones, by apex, contains zone_options_t */ rbtree_t* zone_options; + /* patterns, by name, contains pattern_options_t */ + rbtree_t* patterns; - /* list of keys defined */ - key_options_t* keys; - size_t numkeys; + /* free space in zonelist file, contains zonelist_bucket */ + rbtree_t* zonefree; + /* number of free space lines in zonelist file */ + size_t zonefree_number; + /* zonelist file if open */ + FILE* zonelist; + /* last offset in file (or 0 if none) */ + off_t zonelist_off; + + /* rbtree of keys defined, by name */ + rbtree_t* keys; /* list of ip adresses to bind to (or NULL for all) */ ip_address_option_t* ip_addresses; @@ -42,8 +56,8 @@ struct nsd_options { int debug_mode; int verbosity; int hide_version; - int ip4_only; - int ip6_only; + int do_ip4; + int do_ip6; const char* database; const char* identity; const char* logfile; @@ -56,14 +70,30 @@ struct nsd_options { const char* pidfile; const char* port; int statistics; - const char* zonestatsfile; const char* chroot; const char* username; const char* zonesdir; - const char* difffile; const char* xfrdfile; + const char* xfrdir; + const char* zonelistfile; const char* nsid; int xfrd_reload_timeout; + int zonefiles_check; + + /** remote control section. enable toggle. */ + int control_enable; + /** the interfaces the remote control should listen on */ + ip_address_option_t* control_interface; + /** port number for the control port */ + int control_port; + /** private key file for server */ + char* server_key_file; + /** certificate file for server */ + char* server_cert_file; + /** private key file for nsd-control */ + char* control_key_file; + /** certificate file for nsd-control */ + char* control_cert_file; #ifdef RATELIMIT /** number of buckets in rrl hashtable */ @@ -88,14 +118,11 @@ struct ipaddress_option { }; /* - * Options for a zone + * Pattern of zone options, used to contain options for zone(s). */ -struct zone_options { - /* key is dname of apex */ +struct pattern_options { rbnode_t node; - - /* is apex of the zone */ - const char* name; + const char* pname; /* name of the pattern, key of rbtree */ const char* zonefile; acl_options_t* allow_notify; acl_options_t* request_xfr; @@ -106,7 +133,32 @@ struct zone_options { uint16_t rrl_whitelist; /* bitmap with rrl types */ #endif uint8_t allow_axfr_fallback; + uint8_t allow_axfr_fallback_is_default; uint8_t notify_retry; + uint8_t notify_retry_is_default; + uint8_t implicit; /* pattern is implicit, part_of_config zone used */ + uint8_t xfrd_flags; +}; + +#define PATTERN_IMPLICIT_MARKER "_implicit_" + +/* + * Options for a zone + */ +struct zone_options { + /* key is dname of apex */ + rbnode_t node; + + /* is apex of the zone */ + const char* name; + /* if not part of config, the offset and linesize of zonelist entry */ + off_t off; + int linesize; + /* pattern for the zone options, if zone is part_of_config, this is + * a anonymous pattern created in-place */ + pattern_options_t* pattern; + /* zone is fixed into the main config, not in zonelist, cannot delete */ + uint8_t part_of_config; }; union acl_addr_storage { @@ -125,10 +177,10 @@ struct acl_options { acl_options_t* next; /* options */ - uint8_t use_axfr_only; - uint8_t allow_udp; time_t ixfr_disabled; int bad_xfr_count; + uint8_t use_axfr_only; + uint8_t allow_udp; /* ip address range */ const char* ip_address_spec; @@ -154,21 +206,36 @@ struct acl_options { * Key definition */ struct key_options { - key_options_t* next; - const char* name; - const char* algorithm; - const char* secret; + rbnode_t node; /* key of tree is name */ + char* name; + char* algorithm; + char* secret; struct tsig_key* tsig_key; }; +/** zone list free space */ +struct zonelist_free { + struct zonelist_free* next; + off_t off; +}; +/** zonelist free bucket for a particular line length */ +struct zonelist_bucket { + rbnode_t node; /* key is ptr to linesize */ + int linesize; + struct zonelist_free* list; +}; + /* * Used during options parsing */ struct config_parser_state { const char* filename; + const char* chroot; int line; int errors; + int server_settings_seen; nsd_options_t* opt; + pattern_options_t* current_pattern; zone_options_t* current_zone; key_options_t* current_key; ip_address_option_t* current_ip_address_option; @@ -177,6 +244,8 @@ struct config_parser_state { acl_options_t* current_notify; acl_options_t* current_provide_xfr; acl_options_t* current_outgoing_interface; + void (*err)(void*,const char*); + void* err_arg; }; extern config_parser_state_t* cfg_parser; @@ -188,14 +257,41 @@ static inline size_t nsd_options_num_zones(nsd_options_t* opt) { return opt->zone_options->count; } /* insert a zone into the main options tree, returns 0 on error */ int nsd_options_insert_zone(nsd_options_t* opt, zone_options_t* zone); +/* insert a pattern into the main options tree, returns 0 on error */ +int nsd_options_insert_pattern(nsd_options_t* opt, pattern_options_t* pat); -/* parses options file. Returns false on failure */ -int parse_options_file(nsd_options_t* opt, const char* file); +/* parses options file. Returns false on failure. callback, if nonNULL, + * gets called with error strings, default prints. */ +int parse_options_file(nsd_options_t* opt, const char* file, + void (*err)(void*,const char*), void* err_arg); zone_options_t* zone_options_create(region_type* region); +void zone_options_delete(nsd_options_t* opt, zone_options_t* zone); /* find a zone by apex domain name, or NULL if not found. */ zone_options_t* zone_options_find(nsd_options_t* opt, const struct dname* apex); +pattern_options_t* pattern_options_create(region_type* region); +pattern_options_t* pattern_options_find(nsd_options_t* opt, const char* name); +int pattern_options_equal(pattern_options_t* p, pattern_options_t* q); +void pattern_options_remove(nsd_options_t* opt, const char* name); +void pattern_options_add_modify(nsd_options_t* opt, pattern_options_t* p); +void pattern_options_marshal(struct buffer* buffer, pattern_options_t* p); +pattern_options_t* pattern_options_unmarshal(region_type* r, struct buffer* b); key_options_t* key_options_create(region_type* region); +void key_options_insert(nsd_options_t* opt, key_options_t* key); key_options_t* key_options_find(nsd_options_t* opt, const char* name); +void key_options_remove(nsd_options_t* opt, const char* name); +int key_options_equal(key_options_t* p, key_options_t* q); +void key_options_add_modify(nsd_options_t* opt, key_options_t* key); +/* read in zone list file. Returns false on failure */ +int parse_zone_list_file(nsd_options_t* opt); +/* create zone entry and add to the zonelist file */ +zone_options_t* zone_list_add(nsd_options_t* opt, const char* zname, + const char* pname); +/* create zonelist entry, do not insert in file (called by _add) */ +zone_options_t* zone_list_zone_insert(nsd_options_t* opt, const char* nm, + const char* patnm, int linesize, off_t off); +void zone_list_del(nsd_options_t* opt, zone_options_t* zone); +void zone_list_compact(nsd_options_t* opt); +void zone_list_close(nsd_options_t* opt); #if defined(HAVE_SSL) /* tsig must be inited, adds all keys in options to tsig. */ @@ -218,8 +314,18 @@ int acl_same_host(acl_options_t* a, acl_options_t* b); /* find acl by number in the list */ acl_options_t* acl_find_num(acl_options_t* acl, int num); +/* see if two acl lists are the same (same elements in same order, or empty) */ +int acl_list_equal(acl_options_t* p, acl_options_t* q); +/* see if two acl are the same */ +int acl_equal(acl_options_t* p, acl_options_t* q); + /* see if a zone is a slave or a master zone */ int zone_is_slave(zone_options_t* opt); +/* create zonefile name, returns static pointer (perhaps to options data) */ +const char* config_make_zonefile(zone_options_t* zone); + +#define ZONEC_PCT_TIME 5 /* seconds, then it starts to print pcts */ +#define ZONEC_PCT_COUNT 100000 /* elements before pct check is done */ /* parsing helpers */ void c_error(const char* msg); @@ -233,5 +339,9 @@ int parse_acl_range_type(char* ip, char** mask); void parse_acl_range_subnet(char* p, void* addr, int maxbits); /* clean up options */ void nsd_options_destroy(nsd_options_t* opt); +/* replace occurrences of one with two in buf, pass length of buffer */ +void replace_str(char* buf, size_t len, const char* one, const char* two); +/* apply pattern to the existing pattern in the parser */ +void config_apply_pattern(const char* name); #endif /* OPTIONS_H */ diff --git a/usr.sbin/nsd/packet.c b/usr.sbin/nsd/packet.c index a4ab76e9511..4cba1600c8f 100644 --- a/usr.sbin/nsd/packet.c +++ b/usr.sbin/nsd/packet.c @@ -1,7 +1,7 @@ /* * packet.c -- low-level DNS packet encoding and decoding functions. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -22,7 +22,7 @@ encode_dname(query_type *q, domain_type *domain) query_put_dname_offset(q, domain, buffer_position(q->packet)); DEBUG(DEBUG_NAME_COMPRESSION, 2, (LOG_INFO, "dname: %s, number: %lu, offset: %u\n", - dname_to_string(domain_dname(domain), NULL), + domain_to_string(domain), (unsigned long) domain->number, query_get_dname_offset(q, domain))); buffer_write(q->packet, dname_name(domain_dname(domain)), @@ -32,7 +32,7 @@ encode_dname(query_type *q, domain_type *domain) if (domain->parent) { DEBUG(DEBUG_NAME_COMPRESSION, 2, (LOG_INFO, "dname: %s, number: %lu, pointer: %u\n", - dname_to_string(domain_dname(domain), NULL), + domain_to_string(domain), (unsigned long) domain->number, query_get_dname_offset(q, domain))); assert(query_get_dname_offset(q, domain) <= MAX_COMPRESSION_OFFSET); diff --git a/usr.sbin/nsd/packet.h b/usr.sbin/nsd/packet.h index c9a34061b74..2efa288b91e 100644 --- a/usr.sbin/nsd/packet.h +++ b/usr.sbin/nsd/packet.h @@ -1,7 +1,7 @@ /* * packet.h -- low-level DNS packet encoding and decoding functions. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * diff --git a/usr.sbin/nsd/query.h b/usr.sbin/nsd/query.h index 24fafd447ca..4ff21f770c5 100644 --- a/usr.sbin/nsd/query.h +++ b/usr.sbin/nsd/query.h @@ -1,7 +1,7 @@ /* * query.h -- manipulation with the queries * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -76,9 +76,6 @@ struct query { /* The zone used to answer the query. */ zone_type *zone; - /* The domain used to answer the query. */ - domain_type *domain; - /* The delegation domain, if any. */ domain_type *delegation_domain; @@ -106,10 +103,10 @@ struct query { * query name when generated from a wildcard record. */ uint16_t *compressed_dname_offsets; - uint32_t compressed_dname_offsets_size; + size_t compressed_dname_offsets_size; /* number of temporary domains used for the query */ - uint32_t number_temporary_domains; + size_t number_temporary_domains; /* * Used for AXFR processing. @@ -175,7 +172,7 @@ void query_add_compression_domain(struct query *query, */ query_type *query_create(region_type *region, uint16_t *compressed_dname_offsets, - uint32_t compressed_dname_size); + size_t compressed_dname_size); /* * Reset a query structure so it is ready for receiving and processing diff --git a/usr.sbin/nsd/radtree.c b/usr.sbin/nsd/radtree.c new file mode 100644 index 00000000000..11bfb4fcd30 --- /dev/null +++ b/usr.sbin/nsd/radtree.c @@ -0,0 +1,1411 @@ +/* + * radtree -- generic radix tree for binary strings. + * + * Copyright (c) 2010, NLnet Labs. See LICENSE for license. + */ +#include "config.h" +#include <assert.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <time.h> +#include "radtree.h" +#include "util.h" +#include "region-allocator.h" + +#include <stdio.h> +#include <ctype.h> + +struct radtree* radix_tree_create(struct region* region) +{ + struct radtree* rt = (struct radtree*)region_alloc(region, sizeof(*rt)); + if(!rt) return NULL; + rt->region = region; + radix_tree_init(rt); + return rt; +} + +void radix_tree_init(struct radtree* rt) +{ + rt->root = NULL; + rt->count = 0; +} + +/** delete radnodes in postorder recursion */ +static void radnode_del_postorder(struct region* region, struct radnode* n) +{ + unsigned i; + if(!n) return; + for(i=0; i<n->len; i++) { + radnode_del_postorder(region, n->array[i].node); + region_recycle(region, n->array[i].str, n->array[i].len); + } + region_recycle(region, n->array, n->capacity*sizeof(struct radsel)); + region_recycle(region, n, sizeof(*n)); +} + +void radix_tree_clear(struct radtree* rt) +{ + radnode_del_postorder(rt->region, rt->root); + rt->root = NULL; + rt->count = 0; +} + +void radix_tree_delete(struct radtree* rt) +{ + if(!rt) return; + radix_tree_clear(rt); + region_recycle(rt->region, rt, sizeof(*rt)); +} + +/** return last elem-containing node in this subtree (excl self) */ +static struct radnode* +radnode_last_in_subtree(struct radnode* n) +{ + int idx; + /* try last entry in array first */ + for(idx=((int)n->len)-1; idx >= 0; idx--) { + if(n->array[idx].node) { + /* does it have entries in its subtrees? */ + if(n->array[idx].node->len > 0) { + struct radnode* s = radnode_last_in_subtree( + n->array[idx].node); + if(s) return s; + } + /* no, does it have an entry itself? */ + if(n->array[idx].node->elem) + return n->array[idx].node; + } + } + return NULL; +} + +/** last in subtree, incl self */ +static struct radnode* +radnode_last_in_subtree_incl_self(struct radnode* n) +{ + struct radnode* s = radnode_last_in_subtree(n); + if(s) return s; + if(n->elem) return n; + return NULL; +} + +/** return first elem-containing node in this subtree (excl self) */ +static struct radnode* +radnode_first_in_subtree(struct radnode* n) +{ + unsigned idx; + struct radnode* s; + /* try every subnode */ + for(idx=0; idx<n->len; idx++) { + if(n->array[idx].node) { + /* does it have elem itself? */ + if(n->array[idx].node->elem) + return n->array[idx].node; + /* try its subtrees */ + if((s=radnode_first_in_subtree(n->array[idx].node))!=0) + return s; + } + } + return NULL; +} + +/** Find an entry in arrays from idx-1 to 0 */ +static struct radnode* +radnode_find_prev_from_idx(struct radnode* n, unsigned from) +{ + unsigned idx = from; + while(idx > 0) { + idx --; + if(n->array[idx].node) { + struct radnode* s = radnode_last_in_subtree_incl_self( + n->array[idx].node); + if(s) return s; + } + } + return NULL; +} + +/** + * Find a prefix of the key, in whole-nodes. + * Finds the longest prefix that corresponds to a whole radnode entry. + * There may be a slightly longer prefix in one of the array elements. + * @param result: the longest prefix, the entry itself if *respos==len, + * otherwise an array entry, residx. + * @param respos: pos in string where next unmatched byte is, if == len an + * exact match has been found. If == 0 then a "" match was found. + * @return false if no prefix found, not even the root "" prefix. + */ +static int radix_find_prefix_node(struct radtree* rt, uint8_t* k, + radstrlen_t len, struct radnode** result, radstrlen_t* respos) +{ + struct radnode* n = rt->root; + radstrlen_t pos = 0; + uint8_t byte; + *respos = 0; + *result = n; + if(!n) return 0; + while(n) { + if(pos == len) { + return 1; + } + byte = k[pos]; + if(byte < n->offset) { + return 1; + } + byte -= n->offset; + if(byte >= n->len) { + return 1; + } + pos++; + if(n->array[byte].len != 0) { + /* must match additional string */ + if(pos+n->array[byte].len > len) { + return 1; + } + if(memcmp(&k[pos], n->array[byte].str, + n->array[byte].len) != 0) { + return 1; + } + pos += n->array[byte].len; + } + n = n->array[byte].node; + if(!n) return 1; + *respos = pos; + *result = n; + } + return 1; +} + +/** grow array to at least the given size, offset unchanged */ +static int +radnode_array_grow(struct region* region, struct radnode* n, unsigned want) +{ + unsigned ns = ((unsigned)n->capacity)*2; + struct radsel* a; + assert(want <= 256); /* cannot be more, range of uint8 */ + if(want > ns) + ns = want; + if(ns > 256) ns = 256; + /* we do not use realloc, because we want to keep the old array + * in case alloc fails, so that the tree is still usable */ + a = (struct radsel*)region_alloc(region, ns*sizeof(struct radsel)); + if(!a) return 0; + assert(n->len <= n->capacity); + assert(n->capacity < ns); + memcpy(&a[0], &n->array[0], n->len*sizeof(struct radsel)); + region_recycle(region, n->array, n->capacity*sizeof(struct radsel)); + n->array = a; + n->capacity = ns; + return 1; +} + +/** make space in radnode array for another byte */ +static int +radnode_array_space(struct region* region, struct radnode* n, uint8_t byte) +{ + /* is there an array? */ + if(!n->array || n->capacity == 0) { + n->array = (struct radsel*)region_alloc(region, + sizeof(struct radsel)); + if(!n->array) return 0; + memset(&n->array[0], 0, sizeof(struct radsel)); + n->len = 1; + n->capacity = 1; + n->offset = byte; + /* is the array unused? */ + } else if(n->len == 0 && n->capacity != 0) { + n->len = 1; + n->offset = byte; + memset(&n->array[0], 0, sizeof(struct radsel)); + /* is it below the offset? */ + } else if(byte < n->offset) { + /* is capacity enough? */ + unsigned idx; + unsigned need = n->offset-byte; + if(n->len+need > n->capacity) { + /* grow array */ + if(!radnode_array_grow(region, n, n->len+need)) + return 0; + } + /* reshuffle items to end */ + memmove(&n->array[need], &n->array[0], + n->len*sizeof(struct radsel)); + /* fixup pidx */ + for(idx = 0; idx < n->len; idx++) { + if(n->array[idx+need].node) + n->array[idx+need].node->pidx = idx+need; + } + /* zero the first */ + memset(&n->array[0], 0, need*sizeof(struct radsel)); + n->len += need; + n->offset = byte; + /* is it above the max? */ + } else if(byte-n->offset >= n->len) { + /* is capacity enough? */ + unsigned need = (byte-n->offset) - n->len + 1; + /* grow array */ + if(n->len + need > n->capacity) { + if(!radnode_array_grow(region, n, n->len+need)) + return 0; + } + /* zero added entries */ + memset(&n->array[n->len], 0, need*sizeof(struct radsel)); + /* grow length */ + n->len += need; + } + return 1; +} + +/** create a prefix in the array strs */ +static int +radsel_str_create(struct region* region, struct radsel* r, uint8_t* k, + radstrlen_t pos, radstrlen_t len) +{ + r->str = (uint8_t*)region_alloc(region, sizeof(uint8_t)*(len-pos)); + if(!r->str) + return 0; /* out of memory */ + memmove(r->str, k+pos, len-pos); + r->len = len-pos; + return 1; +} + +/** see if one byte string p is a prefix of another x (equality is true) */ +static int +bstr_is_prefix(uint8_t* p, radstrlen_t plen, uint8_t* x, radstrlen_t xlen) +{ + /* if plen is zero, it is an (empty) prefix */ + if(plen == 0) + return 1; + /* if so, p must be shorter */ + if(plen > xlen) + return 0; + return (memcmp(p, x, plen) == 0); +} + +/** number of bytes in common for the two strings */ +static radstrlen_t +bstr_common(uint8_t* x, radstrlen_t xlen, uint8_t* y, radstrlen_t ylen) +{ + unsigned i, max = ((xlen<ylen)?xlen:ylen); + for(i=0; i<max; i++) { + if(x[i] != y[i]) + return i; + } + return max; +} + + +int +bstr_is_prefix_ext(uint8_t* p, radstrlen_t plen, uint8_t* x, radstrlen_t xlen) +{ + return bstr_is_prefix(p, plen, x, xlen); +} + +radstrlen_t +bstr_common_ext(uint8_t* x, radstrlen_t xlen, uint8_t* y, radstrlen_t ylen) +{ + return bstr_common(x, xlen, y, ylen); +} + +/** allocate remainder from prefixes for a split: + * plen: len prefix, l: longer bstring, llen: length of l. */ +static int +radsel_prefix_remainder(struct region* region, radstrlen_t plen, + uint8_t* l, radstrlen_t llen, + uint8_t** s, radstrlen_t* slen) +{ + *slen = llen - plen; + *s = (uint8_t*)region_alloc(region, (*slen)*sizeof(uint8_t)); + if(!*s) + return 0; + memmove(*s, l+plen, llen-plen); + return 1; +} + +/** radsel create a split when two nodes have shared prefix. + * @param r: radsel that gets changed, it contains a node. + * @param k: key byte string + * @param pos: position where the string enters the radsel (e.g. r.str) + * @param len: length of k. + * @param add: additional node for the string k. + * removed by called on failure. + * @return false on alloc failure, no changes made. + */ +static int +radsel_split(struct region* region, struct radsel* r, uint8_t* k, + radstrlen_t pos, radstrlen_t len, struct radnode* add) +{ + uint8_t* addstr = k+pos; + radstrlen_t addlen = len-pos; + if(bstr_is_prefix(addstr, addlen, r->str, r->len)) { + uint8_t* split_str=NULL, *dupstr=NULL; + radstrlen_t split_len=0; + /* 'add' is a prefix of r.node */ + /* also for empty addstr */ + /* set it up so that the 'add' node has r.node as child */ + /* so, r.node gets moved below the 'add' node, but we do + * this so that the r.node stays the same pointer for its + * key name */ + assert(addlen != r->len); + assert(addlen < r->len); + if(r->len-addlen > 1) { + /* shift one because a char is in the lookup array */ + if(!radsel_prefix_remainder(region, addlen+1, r->str, + r->len, &split_str, &split_len)) + return 0; + } + if(addlen != 0) { + dupstr = (uint8_t*)region_alloc(region, + addlen*sizeof(uint8_t)); + if(!dupstr) { + region_recycle(region, split_str, split_len); + return 0; + } + memcpy(dupstr, addstr, addlen); + } + if(!radnode_array_space(region, add, r->str[addlen])) { + region_recycle(region, split_str, split_len); + region_recycle(region, dupstr, addlen); + return 0; + } + /* alloc succeeded, now link it in */ + add->parent = r->node->parent; + add->pidx = r->node->pidx; + add->array[0].node = r->node; + add->array[0].str = split_str; + add->array[0].len = split_len; + r->node->parent = add; + r->node->pidx = 0; + + r->node = add; + region_recycle(region, r->str, r->len); + r->str = dupstr; + r->len = addlen; + } else if(bstr_is_prefix(r->str, r->len, addstr, addlen)) { + uint8_t* split_str = NULL; + radstrlen_t split_len = 0; + /* r.node is a prefix of 'add' */ + /* set it up so that the 'r.node' has 'add' as child */ + /* and basically, r.node is already completely fine, + * we only need to create a node as its child */ + assert(addlen != r->len); + assert(r->len < addlen); + if(addlen-r->len > 1) { + /* shift one because a character goes into array */ + if(!radsel_prefix_remainder(region, r->len+1, addstr, + addlen, &split_str, &split_len)) + return 0; + } + if(!radnode_array_space(region, r->node, addstr[r->len])) { + region_recycle(region, split_str, split_len); + return 0; + } + /* alloc succeeded, now link it in */ + add->parent = r->node; + add->pidx = addstr[r->len] - r->node->offset; + r->node->array[add->pidx].node = add; + r->node->array[add->pidx].str = split_str; + r->node->array[add->pidx].len = split_len; + } else { + /* okay we need to create a new node that chooses between + * the nodes 'add' and r.node + * We do this so that r.node stays the same pointer for its + * key name. */ + struct radnode* com; + uint8_t* common_str=NULL, *s1_str=NULL, *s2_str=NULL; + radstrlen_t common_len, s1_len=0, s2_len=0; + common_len = bstr_common(r->str, r->len, addstr, addlen); + assert(common_len < r->len); + assert(common_len < addlen); + + /* create the new node for choice */ + com = (struct radnode*)region_alloc_zero(region, sizeof(*com)); + if(!com) return 0; /* out of memory */ + + /* create the two substrings for subchoices */ + if(r->len-common_len > 1) { + /* shift by one char because it goes in lookup array */ + if(!radsel_prefix_remainder(region, common_len+1, + r->str, r->len, &s1_str, &s1_len)) { + region_recycle(region, com, sizeof(*com)); + return 0; + } + } + if(addlen-common_len > 1) { + if(!radsel_prefix_remainder(region, common_len+1, + addstr, addlen, &s2_str, &s2_len)) { + region_recycle(region, com, sizeof(*com)); + region_recycle(region, s1_str, s1_len); + return 0; + } + } + + /* create the shared prefix to go in r */ + if(common_len > 0) { + common_str = (uint8_t*)region_alloc(region, + common_len*sizeof(uint8_t*)); + if(!common_str) { + region_recycle(region, com, sizeof(*com)); + region_recycle(region, s1_str, s1_len); + region_recycle(region, s2_str, s2_len); + return 0; + } + memcpy(common_str, addstr, common_len); + } + + /* make space in the common node array */ + if(!radnode_array_space(region, com, r->str[common_len]) || + !radnode_array_space(region, com, addstr[common_len])) { + region_recycle(region, com->array, com->capacity*sizeof(struct radsel)); + region_recycle(region, com, sizeof(*com)); + region_recycle(region, common_str, common_len); + region_recycle(region, s1_str, s1_len); + region_recycle(region, s2_str, s2_len); + return 0; + } + + /* allocs succeeded, proceed to link it all up */ + com->parent = r->node->parent; + com->pidx = r->node->pidx; + r->node->parent = com; + r->node->pidx = r->str[common_len]-com->offset; + add->parent = com; + add->pidx = addstr[common_len]-com->offset; + com->array[r->node->pidx].node = r->node; + com->array[r->node->pidx].str = s1_str; + com->array[r->node->pidx].len = s1_len; + com->array[add->pidx].node = add; + com->array[add->pidx].str = s2_str; + com->array[add->pidx].len = s2_len; + region_recycle(region, r->str, r->len); + r->str = common_str; + r->len = common_len; + r->node = com; + } + return 1; +} + +struct radnode* radix_insert(struct radtree* rt, uint8_t* k, radstrlen_t len, + void* elem) +{ + struct radnode* n; + radstrlen_t pos = 0; + /* create new element to add */ + struct radnode* add = (struct radnode*)region_alloc_zero(rt->region, + sizeof(*add)); + if(!add) return NULL; /* out of memory */ + add->elem = elem; + + /* find out where to add it */ + if(!radix_find_prefix_node(rt, k, len, &n, &pos)) { + /* new root */ + assert(rt->root == NULL); + if(len == 0) { + rt->root = add; + } else { + /* add a root to point to new node */ + n = (struct radnode*)region_alloc_zero(rt->region, + sizeof(*n)); + if(!n) return NULL; + if(!radnode_array_space(rt->region, n, k[0])) { + region_recycle(rt->region, n->array, + n->capacity*sizeof(struct radsel)); + region_recycle(rt->region, n, sizeof(*n)); + region_recycle(rt->region, add, sizeof(*add)); + return NULL; + } + add->parent = n; + add->pidx = 0; + n->array[0].node = add; + if(len > 1) { + if(!radsel_prefix_remainder(rt->region, 1, k, len, + &n->array[0].str, &n->array[0].len)) { + region_recycle(rt->region, n->array, + n->capacity*sizeof(struct radsel)); + region_recycle(rt->region, n, sizeof(*n)); + region_recycle(rt->region, add, sizeof(*add)); + return NULL; + } + } + rt->root = n; + } + } else if(pos == len) { + /* found an exact match */ + if(n->elem) { + /* already exists, failure */ + region_recycle(rt->region, add, sizeof(*add)); + return NULL; + } + n->elem = elem; + region_recycle(rt->region, add, sizeof(*add)); + add = n; + } else { + /* n is a node which can accomodate */ + uint8_t byte; + assert(pos < len); + byte = k[pos]; + + /* see if it falls outside of array */ + if(byte < n->offset || byte-n->offset >= n->len) { + /* make space in the array for it; adjusts offset */ + if(!radnode_array_space(rt->region, n, byte)) { + region_recycle(rt->region, add, sizeof(*add)); + return NULL; + } + assert(byte>=n->offset && byte-n->offset<n->len); + byte -= n->offset; + /* see if more prefix needs to be split off */ + if(pos+1 < len) { + if(!radsel_str_create(rt->region, &n->array[byte], + k, pos+1, len)) { + region_recycle(rt->region, add, sizeof(*add)); + return NULL; + } + } + /* insert the new node in the new bucket */ + add->parent = n; + add->pidx = byte; + n->array[byte].node = add; + /* so a bucket exists and byte falls in it */ + } else if(n->array[byte-n->offset].node == NULL) { + /* use existing bucket */ + byte -= n->offset; + if(pos+1 < len) { + /* split off more prefix */ + if(!radsel_str_create(rt->region, &n->array[byte], + k, pos+1, len)) { + region_recycle(rt->region, add, sizeof(*add)); + return NULL; + } + } + /* insert the new node in the new bucket */ + add->parent = n; + add->pidx = byte; + n->array[byte].node = add; + } else { + /* use bucket but it has a shared prefix, + * split that out and create a new intermediate + * node to split out between the two. + * One of the two might exactmatch the new + * intermediate node */ + if(!radsel_split(rt->region, &n->array[byte-n->offset], + k, pos+1, len, add)) { + region_recycle(rt->region, add, sizeof(*add)); + return NULL; + } + } + } + + rt->count ++; + return add; +} + +/** Delete a radnode */ +static void radnode_delete(struct region* region, struct radnode* n) +{ + unsigned i; + if(!n) return; + for(i=0; i<n->len; i++) { + /* safe to free NULL str */ + region_recycle(region, n->array[i].str, n->array[i].len); + } + region_recycle(region, n->array, n->capacity*sizeof(struct radsel)); + region_recycle(region, n, sizeof(*n)); +} + +/** Cleanup node with one child, it is removed and joined into parent[x] str */ +static int +radnode_cleanup_onechild(struct region* region, struct radnode* n, + struct radnode* par) +{ + uint8_t* join; + radstrlen_t joinlen; + uint8_t pidx = n->pidx; + struct radnode* child = n->array[0].node; + /* node had one child, merge them into the parent. */ + /* keep the child node, so its pointers stay valid. */ + + /* at parent, append child->str to array str */ + assert(pidx < par->len); + joinlen = par->array[pidx].len + n->array[0].len + 1; + join = (uint8_t*)region_alloc(region, joinlen*sizeof(uint8_t)); + if(!join) { + /* cleanup failed due to out of memory */ + /* the tree is inefficient, with node n still existing */ + return 0; + } + /* we know that .str and join are malloced, thus aligned */ + memcpy(join, par->array[pidx].str, par->array[pidx].len); + /* the array lookup is gone, put its character in the lookup string*/ + join[par->array[pidx].len] = child->pidx + n->offset; + /* but join+len may not be aligned */ + memmove(join+par->array[pidx].len+1, n->array[0].str, n->array[0].len); + region_recycle(region, par->array[pidx].str, par->array[pidx].len); + par->array[pidx].str = join; + par->array[pidx].len = joinlen; + /* and set the node to our child. */ + par->array[pidx].node = child; + child->parent = par; + child->pidx = pidx; + /* we are unlinked, delete our node */ + radnode_delete(region, n); + return 1; +} + +/** remove array of nodes */ +static void +radnode_array_clean_all(struct region* region, struct radnode* n) +{ + n->offset = 0; + n->len = 0; + /* shrink capacity */ + region_recycle(region, n->array, n->capacity*sizeof(struct radsel)); + n->array = NULL; + n->capacity = 0; +} + +/** see if capacity can be reduced for the given node array */ +static void +radnode_array_reduce_if_needed(struct region* region, struct radnode* n) +{ + if(n->len <= n->capacity/2 && n->len != n->capacity) { + struct radsel* a = (struct radsel*)region_alloc(region, + sizeof(*a)*n->len); + if(!a) return; + memcpy(a, n->array, sizeof(*a)*n->len); + region_recycle(region, n->array, n->capacity*sizeof(*a)); + n->array = a; + n->capacity = n->len; + } +} + +/** remove NULL nodes from front of array */ +static void +radnode_array_clean_front(struct region* region, struct radnode* n) +{ + /* move them up and adjust offset */ + unsigned idx, shuf = 0; + /* remove until a nonNULL entry */ + while(shuf < n->len && n->array[shuf].node == NULL) + shuf++; + if(shuf == 0) + return; + if(shuf == n->len) { + /* the array is empty, the tree is inefficient */ + radnode_array_clean_all(region, n); + return; + } + assert(shuf < n->len); + assert((int)shuf <= 255-(int)n->offset); + memmove(&n->array[0], &n->array[shuf], + (n->len - shuf)*sizeof(struct radsel)); + n->offset += shuf; + n->len -= shuf; + for(idx=0; idx<n->len; idx++) + if(n->array[idx].node) + n->array[idx].node->pidx = idx; + /* see if capacity can be reduced */ + radnode_array_reduce_if_needed(region, n); +} + +/** remove NULL nodes from end of array */ +static void +radnode_array_clean_end(struct region* region, struct radnode* n) +{ + /* shorten it */ + unsigned shuf = 0; + /* remove until a nonNULL entry */ + while(shuf < n->len && n->array[n->len-1-shuf].node == NULL) + shuf++; + if(shuf == 0) + return; + if(shuf == n->len) { + /* the array is empty, the tree is inefficient */ + radnode_array_clean_all(region, n); + return; + } + assert(shuf < n->len); + n->len -= shuf; + /* array elements can stay where they are */ + /* see if capacity can be reduced */ + radnode_array_reduce_if_needed(region, n); +} + +/** clean up radnode leaf, where we know it has a parent */ +static void +radnode_cleanup_leaf(struct region* region, struct radnode* n, + struct radnode* par) +{ + uint8_t pidx; + /* node was a leaf */ + /* delete leaf node, but store parent+idx */ + pidx = n->pidx; + radnode_delete(region, n); + + /* set parent+idx entry to NULL str and node.*/ + assert(pidx < par->len); + region_recycle(region, par->array[pidx].str, par->array[pidx].len); + par->array[pidx].str = NULL; + par->array[pidx].len = 0; + par->array[pidx].node = NULL; + + /* see if par offset or len must be adjusted */ + if(par->len == 1) { + /* removed final element from array */ + radnode_array_clean_all(region, par); + } else if(pidx == 0) { + /* removed first element from array */ + radnode_array_clean_front(region, par); + } else if(pidx == par->len-1) { + /* removed last element from array */ + radnode_array_clean_end(region, par); + } +} + +/** + * Cleanup a radix node that was made smaller, see if it can + * be merged with others. + * @param rt: tree to remove root if needed. + * @param n: node to cleanup + * @return false on alloc failure. + */ +static int +radnode_cleanup(struct radtree* rt, struct radnode* n) +{ + while(n) { + if(n->elem) { + /* cannot delete node with a data element */ + return 1; + } else if(n->len == 1 && n->parent) { + return radnode_cleanup_onechild(rt->region, n, n->parent); + } else if(n->len == 0) { + struct radnode* par = n->parent; + if(!par) { + /* root deleted */ + radnode_delete(rt->region, n); + rt->root = NULL; + return 1; + } + /* remove and delete the leaf node */ + radnode_cleanup_leaf(rt->region, n, par); + /* see if parent can now be cleaned up */ + n = par; + } else { + /* node cannot be cleaned up */ + return 1; + } + } + /* ENOTREACH */ + return 1; +} + +void radix_delete(struct radtree* rt, struct radnode* n) +{ + if(!n) return; + n->elem = NULL; + rt->count --; + if(!radnode_cleanup(rt, n)) { + /* out of memory in cleanup. the elem ptr is NULL, but + * the radix tree could be inefficient. */ + } +} + +struct radnode* radix_search(struct radtree* rt, uint8_t* k, radstrlen_t len) +{ + struct radnode* n = rt->root; + radstrlen_t pos = 0; + uint8_t byte; + while(n) { + if(pos == len) + return n->elem?n:NULL; + byte = k[pos]; + if(byte < n->offset) + return NULL; + byte -= n->offset; + if(byte >= n->len) + return NULL; + pos++; + if(n->array[byte].len != 0) { + /* must match additional string */ + if(pos+n->array[byte].len > len) + return NULL; /* no match */ + if(memcmp(&k[pos], n->array[byte].str, + n->array[byte].len) != 0) + return NULL; /* no match */ + pos += n->array[byte].len; + } + n = n->array[byte].node; + } + return NULL; +} + +/** return self or a previous element */ +static int ret_self_or_prev(struct radnode* n, struct radnode** result) +{ + if(n->elem) + *result = n; + else *result = radix_prev(n); + return 0; +} + +int radix_find_less_equal(struct radtree* rt, uint8_t* k, radstrlen_t len, + struct radnode** result) +{ + struct radnode* n = rt->root; + radstrlen_t pos = 0; + uint8_t byte; + int r; + if(!n) { + /* empty tree */ + *result = NULL; + return 0; + } + while(pos < len) { + byte = k[pos]; + if(byte < n->offset) { + /* so the previous is the element itself */ + /* or something before this element */ + return ret_self_or_prev(n, result); + } + byte -= n->offset; + if(byte >= n->len) { + /* so, the previous is the last of array, or itself */ + /* or something before this element */ + if((*result=radnode_last_in_subtree_incl_self(n))==0) + *result = radix_prev(n); + return 0; + } + pos++; + if(!n->array[byte].node) { + /* no match */ + /* Find an entry in arrays from byte-1 to 0 */ + *result = radnode_find_prev_from_idx(n, byte); + if(*result) + return 0; + /* this entry or something before it */ + return ret_self_or_prev(n, result); + } + if(n->array[byte].len != 0) { + /* must match additional string */ + if(pos+n->array[byte].len > len) { + /* the additional string is longer than key*/ + if( (memcmp(&k[pos], n->array[byte].str, + len-pos)) <= 0) { + /* and the key is before this node */ + *result = radix_prev(n->array[byte].node); + } else { + /* the key is after the additional + * string, thus everything in that + * subtree is smaller. */ + *result=radnode_last_in_subtree_incl_self(n->array[byte].node); + /* if somehow that is NULL, + * then we have an inefficient tree: + * byte+1 is larger than us, so find + * something in byte-1 and before */ + if(!*result) + *result = radix_prev(n->array[byte].node); + } + return 0; /* no match */ + } + if( (r=memcmp(&k[pos], n->array[byte].str, + n->array[byte].len)) < 0) { + *result = radix_prev(n->array[byte].node); + return 0; /* no match */ + } else if(r > 0) { + /* the key is larger than the additional + * string, thus everything in that subtree + * is smaller */ + *result=radnode_last_in_subtree_incl_self(n->array[byte].node); + /* if we have an inefficient tree */ + if(!*result) *result = radix_prev(n->array[byte].node); + return 0; /* no match */ + } + pos += n->array[byte].len; + } + n = n->array[byte].node; + } + if(n->elem) { + /* exact match */ + *result = n; + return 1; + } + /* there is a node which is an exact match, but it has no element */ + *result = radix_prev(n); + return 0; +} + + +struct radnode* radix_first(struct radtree* rt) +{ + struct radnode* n; + if(!rt || !rt->root) return NULL; + n = rt->root; + if(n->elem) return n; + return radix_next(n); +} + +struct radnode* radix_last(struct radtree* rt) +{ + if(!rt || !rt->root) return NULL; + return radnode_last_in_subtree_incl_self(rt->root); +} + +struct radnode* radix_next(struct radnode* n) +{ + if(n->len) { + /* go down */ + struct radnode* s = radnode_first_in_subtree(n); + if(s) return s; + } + /* go up - the parent->elem is not useful, because it is before us */ + while(n->parent) { + unsigned idx = n->pidx; + n = n->parent; + idx++; + for(; idx < n->len; idx++) { + /* go down the next branch */ + if(n->array[idx].node) { + struct radnode* s; + /* node itself */ + if(n->array[idx].node->elem) + return n->array[idx].node; + /* or subtree */ + s = radnode_first_in_subtree( + n->array[idx].node); + if(s) return s; + } + } + } + return NULL; +} + +struct radnode* radix_prev(struct radnode* n) +{ + /* must go up, since all array nodes are after this node */ + while(n->parent) { + uint8_t idx = n->pidx; + struct radnode* s; + n = n->parent; + assert(n->len > 0); /* since we are a child */ + /* see if there are elements in previous branches there */ + s = radnode_find_prev_from_idx(n, idx); + if(s) return s; + /* the current node is before the array */ + if(n->elem) + return n; + } + return NULL; +} + +/** convert one character from domain-name to radname */ +static uint8_t char_d2r(uint8_t c) +{ + if(c < 'A') return c+1; /* make space for 00 */ + else if(c <= 'Z') return c-'A'+'a'; /* lowercase */ + else return c; +} + +/** convert one character from radname to domain-name (still lowercased) */ +static uint8_t char_r2d(uint8_t c) +{ + assert(c != 0); /* end of label */ + if(c <= 'A') return c-1; + else return c; +} + +/** copy and convert a range of characters */ +static void cpy_d2r(uint8_t* to, const uint8_t* from, int len) +{ + int i; + for(i=0; i<len; i++) + to[i] = char_d2r(from[i]); +} + +/** copy and convert a range of characters */ +static void cpy_r2d(uint8_t* to, uint8_t* from, uint8_t len) +{ + uint8_t i; + for(i=0; i<len; i++) + to[i] = char_r2d(from[i]); +} + +/* radname code: domain to radix-bstring */ +void radname_d2r(uint8_t* k, radstrlen_t* len, const uint8_t* dname, + size_t dlen) +{ + /* the domain name is converted as follows, + * to preserve the normal (NSEC) ordering of domain names. + * lowercased, and 'end-of-label' is a '00' byte, + * bytes 00-'A' are +1 moved to make space for 00 byte. + * final root label is not appended (string ends). + * because the only allowed empty label is the final root label, + * we can also remove the last 00 label-end. + * The total result length is one-or-two less than the dname. + * + * examples (numbers are bytes, letters are ascii): + * - root: dname: 0, radname: '' + * - nl.: dname: 3nl0, radname: 'nl' + * - labs.nl: dname 4labs3nl0, radname: 'nl0labs' + * - x.labs.nl: dname 1x4labs3nl0, radname: 'nl0labs0x' + */ + + /* conversion by putting the label starts on a stack */ + const uint8_t* labstart[130]; + unsigned int lab = 0, kpos, dpos = 0; + /* sufficient space */ + assert(k && dname); + assert(dlen <= 256); /* and therefore not more than 128 labels */ + assert(*len >= dlen); + assert(dlen > 0); /* even root label has dlen=1 */ + + /* root */ + if(dlen == 1) { + assert(dname[0] == 0); + *len = 0; + return; + } + + /* walk through domain name and remember label positions */ + do { + /* compression pointers not allowed */ + if((dname[dpos] & 0xc0)) { + *len = 0; + return; /* format error */ + } + labstart[lab++] = &dname[dpos]; + if(dpos + dname[dpos] + 1 >= dlen) { + *len = 0; + return; /* format error */ + } + /* skip the label contents */ + dpos += dname[dpos]; + dpos ++; + } while(dname[dpos] != 0); + /* exit condition makes root label not in labelstart stack */ + /* because the root was handled before, we know there is some text */ + assert(lab > 0); + lab-=1; + kpos = *labstart[lab]; + cpy_d2r(k, labstart[lab]+1, kpos); + /* if there are more labels, copy them over */ + while(lab) { + /* put 'end-of-label' 00 to end previous label */ + k[kpos++]=0; + /* append the label */ + lab--; + cpy_d2r(k+kpos, labstart[lab]+1, *labstart[lab]); + kpos += *labstart[lab]; + } + /* done */ + assert(kpos == dlen-2); /* no rootlabel, one less label-marker */ + *len = kpos; +} + +/* radname code: radix-bstring to domain */ +void radname_r2d(uint8_t* k, radstrlen_t len, uint8_t* dname, size_t* dlen) +{ + /* find labels and push on stack */ + uint8_t* labstart[130]; + uint8_t lablen[130]; + unsigned int lab = 0, dpos, kpos = 0; + /* sufficient space */ + assert(k && dname); + assert((size_t)*dlen >= (size_t)len+2); + assert(len <= 256); + /* root label */ + if(len == 0) { + assert(*dlen > 0); + dname[0]=0; + *dlen=1; + return; + } + /* find labels */ + while(kpos < len) { + lablen[lab]=0; + labstart[lab]=&k[kpos]; + /* skip to next label */ + while(kpos < len && k[kpos] != 0) { + lablen[lab]++; + kpos++; + } + lab++; + /* skip 00 byte for label-end */ + if(kpos < len) { + assert(k[kpos] == 0); + kpos++; + } + } + /* copy the labels over to the domain name */ + dpos = 0; + while(lab) { + lab--; + /* label length */ + dname[dpos++] = lablen[lab]; + /* label content */ + cpy_r2d(dname+dpos, labstart[lab], lablen[lab]); + dpos += lablen[lab]; + } + /* append root label */ + dname[dpos++] = 0; + /* assert the domain name is wellformed */ + assert((int)dpos == (int)len+2); + assert(dname[dpos-1] == 0); /* ends with root label */ + *dlen = dpos; +} + +/** insert by domain name */ +struct radnode* +radname_insert(struct radtree* rt, const uint8_t* d, size_t max, void* elem) +{ + /* convert and insert */ + uint8_t radname[300]; + radstrlen_t len = (radstrlen_t)sizeof(radname); + if(max > sizeof(radname)) + return NULL; /* too long */ + radname_d2r(radname, &len, d, max); + return radix_insert(rt, radname, len, elem); +} + +/** delete by domain name */ +void +radname_delete(struct radtree* rt, const uint8_t* d, size_t max) +{ + /* search and remove */ + struct radnode* n = radname_search(rt, d, max); + if(n) radix_delete(rt, n); +} + +/* search for exact match of domain name, converted to radname in tree */ +struct radnode* radname_search(struct radtree* rt, const uint8_t* d, + size_t max) +{ + /* stack of labels in the domain name */ + const uint8_t* labstart[130]; + unsigned int lab, dpos, lpos; + struct radnode* n = rt->root; + uint8_t byte; + radstrlen_t i; + uint8_t b; + + /* search for root? it is '' */ + if(max < 1) + return NULL; + if(d[0] == 0) { + if(!n) return NULL; + return n->elem?n:NULL; + } + + /* find labels stack in domain name */ + lab = 0; + dpos = 0; + /* must have one label, since root is specialcased */ + do { + if((d[dpos] & 0xc0)) + return NULL; /* compression ptrs not allowed error */ + labstart[lab++] = &d[dpos]; + if(dpos + d[dpos] + 1 >= max) + return NULL; /* format error: outside of bounds */ + /* skip the label contents */ + dpos += d[dpos]; + dpos ++; + } while(d[dpos] != 0); + /* exit condition makes that root label is not in the labstarts */ + /* now: dpos+1 is length of domain name. lab is number of labels-1 */ + + /* start processing at the last label */ + lab-=1; + lpos = 0; + while(n) { + /* fetch next byte this label */ + if(lpos < *labstart[lab]) + /* lpos+1 to skip labelstart, lpos++ to move forward */ + byte = char_d2r(labstart[lab][++lpos]); + else { + if(lab == 0) /* last label - we're done */ + return n->elem?n:NULL; + /* next label, search for byte 00 */ + lpos = 0; + lab--; + byte = 0; + } + /* find that byte in the array */ + if(byte < n->offset) + return NULL; + byte -= n->offset; + if(byte >= n->len) + return NULL; + if(n->array[byte].len != 0) { + /* must match additional string */ + /* see how many bytes we need and start matching them*/ + for(i=0; i<n->array[byte].len; i++) { + /* next byte to match */ + if(lpos < *labstart[lab]) + b = char_d2r(labstart[lab][++lpos]); + else { + /* if last label, no match since + * we are in the additional string */ + if(lab == 0) + return NULL; + /* next label, search for byte 00 */ + lpos = 0; + lab--; + b = 0; + } + if(n->array[byte].str[i] != b) + return NULL; /* not matched */ + } + } + n = n->array[byte].node; + } + return NULL; +} + +/* find domain name or smaller or equal domain name in radix tree */ +int radname_find_less_equal(struct radtree* rt, const uint8_t* d, size_t max, + struct radnode** result) +{ + /* stack of labels in the domain name */ + const uint8_t* labstart[130]; + unsigned int lab, dpos, lpos; + struct radnode* n = rt->root; + uint8_t byte; + radstrlen_t i; + uint8_t b; + + /* empty tree */ + if(!n) { + *result = NULL; + return 0; + } + + /* search for root? it is '' */ + if(max < 1) { + *result = NULL; + return 0; /* parse error, out of bounds */ + } + if(d[0] == 0) { + if(n->elem) { + *result = n; + return 1; + } + /* no smaller element than the root */ + *result = NULL; + return 0; + } + + /* find labels stack in domain name */ + lab = 0; + dpos = 0; + /* must have one label, since root is specialcased */ + do { + if((d[dpos] & 0xc0)) { + *result = NULL; + return 0; /* compression ptrs not allowed error */ + } + labstart[lab++] = &d[dpos]; + if(dpos + d[dpos] + 1 >= max) { + *result = NULL; /* format error: outside of bounds */ + return 0; + } + /* skip the label contents */ + dpos += d[dpos]; + dpos ++; + } while(d[dpos] != 0); + /* exit condition makes that root label is not in the labstarts */ + /* now: dpos+1 is length of domain name. lab is number of labels-1 */ + + /* start processing at the last label */ + lab-=1; + lpos = 0; + while(1) { + /* fetch next byte this label */ + if(lpos < *labstart[lab]) + /* lpos+1 to skip labelstart, lpos++ to move forward */ + byte = char_d2r(labstart[lab][++lpos]); + else { + if(lab == 0) { + /* last label - we're done */ + /* exact match */ + if(n->elem) { + *result = n; + return 1; + } + /* there is a node which is an exact match, + * but there no element in it */ + *result = radix_prev(n); + return 0; + } + /* next label, search for byte 0 the label separator */ + lpos = 0; + lab--; + byte = 0; + } + /* find that byte in the array */ + if(byte < n->offset) + /* so the previous is the element itself */ + /* or something before this element */ + return ret_self_or_prev(n, result); + byte -= n->offset; + if(byte >= n->len) { + /* so, the previous is the last of array, or itself */ + /* or something before this element */ + *result = radnode_last_in_subtree_incl_self(n); + if(!*result) + *result = radix_prev(n); + return 0; + } + if(!n->array[byte].node) { + /* no match */ + /* Find an entry in arrays from byte-1 to 0 */ + *result = radnode_find_prev_from_idx(n, byte); + if(*result) + return 0; + /* this entry or something before it */ + return ret_self_or_prev(n, result); + } + if(n->array[byte].len != 0) { + /* must match additional string */ + /* see how many bytes we need and start matching them*/ + for(i=0; i<n->array[byte].len; i++) { + /* next byte to match */ + if(lpos < *labstart[lab]) + b = char_d2r(labstart[lab][++lpos]); + else { + /* if last label, no match since + * we are in the additional string */ + if(lab == 0) { + /* dname ended, thus before + * this array element */ + *result =radix_prev( + n->array[byte].node); + return 0; + } + /* next label, search for byte 00 */ + lpos = 0; + lab--; + b = 0; + } + if(b < n->array[byte].str[i]) { + *result =radix_prev( + n->array[byte].node); + return 0; + } else if(b > n->array[byte].str[i]) { + /* the key is after the additional, + * so everything in its subtree is + * smaller */ + *result = radnode_last_in_subtree_incl_self(n->array[byte].node); + /* if that is NULL, we have an + * inefficient tree, find in byte-1*/ + if(!*result) + *result = radix_prev(n->array[byte].node); + return 0; + } + } + } + n = n->array[byte].node; + } + /* ENOTREACH */ + return 0; +} + diff --git a/usr.sbin/nsd/radtree.h b/usr.sbin/nsd/radtree.h new file mode 100644 index 00000000000..6f54de01641 --- /dev/null +++ b/usr.sbin/nsd/radtree.h @@ -0,0 +1,244 @@ +/* + * radtree -- generic radix tree for binary strings. + * + * Copyright (c) 2010, NLnet Labs. See LICENSE for license. + */ +#ifndef RADTREE_H +#define RADTREE_H + +struct radnode; +struct region; + +/** length of the binary string */ +typedef uint16_t radstrlen_t; + +/** + * The radix tree + * + * The elements are stored based on binary strings(0-255) of a given length. + * They are sorted, a prefix is sorted before its suffixes. + * If you want to know the key string, you should store it yourself, the + * tree stores it in the parts necessary for lookup. + * For binary strings for domain names see the radname routines. + */ +struct radtree { + /** root node in tree */ + struct radnode* root; + /** count of number of elements */ + size_t count; + /** region for allocation */ + struct region* region; +}; + +/** + * A radix tree lookup node. + * The array is malloced separately from the radnode. + */ +struct radnode { + /** data element associated with the binary string up to this node */ + void* elem; + /** parent node (NULL for the root) */ + struct radnode* parent; + /** index in the parent lookup array */ + uint8_t pidx; + /** offset of the lookup array, add to [i] for lookups */ + uint8_t offset; + /** length of the lookup array */ + uint16_t len; + /** capacity of the lookup array (can be larger than length) */ + uint16_t capacity; + /** the lookup array by [byte-offset] */ + struct radsel* array; +}; + +/** + * radix select edge in array + */ +struct radsel { + /** additional string after the selection-byte for this edge. */ + uint8_t* str; + /** length of the additional string for this edge */ + radstrlen_t len; + /** node that deals with byte+str */ + struct radnode* node; +}; + +/** + * Create new radix tree + * @param region: where to allocate the tree. + * @return new tree or NULL on alloc failure. + */ +struct radtree* radix_tree_create(struct region* region); + +/** + * Init new radix tree. + * @param rt: radix tree to be initialized. + */ +void radix_tree_init(struct radtree* rt); + +/** + * Delete intermediate nodes from radix tree + * @param rt: radix tree to be initialized. + */ +void radix_tree_clear(struct radtree* rt); + +/** + * Delete radix tree. + * @param rt: radix tree to be deleted. + */ +void radix_tree_delete(struct radtree* rt); + + +/** + * Insert element into radix tree. + * @param rt: the radix tree. + * @param key: key string. + * @param len: length of key. + * @param elem: pointer to element data. + * @return NULL on failure - out of memory. + * NULL on failure - duplicate entry. + * On success the new radix node for this element. + */ +struct radnode* radix_insert(struct radtree* rt, uint8_t* k, radstrlen_t len, + void* elem); + +/** + * Delete element from radix tree. + * @param rt: the radix tree. + * @param n: radix node for that element. + * if NULL, nothing is deleted. + */ +void radix_delete(struct radtree* rt, struct radnode* n); + +/** + * Find radix element in tree. + * @param rt: the radix tree. + * @param key: key string. + * @param len: length of key. + * @return the radix node or NULL if not found. + */ +struct radnode* radix_search(struct radtree* rt, uint8_t* k, radstrlen_t len); + +/** + * Find radix element in tree, and if not found, find the closest smaller or + * equal element in the tree. + * @param rt: the radix tree. + * @param key: key string. + * @param len: length of key. + * @param result: returns the radix node or closest match (NULL if key is + * smaller than the smallest key in the tree). + * @return true if exact match, false if no match. + */ +int radix_find_less_equal(struct radtree* rt, uint8_t* k, radstrlen_t len, + struct radnode** result); + +/** + * Return the first (smallest) element in the tree. + * @param rt: the radix tree. + * @return: first node or NULL if none. + */ +struct radnode* radix_first(struct radtree* rt); + +/** + * Return the last (largest) element in the tree. + * @param rt: the radix tree. + * @return: last node or NULL if none. + */ +struct radnode* radix_last(struct radtree* rt); + +/** + * Return the next element. + * @param n: the element to go from. + * @return: next node or NULL if none. + */ +struct radnode* radix_next(struct radnode* n); + +/** + * Return the previous element. + * @param n: the element to go from. + * @return: prev node or NULL if none. + */ +struct radnode* radix_prev(struct radnode* n); + +/* + * Perform a walk through all elements of the tree. + * node: variable of type struct radnode*. + * tree: pointer to the tree. + * for(node=radix_first(tree); node; node=radix_next(node)) +*/ + +/** + * Create a binary string to represent a domain name + * @param k: string buffer to store into + * @param len: output length, initially, the max, output the result. + * @param dname: the domain name to convert, in wireformat. + * @param dlen: length of space for dname. + */ +void radname_d2r(uint8_t* k, radstrlen_t* len, const uint8_t* dname, + size_t dlen); + +/** + * Convert a binary string back to a domain name. + * @param k: the binary string. + * @param len: length of k. + * @param dname: buffer to store domain name into. + * @param dlen: length of dname (including root label). + */ +void radname_r2d(uint8_t* k, radstrlen_t len, uint8_t* dname, size_t* dlen); + +/** + * Search the radix tree using a domain name. + * The name is internally converted to a radname. + * @param rt: tree + * @param d: domain name, no compression pointers allowed. + * @param max: max length to go from d. + * @return NULL on parse error or if not found. + */ +struct radnode* radname_search(struct radtree* rt, const uint8_t* d, + size_t max); + +/** + * Find radix element in tree by domain name, and if not found, + * find the closest smaller or equal element in the tree. + * The name is internally converted to a radname (same sorting order). + * @param rt: the radix tree. + * @param d: domain name, no compression pointers allowed. + * @param max: max length to go from d. + * @param result: returns the radix node or closest match (NULL if key is + * smaller than the smallest key in the tree). + * could result in NULL on a parse error as well (with return false). + * @return true if exact match, false if no match. + */ +int radname_find_less_equal(struct radtree* rt, const uint8_t* d, size_t max, + struct radnode** result); + +/** + * Insert radix element by domain name. + * @param rt: the radix tree + * @param d: domain name, no compression pointers. + * @param max: max length from d. + * @param elem: the element pointer to insert. + * @return NULL on failure - out of memory. + * NULL on failure - duplicate entry. + * NULL on failure - parse error. + * On success the radix node for this element. + */ +struct radnode* radname_insert(struct radtree* rt, const uint8_t* d, + size_t max, void* elem); + +/** + * Delete element by domain name from radix tree. + * @param rt: the radix tree. + * @param d: the domain name. If it is not in the tree nothing happens. + * @param max: max length. + */ +void radname_delete(struct radtree* rt, const uint8_t* d, size_t max); + +/** number of bytes in common in strings */ +radstrlen_t bstr_common_ext(uint8_t* x, radstrlen_t xlen, uint8_t* y, + radstrlen_t ylen); +/** true if one is prefix of the other */ +int bstr_is_prefix_ext(uint8_t* p, radstrlen_t plen, uint8_t* x, + radstrlen_t xlen); + +#endif /* RADTREE_H */ diff --git a/usr.sbin/nsd/rbtree.c b/usr.sbin/nsd/rbtree.c index c7d384fe66d..80f7bbb2b6e 100644 --- a/usr.sbin/nsd/rbtree.c +++ b/usr.sbin/nsd/rbtree.c @@ -1,7 +1,7 @@ /* * rbtree.c -- generic red black tree * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -551,39 +551,3 @@ rbtree_previous(rbnode_t *node) } return node; } - - -/** - * Given an rbtree "root" node, find the first node under that tree in - * postorder. - */ -rbnode_t * -rbtree_postorder_first(rbnode_t *root) -{ - rbnode_t *node = root; - do { - while (node->left != RBTREE_NULL) { - node = node->left; - } - while ((node->left == RBTREE_NULL) && - (node->right != RBTREE_NULL)) { - node = node->right; - } - } while (node->left != node->right); - return node; -} - - -/** - * Given any node in an rbtree, find the next node in postorder. - */ -rbnode_t * -rbtree_postorder_next(rbnode_t *node) -{ - if ((node->parent->right == RBTREE_NULL) || - (node->parent->right == node)) - node = node->parent; - else - node = rbtree_postorder_first(node->parent->right); - return node; -} diff --git a/usr.sbin/nsd/rbtree.h b/usr.sbin/nsd/rbtree.h index 028d715397c..a381cf0788f 100644 --- a/usr.sbin/nsd/rbtree.h +++ b/usr.sbin/nsd/rbtree.h @@ -1,7 +1,7 @@ /* * rbtree.h -- generic red-black tree * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -60,8 +60,6 @@ rbnode_t *rbtree_first(rbtree_t *rbtree); rbnode_t *rbtree_last(rbtree_t *rbtree); rbnode_t *rbtree_next(rbnode_t *rbtree); rbnode_t *rbtree_previous(rbnode_t *rbtree); -rbnode_t *rbtree_postorder_first(rbnode_t *root); -rbnode_t *rbtree_postorder_next(rbnode_t *node); #define RBTREE_WALK(rbtree, k, d) \ for((rbtree)->_node = rbtree_first(rbtree);\ diff --git a/usr.sbin/nsd/rdata.h b/usr.sbin/nsd/rdata.h index 0cddb16e0f4..0da8eab6ec0 100644 --- a/usr.sbin/nsd/rdata.h +++ b/usr.sbin/nsd/rdata.h @@ -1,7 +1,7 @@ /* * rdata.h -- RDATA conversion functions. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * diff --git a/usr.sbin/nsd/region-allocator.h b/usr.sbin/nsd/region-allocator.h index a047a1dfc5a..7a7bfe96f2a 100644 --- a/usr.sbin/nsd/region-allocator.h +++ b/usr.sbin/nsd/region-allocator.h @@ -1,7 +1,7 @@ /* * region-allocator.h -- region based memory allocator. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -129,6 +129,10 @@ void region_dump_stats(region_type *region, FILE *out); /* get size of recyclebin */ size_t region_get_recycle_size(region_type* region); +/* get size of region memory in use */ +size_t region_get_mem(region_type* region); +/* get size of region memory unused */ +size_t region_get_mem_unused(region_type* region); /* Debug print REGION statistics to LOG. */ void region_log_stats(region_type *region); diff --git a/usr.sbin/nsd/remote.c b/usr.sbin/nsd/remote.c new file mode 100644 index 00000000000..d4858d2202a --- /dev/null +++ b/usr.sbin/nsd/remote.c @@ -0,0 +1,1943 @@ +/* + * remote.c - remote control for the NSD daemon. + * + * Copyright (c) 2008, NLnet Labs. All rights reserved. + * + * This software is open source. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * Neither the name of the NLNET LABS nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file + * + * This file contains the remote control functionality for the daemon. + * The remote control can be performed using either the commandline + * nsd-control tool, or a SSLv3/TLS capable web browser. + * The channel is secured using SSLv3 or TLSv1, and certificates. + * Both the server and the client(control tool) have their own keys. + */ +#include "config.h" +#ifdef HAVE_SSL + +#ifdef HAVE_OPENSSL_SSL_H +#include "openssl/ssl.h" +#endif +#ifdef HAVE_OPENSSL_ERR_H +#include <openssl/err.h> +#endif +#include <ctype.h> +#include <unistd.h> +#include <assert.h> +#include <fcntl.h> +#ifndef USE_MINI_EVENT +#include <event.h> +#else +#include "mini_event.h" +#endif +#include "remote.h" +#include "util.h" +#include "xfrd.h" +#include "xfrd-notify.h" +#include "xfrd-tcp.h" +#include "nsd.h" +#include "options.h" +#include "difffile.h" +#include "xfrd.h" +#include "ipc.h" + +#ifdef HAVE_SYS_TYPES_H +# include <sys/types.h> +#endif +#ifdef HAVE_NETDB_H +#include <netdb.h> +#endif + +/** number of seconds timeout on incoming remote control handshake */ +#define REMOTE_CONTROL_TCP_TIMEOUT 120 + +/** repattern to master or slave */ +#define REPAT_SLAVE 1 +#define REPAT_MASTER 2 + +/** if you want zero to be inhibited in stats output. + * it omits zeroes for types that have no acronym and unused-rcodes */ +const int inhibit_zero = 1; + +/** + * a busy control command connection, SSL state + * Defined here to keep the definition private, and keep SSL out of the .h + */ +struct rc_state { + /** the next item in list */ + struct rc_state* next, *prev; + /* if the event was added to the event_base */ + int event_added; + /** the commpoint */ + struct event c; + /** timeout for this state */ + struct timeval tval; + /** in the handshake part */ + enum { rc_none, rc_hs_read, rc_hs_write } shake_state; + /** the ssl state */ + SSL* ssl; + /** the rc this is part of */ + struct daemon_remote* rc; + /** stats list next item */ + struct rc_state* stats_next; + /** stats list indicator (0 is not part of stats list, 1 is stats, + * 2 is stats_noreset. */ + int in_stats_list; +}; + +/** + * list of events for accepting connections + */ +struct acceptlist { + struct acceptlist* next; + int event_added; + struct event c; +}; + +/** + * The remote control state. + */ +struct daemon_remote { + /** the master process for this remote control */ + struct xfrd_state* xfrd; + /** commpoints for accepting remote control connections */ + struct acceptlist* accept_list; + /** number of active commpoints that are handling remote control */ + int active; + /** max active commpoints */ + int max_active; + /** current commpoints busy; double linked, malloced */ + struct rc_state* busy_list; + /** commpoints waiting for stats to complete (also in busy_list) */ + struct rc_state* stats_list; + /** last time stats was reported */ + struct timeval stats_time, boot_time; + /** the SSL context for creating new SSL streams */ + SSL_CTX* ctx; +}; + +/** + * Print fixed line of text over ssl connection in blocking mode + * @param ssl: print to + * @param text: the text. + * @return false on connection failure. + */ +static int ssl_print_text(SSL* ssl, const char* text); + +/** + * printf style printing to the ssl connection + * @param ssl: the SSL connection to print to. Blocking. + * @param format: printf style format string. + * @return success or false on a network failure. + */ +static int ssl_printf(SSL* ssl, const char* format, ...) + ATTR_FORMAT(printf, 2, 3); + +/** + * Read until \n is encountered + * If SSL signals EOF, the string up to then is returned (without \n). + * @param ssl: the SSL connection to read from. blocking. + * @param buf: buffer to read to. + * @param max: size of buffer. + * @return false on connection failure. + */ +static int ssl_read_line(SSL* ssl, char* buf, size_t max); + +/** perform the accept of a new remote control connection */ +static void +remote_accept_callback(int fd, short event, void* arg); + +/** perform remote control */ +static void +remote_control_callback(int fd, short event, void* arg); + + +/** ---- end of private defines ---- **/ + + +/** log ssl crypto err */ +static void +log_crypto_err(const char* str) +{ + /* error:[error code]:[library name]:[function name]:[reason string] */ + char buf[128]; + unsigned long e; + ERR_error_string_n(ERR_get_error(), buf, sizeof(buf)); + log_msg(LOG_ERR, "%s crypto %s", str, buf); + while( (e=ERR_get_error()) ) { + ERR_error_string_n(e, buf, sizeof(buf)); + log_msg(LOG_ERR, "and additionally crypto %s", buf); + } +} + +#ifdef BIND8_STATS +/** subtract timers and the values do not overflow or become negative */ +static void +timeval_subtract(struct timeval* d, const struct timeval* end, + const struct timeval* start) +{ +#ifndef S_SPLINT_S + time_t end_usec = end->tv_usec; + d->tv_sec = end->tv_sec - start->tv_sec; + if(end_usec < start->tv_usec) { + end_usec += 1000000; + d->tv_sec--; + } + d->tv_usec = end_usec - start->tv_usec; +#endif +} +#endif /* BIND8_STATS */ + +struct daemon_remote* +daemon_remote_create(nsd_options_t* cfg) +{ + char* s_cert; + char* s_key; + struct daemon_remote* rc = (struct daemon_remote*)xalloc_zero( + sizeof(*rc)); + rc->max_active = 10; + assert(cfg->control_enable); + + /* init SSL library */ + ERR_load_crypto_strings(); + ERR_load_SSL_strings(); + OpenSSL_add_all_algorithms(); + (void)SSL_library_init(); + + rc->ctx = SSL_CTX_new(SSLv23_server_method()); + if(!rc->ctx) { + log_crypto_err("could not SSL_CTX_new"); + free(rc); + return NULL; + } + /* no SSLv2 because has defects */ + if(!(SSL_CTX_set_options(rc->ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2)){ + log_crypto_err("could not set SSL_OP_NO_SSLv2"); + daemon_remote_delete(rc); + return NULL; + } + s_cert = cfg->server_cert_file; + s_key = cfg->server_key_file; + VERBOSITY(2, (LOG_INFO, "setup SSL certificates")); + if (!SSL_CTX_use_certificate_file(rc->ctx,s_cert,SSL_FILETYPE_PEM)) { + log_msg(LOG_ERR, "Error for server-cert-file: %s", s_cert); + log_crypto_err("Error in SSL_CTX use_certificate_file"); + goto setup_error; + } + if(!SSL_CTX_use_PrivateKey_file(rc->ctx,s_key,SSL_FILETYPE_PEM)) { + log_msg(LOG_ERR, "Error for server-key-file: %s", s_key); + log_crypto_err("Error in SSL_CTX use_PrivateKey_file"); + goto setup_error; + } + if(!SSL_CTX_check_private_key(rc->ctx)) { + log_msg(LOG_ERR, "Error for server-key-file: %s", s_key); + log_crypto_err("Error in SSL_CTX check_private_key"); + goto setup_error; + } + if(!SSL_CTX_load_verify_locations(rc->ctx, s_cert, NULL)) { + log_crypto_err("Error setting up SSL_CTX verify locations"); + setup_error: + daemon_remote_delete(rc); + return NULL; + } + SSL_CTX_set_client_CA_list(rc->ctx, SSL_load_client_CA_file(s_cert)); + SSL_CTX_set_verify(rc->ctx, SSL_VERIFY_PEER, NULL); + + /* and try to open the ports */ + if(!daemon_remote_open_ports(rc, cfg)) { + log_msg(LOG_ERR, "could not open remote control port"); + goto setup_error; + } + + if(gettimeofday(&rc->boot_time, NULL) == -1) + log_msg(LOG_ERR, "gettimeofday: %s", strerror(errno)); + rc->stats_time = rc->boot_time; + + return rc; +} + +void daemon_remote_close(struct daemon_remote* rc) +{ + struct rc_state* p, *np; + struct acceptlist* h, *nh; + if(!rc) return; + + /* close listen sockets */ + h = rc->accept_list; + while(h) { + nh = h->next; + if(h->event_added) + event_del(&h->c); + close(h->c.ev_fd); + free(h); + h = nh; + } + rc->accept_list = NULL; + + /* close busy connection sockets */ + p = rc->busy_list; + while(p) { + np = p->next; + if(p->event_added) + event_del(&p->c); + if(p->ssl) + SSL_free(p->ssl); + close(p->c.ev_fd); + free(p); + p = np; + } + rc->busy_list = NULL; + rc->active = 0; +} + +void daemon_remote_delete(struct daemon_remote* rc) +{ + if(!rc) return; + daemon_remote_close(rc); + if(rc->ctx) { + SSL_CTX_free(rc->ctx); + } + free(rc); +} + +static int +create_tcp_accept_sock(struct addrinfo* addr, int* noproto) +{ +#if defined(SO_REUSEADDR) || (defined(INET6) && (defined(IPV6_V6ONLY) || defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU))) + int on = 1; +#endif + int s; + *noproto = 0; + if ((s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) { +#if defined(INET6) + if (addr->ai_family == AF_INET6 && + errno == EAFNOSUPPORT) { + *noproto = 1; + log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: not supported"); + return -1; + } +#endif /* INET6 */ + log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); + return -1; + } +#ifdef SO_REUSEADDR + if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) < 0) { + log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s", strerror(errno)); + } +#endif /* SO_REUSEADDR */ +#if defined(INET6) && defined(IPV6_V6ONLY) + if (addr->ai_family == AF_INET6 && + setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)) < 0) + { + log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed: %s", strerror(errno)); + return -1; + } +#endif + /* set it nonblocking */ + /* (StevensUNP p463), if tcp listening socket is blocking, then + it may block in accept, even if select() says readable. */ + if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) { + log_msg(LOG_ERR, "cannot fcntl tcp: %s", strerror(errno)); + } + /* Bind it... */ + if (bind(s, (struct sockaddr *)addr->ai_addr, addr->ai_addrlen) != 0) { + log_msg(LOG_ERR, "can't bind tcp socket: %s", strerror(errno)); + return -1; + } + /* Listen to it... */ + if (listen(s, TCP_BACKLOG_REMOTE) == -1) { + log_msg(LOG_ERR, "can't listen: %s", strerror(errno)); + return -1; + } + return s; +} + +/** + * Add and open a new control port + * @param rc: rc with result list. + * @param ip: ip str + * @param nr: port nr + * @param noproto_is_err: if lack of protocol support is an error. + * @return false on failure. + */ +static int +add_open(struct daemon_remote* rc, const char* ip, int nr, int noproto_is_err) +{ + struct addrinfo hints; + struct addrinfo* res; + struct acceptlist* hl; + int noproto; + int fd, r; + char port[15]; + snprintf(port, sizeof(port), "%d", nr); + port[sizeof(port)-1]=0; + memset(&hints, 0, sizeof(hints)); + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_PASSIVE | AI_NUMERICHOST; + if((r = getaddrinfo(ip, port, &hints, &res)) != 0 || !res) { + log_msg(LOG_ERR, "control interface %s:%s getaddrinfo: %s %s", + ip?ip:"default", port, gai_strerror(r), +#ifdef EAI_SYSTEM + r==EAI_SYSTEM?(char*)strerror(errno):"" +#else + "" +#endif + ); + return 0; + } + + /* open fd */ + fd = create_tcp_accept_sock(res, &noproto); + freeaddrinfo(res); + if(fd == -1 && noproto) { + if(!noproto_is_err) + return 1; /* return success, but do nothing */ + log_msg(LOG_ERR, "cannot open control interface %s %d : " + "protocol not supported", ip, nr); + return 0; + } + if(fd == -1) { + log_msg(LOG_ERR, "cannot open control interface %s %d", ip, nr); + return 0; + } + + /* alloc */ + hl = (struct acceptlist*)xalloc_zero(sizeof(*hl)); + hl->next = rc->accept_list; + rc->accept_list = hl; + + hl->c.ev_fd = fd; + hl->event_added = 0; + return 1; +} + +int +daemon_remote_open_ports(struct daemon_remote* rc, nsd_options_t* cfg) +{ + assert(cfg->control_enable && cfg->control_port); + if(cfg->control_interface) { + ip_address_option_t* p; + for(p = cfg->control_interface; p; p = p->next) { + if(!add_open(rc, p->address, cfg->control_port, 1)) { + return 0; + } + } + } else { + /* defaults */ + if(cfg->do_ip6 && !add_open(rc, "::1", cfg->control_port, 0)) { + return 0; + } + if(cfg->do_ip4 && + !add_open(rc, "127.0.0.1", cfg->control_port, 1)) { + return 0; + } + } + return 1; +} + +void +daemon_remote_attach(struct daemon_remote* rc, struct xfrd_state* xfrd) +{ + int fd; + struct acceptlist* p; + if(!rc) return; + rc->xfrd = xfrd; + for(p = rc->accept_list; p; p = p->next) { + /* add event */ + fd = p->c.ev_fd; + event_set(&p->c, fd, EV_PERSIST|EV_READ, remote_accept_callback, + rc); + if(event_base_set(xfrd->event_base, &p->c) != 0) + log_msg(LOG_ERR, "remote: cannot set event_base"); + if(event_add(&p->c, NULL) != 0) + log_msg(LOG_ERR, "remote: cannot add event"); + p->event_added = 1; + } +} + +static void +remote_accept_callback(int fd, short event, void* arg) +{ + struct daemon_remote *rc = (struct daemon_remote*)arg; + struct sockaddr_storage addr; + socklen_t addrlen; + int newfd; + struct rc_state* n; + + if (!(event & EV_READ)) { + return; + } + + /* perform the accept */ + addrlen = sizeof(addr); + newfd = accept(fd, (struct sockaddr*)&addr, &addrlen); + if(newfd == -1) { + if ( errno != EINTR + && errno != EWOULDBLOCK +#ifdef ECONNABORTED + && errno != ECONNABORTED +#endif /* ECONNABORTED */ +#ifdef EPROTO + && errno != EPROTO +#endif /* EPROTO */ + ) { + log_msg(LOG_ERR, "accept failed: %s", strerror(errno)); + } + return; + } + + /* create new commpoint unless we are servicing already */ + if(rc->active >= rc->max_active) { + log_msg(LOG_WARNING, "drop incoming remote control: " + "too many connections"); + close_exit: + close(newfd); + return; + } + if (fcntl(newfd, F_SETFL, O_NONBLOCK) == -1) { + log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno)); + goto close_exit; + } + + /* setup state to service the remote control command */ + n = (struct rc_state*)calloc(1, sizeof(*n)); + if(!n) { + log_msg(LOG_ERR, "out of memory"); + goto close_exit; + } + + n->tval.tv_sec = REMOTE_CONTROL_TCP_TIMEOUT; + n->tval.tv_usec = 0L; + + event_set(&n->c, newfd, EV_PERSIST|EV_TIMEOUT|EV_READ, + remote_control_callback, n); + if(event_base_set(xfrd->event_base, &n->c) != 0) + log_msg(LOG_ERR, "remote_accept: cannot set event_base"); + if(event_add(&n->c, &n->tval) != 0) + log_msg(LOG_ERR, "remote_accept: cannot add event"); + n->event_added = 1; + + if(2 <= verbosity) { + char s[128]; + addr2str(&addr, s, sizeof(s)); + VERBOSITY(2, (LOG_INFO, "new control connection from %s", s)); + } + + n->shake_state = rc_hs_read; + n->ssl = SSL_new(rc->ctx); + if(!n->ssl) { + log_crypto_err("could not SSL_new"); + event_del(&n->c); + free(n); + goto close_exit; + } + SSL_set_accept_state(n->ssl); + (void)SSL_set_mode(n->ssl, SSL_MODE_AUTO_RETRY); + if(!SSL_set_fd(n->ssl, newfd)) { + log_crypto_err("could not SSL_set_fd"); + event_del(&n->c); + SSL_free(n->ssl); + free(n); + goto close_exit; + } + + n->rc = rc; + n->stats_next = NULL; + n->in_stats_list = 0; + n->prev = NULL; + n->next = rc->busy_list; + if(n->next) n->next->prev = n; + rc->busy_list = n; + rc->active ++; + + /* perform the first nonblocking read already, for windows, + * so it can return wouldblock. could be faster too. */ + remote_control_callback(newfd, EV_READ, n); +} + +/** delete from list */ +static void +state_list_remove_elem(struct rc_state** list, struct rc_state* todel) +{ + if(todel->prev) todel->prev->next = todel->next; + else *list = todel->next; + if(todel->next) todel->next->prev = todel->prev; +} + +/** delete from stats list */ +static void +stats_list_remove_elem(struct rc_state** list, struct rc_state* todel) +{ + while(*list) { + if( (*list) == todel) { + *list = (*list)->stats_next; + return; + } + list = &(*list)->stats_next; + } +} + +/** decrease active count and remove commpoint from busy list */ +static void +clean_point(struct daemon_remote* rc, struct rc_state* s) +{ + if(s->in_stats_list) + stats_list_remove_elem(&rc->stats_list, s); + state_list_remove_elem(&rc->busy_list, s); + rc->active --; + if(s->event_added) + event_del(&s->c); + if(s->ssl) { + SSL_shutdown(s->ssl); + SSL_free(s->ssl); + } + close(s->c.ev_fd); + free(s); +} + +static int +ssl_print_text(SSL* ssl, const char* text) +{ + int r; + if(!ssl) + return 0; + ERR_clear_error(); + if((r=SSL_write(ssl, text, (int)strlen(text))) <= 0) { + if(SSL_get_error(ssl, r) == SSL_ERROR_ZERO_RETURN) { + VERBOSITY(2, (LOG_WARNING, "in SSL_write, peer " + "closed connection")); + return 0; + } + log_crypto_err("could not SSL_write"); + return 0; + } + return 1; +} + +/** print text over the ssl connection */ +static int +ssl_print_vmsg(SSL* ssl, const char* format, va_list args) +{ + char msg[1024]; + vsnprintf(msg, sizeof(msg), format, args); + return ssl_print_text(ssl, msg); +} + +/** printf style printing to the ssl connection */ +static int +ssl_printf(SSL* ssl, const char* format, ...) +{ + va_list args; + int ret; + va_start(args, format); + ret = ssl_print_vmsg(ssl, format, args); + va_end(args); + return ret; +} + +static int +ssl_read_line(SSL* ssl, char* buf, size_t max) +{ + int r; + size_t len = 0; + if(!ssl) + return 0; + while(len < max) { + ERR_clear_error(); + if((r=SSL_read(ssl, buf+len, 1)) <= 0) { + if(SSL_get_error(ssl, r) == SSL_ERROR_ZERO_RETURN) { + buf[len] = 0; + return 1; + } + log_crypto_err("could not SSL_read"); + return 0; + } + if(buf[len] == '\n') { + /* return string without \n */ + buf[len] = 0; + return 1; + } + len++; + } + buf[max-1] = 0; + log_msg(LOG_ERR, "control line too long (%d): %s", (int)max, buf); + return 0; +} + +/** skip whitespace, return new pointer into string */ +static char* +skipwhite(char* str) +{ + /* EOS \0 is not a space */ + while( isspace(*str) ) + str++; + return str; +} + +/** send the OK to the control client */ +static void +send_ok(SSL* ssl) +{ + (void)ssl_printf(ssl, "ok\n"); +} + +/** get zone argument (if any) or NULL, false on error */ +static int +get_zone_arg(SSL* ssl, xfrd_state_t* xfrd, char* arg, + zone_options_t** zo) +{ + const dname_type* dname; + if(!arg[0]) { + /* no argument present, return NULL */ + *zo = NULL; + return 1; + } + dname = dname_parse(xfrd->region, arg); + if(!dname) { + ssl_printf(ssl, "error cannot parse zone name '%s'\n", arg); + *zo = NULL; + return 0; + } + *zo = zone_options_find(xfrd->nsd->options, dname); + region_recycle(xfrd->region, (void*)dname, dname_total_size(dname)); + if(!*zo) { + ssl_printf(ssl, "error zone %s not configured\n", arg); + return 0; + } + return 1; +} + +/** do the stop command */ +static void +do_stop(SSL* ssl, xfrd_state_t* xfrd) +{ + xfrd->need_to_send_shutdown = 1; + + if(!(xfrd->ipc_handler_flags&EV_WRITE)) { + ipc_xfrd_set_listening(xfrd, EV_PERSIST|EV_READ|EV_WRITE); + } + + send_ok(ssl); +} + +/** do the log_reopen command, it only needs reload_now */ +static void +do_log_reopen(SSL* ssl, xfrd_state_t* xfrd) +{ + xfrd_set_reload_now(xfrd); + send_ok(ssl); +} + +/** do the reload command */ +static void +do_reload(SSL* ssl, xfrd_state_t* xfrd, char* arg) +{ + zone_options_t* zo; + if(!get_zone_arg(ssl, xfrd, arg, &zo)) + return; + task_new_check_zonefiles(xfrd->nsd->task[xfrd->nsd->mytask], + xfrd->last_task, zo?(const dname_type*)zo->node.key:NULL); + xfrd_set_reload_now(xfrd); + send_ok(ssl); +} + +/** do the write command */ +static void +do_write(SSL* ssl, xfrd_state_t* xfrd, char* arg) +{ + zone_options_t* zo; + if(!get_zone_arg(ssl, xfrd, arg, &zo)) + return; + task_new_write_zonefiles(xfrd->nsd->task[xfrd->nsd->mytask], + xfrd->last_task, zo?(const dname_type*)zo->node.key:NULL); + xfrd_set_reload_now(xfrd); + send_ok(ssl); +} + +/** do the notify command */ +static void +do_notify(SSL* ssl, xfrd_state_t* xfrd, char* arg) +{ + zone_options_t* zo; + if(!get_zone_arg(ssl, xfrd, arg, &zo)) + return; + if(zo) { + struct notify_zone_t* n = (struct notify_zone_t*)rbtree_search( + xfrd->notify_zones, (const dname_type*)zo->node.key); + if(n) { + xfrd_notify_start(n); + send_ok(ssl); + } else { + ssl_printf(ssl, "error zone does not have notify\n"); + } + } else { + struct notify_zone_t* n; + RBTREE_FOR(n, struct notify_zone_t*, xfrd->notify_zones) { + xfrd_notify_start(n); + } + send_ok(ssl); + } +} + +/** do the transfer command */ +static void +do_transfer(SSL* ssl, xfrd_state_t* xfrd, char* arg) +{ + zone_options_t* zo; + xfrd_zone_t* zone; + if(!get_zone_arg(ssl, xfrd, arg, &zo)) + return; + if(zo) { + zone = (xfrd_zone_t*)rbtree_search(xfrd->zones, (const + dname_type*)zo->node.key); + if(zone) { + xfrd_handle_notify_and_start_xfr(zone, NULL); + send_ok(ssl); + } else { + ssl_printf(ssl, "error zone not slave\n"); + } + } else { + RBTREE_FOR(zone, xfrd_zone_t*, xfrd->zones) { + xfrd_handle_notify_and_start_xfr(zone, NULL); + } + ssl_printf(ssl, "ok, %u zones\n", (unsigned)xfrd->zones->count); + } +} + +/** force transfer a zone */ +static void +force_transfer_zone(xfrd_zone_t* zone) +{ + /* if in TCP transaction, stop it immediately. */ + if(zone->tcp_conn != -1) + xfrd_tcp_release(xfrd->tcp_set, zone); + else if(zone->zone_handler.ev_fd != -1) + xfrd_udp_release(zone); + /* pretend we not longer have it and force any + * zone to be downloaded (even same serial, w AXFR) */ + zone->soa_disk_acquired = 0; + xfrd_handle_notify_and_start_xfr(zone, NULL); +} + +/** do the force transfer command */ +static void +do_force_transfer(SSL* ssl, xfrd_state_t* xfrd, char* arg) +{ + zone_options_t* zo; + xfrd_zone_t* zone; + if(!get_zone_arg(ssl, xfrd, arg, &zo)) + return; + if(zo) { + zone = (xfrd_zone_t*)rbtree_search(xfrd->zones, (const + dname_type*)zo->node.key); + if(zone) { + force_transfer_zone(zone); + send_ok(ssl); + } else { + ssl_printf(ssl, "error zone not slave\n"); + } + } else { + RBTREE_FOR(zone, xfrd_zone_t*, xfrd->zones) { + force_transfer_zone(zone); + } + ssl_printf(ssl, "ok, %u zones\n", (unsigned)xfrd->zones->count); + } +} + +static int +print_soa_status(SSL* ssl, const char* str, xfrd_soa_t* soa, time_t acq) +{ + if(acq) { + if(!ssl_printf(ssl, " %s: \"%u since %s\"\n", str, + (unsigned)ntohl(soa->serial), xfrd_pretty_time(acq))) + return 0; + } else { + if(!ssl_printf(ssl, " %s: none\n", str)) + return 0; + } + return 1; +} + +/** print zonestatus for one domain */ +static int +print_zonestatus(SSL* ssl, xfrd_state_t* xfrd, zone_options_t* zo) +{ + xfrd_zone_t* xz = (xfrd_zone_t*)rbtree_search(xfrd->zones, + (const dname_type*)zo->node.key); + struct notify_zone_t* nz = (struct notify_zone_t*)rbtree_search( + xfrd->notify_zones, (const dname_type*)zo->node.key); + if(!ssl_printf(ssl, "zone: %s\n", zo->name)) + return 0; + if(!zo->part_of_config) { + if(!ssl_printf(ssl, " pattern: %s\n", zo->pattern->pname)) + return 0; + } + if(nz) { + if(nz->is_waiting) { + if(!ssl_printf(ssl, " notify: \"waiting-for-fd\"\n")) + return 0; + } else if(nz->notify_send_enable) { + if(!ssl_printf(ssl, " notify: \"sent try %d " + "to %s with serial %u\"\n", nz->notify_retry, + nz->notify_current->ip_address_spec, + (unsigned)ntohl(nz->current_soa->serial))) + return 0; + } + } + if(!xz) { + if(!ssl_printf(ssl, " state: master\n")) + return 0; + return 1; + } + if(!ssl_printf(ssl, " state: %s\n", + (xz->state == xfrd_zone_ok)?"ok":( + (xz->state == xfrd_zone_expired)?"expired":"refreshing"))) + return 0; + if(!print_soa_status(ssl, "served-serial", &xz->soa_nsd, + xz->soa_nsd_acquired)) + return 0; + if(!print_soa_status(ssl, "commit-serial", &xz->soa_disk, + xz->soa_disk_acquired)) + return 0; + if(xz->round_num != -1) { + if(!print_soa_status(ssl, "notified-serial", &xz->soa_notified, + xz->soa_notified_acquired)) + return 0; + } + + /* UDP */ + if(xz->udp_waiting) { + if(!ssl_printf(ssl, " transfer: \"waiting-for-UDP-fd\"\n")) + return 0; + } else if(xz->zone_handler.ev_fd != -1 && xz->tcp_conn == -1) { + if(!ssl_printf(ssl, " transfer: \"sent UDP to %s\"\n", + xz->master->ip_address_spec)) + return 0; + } + + /* TCP */ + if(xz->tcp_waiting) { + if(!ssl_printf(ssl, " transfer: \"waiting-for-TCP-fd\"\n")) + return 0; + } else if(xz->tcp_conn != -1) { + if(!ssl_printf(ssl, " transfer: \"TCP connected to %s\"\n", + xz->master->ip_address_spec)) + return 0; + } + + return 1; +} + +/** do the zonestatus command */ +static void +do_zonestatus(SSL* ssl, xfrd_state_t* xfrd, char* arg) +{ + zone_options_t* zo; + if(!get_zone_arg(ssl, xfrd, arg, &zo)) + return; + if(zo) (void)print_zonestatus(ssl, xfrd, zo); + else { + RBTREE_FOR(zo, zone_options_t*, + xfrd->nsd->options->zone_options) { + if(!print_zonestatus(ssl, xfrd, zo)) + return; + } + } +} + +/** do the verbosity command */ +static void +do_verbosity(SSL* ssl, char* str) +{ + int val = atoi(str); + if(strcmp(str, "") == 0) { + ssl_printf(ssl, "verbosity %d\n", verbosity); + return; + } + if(val == 0 && strcmp(str, "0") != 0) { + ssl_printf(ssl, "error in verbosity number syntax: %s\n", str); + return; + } + verbosity = val; + task_new_set_verbosity(xfrd->nsd->task[xfrd->nsd->mytask], + xfrd->last_task, val); + xfrd_set_reload_now(xfrd); + send_ok(ssl); +} + +/** find second argument, modifies string */ +static int +find_arg2(SSL* ssl, char* arg, char** arg2) +{ + char* as = strrchr(arg, ' '); + if(as) { + as[0]=0; + *arg2 = as+1; + while(isspace(*as) && as > arg) + as--; + as[0]=0; + return 1; + } + ssl_printf(ssl, "error could not find next argument " + "after %s\n", arg); + return 0; +} + +/** do the status command */ +static void +do_status(SSL* ssl, xfrd_state_t* xfrd) +{ + if(!ssl_printf(ssl, "version: %s\n", PACKAGE_VERSION)) + return; + if(!ssl_printf(ssl, "verbosity: %d\n", verbosity)) + return; +#ifdef RATELIMIT + if(!ssl_printf(ssl, "ratelimit: %d\n", + (int)xfrd->nsd->options->rrl_ratelimit)) + return; +#else + (void)xfrd; +#endif +} + +/** do the stats command */ +static void +do_stats(struct daemon_remote* rc, int peek, struct rc_state* rs) +{ +#ifdef BIND8_STATS + /* queue up to get stats after a reload is done (to gather statistics + * from the servers) */ + assert(!rs->in_stats_list); + if(peek) rs->in_stats_list = 2; + else rs->in_stats_list = 1; + rs->stats_next = rc->stats_list; + rc->stats_list = rs; + /* block the tcp waiting for the reload */ + event_del(&rs->c); + rs->event_added = 0; + /* force a reload */ + xfrd_set_reload_now(xfrd); +#else + (void)rc; (void)peek; + (void)ssl_printf(rs->ssl, "error no stats enabled at compile time\n"); +#endif /* BIND8_STATS */ +} + +/** do the addzone command */ +static void +do_addzone(SSL* ssl, xfrd_state_t* xfrd, char* arg) +{ + zone_options_t* zopt; + char* arg2 = NULL; + if(!find_arg2(ssl, arg, &arg2)) + return; + + /* if we add it to the xfrd now, then xfrd could download AXFR and + * store it and the NSD-reload would see it in the difffile before + * it sees the add-config task. + */ + /* thus: AXFRs and IXFRs must store the pattern name in the + * difffile, so that it can be added when the AXFR or IXFR is seen. + */ + + /* check that the pattern exists */ + if(!rbtree_search(xfrd->nsd->options->patterns, arg2)) { + (void)ssl_printf(ssl, "error pattern does not exist\n"); + return; + } + + /* add to zonelist and adds to config in memory */ + zopt = zone_list_add(xfrd->nsd->options, arg, arg2); + if(!zopt) { + /* also dname parse error here */ + (void)ssl_printf(ssl, "error could not add zonelist entry\n"); + return; + } + /* make addzone task and schedule reload */ + task_new_add_zone(xfrd->nsd->task[xfrd->nsd->mytask], + xfrd->last_task, arg, arg2); + xfrd_set_reload_now(xfrd); + /* add to xfrd - notify (for master and slaves) */ + init_notify_send(xfrd->notify_zones, xfrd->region, zopt); + /* add to xfrd - slave */ + if(zone_is_slave(zopt)) { + xfrd_init_slave_zone(xfrd, zopt); + } + + send_ok(ssl); +} + +/** do the delzone command */ +static void +do_delzone(SSL* ssl, xfrd_state_t* xfrd, char* arg) +{ + const dname_type* dname; + zone_options_t* zopt; + + dname = dname_parse(xfrd->region, arg); + if(!dname) { + (void)ssl_printf(ssl, "error cannot parse zone name\n"); + return; + } + + /* see if we have the zone in question */ + zopt = zone_options_find(xfrd->nsd->options, dname); + if(!zopt) { + region_recycle(xfrd->region, (void*)dname, + dname_total_size(dname)); + /* nothing to do */ + if(!ssl_printf(ssl, "warning zone %s not present\n", arg)) + return; + send_ok(ssl); + return; + } + + /* see if it can be deleted */ + if(zopt->part_of_config) { + region_recycle(xfrd->region, (void*)dname, + dname_total_size(dname)); + (void)ssl_printf(ssl, "error zone defined in nsd.conf, " + "cannot delete it in this manner: remove it from " + "nsd.conf yourself and repattern\n"); + return; + } + + /* create deletion task */ + task_new_del_zone(xfrd->nsd->task[xfrd->nsd->mytask], + xfrd->last_task, dname); + xfrd_set_reload_now(xfrd); + /* delete it in xfrd */ + if(zone_is_slave(zopt)) { + xfrd_del_slave_zone(xfrd, dname); + } + xfrd_del_notify(xfrd, dname); + /* delete from config */ + zone_list_del(xfrd->nsd->options, zopt); + + region_recycle(xfrd->region, (void*)dname, dname_total_size(dname)); + send_ok(ssl); +} + +/** remove TSIG key from config and add task so that reload does too */ +static void remove_key(xfrd_state_t* xfrd, const char* kname) +{ + /* add task before deletion because the name string could be deleted */ + task_new_del_key(xfrd->nsd->task[xfrd->nsd->mytask], xfrd->last_task, + kname); + key_options_remove(xfrd->nsd->options, kname); + xfrd_set_reload_now(xfrd); /* this is executed when the current control + command ends, thus the entire config changes are bunched up */ +} + +/** add TSIG key to config and add task so that reload does too */ +static void add_key(xfrd_state_t* xfrd, key_options_t* k) +{ + key_options_add_modify(xfrd->nsd->options, k); + task_new_add_key(xfrd->nsd->task[xfrd->nsd->mytask], xfrd->last_task, + k); + xfrd_set_reload_now(xfrd); +} + +/** check if keys have changed */ +static void repat_keys(xfrd_state_t* xfrd, nsd_options_t* newopt) +{ + nsd_options_t* oldopt = xfrd->nsd->options; + key_options_t* k; + /* find deleted keys */ + k = (key_options_t*)rbtree_first(oldopt->keys); + while((rbnode_t*)k != RBTREE_NULL) { + key_options_t* next = (key_options_t*)rbtree_next( + (rbnode_t*)k); + if(!key_options_find(newopt, k->name)) + remove_key(xfrd, k->name); + k = next; + } + /* find added or changed keys */ + RBTREE_FOR(k, key_options_t*, newopt->keys) { + key_options_t* origk = key_options_find(oldopt, k->name); + if(!origk) + add_key(xfrd, k); + else if(!key_options_equal(k, origk)) + add_key(xfrd, k); + } +} + +/** find zone given the implicit pattern */ +static const dname_type* +parse_implicit_name(xfrd_state_t* xfrd,const char* pname) +{ + if(strncmp(pname, PATTERN_IMPLICIT_MARKER, + strlen(PATTERN_IMPLICIT_MARKER)) != 0) + return NULL; + return dname_parse(xfrd->region, pname + + strlen(PATTERN_IMPLICIT_MARKER)); +} + +/** remove cfgzone and add task so that reload does too */ +static void +remove_cfgzone(xfrd_state_t* xfrd, const char* pname) +{ + /* dname and find the zone for the implicit pattern */ + zone_options_t* zopt = NULL; + const dname_type* dname = parse_implicit_name(xfrd, pname); + if(!dname) { + /* should have a parseable name, but it did not */ + return; + } + + /* find the zone entry for the implicit pattern */ + zopt = zone_options_find(xfrd->nsd->options, dname); + if(!zopt) { + /* this should not happen; implicit pattern has zone entry */ + region_recycle(xfrd->region, (void*)dname, + dname_total_size(dname)); + return; + } + + /* create deletion task */ + task_new_del_zone(xfrd->nsd->task[xfrd->nsd->mytask], + xfrd->last_task, dname); + xfrd_set_reload_now(xfrd); + /* delete it in xfrd */ + if(zone_is_slave(zopt)) { + xfrd_del_slave_zone(xfrd, dname); + } + xfrd_del_notify(xfrd, dname); + + /* delete from zoneoptions */ + zone_options_delete(xfrd->nsd->options, zopt); + + /* recycle parsed dname */ + region_recycle(xfrd->region, (void*)dname, dname_total_size(dname)); +} + +/** add cfgzone and add task so that reload does too */ +static void +add_cfgzone(xfrd_state_t* xfrd, const char* pname) +{ + /* add to our zonelist */ + zone_options_t* zopt = zone_options_create(xfrd->nsd->options->region); + if(!zopt) + return; + zopt->part_of_config = 1; + zopt->name = region_strdup(xfrd->nsd->options->region, + pname + strlen(PATTERN_IMPLICIT_MARKER)); + zopt->pattern = pattern_options_find(xfrd->nsd->options, pname); + if(!zopt->name || !zopt->pattern) + return; + if(!nsd_options_insert_zone(xfrd->nsd->options, zopt)) { + log_msg(LOG_ERR, "bad domain name or duplicate zone '%s' " + "pattern %s", zopt->name, pname); + } + + /* make addzone task and schedule reload */ + task_new_add_zone(xfrd->nsd->task[xfrd->nsd->mytask], + xfrd->last_task, zopt->name, pname); + xfrd_set_reload_now(xfrd); + /* add to xfrd - notify (for master and slaves) */ + init_notify_send(xfrd->notify_zones, xfrd->region, zopt); + /* add to xfrd - slave */ + if(zone_is_slave(zopt)) { + xfrd_init_slave_zone(xfrd, zopt); + } +} + +/** remove pattern and add task so that reload does too */ +static void +remove_pat(xfrd_state_t* xfrd, const char* name) +{ + /* add task before deletion, because name-string could be deleted */ + task_new_del_pattern(xfrd->nsd->task[xfrd->nsd->mytask], + xfrd->last_task, name); + pattern_options_remove(xfrd->nsd->options, name); + xfrd_set_reload_now(xfrd); +} + +/** add pattern and add task so that reload does too */ +static void +add_pat(xfrd_state_t* xfrd, pattern_options_t* p) +{ + pattern_options_add_modify(xfrd->nsd->options, p); + task_new_add_pattern(xfrd->nsd->task[xfrd->nsd->mytask], + xfrd->last_task, p); + xfrd_set_reload_now(xfrd); +} + +/** interrupt zones that are using changed or removed patterns */ +static void +repat_interrupt_zones(xfrd_state_t* xfrd, nsd_options_t* newopt) +{ + /* if masterlist changed: + * interrupt slave zone (UDP or TCP) transfers. + * slave zones reset master to start of list. + */ + xfrd_zone_t* xz; + struct notify_zone_t* nz; + RBTREE_FOR(xz, xfrd_zone_t*, xfrd->zones) { + pattern_options_t* oldp = xz->zone_options->pattern; + pattern_options_t* newp = pattern_options_find(newopt, + oldp->pname); + if(!newp || !acl_list_equal(oldp->request_xfr, + newp->request_xfr)) { + /* interrupt transfer */ + if(xz->tcp_conn != -1) { + xfrd_tcp_release(xfrd->tcp_set, xz); + xfrd_set_refresh_now(xz); + } else if(xz->zone_handler.ev_fd != -1) { + xfrd_udp_release(xz); + xfrd_set_refresh_now(xz); + } + xz->master = 0; + xz->master_num = 0; + xz->next_master = -1; + xz->round_num = 0; /* fresh set of retries */ + } + } + /* if notify list changed: + * interrupt notify that is busy. + * reset notify to start of list. (clear all other reset_notify) + */ + RBTREE_FOR(nz, struct notify_zone_t*, xfrd->notify_zones) { + pattern_options_t* oldp = nz->options->pattern; + pattern_options_t* newp = pattern_options_find(newopt, + oldp->pname); + if(!newp || !acl_list_equal(oldp->notify, newp->notify)) { + /* interrupt notify */ + if(nz->notify_send_enable) { + notify_disable(nz); + /* set to restart the notify after the + * pattern has been changed. */ + nz->notify_restart = 2; + } else { + nz->notify_restart = 1; + } + } else { + nz->notify_restart = 0; + } + } +} + +/** for notify, after the pattern changes, restart the affected notifies */ +static void +repat_interrupt_notify_start(xfrd_state_t* xfrd) +{ + struct notify_zone_t* nz; + RBTREE_FOR(nz, struct notify_zone_t*, xfrd->notify_zones) { + if(nz->notify_restart) { + if(nz->notify_current) + nz->notify_current = nz->options->pattern->notify; + if(nz->notify_restart == 2) { + if(nz->notify_restart) + xfrd_notify_start(nz); + } + } + } +} + +/** check if patterns have changed */ +static void +repat_patterns(xfrd_state_t* xfrd, nsd_options_t* newopt) +{ + /* zones that use changed patterns must have: + * - their AXFR/IXFR interrupted: try again, acl may have changed. + * if the old master/key still exists, OK, fix master-numptrs and + * keep going. Otherwise, stop xfer and reset TSIG. + * - send NOTIFY reset to start of NOTIFY list (and TSIG reset). + */ + nsd_options_t* oldopt = xfrd->nsd->options; + pattern_options_t* p; + int search_zones = 0; + + repat_interrupt_zones(xfrd, newopt); + /* find deleted patterns */ + p = (pattern_options_t*)rbtree_first(oldopt->patterns); + while((rbnode_t*)p != RBTREE_NULL) { + pattern_options_t* next = (pattern_options_t*)rbtree_next( + (rbnode_t*)p); + if(!pattern_options_find(newopt, p->pname)) { + if(p->implicit) { + /* first remove its zone */ + VERBOSITY(1, (LOG_INFO, "zone removed from config: %s", p->pname + strlen(PATTERN_IMPLICIT_MARKER))); + remove_cfgzone(xfrd, p->pname); + } + remove_pat(xfrd, p->pname); + } + p = next; + } + /* find added or changed patterns */ + RBTREE_FOR(p, pattern_options_t*, newopt->patterns) { + pattern_options_t* origp = pattern_options_find(oldopt, + p->pname); + if(!origp) { + /* no zones can use it, no zone_interrupt needed */ + add_pat(xfrd, p); + if(p->implicit) { + VERBOSITY(1, (LOG_INFO, "zone added to config: %s", p->pname + strlen(PATTERN_IMPLICIT_MARKER))); + add_cfgzone(xfrd, p->pname); + } + } else if(!pattern_options_equal(p, origp)) { + uint8_t newstate = 0; + if (p->request_xfr && !origp->request_xfr) { + newstate = REPAT_SLAVE; + } else if (!p->request_xfr && origp->request_xfr) { + newstate = REPAT_MASTER; + } + add_pat(xfrd, p); + if (p->implicit && newstate) { + const dname_type* dname = + parse_implicit_name(xfrd, p->pname); + if (dname) { + if (newstate == REPAT_SLAVE) { + zone_options_t* zopt = + zone_options_find( + oldopt, dname); + if (zopt) { + xfrd_init_slave_zone( + xfrd, zopt); + } + } else if (newstate == REPAT_MASTER) { + xfrd_del_slave_zone(xfrd, + dname); + } + region_recycle(xfrd->region, + (void*)dname, + dname_total_size(dname)); + } + } else if(!p->implicit && newstate) { + /* search all zones with this pattern */ + search_zones = 1; + origp->xfrd_flags = newstate; + } + } + } + if (search_zones) { + zone_options_t* zone_opt; + /* search in oldopt because 1) it contains zonelist zones, + * and 2) you need oldopt(existing) to call xfrd_init */ + RBTREE_FOR(zone_opt, zone_options_t*, oldopt->zone_options) { + pattern_options_t* oldp = zone_opt->pattern; + if (!oldp->implicit) { + if (oldp->xfrd_flags == REPAT_SLAVE) { + /* xfrd needs stable reference so get + * it from the oldopt(modified) tree */ + xfrd_init_slave_zone(xfrd, zone_opt); + } else if (oldp->xfrd_flags == REPAT_MASTER) { + xfrd_del_slave_zone(xfrd, + (const dname_type*) + zone_opt->node.key); + } + oldp->xfrd_flags = 0; + } + } + } + repat_interrupt_notify_start(xfrd); +} + +/** true if options are different that can be set via repat. */ +static int +repat_options_changed(xfrd_state_t* xfrd, nsd_options_t* newopt) +{ +#ifdef RATELIMIT + if(xfrd->nsd->options->rrl_ratelimit != newopt->rrl_ratelimit) + return 1; + if(xfrd->nsd->options->rrl_whitelist_ratelimit != newopt->rrl_whitelist_ratelimit) + return 1; + if(xfrd->nsd->options->rrl_slip != newopt->rrl_slip) + return 1; +#else + (void)xfrd; (void)newopt; +#endif + return 0; +} + +/** check if global options have changed */ +static void +repat_options(xfrd_state_t* xfrd, nsd_options_t* newopt) +{ + if(repat_options_changed(xfrd, newopt)) { + /* update our options */ +#ifdef RATELIMIT + xfrd->nsd->options->rrl_ratelimit = newopt->rrl_ratelimit; + xfrd->nsd->options->rrl_whitelist_ratelimit = newopt->rrl_whitelist_ratelimit; + xfrd->nsd->options->rrl_slip = newopt->rrl_slip; +#endif + task_new_opt_change(xfrd->nsd->task[xfrd->nsd->mytask], + xfrd->last_task, newopt); + xfrd_set_reload_now(xfrd); + } +} + +/** print errors over ssl, gets pointer-to-pointer to ssl, so it can set + * the pointer to NULL on failure and stop printing */ +static void +print_ssl_cfg_err(void* arg, const char* str) +{ + SSL** ssl = (SSL**)arg; + if(!*ssl) return; + if(!ssl_printf(*ssl, "%s", str)) + *ssl = NULL; /* failed, stop printing */ +} + +/** do the repattern command: reread config file and apply keys, patterns */ +static void +do_repattern(SSL* ssl, xfrd_state_t* xfrd) +{ + region_type* region = region_create(xalloc, free); + nsd_options_t* opt; + const char* cfgfile = xfrd->nsd->options->configfile; + + /* check chroot and configfile, if possible to reread */ + if(xfrd->nsd->chrootdir) { + size_t l = strlen(xfrd->nsd->chrootdir); + while(l>0 && xfrd->nsd->chrootdir[l-1] == '/') + --l; + if(strncmp(xfrd->nsd->chrootdir, cfgfile, l) != 0) { + ssl_printf(ssl, "error %s is not relative to %s: " + "chroot prevents reread of config\n", + cfgfile, xfrd->nsd->chrootdir); + region_destroy(region); + return; + } + cfgfile += l; + } + + ssl_printf(ssl, "reconfig start, read %s\n", cfgfile); + opt = nsd_options_create(region); + if(!parse_options_file(opt, cfgfile, &print_ssl_cfg_err, &ssl)) { + /* error already printed */ + region_destroy(region); + return; + } + /* check for differences in TSIG keys and patterns, and apply, + * first the keys, so that pattern->keyptr can be set right. */ + repat_keys(xfrd, opt); + repat_patterns(xfrd, opt); + repat_options(xfrd, opt); + send_ok(ssl); + region_destroy(region); +} + +/** do the serverpid command: printout pid of server process */ +static void +do_serverpid(SSL* ssl, xfrd_state_t* xfrd) +{ + (void)ssl_printf(ssl, "%u\n", (unsigned)xfrd->reload_pid); +} + +/** check for name with end-of-string, space or tab after it */ +static int +cmdcmp(char* p, const char* cmd, size_t len) +{ + return strncmp(p,cmd,len)==0 && (p[len]==0||p[len]==' '||p[len]=='\t'); +} + +/** execute a remote control command */ +static void +execute_cmd(struct daemon_remote* rc, SSL* ssl, char* cmd, struct rc_state* rs) +{ + char* p = skipwhite(cmd); + /* compare command */ + if(cmdcmp(p, "stop", 4)) { + do_stop(ssl, rc->xfrd); + } else if(cmdcmp(p, "reload", 6)) { + do_reload(ssl, rc->xfrd, skipwhite(p+6)); + } else if(cmdcmp(p, "write", 5)) { + do_write(ssl, rc->xfrd, skipwhite(p+5)); + } else if(cmdcmp(p, "status", 6)) { + do_status(ssl, rc->xfrd); + } else if(cmdcmp(p, "stats_noreset", 13)) { + do_stats(rc, 1, rs); + } else if(cmdcmp(p, "stats", 5)) { + do_stats(rc, 0, rs); + } else if(cmdcmp(p, "log_reopen", 10)) { + do_log_reopen(ssl, rc->xfrd); + } else if(cmdcmp(p, "addzone", 7)) { + do_addzone(ssl, rc->xfrd, skipwhite(p+7)); + } else if(cmdcmp(p, "delzone", 7)) { + do_delzone(ssl, rc->xfrd, skipwhite(p+7)); + } else if(cmdcmp(p, "notify", 6)) { + do_notify(ssl, rc->xfrd, skipwhite(p+6)); + } else if(cmdcmp(p, "transfer", 8)) { + do_transfer(ssl, rc->xfrd, skipwhite(p+8)); + } else if(cmdcmp(p, "force_transfer", 14)) { + do_force_transfer(ssl, rc->xfrd, skipwhite(p+14)); + } else if(cmdcmp(p, "zonestatus", 10)) { + do_zonestatus(ssl, rc->xfrd, skipwhite(p+10)); + } else if(cmdcmp(p, "verbosity", 9)) { + do_verbosity(ssl, skipwhite(p+9)); + } else if(cmdcmp(p, "repattern", 9)) { + do_repattern(ssl, rc->xfrd); + } else if(cmdcmp(p, "reconfig", 8)) { + do_repattern(ssl, rc->xfrd); + } else if(cmdcmp(p, "serverpid", 9)) { + do_serverpid(ssl, rc->xfrd); + } else { + (void)ssl_printf(ssl, "error unknown command '%s'\n", p); + } +} + +/** handle remote control request */ +static void +handle_req(struct daemon_remote* rc, struct rc_state* s, SSL* ssl) +{ + int r; + char pre[10]; + char magic[8]; + char buf[1024]; + if (fcntl(s->c.ev_fd, F_SETFL, 0) == -1) { /* set blocking */ + log_msg(LOG_ERR, "cannot fcntl rc: %s", strerror(errno)); + } + + /* try to read magic UBCT[version]_space_ string */ + ERR_clear_error(); + if((r=SSL_read(ssl, magic, (int)sizeof(magic)-1)) <= 0) { + if(SSL_get_error(ssl, r) == SSL_ERROR_ZERO_RETURN) + return; + log_crypto_err("could not SSL_read"); + return; + } + magic[7] = 0; + if( r != 7 || strncmp(magic, "NSDCT", 5) != 0) { + VERBOSITY(2, (LOG_INFO, "control connection has bad header")); + /* probably wrong tool connected, ignore it completely */ + return; + } + + /* read the command line */ + if(!ssl_read_line(ssl, buf, sizeof(buf))) { + return; + } + snprintf(pre, sizeof(pre), "NSDCT%d ", NSD_CONTROL_VERSION); + if(strcmp(magic, pre) != 0) { + VERBOSITY(2, (LOG_INFO, "control connection had bad " + "version %s, cmd: %s", magic, buf)); + ssl_printf(ssl, "error version mismatch\n"); + return; + } + VERBOSITY(2, (LOG_INFO, "control cmd: %s", buf)); + + /* figure out what to do */ + execute_cmd(rc, ssl, buf, s); +} + +static void +remote_control_callback(int fd, short event, void* arg) +{ + struct rc_state* s = (struct rc_state*)arg; + struct daemon_remote* rc = s->rc; + int r; + if( (event&EV_TIMEOUT) ) { + log_msg(LOG_ERR, "remote control timed out"); + clean_point(rc, s); + return; + } + /* (continue to) setup the SSL connection */ + ERR_clear_error(); + r = SSL_do_handshake(s->ssl); + if(r != 1) { + int r2 = SSL_get_error(s->ssl, r); + if(r2 == SSL_ERROR_WANT_READ) { + if(s->shake_state == rc_hs_read) { + /* try again later */ + return; + } + s->shake_state = rc_hs_read; + event_del(&s->c); + event_set(&s->c, fd, EV_PERSIST|EV_TIMEOUT|EV_READ, + remote_control_callback, s); + if(event_base_set(xfrd->event_base, &s->c) != 0) + log_msg(LOG_ERR, "remote_accept: cannot set event_base"); + if(event_add(&s->c, &s->tval) != 0) + log_msg(LOG_ERR, "remote_accept: cannot add event"); + return; + } else if(r2 == SSL_ERROR_WANT_WRITE) { + if(s->shake_state == rc_hs_write) { + /* try again later */ + return; + } + s->shake_state = rc_hs_write; + event_del(&s->c); + event_set(&s->c, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE, + remote_control_callback, s); + if(event_base_set(xfrd->event_base, &s->c) != 0) + log_msg(LOG_ERR, "remote_accept: cannot set event_base"); + if(event_add(&s->c, &s->tval) != 0) + log_msg(LOG_ERR, "remote_accept: cannot add event"); + return; + } else { + if(r == 0) + log_msg(LOG_ERR, "remote control connection closed prematurely"); + log_crypto_err("remote control failed ssl"); + clean_point(rc, s); + return; + } + } + s->shake_state = rc_none; + + /* once handshake has completed, check authentication */ + if(SSL_get_verify_result(s->ssl) == X509_V_OK) { + X509* x = SSL_get_peer_certificate(s->ssl); + if(!x) { + VERBOSITY(2, (LOG_INFO, "remote control connection " + "provided no client certificate")); + clean_point(rc, s); + return; + } + VERBOSITY(3, (LOG_INFO, "remote control connection authenticated")); + X509_free(x); + } else { + VERBOSITY(2, (LOG_INFO, "remote control connection failed to " + "authenticate with client certificate")); + clean_point(rc, s); + return; + } + + /* if OK start to actually handle the request */ + handle_req(rc, s, s->ssl); + + if(!s->in_stats_list) { + VERBOSITY(3, (LOG_INFO, "remote control operation completed")); + clean_point(rc, s); + } +} + +#ifdef BIND8_STATS +static const char* +opcode2str(int o) +{ + switch(o) { + case OPCODE_QUERY: return "QUERY"; + case OPCODE_IQUERY: return "IQUERY"; + case OPCODE_STATUS: return "STATUS"; + case OPCODE_NOTIFY: return "NOTIFY"; + case OPCODE_UPDATE: return "UPDATE"; + default: return "OTHER"; + } +} + +/** print long number */ +static int +print_longnum(SSL* ssl, char* desc, uint64_t x) +{ + if(x > (uint64_t)1024*1024*1024) { + /* more than a Gb */ + size_t front = (size_t)(x / (uint64_t)1000000); + size_t back = (size_t)(x % (uint64_t)1000000); + return ssl_printf(ssl, "%s%u%6.6u\n", desc, + (unsigned)front, (unsigned)back); + } else { + return ssl_printf(ssl, "%s%u\n", desc, (unsigned)x); + } +} + +static void +print_stats(SSL* ssl, xfrd_state_t* xfrd, struct timeval* now) +{ + const char* rcstr[] = {"NOERROR", "FORMERR", "SERVFAIL", "NXDOMAIN", + "NOTIMP", "REFUSED", "YXDOMAIN", "YXRRSET", "NXRRSET", "NOTAUTH", + "NOTZONE", "RCODE11", "RCODE12", "RCODE13", "RCODE14", "RCODE15", + "BADVERS" + }; + size_t i; + stc_t total = 0; + struct timeval elapsed, uptime; + + /* per CPU and total */ + for(i=0; i<xfrd->nsd->child_count; i++) { + if(!ssl_printf(ssl, "server%d.queries=%u\n", (int)i, + (unsigned)xfrd->nsd->children[i].query_count)) + return; + total += xfrd->nsd->children[i].query_count; + } + if(!ssl_printf(ssl, "num.queries=%u\n", (unsigned)total)) + return; + + /* time elapsed and uptime (in seconds) */ + timeval_subtract(&uptime, now, &xfrd->nsd->rc->boot_time); + timeval_subtract(&elapsed, now, &xfrd->nsd->rc->stats_time); + if(!ssl_printf(ssl, "time.boot=%u.%6.6u\n", + (unsigned)uptime.tv_sec, (unsigned)uptime.tv_usec)) + return; + if(!ssl_printf(ssl, "time.elapsed=%u.%6.6u\n", + (unsigned)elapsed.tv_sec, (unsigned)elapsed.tv_usec)) + return; + + /* mem info, database on disksize */ + if(!print_longnum(ssl, "size.db.disk=", xfrd->nsd->st.db_disk)) + return; + if(!print_longnum(ssl, "size.db.mem=", xfrd->nsd->st.db_mem)) + return; + if(!print_longnum(ssl, "size.xfrd.mem=", region_get_mem(xfrd->region))) + return; + if(!print_longnum(ssl, "size.config.disk=", + xfrd->nsd->options->zonelist_off)) + return; + if(!print_longnum(ssl, "size.config.mem=", region_get_mem( + xfrd->nsd->options->region))) + return; + + for(i=0; i<= 255; i++) { + if(inhibit_zero && xfrd->nsd->st.qtype[i] == 0 && + strncmp(rrtype_to_string(i), "TYPE", 4) == 0) + continue; + if(!ssl_printf(ssl, "num.type.%s=%u\n", + rrtype_to_string(i), (unsigned)xfrd->nsd->st.qtype[i])) + return; + } + + /* opcode */ + for(i=0; i<6; i++) { + if(inhibit_zero && xfrd->nsd->st.opcode[i] == 0 && + i != OPCODE_QUERY) + continue; + if(!ssl_printf(ssl, "num.opcode.%s=%u\n", opcode2str(i), + (unsigned)xfrd->nsd->st.opcode[i])) + return; + } + + /* qclass */ + for(i=0; i<4; i++) { + if(inhibit_zero && xfrd->nsd->st.qclass[i] == 0 && + i != CLASS_IN) + continue; + if(!ssl_printf(ssl, "num.class.%s=%u\n", rrclass_to_string(i), + (unsigned)xfrd->nsd->st.qclass[i])) + return; + } + + /* rcode */ + for(i=0; i<17; i++) { + if(inhibit_zero && xfrd->nsd->st.rcode[i] == 0 && + i > RCODE_YXDOMAIN) /* NSD does not use larger */ + continue; + if(!ssl_printf(ssl, "num.rcode.%s=%u\n", rcstr[i], + (unsigned)xfrd->nsd->st.rcode[i])) + return; + } + + /* edns */ + if(!ssl_printf(ssl, "num.edns=%u\n", (unsigned)xfrd->nsd->st.edns)) + return; + + /* ednserr */ + if(!ssl_printf(ssl, "num.ednserr=%u\n", + (unsigned)xfrd->nsd->st.ednserr)) + return; + + /* qudp */ + if(!ssl_printf(ssl, "num.udp=%u\n", (unsigned)xfrd->nsd->st.qudp)) + return; + /* qudp6 */ + if(!ssl_printf(ssl, "num.udp6=%u\n", (unsigned)xfrd->nsd->st.qudp6)) + return; + /* ctcp */ + if(!ssl_printf(ssl, "num.tcp=%u\n", (unsigned)xfrd->nsd->st.ctcp)) + return; + /* ctcp6 */ + if(!ssl_printf(ssl, "num.tcp6=%u\n", (unsigned)xfrd->nsd->st.ctcp6)) + return; + + /* nona */ + if(!ssl_printf(ssl, "num.answer_wo_aa=%u\n", + (unsigned)xfrd->nsd->st.nona)) + return; + + /* rxerr */ + if(!ssl_printf(ssl, "num.rxerr=%u\n", (unsigned)xfrd->nsd->st.rxerr)) + return; + + /* txerr */ + if(!ssl_printf(ssl, "num.txerr=%u\n", (unsigned)xfrd->nsd->st.txerr)) + return; + + /* number of requested-axfr, number of times axfr served to clients */ + if(!ssl_printf(ssl, "num.raxfr=%u\n", (unsigned)xfrd->nsd->st.raxfr)) + return; + + /* truncated */ + if(!ssl_printf(ssl, "num.truncated=%u\n", + (unsigned)xfrd->nsd->st.truncated)) + return; + + /* dropped */ + if(!ssl_printf(ssl, "num.dropped=%u\n", + (unsigned)xfrd->nsd->st.dropped)) + return; + + /* zone statistics */ + if(!ssl_printf(ssl, "zone.master=%u\n", + (unsigned)(xfrd->notify_zones->count - xfrd->zones->count))) + return; + if(!ssl_printf(ssl, "zone.slave=%u\n", (unsigned)xfrd->zones->count)) + return; +} + +static void +clear_stats(xfrd_state_t* xfrd) +{ + size_t i; + uint64_t dbd = xfrd->nsd->st.db_disk; + uint64_t dbm = xfrd->nsd->st.db_mem; + for(i=0; i<xfrd->nsd->child_count; i++) { + xfrd->nsd->children[i].query_count = 0; + } + memset(&xfrd->nsd->st, 0, sizeof(struct nsdst)); + xfrd->nsd->st.db_disk = dbd; + xfrd->nsd->st.db_mem = dbm; +} + +void +daemon_remote_process_stats(struct daemon_remote* rc) +{ + struct rc_state* s; + struct timeval now; + if(!rc) return; + if(gettimeofday(&now, NULL) == -1) + log_msg(LOG_ERR, "gettimeofday: %s", strerror(errno)); + /* pop one and give it stats */ + while((s = rc->stats_list)) { + assert(s->in_stats_list); + print_stats(s->ssl, rc->xfrd, &now); + if(s->in_stats_list == 1) { + clear_stats(rc->xfrd); + rc->stats_time = now; + } + VERBOSITY(3, (LOG_INFO, "remote control stats printed")); + rc->stats_list = s->next; + s->in_stats_list = 0; + clean_point(rc, s); + } +} +#endif /* BIND8_STATS */ + +#endif /* HAVE_SSL */ diff --git a/usr.sbin/nsd/remote.h b/usr.sbin/nsd/remote.h new file mode 100644 index 00000000000..4317e1fec65 --- /dev/null +++ b/usr.sbin/nsd/remote.h @@ -0,0 +1,102 @@ +/* + * remote.h - remote control for the NSD daemon. + * + * Copyright (c) 2008, NLnet Labs. All rights reserved. + * + * This software is open source. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * Neither the name of the NLNET LABS nor the names of its contributors may + * be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file + * + * This file contains the remote control functionality for the daemon. + * The remote control can be performed using either the commandline + * nsd-control tool, or a SSLv3/TLS capable web browser. + * The channel is secured using SSLv3 or TLSv1, and certificates. + * Both the server and the client(control tool) have their own keys. + */ + +#ifndef DAEMON_REMOTE_H +#define DAEMON_REMOTE_H +struct xfrd_state; +struct nsd_options; + +/* private, defined in remote.c to keep ssl.h out of this header */ +struct daemon_remote; +struct rc_state; + +/* the remote control needs less backlog than the tcp53 service */ +#define TCP_BACKLOG_REMOTE 16 /* listen() tcp backlog */ + +/** + * Create new remote control state for the daemon. + * Also setups the control port. + * @param cfg: config file with key file settings. + * @return new state, or NULL on failure. + */ +struct daemon_remote* daemon_remote_create(struct nsd_options* cfg); + +/** + * remote control state to delete. + * @param rc: state to delete. + */ +void daemon_remote_delete(struct daemon_remote* rc); + +/** + * Close remote control ports. Clears up busy connections. + * Does not delete the rc itself, or the ssl context (with its keys). + * @param rc: state to close. + */ +void daemon_remote_close(struct daemon_remote* rc); + +/** + * Open and create listening ports for remote control. + * @param rc: rc state that contains list of accept port sockets. + * @param cfg: config options. + * @return false on failure. + */ +int daemon_remote_open_ports(struct daemon_remote* rc, + struct nsd_options* cfg); + +/** + * Setup comm points for accepting remote control connections. + * @param rc: state + * @param xfrd: the process that hosts the control connection. + * The rc is attached to its event base. + */ +void daemon_remote_attach(struct daemon_remote* rc, struct xfrd_state* xfrd); + +/** + * Process statistic results and send them + * @param rc: state. + */ +void daemon_remote_process_stats(struct daemon_remote* rc); + +#endif /* DAEMON_REMOTE_H */ diff --git a/usr.sbin/nsd/rrl.h b/usr.sbin/nsd/rrl.h index 48dbb53b8cb..1ffd841664b 100644 --- a/usr.sbin/nsd/rrl.h +++ b/usr.sbin/nsd/rrl.h @@ -72,5 +72,7 @@ enum rrl_type rrlstr2type(const char* s); /** for unit test, update rrl bucket; return rate */ uint32_t rrl_update(query_type* query, uint32_t hash, uint64_t source, uint16_t flags, int32_t now, uint32_t lm); +/** set the rate limit counters, pass variables in qps */ +void rrl_set_limit(size_t lm, size_t wlm, size_t sm); #endif /* RRL_H */ diff --git a/usr.sbin/nsd/tsig-openssl.c b/usr.sbin/nsd/tsig-openssl.c index 797f7fbf2ab..6795e750f1f 100644 --- a/usr.sbin/nsd/tsig-openssl.c +++ b/usr.sbin/nsd/tsig-openssl.c @@ -1,7 +1,7 @@ /* * tsig-openssl.h -- Interface to OpenSSL for TSIG support. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * diff --git a/usr.sbin/nsd/tsig-openssl.h b/usr.sbin/nsd/tsig-openssl.h index 263c715b113..859c280c4c0 100644 --- a/usr.sbin/nsd/tsig-openssl.h +++ b/usr.sbin/nsd/tsig-openssl.h @@ -1,7 +1,7 @@ /* * tsig-openssl.h -- Interface to OpenSSL for TSIG support. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * diff --git a/usr.sbin/nsd/tsig.c b/usr.sbin/nsd/tsig.c index cf2872b563e..1844e98d9e1 100644 --- a/usr.sbin/nsd/tsig.c +++ b/usr.sbin/nsd/tsig.c @@ -1,7 +1,7 @@ /* - * tsig.h -- TSIG definitions (RFC 2845). + * tsig.c -- TSIG implementation (RFC 2845). * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -16,16 +16,18 @@ #include "tsig-openssl.h" #include "dns.h" #include "packet.h" +#include "query.h" +#include "rbtree.h" static region_type *tsig_region; struct tsig_key_table { - struct tsig_key_table *next; + rbnode_t node; /* by dname */ tsig_key_type *key; }; typedef struct tsig_key_table tsig_key_table_type; -static tsig_key_table_type *tsig_key_table; +static rbtree_t *tsig_key_table; struct tsig_algorithm_table { @@ -83,11 +85,17 @@ tsig_digest_variables(tsig_record_type *tsig, int tsig_timers_only) } } +static int +tree_dname_compare(const void* a, const void* b) +{ + return dname_compare((const dname_type*)a, (const dname_type*)b); +} + int tsig_init(region_type *region) { tsig_region = region; - tsig_key_table = NULL; + tsig_key_table = rbtree_create(region, &tree_dname_compare); tsig_algorithm_table = NULL; #if defined(HAVE_SSL) @@ -99,11 +107,31 @@ tsig_init(region_type *region) void tsig_add_key(tsig_key_type *key) { - tsig_key_table_type *entry = (tsig_key_table_type *) region_alloc( + tsig_key_table_type *entry = (tsig_key_table_type *) region_alloc_zero( tsig_region, sizeof(tsig_key_table_type)); entry->key = key; - entry->next = tsig_key_table; - tsig_key_table = entry; + entry->node.key = entry->key->name; + (void)rbtree_insert(tsig_key_table, &entry->node); +} + +void +tsig_del_key(tsig_key_type *key) +{ + tsig_key_table_type *entry; + if(!key) return; + entry = (tsig_key_table_type*)rbtree_delete(tsig_key_table, key->name); + if(!entry) return; + region_recycle(tsig_region, entry, sizeof(tsig_key_table_type)); +} + +tsig_key_type* +tsig_find_key(const dname_type* name) +{ + tsig_key_table_type* entry; + entry = (tsig_key_table_type*)rbtree_search(tsig_key_table, name); + if(entry) + return entry->key; + return NULL; } void @@ -222,11 +250,21 @@ tsig_create_record_custom(tsig_record_type *tsig, region_type *region, large_object_size, initial_cleanup_size, 0); tsig->context_region = region_create_custom(xalloc, free, chunk_size, large_object_size, initial_cleanup_size, 0); - region_add_cleanup(region, tsig_cleanup, tsig); + if(region) + region_add_cleanup(region, tsig_cleanup, tsig); tsig_init_record(tsig, NULL, NULL); } void +tsig_delete_record(tsig_record_type* tsig, region_type* region) +{ + if(region) + region_remove_cleanup(region, tsig_cleanup, tsig); + region_destroy(tsig->rr_region); + region_destroy(tsig->context_region); +} + +void tsig_init_record(tsig_record_type *tsig, tsig_algorithm_type *algorithm, tsig_key_type *key) @@ -246,7 +284,6 @@ tsig_init_record(tsig_record_type *tsig, int tsig_from_query(tsig_record_type *tsig) { - tsig_key_table_type *key_entry; tsig_key_type *key = NULL; tsig_algorithm_table_type *algorithm_entry; tsig_algorithm_type *algorithm = NULL; @@ -257,16 +294,7 @@ tsig_from_query(tsig_record_type *tsig) assert(!tsig->algorithm); assert(!tsig->key); - /* XXX: TODO: slow linear check for keyname */ - for (key_entry = tsig_key_table; - key_entry; - key_entry = key_entry->next) - { - if (dname_compare(tsig->key_name, key_entry->key->name) == 0) { - key = key_entry->key; - break; - } - } + key = (tsig_key_type*)tsig_find_key(tsig->key_name); for (algorithm_entry = tsig_algorithm_table; algorithm_entry; diff --git a/usr.sbin/nsd/tsig.h b/usr.sbin/nsd/tsig.h index f09a07e5aba..71cad7740c7 100644 --- a/usr.sbin/nsd/tsig.h +++ b/usr.sbin/nsd/tsig.h @@ -1,7 +1,7 @@ /* * tsig.h -- TSIG definitions (RFC 2845). * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -103,7 +103,7 @@ struct tsig_key { const dname_type *name; size_t size; - const uint8_t *data; + uint8_t *data; }; struct tsig_record @@ -144,6 +144,7 @@ int tsig_init(region_type *region); * Add the specified key to the TSIG key table. */ void tsig_add_key(tsig_key_type *key); +void tsig_del_key(tsig_key_type *key); /* * Add the specified algorithm to the TSIG algorithm table. @@ -172,6 +173,7 @@ void tsig_create_record(tsig_record_type* tsig, /* * Like tsig_create_record, with custom region settings. * The size params are used to customise the rr_region and context_region. + * If region is NULL, no cleanup is attached to it. */ void tsig_create_record_custom(tsig_record_type* tsig, region_type* region, @@ -180,6 +182,12 @@ void tsig_create_record_custom(tsig_record_type* tsig, size_t initial_cleanup_size); /* + * Destroy tsig record internals (the main ptr is user alloced). + * if region is nonNULL, removes cleanup. + */ +void tsig_delete_record(tsig_record_type* tsig, region_type* region); + +/* * Call this before starting to analyze or signing a sequence of * packets. * diff --git a/usr.sbin/nsd/udb.c b/usr.sbin/nsd/udb.c new file mode 100644 index 00000000000..6c0ffe7d0c0 --- /dev/null +++ b/usr.sbin/nsd/udb.c @@ -0,0 +1,2018 @@ +/* udb.c - u(micro) data base. + * By W.C.A. Wijngaards + * Copyright 2010, NLnet Labs. + * BSD, see LICENSE. + */ +#include "config.h" +#include "udb.h" +#include <string.h> +#include <errno.h> +#include <stdio.h> +#include <unistd.h> +#include <assert.h> +#include "lookup3.h" +#include "util.h" + +/* mmap and friends */ +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/mman.h> + +/* for systems without, portable definition, failed-1 and async is a flag */ +#ifndef MAP_FAILED +#define MAP_FAILED ((void*)-1) +#endif +#ifndef MS_SYNC +#define MS_SYNC 0 +#endif + +/** move and fixup xl segment */ +static void move_xl_segment(void* base, udb_base* udb, udb_void xl, + udb_void n, uint64_t sz, uint64_t startseg); +/** attempt to compact the data and move free space to the end */ +static int udb_alloc_compact(void* base, udb_alloc* alloc); + +/** convert pointer to the data part to a pointer to the base of the chunk */ +static udb_void +chunk_from_dataptr(udb_void data) +{ + /* we use that sizeof(udb_chunk_d) != sizeof(udb_xl_chunk_d) and + * that xl_chunk_d is aligned on x**1024 boundaries. */ + udb_void xl = data - sizeof(udb_xl_chunk_d); + if( (xl & (UDB_ALLOC_CHUNK_SIZE-1)) == 0) + return xl; + return data - sizeof(udb_chunk_d); +} + +udb_void chunk_from_dataptr_ext(udb_void data) { + return chunk_from_dataptr(data); +} + +#ifndef NDEBUG +/** read last octet from a chunk */ +static uint8_t +chunk_get_last(void* base, udb_void chunk, int exp) +{ + return *((uint8_t*)UDB_REL(base, chunk+(1<<exp)-1)); +} +#endif + +/** write last octet of a chunk */ +static void +chunk_set_last(void* base, udb_void chunk, int exp, uint8_t value) +{ + *((uint8_t*)UDB_REL(base, chunk+(1<<exp)-1)) = value; +} + +/** create udb_base from a file descriptor (must be at start of file) */ +udb_base* +udb_base_create_fd(const char* fname, int fd, udb_walk_relptr_func walkfunc, + void* arg) +{ + uint64_t m; + udb_glob_d g; + ssize_t r; + udb_base* udb = (udb_base*)xalloc_zero(sizeof(*udb)); + if(!udb) { + log_msg(LOG_ERR, "out of memory"); + close(fd); + return NULL; + } + udb->fname = strdup(fname); + if(!udb->fname) { + log_msg(LOG_ERR, "out of memory"); + free(udb); + close(fd); + return NULL; + } + udb->walkfunc = walkfunc; + udb->walkarg = arg; + udb->fd = fd; + udb->ram_size = 1024; + udb->ram_mask = (int)udb->ram_size - 1; + udb->ram_hash = (udb_ptr**)xalloc_zero(sizeof(udb_ptr*)*udb->ram_size); + if(!udb->ram_hash) { + free(udb->fname); + free(udb); + log_msg(LOG_ERR, "out of memory"); + close(fd); + return NULL; + } + + /* read magic */ + if((r=read(fd, &m, sizeof(m))) == -1) { + log_msg(LOG_ERR, "%s: %s", fname, strerror(errno)); + goto fail; + } else if(r != (ssize_t)sizeof(m)) { + log_msg(LOG_ERR, "%s: file too short", fname); + goto fail; + } + /* TODO : what if bigendian and littleendian file, see magic */ + if(m != UDB_MAGIC) { + log_msg(LOG_ERR, "%s: wrong type of file", fname); + goto fail; + } + /* read header */ + if((r=read(fd, &g, sizeof(g))) == -1) { + log_msg(LOG_ERR, "%s: %s\n", fname, strerror(errno)); + goto fail; + } else if(r != (ssize_t)sizeof(g)) { + log_msg(LOG_ERR, "%s: file too short", fname); + goto fail; + } + if(g.version != 0) { + log_msg(LOG_ERR, "%s: unknown file version %d", fname, + (int)g.version); + goto fail; + } + if(g.hsize < UDB_HEADER_SIZE) { + log_msg(LOG_ERR, "%s: header size too small %d", fname, + (int)g.hsize); + goto fail; + } + if(g.hsize > UDB_HEADER_SIZE) { + log_msg(LOG_WARNING, "%s: header size too large %d", fname, + (int)g.hsize); + log_msg(LOG_WARNING, "attempting to continue..."); + } + if(g.clean_close != 0) { + log_msg(LOG_WARNING, "%s: not cleanly closed %d", fname, + (int)g.clean_close); + log_msg(LOG_WARNING, "attempting to continue..."); + } + /* TODO check if too large (>4g on 32bit); mmap-usage would fail */ + + /* mmap it */ + if(g.fsize < UDB_HEADER_SIZE || g.fsize < g.hsize) { + log_msg(LOG_ERR, "%s: file too short", fname); + goto fail; + } + udb->base_size = (size_t)g.fsize; + /* note the size_t casts must be there for portability, on some + * systems the layout of memory is otherwise broken. */ + udb->base = mmap(NULL, (size_t)udb->base_size, + (int)PROT_READ|PROT_WRITE, (int)MAP_SHARED, + (int)udb->fd, (off_t)0); + if(udb->base == MAP_FAILED) { + udb->base = NULL; + log_msg(LOG_ERR, "mmap(size %u) error: %s", + (unsigned)udb->base_size, strerror(errno)); + fail: + close(fd); + free(udb->fname); + free(udb->ram_hash); + free(udb); + return NULL; + } + + /* init completion */ + udb->glob_data = (udb_glob_d*)(udb->base+sizeof(uint64_t)); + r = 0; + if(udb->glob_data->dirty_alloc != udb_dirty_clean) + r = 1; + udb->alloc = udb_alloc_create(udb, (udb_alloc_d*)( + (void*)udb->glob_data+sizeof(*udb->glob_data))); + if(!udb->alloc) { + log_msg(LOG_ERR, "out of memory"); + udb_base_free(udb); + return NULL; + } + if(r) { + /* and compact now, or resume compacting */ + udb_alloc_compact(udb, udb->alloc); + udb_base_sync(udb, 1); + } + + return udb; +} + +udb_base* udb_base_create_read(const char* fname, udb_walk_relptr_func walkfunc, + void* arg) +{ + int fd = open(fname, O_RDWR); + if(fd == -1) { + log_msg(LOG_ERR, "%s: %s", fname, strerror(errno)); + return NULL; + } + return udb_base_create_fd(fname, fd, walkfunc, arg); +} + +/** init new udb_global structure */ +static void udb_glob_init_new(udb_glob_d* g) +{ + memset(g, 0, sizeof(*g)); + g->hsize = UDB_HEADER_SIZE; + g->fsize = UDB_HEADER_SIZE; +} + +/** write data to file and check result */ +static int +write_fdata(const char* fname, int fd, void* data, size_t len) +{ + ssize_t w; + if((w=write(fd, data, len)) == -1) { + log_msg(LOG_ERR, "%s: %s", fname, strerror(errno)); + close(fd); + return 0; + } else if(w != (ssize_t)len) { + log_msg(LOG_ERR, "%s: short write (disk full?)", fname); + close(fd); + return 0; + } + return 1; +} + +udb_base* udb_base_create_new(const char* fname, udb_walk_relptr_func walkfunc, + void* arg) +{ + uint64_t m; + udb_glob_d g; + udb_alloc_d a; + uint64_t endsize = UDB_HEADER_SIZE; + uint64_t endexp = 0; + int fd = open(fname, O_CREAT|O_RDWR, 0600); + if(fd == -1) { + log_msg(LOG_ERR, "%s: %s", fname, strerror(errno)); + return NULL; + } + m = UDB_MAGIC; + udb_glob_init_new(&g); + udb_alloc_init_new(&a); + + /* write new data to file (closes fd on error) */ + if(!write_fdata(fname, fd, &m, sizeof(m))) + return NULL; + if(!write_fdata(fname, fd, &g, sizeof(g))) + return NULL; + if(!write_fdata(fname, fd, &a, sizeof(a))) + return NULL; + if(!write_fdata(fname, fd, &endsize, sizeof(endsize))) + return NULL; + if(!write_fdata(fname, fd, &endexp, sizeof(endexp))) + return NULL; + /* rewind to start */ + if(lseek(fd, (off_t)0, SEEK_SET) == (off_t)-1) { + log_msg(LOG_ERR, "%s: lseek %s", fname, strerror(errno)); + close(fd); + return NULL; + } + return udb_base_create_fd(fname, fd, walkfunc, arg); +} + +/** shrink the udb base if it has unused space at the end */ +static void +udb_base_shrink(udb_base* udb, uint64_t nsize) +{ + udb->glob_data->dirty_alloc = udb_dirty_fsize; + udb->glob_data->fsize = nsize; + /* sync, does not *seem* to be required on Linux, but it is + certainly required on OpenBSD. Otherwise changed data is lost. */ + msync(udb->base, udb->base_size, MS_ASYNC); + if(ftruncate(udb->fd, (off_t)nsize) != 0) { + log_msg(LOG_ERR, "%s: ftruncate(%u) %s", udb->fname, + (unsigned)nsize, strerror(errno)); + } + udb->glob_data->dirty_alloc = udb_dirty_clean; +} + +void udb_base_close(udb_base* udb) +{ + if(!udb) + return; + if(udb->fd != -1 && udb->base && udb->alloc) { + uint64_t nsize = udb->alloc->disk->nextgrow; + if(nsize < udb->base_size) + udb_base_shrink(udb, nsize); + } + if(udb->fd != -1) { + close(udb->fd); + udb->fd = -1; + } + if(udb->base) { + if(munmap(udb->base, udb->base_size) == -1) { + log_msg(LOG_ERR, "munmap: %s", strerror(errno)); + } + udb->base = NULL; + } +} + +void udb_base_free(udb_base* udb) +{ + if(!udb) + return; + udb_base_close(udb); + udb_alloc_delete(udb->alloc); + free(udb->ram_hash); + free(udb->fname); + free(udb); +} + +void udb_base_free_keep_mmap(udb_base* udb) +{ + if(!udb) return; + if(udb->fd != -1) { + close(udb->fd); + udb->fd = -1; + } + udb->base = NULL; + udb_alloc_delete(udb->alloc); + free(udb->ram_hash); + free(udb->fname); + free(udb); +} + +void udb_base_sync(udb_base* udb, int wait) +{ + if(msync(udb->base, udb->base_size, wait?MS_SYNC:MS_ASYNC) != 0) { + log_msg(LOG_ERR, "msync(%s) error %s", + udb->fname, strerror(errno)); + } +} + +/** hash a chunk pointer */ +static uint32_t +chunk_hash_ptr(udb_void p) +{ + /* put p into an array of uint32 */ + uint32_t h[sizeof(p)/sizeof(uint32_t)]; + memcpy(&h, &p, sizeof(h)); + return hashword(h, sizeof(p)/sizeof(uint32_t), 0x8763); +} + +/** check that the given pointer is on the bucket for the given offset */ +int udb_ptr_is_on_bucket(udb_base* udb, udb_ptr* ptr, udb_void to) +{ + uint32_t i = chunk_hash_ptr(to) & udb->ram_mask; + udb_ptr* p; + assert((size_t)i < udb->ram_size); + for(p = udb->ram_hash[i]; p; p=p->next) { + if(p == ptr) + return 1; + } + return 0; +} + +/** grow the ram array */ +static void +grow_ram_hash(udb_base* udb, udb_ptr** newhash) +{ + size_t i; + size_t osize= udb->ram_size; + udb_ptr* p, *np; + udb_ptr** oldhash = udb->ram_hash; + udb->ram_size *= 2; + udb->ram_mask <<= 1; + udb->ram_mask |= 1; + udb->ram_hash = newhash; + /* have to link in every element in the old list into the new list*/ + for(i=0; i<osize; i++) { + p = oldhash[i]; + while(p) { + np = p->next; + /* link into newhash */ + p->prev=NULL; + p->next=newhash[chunk_hash_ptr(p->data)&udb->ram_mask]; + if(p->next) p->next->prev = p; + /* go to next element of oldhash */ + p = np; + } + } + free(oldhash); +} + +void udb_base_link_ptr(udb_base* udb, udb_ptr* ptr) +{ + uint32_t i = chunk_hash_ptr(ptr->data) & udb->ram_mask; + assert((size_t)i < udb->ram_size); +#ifdef UDB_CHECK + assert(udb_valid_dataptr(udb, ptr->data)); /* must be to whole chunk*/ +#endif + udb->ram_num++; + if(udb->ram_num == udb->ram_size && udb->ram_size<(size_t)0xefffffff) { + /* grow the array, if allocation succeeds */ + udb_ptr** newram = (udb_ptr**)xalloc_zero(sizeof(udb_ptr*)* + udb->ram_size*2); + if(newram) { + grow_ram_hash(udb, newram); + } + } + ptr->prev = NULL; + ptr->next = udb->ram_hash[i]; + udb->ram_hash[i] = ptr; + if(ptr->next) + ptr->next->prev = ptr; +} + +void udb_base_unlink_ptr(udb_base* udb, udb_ptr* ptr) +{ + assert(ptr->data); +#ifdef UDB_CHECK + assert(udb_valid_dataptr(udb, ptr->data)); /* ptr must be inited */ + assert(udb_ptr_is_on_bucket(udb, ptr, ptr->data)); +#endif + udb->ram_num--; + if(ptr->next) + ptr->next->prev = ptr->prev; + if(ptr->prev) + ptr->prev->next = ptr->next; + else { + uint32_t i = chunk_hash_ptr(ptr->data) & udb->ram_mask; + assert((size_t)i < udb->ram_size); + udb->ram_hash[i] = ptr->next; + } +} + +/** change a set of ram ptrs to a new value */ +static void +udb_base_ram_ptr_edit(udb_base* udb, udb_void old, udb_void newd) +{ + uint32_t io = chunk_hash_ptr(old) & udb->ram_mask; + udb_ptr* p, *np; + /* edit them and move them into the new position */ + p = udb->ram_hash[io]; + while(p) { + np = p->next; + if(p->data == old) { + udb_base_unlink_ptr(udb, p); + p->data = newd; + udb_base_link_ptr(udb, p); + } + p = np; + } +} + +udb_rel_ptr* udb_base_get_userdata(udb_base* udb) +{ + return &udb->glob_data->user_global; +} + +void udb_base_set_userdata(udb_base* udb, udb_void user) +{ +#ifdef UDB_CHECK + if(user) { assert(udb_valid_dataptr(udb, user)); } +#endif + udb_rel_ptr_set(udb->base, &udb->glob_data->user_global, user); +} + +void udb_base_set_userflags(udb_base* udb, uint8_t v) +{ + udb->glob_data->userflags = v; +} + +uint8_t udb_base_get_userflags(udb_base* udb) +{ + return udb->glob_data->userflags; +} + +/** re-mmap the udb to specified size */ +static void* +udb_base_remap(udb_base* udb, udb_alloc* alloc, uint64_t nsize) +{ + void* nb; + /* for use with valgrind, do not use mremap, but the other version */ +#ifdef MREMAP_MAYMOVE + nb = mremap(udb->base, udb->base_size, nsize, MREMAP_MAYMOVE); + if(nb == MAP_FAILED) { + log_msg(LOG_ERR, "mremap(%s, size %u) error %s", + udb->fname, (unsigned)nsize, strerror(errno)); + return 0; + } +#else /* !HAVE MREMAP */ + /* use munmap-mmap to simulate mremap */ + if(munmap(udb->base, udb->base_size) != 0) { + log_msg(LOG_ERR, "munmap(%s) error %s", + udb->fname, strerror(errno)); + } + /* provide hint for new location */ + /* note the size_t casts must be there for portability, on some + * systems the layout of memory is otherwise broken. */ + nb = mmap(udb->base, (size_t)nsize, (int)PROT_READ|PROT_WRITE, + (int)MAP_SHARED, (int)udb->fd, (off_t)0); + /* retry the mmap without basept in case of ENOMEM (FreeBSD8), + * the kernel can then try to mmap it at a different location + * where more memory is available */ + if(nb == MAP_FAILED && errno == ENOMEM) { + nb = mmap(NULL, (size_t)nsize, (int)PROT_READ|PROT_WRITE, + (int)MAP_SHARED, (int)udb->fd, (off_t)0); + } + if(nb == MAP_FAILED) { + log_msg(LOG_ERR, "mmap(%s, size %u) error %s", + udb->fname, (unsigned)nsize, strerror(errno)); + udb->base = NULL; + return 0; + } +#endif /* HAVE MREMAP */ + if(nb != udb->base) { + /* fix up realpointers in udb and alloc */ + /* but mremap may have been nice and not move the base */ + udb->base = nb; + udb->glob_data = (udb_glob_d*)(nb+sizeof(uint64_t)); + /* use passed alloc pointer because the udb->alloc may not + * be initialized yet */ + alloc->disk = (udb_alloc_d*)((void*)udb->glob_data + +sizeof(*udb->glob_data)); + } + udb->base_size = nsize; + return nb; +} + +void +udb_base_remap_process(udb_base* udb) +{ + /* assume that fsize is still accessible */ + udb_base_remap(udb, udb->alloc, udb->glob_data->fsize); +} + +/** grow file to specified size and re-mmap, return new base */ +static void* +udb_base_grow_and_remap(udb_base* udb, uint64_t nsize) +{ + /* grow file by writing a single zero at that spot, the + * rest is filled in with zeroes. */ + uint8_t z = 0; + ssize_t w; + + assert(nsize > 0); + udb->glob_data->dirty_alloc = udb_dirty_fsize; +#ifdef HAVE_PWRITE + if((w=pwrite(udb->fd, &z, sizeof(z), (off_t)(nsize-1))) == -1) { +#else + if(lseek(udb->fd, (off_t)(nsize-1), SEEK_SET) == -1) { + log_msg(LOG_ERR, "fseek %s: %s", udb->fname, strerror(errno)); + return 0; + } + if((w=write(udb->fd, &z, sizeof(z))) == -1) { +#endif + log_msg(LOG_ERR, "grow(%s, size %u) error %s", + udb->fname, (unsigned)nsize, strerror(errno)); + return 0; + } else if(w != (ssize_t)sizeof(z)) { + log_msg(LOG_ERR, "grow(%s, size %u) failed (disk full?)", + udb->fname, (unsigned)nsize); + return 0; + } + udb->glob_data->fsize = nsize; + udb->glob_data->dirty_alloc = udb_dirty_clean; + return udb_base_remap(udb, udb->alloc, nsize); +} + +int udb_exp_size(uint64_t a) +{ + /* find enclosing value such that 2**x >= a */ + int x = 0; + uint64_t i = a; + assert(a != 0); + + i --; + /* could optimise this with uint8* access, depends on endianness */ + /* first whole bytes */ + while( (i&(~(uint64_t)0xff)) ) { + i >>= 8; + x += 8; + } + /* now details */ + while(i) { + i >>= 1; + x ++; + } + assert( ((uint64_t)1<<x) >= a); + assert( x==0 || ((uint64_t)1<<(x-1)) < a); + return x; +} + +int udb_exp_offset(uint64_t o) +{ + /* this means measuring the number of 0 bits on the right */ + /* so, if exp zero bits then (o&(2**x-1))==0 */ + int x = 0; + uint64_t i = o; + assert(o != 0); + /* first whole bytes */ + while( (i&(uint64_t)0xff) == 0) { + i >>= 8; + x += 8; + } + /* now details */ + while( (i&(uint64_t)0x1) == 0) { + i >>= 1; + x ++; + } + assert( o % ((uint64_t)1<<x) == 0); + assert( o % ((uint64_t)1<<(x+1)) != 0); + return x; +} + +void udb_alloc_init_new(udb_alloc_d* a) +{ + assert(UDB_HEADER_SIZE % UDB_ALLOC_CHUNK_MINSIZE == 0); + memset(a, 0, sizeof(*a)); + /* set new allocations after header, as if allocated in a sequence + * of minsize allocations */ + a->nextgrow = UDB_HEADER_SIZE; +} + +/** fsck the file size, false if failed and file is useless */ +static int +fsck_fsize(udb_base* udb, udb_alloc* alloc) +{ + off_t realsize; + log_msg(LOG_WARNING, "udb-fsck %s: file size wrong", udb->fname); + realsize = lseek(udb->fd, (off_t)0, SEEK_END); + if(realsize == (off_t)-1) { + log_msg(LOG_ERR, "lseek(%s): %s", udb->fname, strerror(errno)); + return 0; + } + udb->glob_data->fsize = (uint64_t)realsize; + if(!udb_base_remap(udb, alloc, (uint64_t)realsize)) + return 0; + udb->glob_data->dirty_alloc = udb_dirty_clean; + log_msg(LOG_WARNING, "udb-fsck %s: file size fixed (sync)", udb->fname); + udb_base_sync(udb, 1); + return 1; +} + +/** regenerate freelist add a new free chunk, return next todo */ +static udb_void +regen_free(void* base, udb_void c, int exp, udb_alloc_d* regen) +{ + udb_free_chunk_d* cp = UDB_FREE_CHUNK(c); + uint64_t esz = (uint64_t)1<<exp; + if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) { + return 0; + } + cp->type = udb_chunk_type_free; + cp->flags = 0; + chunk_set_last(base, c, exp, (uint8_t)exp); + cp->prev = 0; + cp->next = regen->free[exp-UDB_ALLOC_CHUNK_MINEXP]; + if(cp->next) + UDB_FREE_CHUNK(cp->next)->prev = c; + regen->stat_free += esz; + return c + esz; +} + +/** regenerate xl chunk, return next todo */ +static udb_void +regen_xl(void* base, udb_void c, udb_alloc_d* regen) +{ + udb_xl_chunk_d* cp = UDB_XL_CHUNK(c); + uint64_t xlsz = cp->size; + if( (xlsz&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) { + return 0; + } + if( (c&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) { + return 0; + } + /* fixup end-size and end-expmarker */ + regen->stat_alloc += xlsz; + return c + xlsz; +} + +/** regenerate data chunk, return next todo */ +static udb_void +regen_data(void* base, udb_void c, int exp, udb_alloc_d* regen) +{ + uint64_t esz = (uint64_t)1<<exp; + if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) { + return 0; + } + chunk_set_last(base, c, exp, (uint8_t)exp); + regen->stat_alloc += esz; + return c + esz; +} + +/** regenerate a relptr structure inside a data segment */ +static void +regen_relptr_func(void* base, udb_rel_ptr* rp, void* arg) +{ + udb_void* a = (udb_void*)arg; + /* ignore 0 pointers */ + if(!rp->data) + return; + + /* edit relptrs that point to oldmoved to point to newmoved. */ + if(rp->data == a[0]) + rp->data = a[1]; + + /* regenerate relptr lists, add this item to the relptr list for + * the data that it points to */ + udb_rel_ptr_link(base, rp, rp->data); +} + +/** regenerate the relptrs store in this data segment */ +static void +regen_its_ptrs(void* base, udb_base* udb, udb_chunk_d* atp, + void* data, uint64_t dsz, udb_void rb_old, udb_void rb_new) +{ + udb_void arg[2]; + arg[0] = rb_old; arg[1] = rb_new; + /* walk through the structs here and put them on their respective + * relptr lists */ + (*udb->walkfunc)(base, udb->walkarg, atp->type, data, dsz, + ®en_relptr_func, arg); + +} + +/** regenerate relptrlists in the file */ +static void +regen_ptrlist(void* base, udb_base* udb, udb_alloc* alloc, + udb_void rb_old, udb_void rb_new) +{ + udb_void at = alloc->udb->glob_data->hsize; + /* clear all ptrlist start pointers in the file. */ + while(at < alloc->disk->nextgrow) { + int exp = (int)UDB_CHUNK(at)->exp; + udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type; + if(exp == UDB_EXP_XL) { + UDB_XL_CHUNK(at)->ptrlist = 0; + at += UDB_XL_CHUNK(at)->size; + } else if(tp == udb_chunk_type_free) { + at += (uint64_t)1<<exp; + } else { /* data chunk */ + UDB_CHUNK(at)->ptrlist = 0; + at += (uint64_t)1<<exp; + } + } + /* walk through all relptr structs and put on the right list. */ + at = alloc->udb->glob_data->hsize; + while(at < alloc->disk->nextgrow) { + udb_chunk_d* atp = UDB_CHUNK(at); + int exp = (int)atp->exp; + udb_chunk_type tp = (udb_chunk_type)atp->type; + uint64_t sz = ((exp == UDB_EXP_XL)?UDB_XL_CHUNK(at)->size: + (uint64_t)1<<exp); + if(exp == UDB_EXP_XL) { + assert(at != rb_old); /* should have been freed */ + regen_its_ptrs(base, udb, atp, + ((void*)atp)+sizeof(udb_xl_chunk_d), + sz-sizeof(udb_xl_chunk_d) - sizeof(uint64_t)*2, + rb_old, rb_new); + at += sz; + } else if(tp == udb_chunk_type_free) { + at += sz; + } else { /* data chunk */ + assert(at != rb_old); /* should have been freed */ + regen_its_ptrs(base, udb, atp, + ((void*)atp)+sizeof(udb_chunk_d), + sz-sizeof(udb_chunk_d)-1, rb_old, rb_new); + at += sz; + } + } +} + + +/** mark free elements from ex XL chunk space and later fixups pick that up */ +static void +rb_mark_free_segs(void* base, udb_void s, uint64_t m) +{ + udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE; + /* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/ + assert(s >= UDB_ALLOC_CHUNK_SIZE); + while(q >= s) { + UDB_CHUNK(q)->exp = UDB_ALLOC_CHUNKS_MAX; + UDB_CHUNK(q)->type = udb_chunk_type_free; + q -= UDB_ALLOC_CHUNK_SIZE; + } +} + + +/** fsck rollback or rollforward XL move results */ +static int +fsck_rb_xl(void* base, udb_base* udb, udb_void rb_old, udb_void rb_new, + uint64_t rb_size, uint64_t rb_seg) +{ + + if(rb_old <= rb_new) + return 0; /* XL move one way */ + if( (rb_size&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) + return 0; /* not aligned */ + if( (rb_old&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) + return 0; /* not aligned */ + if( (rb_new&(UDB_ALLOC_CHUNK_SIZE-1)) != 0) + return 0; /* not aligned */ + if(rb_new + rb_size <= rb_old) { + /* not overlapping: resume copy */ + memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size); + /* and free up old piece(s) */ + rb_mark_free_segs(base, rb_old, rb_size); + } else { + /* overlapping, see what segment we stopped at + * and continue there. */ + move_xl_segment(base, udb, rb_old, rb_new, rb_size, rb_seg); + /* free up old piece(s); from the end of the moved segment, + * until the end of the old segment */ + rb_mark_free_segs(base, rb_new+rb_size, (rb_old+rb_size)- + (rb_new+rb_size)); + } + /* do not call fix_ptrs, regenptrs does the job */ + return 1; +} + +/** fsck rollback or rollforward move results */ +static int +fsck_rb(void* base, udb_void rb_old, udb_void rb_new, uint64_t rb_size, + udb_void* make_free) +{ + if( (rb_size&(rb_size-1)) != 0) + return 0; /* not powerof2 */ + if( (rb_old&(rb_size-1)) != 0) + return 0; /* not aligned */ + if( (rb_new&(rb_size-1)) != 0) + return 0; /* not aligned */ + /* resume copy */ + memcpy(UDB_CHUNK(rb_new), UDB_CHUNK(rb_old), rb_size); + /* do not call fix_ptrs, regenptrs does the job */ + /* make sure udb_old is freed */ + *make_free = rb_old; + return 1; +} + +/** fsck the file and salvage, false if failed and file is useless */ +static int +fsck_file(udb_base* udb, udb_alloc* alloc, int moved) +{ + void* base = udb->base; + udb_alloc_d regen; + udb_void at = udb->glob_data->hsize; + udb_void rb_old = udb->glob_data->rb_old; + udb_void rb_new = udb->glob_data->rb_new; + udb_void rb_seg = udb->glob_data->rb_seg; + udb_void make_free = 0; + uint64_t rb_size = udb->glob_data->rb_size; + log_msg(LOG_WARNING, "udb-fsck %s: salvaging", udb->fname); + /* walk through the file, use the exp values to see what can be + * salvaged */ + if(moved && rb_old && rb_new && rb_size) { + if(rb_old+rb_size <= alloc->disk->nextgrow + && rb_new+rb_size <= alloc->disk->nextgrow) { + /* we can use the move information to fix up the + * duplicate element (or partially moved element) */ + if(rb_size > 1024*1024) { + /* XL chunk */ + if(!fsck_rb_xl(base, udb, rb_old, rb_new, + rb_size, rb_seg)) + return 0; + } else { + if(!fsck_rb(base, rb_old, rb_new, rb_size, + &make_free)) + return 0; + } + } + } + + /* rebuild freelists */ + /* recalculate stats in alloc (except 'stat_data') */ + /* possibly new end 'nextgrow' value */ + memset(®en, 0, sizeof(regen)); + regen.nextgrow = alloc->disk->nextgrow; + while(at < regen.nextgrow) { + /* figure out this chunk */ + int exp = (int)UDB_CHUNK(at)->exp; + udb_chunk_type tp = (udb_chunk_type)UDB_CHUNK(at)->type; + /* consistency check possible here with end-exp */ + if(tp == udb_chunk_type_free || at == make_free) { + at = regen_free(base, at, exp, ®en); + if(!at) return 0; + } else if(exp == UDB_EXP_XL) { + /* allocated data of XL size */ + at = regen_xl(base, at, ®en); + if(!at) return 0; + } else if(exp >= UDB_ALLOC_CHUNK_MINEXP + && exp <= UDB_ALLOC_CHUNKS_MAX) { + /* allocated data */ + at = regen_data(base, at, exp, ®en); + if(!at) return 0; + } else { + /* garbage; this must be EOF then */ + regen.nextgrow = at; + break; + } + } + *alloc->disk = regen; + + /* rebuild relptr lists */ + regen_ptrlist(base, udb, alloc, rb_old, rb_new); + + log_msg(LOG_WARNING, "udb-fsck %s: salvaged successfully (sync)", + udb->fname); + udb->glob_data->rb_old = 0; + udb->glob_data->rb_new = 0; + udb->glob_data->rb_size = 0; + udb->glob_data->dirty_alloc = udb_dirty_clean; + udb_base_sync(udb, 1); + return 1; +} + + +udb_alloc* udb_alloc_create(udb_base* udb, udb_alloc_d* disk) +{ + udb_alloc* alloc = (udb_alloc*)xalloc_zero(sizeof(*alloc)); + if(!alloc) + return NULL; + alloc->udb = udb; + alloc->disk = disk; + /* see if committed but uncompleted actions need to be done */ + /* preserves the alloc state */ + if(udb->glob_data->dirty_alloc != udb_dirty_clean) { + if(udb->glob_data->dirty_alloc == udb_dirty_fsize) { + if(fsck_fsize(udb, alloc)) + return alloc; + } else if(udb->glob_data->dirty_alloc == udb_dirty_fl) { + if(fsck_file(udb, alloc, 0)) + return alloc; + } else if(udb->glob_data->dirty_alloc == udb_dirty_compact) { + if(fsck_file(udb, alloc, 1)) + return alloc; + } + log_msg(LOG_ERR, "error: file allocation dirty (%d)", + (int)udb->glob_data->dirty_alloc); + free(alloc); + return NULL; + } + return alloc; +} + +void udb_alloc_delete(udb_alloc* alloc) +{ + if(!alloc) return; + free(alloc); +} + +/** unlink this element from its freelist */ +static void +udb_alloc_unlink_fl(void* base, udb_alloc* alloc, udb_void chunk, int exp) +{ + udb_free_chunk_d* fp = UDB_FREE_CHUNK(chunk); + assert(chunk); + /* chunk is a free chunk */ + assert(fp->exp == (uint8_t)exp); + assert(fp->type == udb_chunk_type_free); + assert(chunk_get_last(base, chunk, exp) == (uint8_t)exp); + /* and thus freelist not empty */ + assert(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]); + /* unlink */ + if(fp->prev) + UDB_FREE_CHUNK(fp->prev)->next = fp->next; + else alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next; + if(fp->next) + UDB_FREE_CHUNK(fp->next)->prev = fp->prev; +} + +/** pop first element off freelist, list may not be empty */ +static udb_void +udb_alloc_pop_fl(void* base, udb_alloc* alloc, int exp) +{ + udb_void f = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]; + udb_free_chunk_d* fp = UDB_FREE_CHUNK(f); + assert(f); + assert(fp->exp == (uint8_t)exp); + assert(fp->type == udb_chunk_type_free); + assert(chunk_get_last(base, f, exp) == (uint8_t)exp); + alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = fp->next; + if(fp->next) { + UDB_FREE_CHUNK(fp->next)->prev = 0; + } + return f; +} + +/** push new element onto freelist */ +static void +udb_alloc_push_fl(void* base, udb_alloc* alloc, udb_void f, int exp) +{ + udb_free_chunk_d* fp = UDB_FREE_CHUNK(f); + assert(f); + fp->exp = (uint8_t)exp; + fp->type = udb_chunk_type_free; + fp->flags = 0; + fp->prev = 0; + fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]; + if(fp->next) + UDB_FREE_CHUNK(fp->next)->prev = f; + chunk_set_last(base, f, exp, (uint8_t)exp); + alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f; +} + +/** push new element onto freelist - do not initialize the elt */ +static void +udb_alloc_push_fl_noinit(void* base, udb_alloc* alloc, udb_void f, int exp) +{ + udb_free_chunk_d* fp = UDB_FREE_CHUNK(f); + assert(f); + assert(fp->exp == (uint8_t)exp); + assert(fp->type == udb_chunk_type_free); + assert(chunk_get_last(base, f, exp) == (uint8_t)exp); + fp->prev = 0; + fp->next = alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]; + if(fp->next) + UDB_FREE_CHUNK(fp->next)->prev = f; + alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP] = f; +} + +/** add free chunks at end until specified alignment occurs */ +static void +grow_align(void* base, udb_alloc* alloc, uint64_t esz) +{ + while( (alloc->disk->nextgrow & (esz-1)) != 0) { + /* the nextgrow is not a whole multiple of esz. */ + /* grow a free chunk of max allowed size */ + int fexp = udb_exp_offset(alloc->disk->nextgrow); + uint64_t fsz = (uint64_t)1<<fexp; + udb_void f = alloc->disk->nextgrow; + udb_void fn = alloc->disk->nextgrow+fsz; + assert(fn <= alloc->udb->base_size); + alloc->disk->stat_free += fsz; + udb_alloc_push_fl(base, alloc, f, fexp); + /* now increase nextgrow to commit that free chunk */ + alloc->disk->nextgrow = fn; + } +} + +/** append chunks at end of memory space to get size exp, return dataptr */ +static udb_void +grow_chunks(void* base, udb_alloc* alloc, size_t sz, int exp) +{ + uint64_t esz = (uint64_t)1<<exp; + udb_void ret; + alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; + grow_align(base, alloc, esz); + /* free chunks are grown, grow the one we want to use */ + ret = alloc->disk->nextgrow; + /* take a new alloced chunk into use */ + UDB_CHUNK(ret)->exp = (uint8_t)exp; + UDB_CHUNK(ret)->flags = 0; + UDB_CHUNK(ret)->ptrlist = 0; + UDB_CHUNK(ret)->type = udb_chunk_type_data; + /* store last octet */ + chunk_set_last(base, ret, exp, (uint8_t)exp); + /* update stats */ + alloc->disk->stat_alloc += esz; + alloc->disk->stat_data += sz; + /* now increase nextgrow to commit this newly allocated chunk */ + alloc->disk->nextgrow += esz; + assert(alloc->disk->nextgrow <= alloc->udb->base_size); + alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; + return ret + sizeof(udb_chunk_d); /* ptr to data */ +} + +/** calculate how much space is necessary to grow for this exp */ +static uint64_t +grow_end_calc(udb_alloc* alloc, int exp) +{ + uint64_t sz = (uint64_t)1<<exp; + uint64_t ng = alloc->disk->nextgrow; + uint64_t res; + /* if nextgrow is 2**expness, no extra growth needed, only size */ + if( (ng & (sz-1)) == 0) { + /* sz-1 is like 0xfff, and checks if ng is whole 2**exp */ + return ng+sz; /* must grow exactly 2**exp */ + } + /* grow until 2**expness and then we need 2**exp as well */ + /* so, round ng down to whole sz (basically ng-ng%sz, or ng/sz*sz) + * and then add the sz twice (go up to whole sz, and to allocate) */ + res = (ng & ~(sz-1)) + 2*sz; + return res; +} + +/** see if we need to grow more than specified to enable sustained growth */ +static uint64_t +grow_extra_check(udb_alloc* alloc, uint64_t ge) +{ + const uint64_t mb = 1024*1024; + uint64_t bsz = alloc->udb->base_size; + if(bsz <= mb) { + /* below 1 Mb, double sizes for exponential growth */ + /* takes about 15 times to grow to 1Mb */ + if(ge < bsz*2) + return bsz*2; + } else { + uint64_t gnow = ge - bsz; + /* above 1Mb, grow at least 1 Mb, or 12.5% of current size, + * in whole megabytes rounded up. */ + uint64_t want = ((bsz / 8) & ~(mb-1)) + mb; + if(gnow < want) + return bsz + want; + } + return ge; +} + +/** see if free space is enogh to warrant shrink (while file is open) */ +static int +enough_free(udb_alloc* alloc) +{ + if(alloc->udb->base_size <= 2*1024*1024) { + /* below 1 Mb, grown by double size, (so up to 2 mb), + * do not shrink unless we can 1/3 in size */ + if(((size_t)alloc->disk->nextgrow)*3 <= alloc->udb->base_size) + return 1; + } else { + /* grown 12.5%, shrink 25% if possible, at least one mb */ + /* between 1mb and 4mb size, it shrinks by 1mb if possible */ + uint64_t space = alloc->udb->base_size - alloc->disk->nextgrow; + if(space >= 1024*1024 && (space*4 >= alloc->udb->base_size + || alloc->udb->base_size < 4*1024*1024)) + return 1; + } + return 0; +} + +/** grow space for a chunk of 2**exp and return dataptr */ +static udb_void +udb_alloc_grow_space(void* base, udb_alloc* alloc, size_t sz, int exp) +{ + /* commit the grow action + * - the file grow only changes filesize, but not the nextgrow. + * - taking space after nextgrow into use (as free space), + * is like free-ing a chunk (one at a time). + * - and the last chunk taken into use is like alloc. + */ + /* predict how much free space is needed for this */ + uint64_t grow_end = grow_end_calc(alloc, exp); + assert(alloc->udb->base_size >= alloc->disk->nextgrow); + if(grow_end <= alloc->udb->base_size) { + /* we can do this with the available space */ + return grow_chunks(base, alloc, sz, exp); + } + /* we have to grow the file, re-mmap */ + /* see if we need to grow a little more, to avoid endless grow + * efforts on adding data */ + grow_end = grow_extra_check(alloc, grow_end); + if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) { + return 0; /* mmap or write failed (disk or mem full) */ + } + /* we have enough space now */ + assert(grow_end <= alloc->udb->base_size); + assert(alloc->udb->glob_data->fsize == alloc->udb->base_size); + return grow_chunks(base, alloc, sz, exp); +} + +/** take XL allocation into use at end of file, return dataptr */ +static udb_void +grow_xl(void* base, udb_alloc* alloc, uint64_t xlsz, uint64_t sz) +{ + udb_void ret; + udb_xl_chunk_d* p; + alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; + + /* align growth to whole mbs */ + grow_align(base, alloc, UDB_ALLOC_CHUNK_SIZE); + + /* grow XL segment */ + ret = alloc->disk->nextgrow; + p = UDB_XL_CHUNK(ret); + p->exp = UDB_EXP_XL; + p->size = xlsz; + p->flags = 0; + p->ptrlist = 0; + p->type = udb_chunk_type_data; + + /* also put size and marker at end for compaction */ + *((uint64_t*)(UDB_REL(base, ret+xlsz-sizeof(uint64_t)*2))) = xlsz; + *((uint8_t*)(UDB_REL(base, ret+xlsz-1))) = UDB_EXP_XL; + + /* stats */ + alloc->disk->stat_data += sz; + alloc->disk->stat_alloc += xlsz; + /* now increase the nextgrow to commit this xl chunk */ + alloc->disk->nextgrow += xlsz; + alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; + return ret + sizeof(udb_xl_chunk_d); /* data ptr */ +} + +/** make space for XL allocation */ +static udb_void +udb_alloc_xl_space(void* base, udb_alloc* alloc, size_t sz) +{ + /* allocate whole mbs of space, at end of space */ + uint64_t asz = sz + sizeof(udb_xl_chunk_d) + sizeof(uint64_t)*2; + uint64_t need=(asz+UDB_ALLOC_CHUNK_SIZE-1)&(~(UDB_ALLOC_CHUNK_SIZE-1)); + uint64_t grow_end = grow_end_calc(alloc, UDB_ALLOC_CHUNKS_MAX) + need; + assert(need >= asz); + if(grow_end <= alloc->udb->base_size) { + /* can do this in available space */ + return grow_xl(base, alloc, need, sz); + } + /* have to grow file and re-mmap */ + grow_end = grow_extra_check(alloc, grow_end); + if(!(base=udb_base_grow_and_remap(alloc->udb, grow_end))) { + return 0; /* mmap or write failed (disk or mem full) */ + } + /* we have enough space now */ + assert(grow_end <= alloc->udb->base_size); + assert(alloc->udb->glob_data->fsize == alloc->udb->base_size); + return grow_xl(base, alloc, need, sz); +} + +/** divide big(2**e2) into pieces so 2**exp fits */ +static udb_void +udb_alloc_subdivide(void* base, udb_alloc* alloc, udb_void big, int e2, + int exp) +{ + int e = e2; + uint64_t sz = (uint64_t)1<<e2; + assert(big && e2 > exp); + /* so the returned piece to use is the first piece, + * offload the later half until it fits */ + do { + sz >>= 1; /* divide size of big by two */ + e--; /* that means its exp is one smaller */ + udb_alloc_push_fl(base, alloc, big+sz, e); + } while(e != exp); + /* exit loop when last pushed is same size as what we want */ + return big; +} + +/** returns the exponent size of the chunk needed for data sz */ +static int +udb_alloc_exp_needed(size_t sz) +{ + uint64_t asz = sz + sizeof(udb_chunk_d) + 1; + if(asz > UDB_ALLOC_CHUNK_SIZE) { + return UDB_EXP_XL; + } else if(asz <= UDB_ALLOC_CHUNK_MINSIZE) { + return UDB_ALLOC_CHUNK_MINEXP; + } + return udb_exp_size(asz); +} + +udb_void udb_alloc_space(udb_alloc* alloc, size_t sz) +{ + void* base = alloc->udb->base; + /* calculate actual allocation size */ + int e2, exp = udb_alloc_exp_needed(sz); + if(exp == UDB_EXP_XL) + return udb_alloc_xl_space(base, alloc, sz); + /* see if there is a free chunk of that size exactly */ + if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]) { + /* snip from freelist, udb_chunk_d */ + udb_void ret; + alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; + ret = udb_alloc_pop_fl(base, alloc, exp); + /* use it - size octets already OK */ + UDB_CHUNK(ret)->flags = 0; + UDB_CHUNK(ret)->ptrlist = 0; + UDB_CHUNK(ret)->type = udb_chunk_type_data; + /* update stats */ + alloc->disk->stat_data += sz; + alloc->disk->stat_alloc += (1<<exp); + assert(alloc->disk->stat_free >= (1u<<exp)); + alloc->disk->stat_free -= (1<<exp); + alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; + return ret + sizeof(udb_chunk_d); /* ptr to data */ + } + /* see if we can subdivide a larger chunk */ + for(e2 = exp+1; e2 < UDB_ALLOC_CHUNKS_MAX; e2++) + if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) { + udb_void big, ret; /* udb_chunk_d */ + alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; + big = udb_alloc_pop_fl(base, alloc, e2); + /* push other parts onto freelists (needs inited) */ + ret = udb_alloc_subdivide(base, alloc, big, e2, exp); + /* use final part (needs inited) */ + UDB_CHUNK(ret)->exp = (uint8_t)exp; + /* if stop here; the new exp makes smaller free chunk*/ + UDB_CHUNK(ret)->flags = 0; + UDB_CHUNK(ret)->ptrlist = 0; + /* set type to commit data chunk */ + UDB_CHUNK(ret)->type = udb_chunk_type_data; + /* store last octet */ + chunk_set_last(base, ret, exp, (uint8_t)exp); + /* update stats */ + alloc->disk->stat_data += sz; + alloc->disk->stat_alloc += (1<<exp); + assert(alloc->disk->stat_free >= (1u<<exp)); + alloc->disk->stat_free -= (1<<exp); + alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; + return ret + sizeof(udb_chunk_d); /* ptr to data */ + } + /* we need to grow an extra chunk */ + return udb_alloc_grow_space(base, alloc, sz, exp); +} + +/** see if there is free space to allocate a chunk into */ +static int +have_free_for(udb_alloc* alloc, int exp) +{ + int e2; + if(alloc->disk->free[exp-UDB_ALLOC_CHUNK_MINEXP]) + return exp; + for(e2 = exp+1; e2 < UDB_ALLOC_CHUNKS_MAX; e2++) + if(alloc->disk->free[e2-UDB_ALLOC_CHUNK_MINEXP]) { + return e2; + } + return 0; +} + +/** fix relptr prev and next for moved relptr structures */ +static void +chunk_fix_ptr_each(void* base, udb_rel_ptr* rp, void* arg) +{ + udb_void* data = (udb_void*)arg; + udb_void r; + if(!rp->data) + return; + r = UDB_SYSTOREL(base, rp); + if(rp->next) + UDB_REL_PTR(rp->next)->prev = r; + if(rp->prev) + UDB_REL_PTR(rp->prev)->next = r; + else { + /* if this is a pointer to its own chunk, fix it up; + * the data ptr gets set by relptr_edit later. */ + if(rp->data == data[0]) + UDB_CHUNK(data[1])->ptrlist = r; + else UDB_CHUNK(chunk_from_dataptr(rp->data))->ptrlist = r; + } +} + +/** fix pointers from and to a moved chunk */ +static void +chunk_fix_ptrs(void* base, udb_base* udb, udb_chunk_d* cp, udb_void data, + uint64_t dsz, udb_void olddata) +{ + udb_void d[2]; + d[0] = olddata; + d[1] = data; + (*udb->walkfunc)(base, udb->walkarg, cp->type, UDB_REL(base, data), + dsz, &chunk_fix_ptr_each, d); + udb_rel_ptr_edit(base, cp->ptrlist, data); + udb_base_ram_ptr_edit(udb, olddata, data); +} + +/** move an allocated chunk to use a free chunk */ +static void +move_chunk(void* base, udb_alloc* alloc, udb_void f, int exp, uint64_t esz, + int e2) +{ + udb_void res = udb_alloc_pop_fl(base, alloc, e2); + udb_chunk_d* rp; + udb_chunk_d* fp; + if(exp != e2) { + /* it is bigger, subdivide it */ + res = udb_alloc_subdivide(base, alloc, res, e2, exp); + } + assert(res != f); + /* setup rollback information */ + alloc->udb->glob_data->rb_old = f; + alloc->udb->glob_data->rb_new = res; + alloc->udb->glob_data->rb_size = esz; + /* take the res, exp into use */ + rp = UDB_CHUNK(res); + fp = UDB_CHUNK(f); + /* copy over the data */ + memcpy(rp, fp, esz); + /* adjust rel ptrs */ + chunk_fix_ptrs(base, alloc->udb, rp, res+sizeof(udb_chunk_d), + esz-sizeof(udb_chunk_d)-1, f+sizeof(udb_chunk_d)); + + /* do not freeup the fp; caller does that */ +} + +/** unlink several free elements to overwrite with xl chunk */ +static void +free_xl_space(void* base, udb_alloc* alloc, udb_void s, uint64_t m) +{ + udb_void q = s + m - UDB_ALLOC_CHUNK_SIZE; + /* because of header and alignment we know s >= UDB_ALLOC_CHUNK_SIZE*/ + assert(s >= UDB_ALLOC_CHUNK_SIZE); + while(q >= s) { + assert(UDB_CHUNK(q)->exp == UDB_ALLOC_CHUNKS_MAX); + assert(UDB_CHUNK(q)->type == udb_chunk_type_free); + udb_alloc_unlink_fl(base, alloc, q, UDB_ALLOC_CHUNKS_MAX); + q -= UDB_ALLOC_CHUNK_SIZE; + } +} + +/** move an XL chunk, and keep track of segments for rollback */ +static void +move_xl_segment(void* base, udb_base* udb, udb_void xl, udb_void n, + uint64_t sz, uint64_t startseg) +{ + udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl); + udb_xl_chunk_d* np = UDB_XL_CHUNK(n); + uint64_t amount = xl - n; + assert(n < xl); /* move to compact */ + + /* setup move rollback */ + udb->glob_data->rb_old = xl; + udb->glob_data->rb_new = n; + udb->glob_data->rb_size = sz; + + /* is it overlapping? */ + if(sz <= amount) { + memcpy(np, xlp, sz); + } else { + /* move and commit per 1M segment to avoid data loss */ + uint64_t seg, maxseg = amount/UDB_ALLOC_CHUNK_SIZE; + for(seg = startseg; seg<maxseg; seg++) { + udb->glob_data->rb_seg = seg; + memcpy(np+seg*UDB_ALLOC_CHUNK_SIZE, + xlp+seg*UDB_ALLOC_CHUNK_SIZE, + UDB_ALLOC_CHUNK_SIZE); + } + + } +} + +/** move list of XL chunks to the front by the shift amount */ +static void +move_xl_list(void* base, udb_alloc* alloc, udb_void xl_start, uint64_t xl_sz, + uint64_t amount) +{ + udb_void xl = xl_start; + assert( (xl_start&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */ + assert( (amount&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */ + assert( (xl_sz&(UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* multiples */ + while(xl < xl_start+xl_sz) { + udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl); + udb_void n = xl-amount; + uint64_t sz = xlp->size; + assert(xlp->exp == UDB_EXP_XL); + move_xl_segment(base, alloc->udb, xl, n, sz, 0); + chunk_fix_ptrs(base, alloc->udb, UDB_CHUNK(n), + n+sizeof(udb_xl_chunk_d), + sz-sizeof(udb_xl_chunk_d)-sizeof(uint64_t)*2, + xl+sizeof(udb_xl_chunk_d)); + } + alloc->disk->stat_free -= amount; + alloc->disk->nextgrow -= amount; + alloc->udb->glob_data->rb_old = 0; + alloc->udb->glob_data->rb_new = 0; + alloc->udb->glob_data->rb_size = 0; +} + +/** see if free chunk can coagulate with another chunk, return other chunk */ +static udb_void +coagulate_possible(void* base, udb_alloc* alloc, udb_void f, int exp, + uint64_t esz) +{ + udb_void other = f^esz; + if(exp == UDB_ALLOC_CHUNKS_MAX) + return 0; /* no further merges */ + if(other >= alloc->udb->base_size) + return 0; /* not allocated */ + if(other >= alloc->disk->nextgrow) + return 0; /* not in use */ + if(other < alloc->udb->glob_data->hsize) + return 0; /* cannot merge with header */ + /* the header is also protected by the special exp marker */ + /* see if the other chunk is a free chunk */ + + /* check closest marker to avoid large memory churn */ + /* and also it makes XL allocations and header special markers work */ + if(f > other) { + assert(f > 1); /* this is certain because of header */ + if(*((uint8_t*)UDB_REL(base, f-1)) == (uint8_t)exp) { + /* can do it if the other part is a free chunk */ + assert(UDB_FREE_CHUNK(other)->exp == (uint8_t)exp); + if(UDB_CHUNK(other)->type == udb_chunk_type_free) + return other; + } + } else { + if(UDB_CHUNK(other)->exp == (uint8_t)exp) { + /* can do it if the other part is a free chunk */ + assert(chunk_get_last(base, other, exp)==(uint8_t)exp); + if(UDB_CHUNK(other)->type == udb_chunk_type_free) + return other; + } + } + return 0; +} + +/** coagulate and then add new free segment, return final free segment */ +static udb_void +coagulate_and_push(void* base, udb_alloc* alloc, udb_void last, int exp, + uint64_t esz) +{ + /* new free chunk here, attempt coagulate */ + udb_void other; + while( (other=coagulate_possible(base, alloc, last, exp, esz)) ) { + /* unlink that other chunk */ + udb_alloc_unlink_fl(base, alloc, other, exp); + /* merge up */ + if(other < last) + last = other; + exp++; + esz <<= 1; + } + /* free the final segment */ + udb_alloc_push_fl(base, alloc, last, exp); + return last; +} + +/** attempt to compact the data and move free space to the end */ +static int +udb_alloc_compact(void* base, udb_alloc* alloc) +{ + udb_void last; + int exp, e2; + uint64_t esz; + uint64_t at = alloc->disk->nextgrow; + udb_void xl_start = 0; + uint64_t xl_sz = 0; + while(at > alloc->udb->glob_data->hsize) { + /* grab last entry */ + exp = (int)*((uint8_t*)UDB_REL(base, at-1)); + if(exp == UDB_EXP_XL) { + /* for XL chunks: + * - inspect the size of the XLchunklist at end + * - attempt to compact in front of of XLchunklist + */ + uint64_t xlsz = *((uint64_t*)UDB_REL(base, + at-sizeof(uint64_t)*2)); + udb_void xl = at-xlsz; +#ifndef NDEBUG + udb_xl_chunk_d* xlp = UDB_XL_CHUNK(xl); + assert(xlp->exp == UDB_EXP_XL); + assert(xlp->type != udb_chunk_type_free); +#endif + /* got thesegment add to the xl chunk list */ + if(xl_start != 0 && xl+xlsz != xl_start) { + /* nonadjoining XL part, but they are aligned, + * so the space in between is whole Mbs, + * shift the later part(s) and continue */ + uint64_t m = xl_start - (xl+xlsz); + assert(xl_start > xl+xlsz); + alloc->udb->glob_data->dirty_alloc = udb_dirty_compact; + free_xl_space(base, alloc, xl+xlsz, m); + move_xl_list(base, alloc, xl_start, xl_sz, m); + alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; + } + xl_start = xl; + xl_sz += xlsz; + at = xl; + continue; + /* end of XL if */ + } else if(exp < UDB_ALLOC_CHUNK_MINEXP + || exp > UDB_ALLOC_CHUNKS_MAX) + break; /* special chunk or garbage */ + esz = (uint64_t)1<<exp; + last = at - esz; + assert(UDB_CHUNK(last)->exp == (uint8_t)exp); + if(UDB_CHUNK(last)->type == udb_chunk_type_free) { + /* if xlstart continue looking to move stuff, but do + * not unlink this free segment */ + if(!xl_start) { + /* it is a free chunk, remove it */ + alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; + udb_alloc_unlink_fl(base, alloc, last, exp); + alloc->disk->stat_free -= esz; + alloc->disk->nextgrow = last; + alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; + /* and continue at this point */ + } + at = last; + } else if( (e2=have_free_for(alloc, exp)) ) { + /* last entry can be allocated in free chunks + * move it to its new position, adjust rel_ptrs */ + alloc->udb->glob_data->dirty_alloc = udb_dirty_compact; + move_chunk(base, alloc, last, exp, esz, e2); + if(xl_start) { + last = coagulate_and_push(base, alloc, + last, exp, esz); + } else { + /* shorten usage */ + alloc->disk->stat_free -= esz; + alloc->disk->nextgrow = last; + } + alloc->udb->glob_data->rb_old = 0; + alloc->udb->glob_data->rb_new = 0; + alloc->udb->glob_data->rb_size = 0; + alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; + /* and continue in front of it */ + at = last; + } else { + /* cannot compact this block, stop compacting */ + break; + } + /* if that worked, repeat it */ + } + /* if we passed xl chunks, see if XL-chunklist can move */ + if(xl_start) { + /* calculate free space in front of the XLchunklist. */ + /* has to be whole mbs of free space */ + /* if so, we can move the XL chunks. Move them all back + * by the new free space. */ + /* this compacts very well, but the XL chunks can be moved + * multiple times; worst case for every mb freed a huge sized + * xlchunklist gets moved. */ + /* free space must be, since aligned and coagulated, in + * chunks of a whole MB */ + udb_void at = xl_start; + uint64_t m = 0; + while(*((uint8_t*)UDB_REL(base, at-1))==UDB_ALLOC_CHUNKS_MAX){ + udb_void chunk = at - UDB_ALLOC_CHUNK_SIZE; + if(UDB_CHUNK(chunk)->type != udb_chunk_type_free) + break; + assert(UDB_CHUNK(chunk)->exp==UDB_ALLOC_CHUNKS_MAX); + m += UDB_ALLOC_CHUNK_SIZE; + at = chunk; + } + if(m != 0) { + assert(at+m == xl_start); + alloc->udb->glob_data->dirty_alloc = udb_dirty_compact; + free_xl_space(base, alloc, at, m); + move_xl_list(base, alloc, xl_start, xl_sz, m); + alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; + } + } + + /* if enough free, shrink the file; re-mmap */ + if(enough_free(alloc)) { + uint64_t nsize = alloc->disk->nextgrow; + udb_base_shrink(alloc->udb, nsize); + if(!udb_base_remap(alloc->udb, alloc, nsize)) + return 0; + } + return 1; +} + +#ifdef UDB_CHECK +/** check that rptrs are really zero before free */ +void udb_check_rptr_zero(void* base, udb_rel_ptr* p, void* arg) +{ + (void)base; + (void)arg; + assert(p->data == 0); +} +#endif /* UDB_CHECK */ + +/** free XL chunk as multiples of CHUNK_SIZE free segments */ +static void +udb_free_xl(void* base, udb_alloc* alloc, udb_void f, udb_xl_chunk_d* fp, + size_t sz) +{ + uint64_t xlsz = fp->size; + uint64_t c; + /* lightweight check for buffer overflow in xl data */ + assert(*((uint64_t*)(UDB_REL(base, f+xlsz-sizeof(uint64_t)*2)))==xlsz); + assert(*((uint8_t*)(UDB_REL(base, f+xlsz-1))) == UDB_EXP_XL); + assert( (xlsz & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* whole mbs */ + assert( (f & (UDB_ALLOC_CHUNK_SIZE-1)) == 0 ); /* aligned */ +#ifdef UDB_CHECK + /* check that relptrs in this chunk have been zeroed */ + (*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type, + UDB_REL(base, f+sizeof(udb_xl_chunk_d)), xlsz, + &udb_check_rptr_zero, NULL); +#endif + alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; + /* update stats */ + alloc->disk->stat_data -= sz; + alloc->disk->stat_alloc -= xlsz; + alloc->disk->stat_free += xlsz; + /* walk in reverse, so the front blocks go first on the list */ + c = f + xlsz - UDB_ALLOC_CHUNK_SIZE; + /* because of header and alignment we know f >= UDB_ALLOC_CHUNK_SIZE*/ + assert(f >= UDB_ALLOC_CHUNK_SIZE); + while(c >= f) { + /* free a block of CHUNK_SIZE (1 Mb) */ + udb_alloc_push_fl(base, alloc, c, UDB_ALLOC_CHUNKS_MAX); + c -= UDB_ALLOC_CHUNK_SIZE; + } + alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; +} + +int udb_alloc_free(udb_alloc* alloc, udb_void r, size_t sz) +{ + void* base; + /* lookup chunk ptr */ + udb_void f; + udb_chunk_d* fp; + uint64_t esz; + int exp; + udb_void other; + int coagulated = 0; + if(!r) + return 1; /* free(NULL) does nothing */ + + /* lookup size of chunk */ + base = alloc->udb->base; + /* fails for XL blocks */ + f = chunk_from_dataptr(r); + fp = UDB_CHUNK(f); + assert(fp->type != udb_chunk_type_free); + + /* see if it has a ptrlist, if so: trouble, the list is not properly + * cleaned up. (although you can imagine a wholesale delete where + * it does not matter) */ + assert(fp->ptrlist == 0); + + /* set ptrlist to 0 to stop relptr from using it, robustness. */ + fp->ptrlist = 0; + + if(fp->exp == UDB_EXP_XL) { + udb_free_xl(base, alloc, f, (udb_xl_chunk_d*)fp, sz); + /* compact */ + return udb_alloc_compact(base, alloc); + } + /* it is a regular chunk of 2**exp size */ + exp = (int)fp->exp; + esz = (uint64_t)1<<exp; + /* light check for e.g. buffer overflow of the data */ + assert(sz < esz); + assert(chunk_get_last(base, f, exp) == (uint8_t)exp); +#ifdef UDB_CHECK + /* check that relptrs in this chunk have been zeroed */ + (*alloc->udb->walkfunc)(base, alloc->udb->walkarg, fp->type, + UDB_REL(base, r), esz, &udb_check_rptr_zero, NULL); +#endif + + /* update the stats */ + alloc->udb->glob_data->dirty_alloc = udb_dirty_fl; + alloc->disk->stat_data -= sz; + alloc->disk->stat_free += esz; + alloc->disk->stat_alloc -= esz; + + /* if it can be merged with other free chunks, do so */ + while( (other=coagulate_possible(base, alloc, f, exp, esz)) ) { + coagulated = 1; + /* unlink that other chunk and expand it (it has same size) */ + udb_alloc_unlink_fl(base, alloc, other, exp); + /* merge up */ + if(other < f) + f = other; + exp++; + esz <<= 1; + } + if(coagulated) { + /* put big free chunk into freelist, and init it */ + udb_alloc_push_fl(base, alloc, f, exp); + } else { + /* we do not need to touch the last-exp-byte, which may save + * a reference to that page of memory */ + fp->type = udb_chunk_type_free; + fp->flags = 0; + udb_alloc_push_fl_noinit(base, alloc, f, exp); + } + alloc->udb->glob_data->dirty_alloc = udb_dirty_clean; + /* compact */ + return udb_alloc_compact(base, alloc); +} + +udb_void udb_alloc_init(udb_alloc* alloc, void* d, size_t sz) +{ + /* could be faster maybe, if grown? */ + udb_void r = udb_alloc_space(alloc, sz); + if(!r) return r; + memcpy(UDB_REL(alloc->udb->base, r), d, sz); + return r; +} + +udb_void udb_alloc_realloc(udb_alloc* alloc, udb_void r, size_t osz, size_t sz) +{ + void* base = alloc->udb->base; + udb_void c, n, newd; + udb_chunk_d* cp, *np; + uint64_t avail; + uint8_t cp_type; + /* emulate some posix realloc stuff */ + if(r == 0) + return udb_alloc_space(alloc, sz); + if(sz == 0) { + if(!udb_alloc_free(alloc, r, osz)) + log_msg(LOG_ERR, "udb_alloc_realloc: free failed"); + return 0; + } + c = chunk_from_dataptr(r); + cp = UDB_CHUNK(c); + cp_type = cp->type; + if(cp->exp == UDB_EXP_XL) { + avail = UDB_XL_CHUNK(c)->size - sizeof(udb_xl_chunk_d) + - sizeof(uint64_t)*2; + } else { + avail = ((uint64_t)1<<cp->exp) - sizeof(udb_chunk_d) - 1; + } + if(sz <= avail) + return r; + /* reallocate it, and copy */ + newd = udb_alloc_space(alloc, sz); + if(!newd) return 0; + /* re-base after alloc, since re-mmap may have happened */ + base = alloc->udb->base; + cp = NULL; /* may be invalid now, robustness */ + n = chunk_from_dataptr(newd); + np = UDB_CHUNK(n); + np->type = cp_type; + memcpy(UDB_REL(base, newd), UDB_REL(base, r), osz); + /* fixup ptrs */ + chunk_fix_ptrs(base, alloc->udb, np, newd, osz, r); + + if(!udb_alloc_free(alloc, r, osz)) + log_msg(LOG_ERR, "udb_alloc_realloc: free failed"); + return newd; +} + +int udb_alloc_grow(udb_alloc* alloc, size_t sz, size_t num) +{ + const uint64_t mb = 1024*1024; + int exp = udb_alloc_exp_needed(sz); + uint64_t esz; + uint64_t want; + if(exp == UDB_EXP_XL) + esz = (sz&(mb-1))+mb; + else esz = (uint64_t)1<<exp; + /* we need grow_end_calc to take into account alignment */ + want = grow_end_calc(alloc, exp) + esz*(num-1); + assert(want >= alloc->udb->base_size); + if(!udb_base_grow_and_remap(alloc->udb, want)) { + log_msg(LOG_ERR, "failed to grow the specified amount"); + return 0; + } + return 1; +} + +void udb_alloc_set_type(udb_alloc* alloc, udb_void r, udb_chunk_type tp) +{ + void* base = alloc->udb->base; + udb_void f = chunk_from_dataptr(r); + udb_chunk_d* fp = UDB_CHUNK(f); + /* not the 'free' type, that must be set by allocation routines */ + assert(fp->type != udb_chunk_type_free); + assert(tp != udb_chunk_type_free); + fp->type = tp; +} + +int udb_valid_offset(udb_base* udb, udb_void to, size_t destsize) +{ + /* pointers are not valid before the header-size or after the + * used-region of the mmap */ + return ( (to+destsize) <= udb->base_size && + to >= (udb->glob_data->hsize-2*sizeof(udb_rel_ptr)) && + (to+destsize) <= udb->alloc->disk->nextgrow); +} + +int udb_valid_dataptr(udb_base* udb, udb_void to) +{ + void* base = udb->base; + udb_void ch; + int exp; + uint64_t esz; + /* our data chunks are aligned and at least 8 bytes */ + if(!udb_valid_offset(udb, to, sizeof(uint64_t))) + return 0; + /* get the chunk pointer */ + ch = chunk_from_dataptr(to); + if(!udb_valid_offset(udb, ch, sizeof(udb_chunk_d))) + return 0; + /* check its size */ + exp = UDB_CHUNK(ch)->exp; + if(exp == UDB_EXP_XL) { + /* check XL chunk */ + uint64_t xlsz; + if(!udb_valid_offset(udb, ch, sizeof(udb_xl_chunk_d))) + return 0; + xlsz = UDB_XL_CHUNK(ch)->size; + if(!udb_valid_offset(udb, ch+xlsz-1, 1)) + return 0; + if(*((uint8_t*)UDB_REL(base, ch+xlsz-1)) != UDB_EXP_XL) + return 0; + if(*((uint64_t*)UDB_REL(base, ch+xlsz-sizeof(uint64_t)*2)) + != xlsz) + return 0; + return 1; + } + /* check if regular chunk has matching end byte */ + if(exp < UDB_ALLOC_CHUNK_MINEXP || exp > UDB_ALLOC_CHUNKS_MAX) + return 0; /* cannot be a valid chunk */ + esz = 1<<exp; + if(!udb_valid_offset(udb, ch+esz-1, 1)) + return 0; + if(*((uint8_t*)UDB_REL(base, ch+esz-1)) != exp) + return 0; + return 1; +} + +int udb_valid_rptr(udb_base* udb, udb_void rptr, udb_void to) +{ + void* base = udb->base; + udb_void p; + if(!udb_valid_offset(udb, rptr, sizeof(udb_rel_ptr))) + return 0; + if(!udb_valid_dataptr(udb, to)) + return 0; + p = UDB_CHUNK(chunk_from_dataptr(to))->ptrlist; + while(p) { + if(!udb_valid_offset(udb, p, sizeof(udb_rel_ptr))) + return 0; + if(p == rptr) + return 1; + p = UDB_REL_PTR(p)->next; + } + return 0; +} + +void udb_rel_ptr_init(udb_rel_ptr* ptr) +{ + memset(ptr, 0, sizeof(*ptr)); +} + +void udb_rel_ptr_unlink(void* base, udb_rel_ptr* ptr) +{ + if(!ptr->data) + return; + if(ptr->prev) { + UDB_REL_PTR(ptr->prev)->next = ptr->next; + } else { + UDB_CHUNK(chunk_from_dataptr(ptr->data))->ptrlist = ptr->next; + } + if(ptr->next) { + UDB_REL_PTR(ptr->next)->prev = ptr->prev; + } +} + +void udb_rel_ptr_link(void* base, udb_rel_ptr* ptr, udb_void to) +{ + udb_chunk_d* chunk = UDB_CHUNK(chunk_from_dataptr(to)); + ptr->prev = 0; + ptr->next = chunk->ptrlist; + if(ptr->next) + UDB_REL_PTR(ptr->next)->prev = UDB_SYSTOREL(base, ptr); + chunk->ptrlist = UDB_SYSTOREL(base, ptr); + ptr->data = to; +} + +void udb_rel_ptr_set(void* base, udb_rel_ptr* ptr, udb_void to) +{ + assert(to == 0 || to > 64); + udb_rel_ptr_unlink(base, ptr); + if(to) + udb_rel_ptr_link(base, ptr, to); + else ptr->data = to; +} + +void udb_rel_ptr_edit(void* base, udb_void list, udb_void to) +{ + udb_void p = list; + while(p) { + UDB_REL_PTR(p)->data = to; + p = UDB_REL_PTR(p)->next; + } +} + +#ifdef UDB_CHECK +/** check that all pointers are validly chained */ +static void +udb_check_ptrs_valid(udb_base* udb) +{ + size_t i; + udb_ptr* p, *prev; + for(i=0; i<udb->ram_size; i++) { + prev = NULL; + for(p=udb->ram_hash[i]; p; p=p->next) { + assert(p->prev == prev); + assert((size_t)(chunk_hash_ptr(p->data)&udb->ram_mask) + == i); + assert(p->base == &udb->base); + prev = p; + } + } +} +#endif /* UDB_CHECK */ + +void udb_ptr_init(udb_ptr* ptr, udb_base* udb) +{ +#ifdef UDB_CHECK + udb_check_ptrs_valid(udb); /* previous ptrs have been unlinked */ +#endif + memset(ptr, 0, sizeof(*ptr)); + ptr->base = &udb->base; +} + +void udb_ptr_set(udb_ptr* ptr, udb_base* udb, udb_void newval) +{ + assert(newval == 0 || newval > 64); + if(ptr->data) + udb_base_unlink_ptr(udb, ptr); + ptr->data = newval; + if(newval) + udb_base_link_ptr(udb, ptr); +} + +int udb_ptr_alloc_space(udb_ptr* ptr, udb_base* udb, udb_chunk_type type, + size_t sz) +{ + udb_void r; + r = udb_alloc_space(udb->alloc, sz); + if(!r) return 0; + udb_alloc_set_type(udb->alloc, r, type); + udb_ptr_init(ptr, udb); + udb_ptr_set(ptr, udb, r); + return 1; +} + +void udb_ptr_free_space(udb_ptr* ptr, udb_base* udb, size_t sz) +{ + if(ptr->data) { + udb_void d = ptr->data; + udb_ptr_set(ptr, udb, 0); + udb_alloc_free(udb->alloc, d, sz); + } +} + +udb_chunk_type udb_ptr_get_type(udb_ptr* ptr) +{ + udb_void f; + if(!ptr || ptr->data == 0) return udb_chunk_type_internal; /* something bad*/ + f = chunk_from_dataptr(ptr->data); + return ((udb_chunk_d*)UDB_REL(*ptr->base, f))->type; +} diff --git a/usr.sbin/nsd/udb.h b/usr.sbin/nsd/udb.h new file mode 100644 index 00000000000..de7985275c2 --- /dev/null +++ b/usr.sbin/nsd/udb.h @@ -0,0 +1,784 @@ +/* udb.h - u(micro) data base, stores data and index information in mmap file. + * By W.C.A. Wijngaards + * Copyright 2010, NLnet Labs. + * BSD, see LICENSE. + */ +#ifndef UDB_H +#define UDB_H +#include <assert.h> + +/** + * The micro data base UDB. + * + * File data.udb is mmapped and used to lookup and edit. + * it contains a header with space-allocation-info, and a reference to the + * base information, an object that is the entry point for the file. + * Then it contains a lot of data and index objects. + * + * The space allocator is 'buddy system', 1megareas, larger get own area. + * So worst case is 2xdata filesize (+header). Growth semi-linear. + * Chunks have size and type (for recovery). Call to reserve space. + * Call to 'realloc-in-place', if space permits. + * + * Usually you want a record-type and its indexes (sorted) to be stored in + * the file. This is a table (named by string). The record is opaque + * data. + * + * To be able to use pointers in the mmapped file, there is conversion of + * relative-pointers(to file base) to system-pointers. + * + * If an item is moved its internal pointers need to be recalculated. + * Thus a recordtype (that has internal pointers) must provide a routine. + * Structures that are 'on-disk', are denoted with _d. Except rel_ptr which + * is also on-disk. + * + * About 64-bit trouble. The pointer-size which which the application is + * compiled determines the file layout, because this makes it perform well + * in a mmap. It could in theory be converted if you really wanted to. + * Nonpointer data is best stored as a fixed bitsize (uint8, 16, 32, 64). + */ +typedef struct udb_base udb_base; +typedef struct udb_alloc udb_alloc; + +/** perform extra checks (when --enable-checking is used) */ +#ifndef NDEBUG +#define UDB_CHECK 1 +#endif + +/** pointers are stored like this */ +typedef uint64_t udb_void; + +/** convert relptr to usable pointer */ +#define UDB_REL(base, relptr) ((base) + (relptr)) +/** from system pointer to relative pointer */ +#define UDB_SYSTOREL(base, ptr) ((udb_void)((void*)(ptr) - (base))) + +/** MAX 2**x exponent of alloced chunks, for 1Mbytes. The smallest + * chunk is 16bytes (8preamble+8data), so 0-3 is unused. */ +#define UDB_ALLOC_CHUNKS_MAX 20 +/** size of areas that are subdivided */ +#define UDB_ALLOC_CHUNK_SIZE ((uint64_t)1<<UDB_ALLOC_CHUNKS_MAX) +/** the minimum alloc in exp, 2**x. 32bytes because of chunk_free_d size (8aligned) */ +#define UDB_ALLOC_CHUNK_MINEXP 5 +/** size of minimum alloc */ +#define UDB_ALLOC_CHUNK_MINSIZE ((uint64_t)1<<UDB_ALLOC_CHUNK_MINEXP) +/** exp size used to mark the header (cannot be reallocated) */ +#define UDB_EXP_HEADER 0 +/** exp size used to mark XL(extralarge) allocations (in whole mbs) */ +#define UDB_EXP_XL 1 + +typedef struct udb_ptr udb_ptr; +/** + * This structure is there for when you want to have a pointer into + * the mmap-ed file. It is kept track of. Set it to NULL to unlink it. + * For pointers to the mmap-ed file from within the mmap-ed file, use the + * rel_pre construct below. + */ +struct udb_ptr { + /** the data segment it points to (relative file offset) */ + uint64_t data; + /** pointer to the base pointer (for convenience) */ + void** base; + /** prev in udb_ptr list for this data segment */ + udb_ptr* prev; + /** next in udb_ptr list for this data segment */ + udb_ptr* next; +}; + +typedef struct udb_rel_ptr udb_rel_ptr; +/** + * A relative pointer that keeps track of the list of pointers, + * so that it can be reallocated. + */ +struct udb_rel_ptr { + /** the relative pointer to the data itself (subtract chunk_d size + * to get the chunk_d type, this is for usage speed in dereferencing + * to the userdata). */ + udb_void data; + /** udb_rel_ptr* prev in relptr list */ + udb_void prev; + /** udb_rel_ptr* next in relptr list */ + udb_void next; +}; + +/** + * This is the routine that is called for every relptr + * @param base: the baseptr for REL. + * @param p: the relptr, a real pointer to it. + * @param arg: user argument. + */ +typedef void udb_walk_relptr_cb(void*, udb_rel_ptr*, void*); + +/** + * This routine calls the callback for every relptr in a datablock + * params in order: + * base: the baseptr for REL macro. + * warg: the walkfunc user argument. + * t: the type of the chunk. + * d: pointer to the data part of the chunk (real pointer). + * s: max size of the data part. + * cb: the callback to call for every element. + * arg: user argument to pass to the callback. + */ +typedef void udb_walk_relptr_func(void*, void*, uint8_t, void*, uint64_t, + udb_walk_relptr_cb*, void*); + +/** What sort of salvage should be performed by alloc */ +enum udb_dirty_alloc { + udb_dirty_clean = 0, /* all clean */ + udb_dirty_fl, /* allocs, freelists are messed up */ + udb_dirty_fsize, /* file size and fsize are messed up */ + udb_dirty_compact /* allocs, freelists and relptrs are messed up */ +}; + +typedef struct udb_glob_d udb_glob_d; +/** + * The UDB global data for a file. This structure is mmapped. + * Make sure it has no structure-padding problems. + */ +struct udb_glob_d { + /** size of header in the file (offset to the first alloced chunk) */ + uint64_t hsize; + /** version number of this file */ + uint8_t version; + /** was the file not cleanly closed, 0 is ok */ + uint8_t clean_close; + /** an allocation operation was in progress, file needs to be salvaged + * type enum udb_dirty_alloc */ + uint8_t dirty_alloc; + /** user flags */ + uint8_t userflags; + /** padding to 8-bytes alignment */ + uint8_t pad1[4]; + /** size to mmap */ + uint64_t fsize; + /** chunk move rollback info: oldchunk (0 is nothing). + * volatile because these values prevent dataloss, they need to be + * written immediately. */ + volatile udb_void rb_old; + /** chunk move rollback info: newchunk (0 is nothing) */ + volatile udb_void rb_new; + /** size of move rollback chunks */ + volatile uint64_t rb_size; + /** segment of move rollback, for an XL chunk that overlaps. */ + volatile uint64_t rb_seg; + /** linked list for content-listing, 0 if empty */ + udb_rel_ptr content_list; + /** user global data pointer */ + udb_rel_ptr user_global; +}; + +/** + * The UDB database file. Contains all the data + */ +struct udb_base { + /** name of the file, alloced */ + char* fname; + + /** mmap base pointer (or NULL) */ + void* base; + /** size of mmap */ + size_t base_size; + /** fd of mmap (if -1, closed). */ + int fd; + + /** space allocator that is used for this base */ + udb_alloc* alloc; + /** real pointer to the global data in the file */ + udb_glob_d* glob_data; + + /** store all linked udb_ptrs in this table, by hash(offset). + * then a linked list of ptrs (all that match the hash). + * this avoids buckets, and thus memory allocation. */ + udb_ptr** ram_hash; + /** size of the current udb_ptr hashtable array */ + size_t ram_size; + /** mask for the curren udb_ptr hashtable lookups */ + int ram_mask; + /** number of ptrs in ram, used to decide when to grow */ + size_t ram_num; + /** for relocation, this walks through all relptrs in chunk */ + udb_walk_relptr_func* walkfunc; + /** user data for walkfunc */ + void* walkarg; +}; + +typedef enum udb_chunk_type udb_chunk_type; +/** chunk type enum, setting these types help recovery and debug */ +enum udb_chunk_type { + udb_chunk_type_free = 0, + udb_chunk_type_data, /* alloced data */ + udb_chunk_type_index, + udb_chunk_type_radtree, + udb_chunk_type_radnode, + udb_chunk_type_radarray, + udb_chunk_type_zone, + udb_chunk_type_domain, + udb_chunk_type_rrset, + udb_chunk_type_rr, + udb_chunk_type_task, + udb_chunk_type_internal +}; + +typedef struct udb_chunk_d udb_chunk_d; +/** + * UDB chunk info (prepended for every allocated chunk). + * The chunks are in doublelinkedlists per size. + * At the end of the chunk another exp uint8 is stored (to walk backwards). + * 17 bytes overhead, datasize for 32byte chunk is 15. + */ +struct udb_chunk_d { + /** the size of this chunk (i.e. 2**x) */ + uint8_t exp; + /** type for this chunk (enum chunktype; free, data or index) */ + uint8_t type; + /** flags for this chunk */ + uint8_t flags; + /** padding onto 8-alignment */ + uint8_t pad[5]; + /** udb_rel_ptr* first in list of rel-ptrs that point back here + * In the free chunk this is the previous pointer. */ + udb_void ptrlist; + /* user data space starts here, 64-bit aligned */ + uint8_t data[0]; + /* last octet: exp of chunk */ +}; + +typedef struct udb_free_chunk_d udb_free_chunk_d; +/** + * A free chunk. Same start as the udb_chunk_d. minsize is 32 bytes. + */ +struct udb_free_chunk_d { + /** the size of this chunk (i.e. 2**x) */ + uint8_t exp; + /** type for this chunk (enum chunktype; free, data or index) */ + uint8_t type; + /** flags for this chunk */ + uint8_t flags; + /** padding onto 8-alignment */ + uint8_t pad[5]; + /** udb_chunk_d* prev of free list for this size */ + udb_void prev; + /** udb_chunk_d* next of free list for this size */ + udb_void next; + /* empty stuff */ + /* last octet: exp of chunk */ +}; + +typedef struct udb_xl_chunk_d udb_xl_chunk_d; +/** + * an Extra Large (XL) chunk. Same start as the udb_chunk_d. Allocated in whole + * MAX_CHUNK_SIZE parts, whole megabytes. overhead is 5x8=40 bytes. + */ +struct udb_xl_chunk_d { + /** the size of this chunk (i.e. 2**x): special XL value */ + uint8_t exp; + /** type for this chunk (enum chunktype; free, data or index) */ + uint8_t type; + /** flags for this chunk */ + uint8_t flags; + /** padding onto 8-alignment */ + uint8_t pad[5]; + /** udb_rel_ptr* first in list of rel-ptrs that point back here + * In the free chunk this is the previous pointer. */ + udb_void ptrlist; + /** size of this chunk in bytes */ + uint64_t size; + /** data of the XL chunk */ + uint8_t data[0]; + /* uint64_t endsize: before last octet the size again. */ + /* uint8_t pad[7]: padding to make last octet last. */ + /* last octet: exp of chunk: special XL value */ +}; + +typedef struct udb_alloc_d udb_alloc_d; +/** + * UDB alloc info on disk. + */ +struct udb_alloc_d { + /** stats: number of data bytes allocated, sum of sizes passed to alloc */ + uint64_t stat_data; + /** stats: number of bytes in free chunks, sum of their 2**x size */ + uint64_t stat_free; + /** stats: number of bytes in alloced chunks, sum of their 2**x size */ + uint64_t stat_alloc; + /** offset to create next chunk at. can be before file-end, or be + * fsize, volatile because it is used as a 'commit', and thus we want + * this to be written to memory (and thus disk) immediately. */ + volatile uint64_t nextgrow; + /** fixed size array the points to the 2**x size chunks in the file, + * This is the start of the doublelinked list, ptr to udb_free_chunk_d. + * array starts at UDB_ALLOC_CHUNK_MINEXP entry as [0]. */ + udb_void free[UDB_ALLOC_CHUNKS_MAX-UDB_ALLOC_CHUNK_MINEXP+1]; +}; + +/** + * The UDB space allocator. Assigns space in the file. + */ +struct udb_alloc { + /** the base this is part of */ + udb_base* udb; + /** real pointer to space allocation info on disk; fixedsize struct */ + udb_alloc_d* disk; +}; + +/** + * file header length, the file start with + * 64bit: magic number to identify file (and prevent stupid mistakes) + * globdata: global data. Fixed size segment. (starts with size uint64) + * allocdata: alloc global data. Fixed size segment. + * size and 0 byte: end marker for reverse search. + */ +#define UDB_HEADER_SIZE (sizeof(uint64_t)+sizeof(udb_glob_d)+ \ + sizeof(udb_alloc_d)+sizeof(uint64_t)*2) +/** magic string that starts an UDB file, uint64_t, note first byte=0, to mark + * header start as a chunk. */ +#define UDB_MAGIC (((uint64_t)'u'<<48)|((uint64_t)'d'<<40)|((uint64_t)'b' \ + <<32)|((uint64_t)'v'<<24)|((uint64_t)'0'<<16)|((uint64_t)'a'<<8)) + +/* UDB BASE */ +/** + * Create udb base structure and attempt to read the file. + * @param fname: file name. + * @param walkfunc: function to walk through relptrs in chunk. + * @param arg: user argument to pass to walkfunc + * @return base structure or NULL on failure. + */ +udb_base* udb_base_create_read(const char* fname, udb_walk_relptr_func walkfunc, + void* arg); + +/** + * Create udb base structure and create a new file. + * @param fname: file name. + * @param walkfunc: function to walk through relptrs in chunk. + * @param arg: user argument to pass to walkfunc + * @return base structure or NULL on failure. + */ +udb_base* udb_base_create_new(const char* fname, udb_walk_relptr_func walkfunc, + void* arg); + +/** + * Create udb from (O_RDWR) fd. + * @param fname: file name. + * @param fd: file descriptor. + * @param walkfunc: function to walk through relptrs in chunk. + * @param arg: user argument to pass to walkfunc + * @return base structure or NULL on failure. + */ +udb_base* udb_base_create_fd(const char* fname, int fd, + udb_walk_relptr_func walkfunc, void* arg); + +/** + * Properly close the UDB base file. Separate from delete so the + * most important bits (write to disk, sockets) can be done first. + * @param udb: the udb. + */ +void udb_base_close(udb_base* udb); + +/** + * Free the data structure (and close if not already) the udb. + * @param udb: the udb. + */ +void udb_base_free(udb_base* udb); + +/** + * Free the udb, but keep mmap mapped for others. + * @param udb: the udb. + */ +void udb_base_free_keep_mmap(udb_base* udb); + +/** + * Sync the mmap. + * @param udb: the udb. + * @param wait: if true, the call blocks until synced. + */ +void udb_base_sync(udb_base* udb, int wait); + +/** + * The mmap size is updated to reflect changes by another process. + * @param udb: the udb. + */ +void udb_base_remap_process(udb_base* udb); + +/** + * get the user data (relative) pointer. + * @param udb: the udb. + * @return the userdata relative pointer, 0 means nothing. + */ +udb_rel_ptr* udb_base_get_userdata(udb_base* udb); + +/** + * Set the user data (relative) pointer. + * @param udb: the udb. + * @param user: user data. offset-pointer (or 0). + */ +void udb_base_set_userdata(udb_base* udb, udb_void user); + +/** + * Set the user flags (to any value, uint8). + * @param udb: the udb. + * @param v: new value. + */ +void udb_base_set_userflags(udb_base* udb, uint8_t v); + +/** + * Get the user flags. + * @param udb: the udb. + * @param v: new value. + */ +uint8_t udb_base_get_userflags(udb_base* udb); + +/** + * Not for users of udb_base, but for udb_ptr. + * Link in a new ptr that references a data segment. + * @param udb: the udb. + * @param ptr: to link in. + */ +void udb_base_link_ptr(udb_base* udb, udb_ptr* ptr); + +/** + * Not for users of udb_base, but for udb_ptr. + * Unlink a ptr that references a data segment. + * @param udb: the udb. + * @param ptr: to unlink. + */ +void udb_base_unlink_ptr(udb_base* udb, udb_ptr* ptr); + +/* UDB ALLOC */ +/** + * Utility for alloc, find 2**x size that is bigger than the given size. + * Does not work for amount==0. + * @param amount: amount of memory. + * @return x; the exponent where 2**x >= amount. + */ +int udb_exp_size(uint64_t amount); + +/** + * Utility for alloc, what is the size that the current offset supports + * as a maximum 2**x chunk. + * Does not work for offset = 0 (result is infinite). + * @param offset: the offset into the memory region. + * @return maximum exponent where 2**x is fits the offset, thus + * offset % (2**x) == 0 and x cannot be larger. + */ +int udb_exp_offset(uint64_t offset); + +/** + * Convert pointer to the data part to a pointer to the base of the chunk. + * @param data: data part. + * @return pointer to the base of the chunk. + */ +udb_void chunk_from_dataptr_ext(udb_void data); + +/** + * Create empty UDB allocate structure to write to disk to initialize file. + * @param a: allocation structure to initialize. system pointer. + */ +void udb_alloc_init_new(udb_alloc_d* a); + +/** + * Create new udb allocator, with specific data on disk + * @param udb: the udb. + * @param disk: disk data. + * @return udb allocator or NULL on (malloc) failure. + */ +udb_alloc* udb_alloc_create(udb_base* udb, udb_alloc_d* disk); + +/** + * Free the udb allocator from memory. + * @param alloc: the udb space allocator. + */ +void udb_alloc_delete(udb_alloc* alloc); + +/** + * Allocate space on the disk. + * This may involve closing and reopening the mmap. + * @param alloc: the udb space allocator. + * @param sz: size you want to use. + * @return relative pointer (or 0 on alloc failure). + */ +udb_void udb_alloc_space(udb_alloc* alloc, size_t sz); + +/** + * Allocate space on disk, give already the data you want there. + * This may involve closing and reopening the mmap. + * @param alloc: the udb space allocator. + * @param d: data you want there (system pointer). + * @param sz: size you want to use. + * @return relative pointer (or 0 on alloc failure). + */ +udb_void udb_alloc_init(udb_alloc* alloc, void* d, size_t sz); + +/** + * free allocated space. It may shrink the file. + * This may involve closing and reopening the mmap. + * @param alloc: the udb space allocator. + * @param r: relative pointer to data you want to free. + * @param sz: the size of the data you stop using. + * @return false if the free failed, it failed the close and mmap. + */ +int udb_alloc_free(udb_alloc* alloc, udb_void r, size_t sz); + +/** + * realloc an existing allocated space. It may grow the file. + * This may involve closing and reopening the mmap. + * It could also use the existing space where it is now. + * @param alloc: the udb space allocator. + * @param r: relative pointer to data you want to realloc. + * if 0 then this is alloc_space(), and osz is ignored. + * @param osz: the old size of the data. + * @param sz: the size of the data you want to get. + * if this is 0 then a free() is done, but please do it directly, + * as you then get a returnvalue (file errors). + * @return relative pointer (0 on alloc failure, same if not moved). + */ +udb_void udb_alloc_realloc(udb_alloc* alloc, udb_void r, size_t osz, + size_t sz); + +/** + * Prepare for a lot of new entries. Grow space for that. + * This can involve closing and reopening the mmap. + * This space (if large) is going to be released on next free() or close(). + * @param alloc: the udb space allocator. + * @param sz: size of the entries. + * @param num: number of entries. + * @return false on failure to grow or re-mmap. + */ +int udb_alloc_grow(udb_alloc* alloc, size_t sz, size_t num); + +/** + * Set the alloc type for a newly alloced piece of data + * @param alloc: the udb space allocator. + * @param r: relativeptr to the data. + * @param tp: the type of that block. + */ +void udb_alloc_set_type(udb_alloc* alloc, udb_void r, udb_chunk_type tp); + +/** + * See if a pointer could be valid (it points within valid space), + * for the given type side. For debug checks. + * @param udb: the udb + * @param to: the ptr (offset). + * @param destsize: the size_of of the destination of the pointer. + * @return true if it points to a valid region. + */ +int udb_valid_offset(udb_base* udb, udb_void to, size_t destsize); + +/** + * See if a pointer is valid (it points to a chunk). For debug checks. + * @param udb: the udb. + * @param to: the ptr (offset). + * @return true if it points to the start of a chunks data region. + */ +int udb_valid_dataptr(udb_base* udb, udb_void to); + +/** + * See if a pointer is on the relptrlist for dataptr. For debug checks. + * @param udb: the udb. + * @param rptr: the rel_ptr (offset). + * @param to: dataptr of the chunk on which ptrlist the rptr is searched. + * @return true if rptr is valid and on the ptrlist. + */ +int udb_valid_rptr(udb_base* udb, udb_void rptr, udb_void to); + +/*** UDB_REL_PTR ***/ +/** + * Init a new UDB rel ptr at NULL. + * @param ptr: sysptr, becomes inited. + */ +void udb_rel_ptr_init(udb_rel_ptr* ptr); + +/** + * Unlink a UDB rel ptr. + * @param base: the udb base + * @param ptr: sysptr, unlinked + */ +void udb_rel_ptr_unlink(void* base, udb_rel_ptr* ptr); + +/** + * Link a UDB rel ptr to a new chunk + * @param base: the udb base + * @param ptr: sysptr, linked to new value. + * @param to: the data to point to (relative ptr). + */ +void udb_rel_ptr_link(void* base, udb_rel_ptr* ptr, udb_void to); + +/** + * Change rel ptr to a new value (point to another record) + * @param base: the udb base + * @param ptr: sysptr, points to new value. + * @param to: the data to point to (relative ptr). + */ +void udb_rel_ptr_set(void* base, udb_rel_ptr* ptr, udb_void to); + +/** + * A chunk has moved and now edit all the relptrs in list to fix them up + * @param base: the udb base + * @param list: start of the ptr list + * @param to: where the chunk has moved to relptr to its userdata. + */ +void udb_rel_ptr_edit(void* base, udb_void list, udb_void to); + +/** + * Get system pointer. Assumes there is a variable named 'base' + * that points to the udb base. + * @param ptr: the relative pointer (a sysptr to it). + * @return void* to the data. + */ +#define UDB_SYSPTR(ptr) UDB_REL(base, (ptr)->data) + +/** get sys ptr for char* string */ +#define UDB_CHAR(ptr) ((char*)UDB_REL(base, ptr)) +/** get sys ptr for udb_rel_ptr */ +#define UDB_REL_PTR(ptr) ((udb_rel_ptr*)UDB_REL(base, ptr)) +/** get sys ptr for udb_glob_d */ +#define UDB_GLOB(ptr) ((udb_glob_d*)UDB_REL(base, ptr)) +/** get sys ptr for udb_chunk_d */ +#define UDB_CHUNK(ptr) ((udb_chunk_d*)UDB_REL(base, ptr)) +/** get sys ptr for udb_free_chunk_d */ +#define UDB_FREE_CHUNK(ptr) ((udb_free_chunk_d*)UDB_REL(base, ptr)) +/** get sys ptr for udb_xl_chunk_d */ +#define UDB_XL_CHUNK(ptr) ((udb_xl_chunk_d*)UDB_REL(base, ptr)) + +/* udb_ptr */ +/** + * Initialize an udb ptr. Set to NULL. (and thus not linked can be deleted). + * You MUST set it to 0 before you stop using the ptr. + * @param ptr: the ptr to initialise (caller has allocated it). + * @param udb: the udb base to link it to. + */ +void udb_ptr_init(udb_ptr* ptr, udb_base* udb); + +/** + * Set udp ptr to a new value. If set to NULL you can delete it. + * @param ptr: the ptr. + * @param udb: the udb base to link up with that data segment's administration. + * @param newval: new value to point to (udb_void relative file offset to data). + */ +void udb_ptr_set(udb_ptr* ptr, udb_base* udb, udb_void newval); + +/** dereference udb_ptr */ +#define UDB_PTR(ptr) (UDB_REL(*((ptr)->base), (ptr)->data)) + +/** + * Ease of use udb ptr, allocate space and return ptr to it + * You MUST udb_ptr_set it to 0 before you stop using the ptr. + * @param base: udb base to use. + * @param ptr: ptr is overwritten, can be uninitialised. + * @param type: type of the allocation. + * You need a special type if the block contains udb_rel_ptr's. + * You can use udb_type_data for plain data. + * @param sz: amount to allocate. + * @return 0 on alloc failure. + */ +int udb_ptr_alloc_space(udb_ptr* ptr, udb_base* udb, udb_chunk_type type, + size_t sz); + +/** + * Ease of use udb ptr, free space and set ptr to NULL (to it can be deleted). + * The space is freed on disk. + * @param ptr: the ptr. + * @param udb: udb base. + * @param sz: the size of the data you stop using. + */ +void udb_ptr_free_space(udb_ptr* ptr, udb_base* udb, size_t sz); + +/** + * Get pointer to the data of the ptr. or use a macro to cast UDB_PTR to + * the type of your structure(.._d) + */ +static inline uint8_t* udb_ptr_data(udb_ptr* ptr) { + return (uint8_t*)UDB_PTR(ptr); +} + +/** + * See if udb ptr is null + */ +static inline int udb_ptr_is_null(udb_ptr* ptr) { + return (ptr->data == 0); +} + +/** + * Get the type of a udb_ptr chunk. + * @param ptr: udb pointer + * @return type of chunk */ +udb_chunk_type udb_ptr_get_type(udb_ptr* ptr); + +/** Ease of use, create new pointer to destination relptr + * You MUST udb_ptr_set it to 0 before you stop using the ptr. */ +static inline void udb_ptr_new(udb_ptr* ptr, udb_base* udb, udb_rel_ptr* d) { + udb_ptr_init(ptr, udb); + udb_ptr_set(ptr, udb, d->data); +} + +/** Ease of use. Stop using this ptr */ +static inline void udb_ptr_unlink(udb_ptr* ptr, udb_base* udb) { + if(ptr->data) + udb_base_unlink_ptr(udb, ptr); +} + +/* Ease of use. Assign rptr from rptr */ +static inline void udb_rptr_set_rptr(udb_rel_ptr* dest, udb_base* udb, + udb_rel_ptr* p) { +#ifdef UDB_CHECK + if(dest->data) { assert(udb_valid_rptr(udb, + UDB_SYSTOREL(udb->base, dest), dest->data)); } + if(p->data) { assert(udb_valid_rptr(udb, + UDB_SYSTOREL(udb->base, p), p->data)); } +#endif + udb_rel_ptr_set(udb->base, dest, p->data); +} + +/* Ease of use. Assign rptr from ptr */ +static inline void udb_rptr_set_ptr(udb_rel_ptr* dest, udb_base* udb, + udb_ptr* p) { +#ifdef UDB_CHECK + if(dest->data) { assert(udb_valid_rptr(udb, + UDB_SYSTOREL(udb->base, dest), dest->data)); } + if(p->data) { assert(udb_valid_dataptr(udb, p->data)); } +#endif + udb_rel_ptr_set(udb->base, dest, p->data); +} + +/* Ease of use. Assign ptr from rptr */ +static inline void udb_ptr_set_rptr(udb_ptr* dest, udb_base* udb, + udb_rel_ptr* p) { +#ifdef UDB_CHECK + if(p->data) { assert(udb_valid_rptr(udb, + UDB_SYSTOREL(udb->base, p), p->data)); } +#endif + udb_ptr_set(dest, udb, p->data); +} + +/* Ease of use. Assign ptr from ptr */ +static inline void udb_ptr_set_ptr(udb_ptr* dest, udb_base* udb, udb_ptr* p) { + udb_ptr_set(dest, udb, p->data); +} + +/* Ease of use, zero rptr. You use this to zero an existing pointer. + * A new rptr should be rel_ptr_init-ed before it is taken into use. */ +static inline void udb_rptr_zero(udb_rel_ptr* dest, udb_base* udb) { +#ifdef UDB_CHECK + if(dest->data) { assert(udb_valid_rptr(udb, + UDB_SYSTOREL(udb->base, dest), dest->data)); } +#endif + udb_rel_ptr_set(udb->base, dest, 0); +} + +/* Ease of use, zero ptr */ +static inline void udb_ptr_zero(udb_ptr* dest, udb_base* udb) { + udb_ptr_set(dest, udb, 0); +} + +/** ease of use, delete memory pointed at by relptr */ +static inline void udb_rel_ptr_free_space(udb_rel_ptr* ptr, udb_base* udb, + size_t sz) { + udb_void d = ptr->data; +#ifdef UDB_CHECK + if(d) { assert(udb_valid_rptr(udb, UDB_SYSTOREL(udb->base, ptr), d)); } +#endif + udb_rel_ptr_set(udb->base, ptr, 0); + udb_alloc_free(udb->alloc, d, sz); +} + +#endif /* UDB_H */ diff --git a/usr.sbin/nsd/udbradtree.c b/usr.sbin/nsd/udbradtree.c new file mode 100644 index 00000000000..d9be6b9c255 --- /dev/null +++ b/usr.sbin/nsd/udbradtree.c @@ -0,0 +1,1463 @@ +/* + * udbradtree -- radix tree for binary strings for in udb file. + * + * Copyright (c) 2011, NLnet Labs. See LICENSE for license. + */ +#include "config.h" +#include <string.h> +#include <assert.h> +#include <stdio.h> +#include "udbradtree.h" +#include "radtree.h" +#define RADARRAY(ptr) ((struct udb_radarray_d*)UDB_PTR(ptr)) + +/** see if radarray can be reduced (by a factor of two) */ +static int udb_radarray_reduce_if_needed(udb_base* udb, udb_ptr* n); + +int udb_radix_tree_create(udb_base* udb, udb_ptr* ptr) +{ + if(!udb_ptr_alloc_space(ptr, udb, udb_chunk_type_radtree, + sizeof(struct udb_radtree_d))) + return 0; + udb_rel_ptr_init(&RADTREE(ptr)->root); + RADTREE(ptr)->count = 0; + return 1; +} + +/** size of radarray */ +static size_t size_of_radarray(struct udb_radarray_d* a) +{ + return sizeof(struct udb_radarray_d)+((size_t)a->capacity)*( + sizeof(struct udb_radsel_d)+(size_t)a->str_cap); +} + +/** size in bytes of data in the array lookup structure */ +static size_t size_of_lookup(udb_ptr* node) +{ + assert(udb_ptr_get_type(node) == udb_chunk_type_radnode); + return size_of_radarray((struct udb_radarray_d*)UDB_REL(*node->base, + RADNODE(node)->lookup.data)); +} + +/** external variant, size in bytes of data in the array lookup structure */ +size_t size_of_lookup_ext(udb_ptr* lookup) +{ + return size_of_lookup(lookup); +} + +/** size needed for a lookup array like this */ +static size_t size_of_lookup_needed(uint16_t capacity, udb_radstrlen_t str_cap) +{ + return sizeof(struct udb_radarray_d)+ ((size_t)capacity)*( + sizeof(struct udb_radsel_d)+(size_t)str_cap); +} + +/** get the lookup array for a node */ +static struct udb_radarray_d* lookup(udb_ptr* n) +{ + assert(udb_ptr_get_type(n) == udb_chunk_type_radnode); + return (struct udb_radarray_d*)UDB_REL(*n->base, + RADNODE(n)->lookup.data); +} + +/** get a length in the lookup array */ +static udb_radstrlen_t lookup_len(udb_ptr* n, unsigned i) +{ + return lookup(n)->array[i].len; +} + +/** get a string in the lookup array */ +static uint8_t* lookup_string(udb_ptr* n, unsigned i) +{ + return ((uint8_t*)&(lookup(n)->array[lookup(n)->capacity]))+ + i*lookup(n)->str_cap; +} + +/** get a node in the lookup array */ +static struct udb_radnode_d* lookup_node(udb_ptr* n, unsigned i) +{ + return (struct udb_radnode_d*)UDB_REL(*n->base, + lookup(n)->array[i].node.data); +} + +/** zero the relptrs in radarray */ +static void udb_radarray_zero_ptrs(udb_base* udb, udb_ptr* n) +{ + unsigned i; + for(i=0; i<lookup(n)->len; i++) { + udb_rptr_zero(&lookup(n)->array[i].node, udb); + } +} + +/** delete a radnode */ +static void udb_radnode_delete(udb_base* udb, udb_ptr* n) +{ + if(udb_ptr_is_null(n)) + return; + if(RADNODE(n)->lookup.data) { + udb_radarray_zero_ptrs(udb, n); + udb_rel_ptr_free_space(&RADNODE(n)->lookup, udb, + size_of_lookup(n)); + } + udb_rptr_zero(&RADNODE(n)->lookup, udb); + udb_rptr_zero(&RADNODE(n)->parent, udb); + udb_rptr_zero(&RADNODE(n)->elem, udb); + udb_ptr_free_space(n, udb, sizeof(struct udb_radnode_d)); +} + +/** delete radnodes in postorder recursion, n is ptr to node */ +static void udb_radnode_del_postorder(udb_base* udb, udb_ptr* n) +{ + unsigned i; + udb_ptr sub; + if(udb_ptr_is_null(n)) + return; + /* clear subnodes */ + udb_ptr_init(&sub, udb); + for(i=0; i<lookup(n)->len; i++) { + udb_ptr_set_rptr(&sub, udb, &lookup(n)->array[i].node); + udb_rptr_zero(&lookup(n)->array[i].node, udb); + udb_radnode_del_postorder(udb, &sub); + } + udb_ptr_unlink(&sub, udb); + /* clear lookup */ + udb_rel_ptr_free_space(&RADNODE(n)->lookup, udb, size_of_lookup(n)); + udb_rptr_zero(&RADNODE(n)->parent, udb); + udb_rptr_zero(&RADNODE(n)->elem, udb); + udb_ptr_free_space(n, udb, sizeof(struct udb_radnode_d)); +} + +void udb_radix_tree_clear(udb_base* udb, udb_ptr* rt) +{ + udb_ptr root; + udb_ptr_new(&root, udb, &RADTREE(rt)->root); + udb_rptr_zero(&RADTREE(rt)->root, udb); + /* free the root node (and its descendants, if any) */ + udb_radnode_del_postorder(udb, &root); + udb_ptr_unlink(&root, udb); + + RADTREE(rt)->count = 0; +} + +void udb_radix_tree_delete(udb_base* udb, udb_ptr* rt) +{ + if(rt->data == 0) return; + assert(udb_ptr_get_type(rt) == udb_chunk_type_radtree); + udb_radix_tree_clear(udb, rt); + udb_ptr_free_space(rt, udb, sizeof(struct udb_radtree_d)); +} + +/** + * Find a prefix of the key, in whole-nodes. + * Finds the longest prefix that corresponds to a whole radnode entry. + * There may be a slightly longer prefix in one of the array elements. + * @param result: the longest prefix, the entry itself if *respos==len, + * otherwise an array entry, residx. Output. + * @param respos: pos in string where next unmatched byte is, if == len an + * exact match has been found. If == 0 then a "" match was found. + * @return false if no prefix found, not even the root "" prefix. + */ +static int udb_radix_find_prefix_node(udb_base* udb, udb_ptr* rt, uint8_t* k, + udb_radstrlen_t len, udb_ptr* result, udb_radstrlen_t* respos) +{ + udb_radstrlen_t pos = 0; + uint8_t byte; + udb_ptr n; + udb_ptr_new(&n, udb, &RADTREE(rt)->root); + + *respos = 0; + udb_ptr_set_ptr(result, udb, &n); + if(udb_ptr_is_null(&n)) { + udb_ptr_unlink(&n, udb); + return 0; + } + while(!udb_ptr_is_null(&n)) { + if(pos == len) { + break; + } + byte = k[pos]; + if(byte < RADNODE(&n)->offset) { + break; + } + byte -= RADNODE(&n)->offset; + if(byte >= lookup(&n)->len) { + break; + } + pos++; + if(lookup(&n)->array[byte].len != 0) { + /* must match additional string */ + if(pos+lookup(&n)->array[byte].len > len) { + break; + } + if(memcmp(&k[pos], lookup_string(&n, byte), + lookup(&n)->array[byte].len) != 0) { + break; + } + pos += lookup(&n)->array[byte].len; + } + udb_ptr_set_rptr(&n, udb, &lookup(&n)->array[byte].node); + if(udb_ptr_is_null(&n)) { + break; + } + *respos = pos; + udb_ptr_set_ptr(result, udb, &n); + } + udb_ptr_unlink(&n, udb); + return 1; +} + +/** grow the radnode stringcapacity, copy existing elements */ +static int udb_radnode_str_grow(udb_base* udb, udb_ptr* n, udb_radstrlen_t want) +{ + unsigned ns = ((unsigned)lookup(n)->str_cap)*2; + unsigned i; + udb_ptr a; + if(want > ns) + ns = want; + if(ns > 65535) ns = 65535; /* MAX of udb_radstrlen_t range */ + /* if this fails, the tree is still usable */ + if(!udb_ptr_alloc_space(&a, udb, udb_chunk_type_radarray, + size_of_lookup_needed(lookup(n)->capacity, ns))) + return 0; + /* make sure to zero the newly allocated relptrs to init them */ + memcpy(RADARRAY(&a), lookup(n), sizeof(struct udb_radarray_d)); + RADARRAY(&a)->str_cap = ns; + for(i = 0; i < lookup(n)->len; i++) { + udb_rel_ptr_init(&RADARRAY(&a)->array[i].node); + udb_rptr_set_rptr(&RADARRAY(&a)->array[i].node, udb, + &lookup(n)->array[i].node); + RADARRAY(&a)->array[i].len = lookup_len(n, i); + memmove(((uint8_t*)(&RADARRAY(&a)->array[ + lookup(n)->capacity]))+i*ns, + lookup_string(n, i), lookup(n)->str_cap); + } + udb_radarray_zero_ptrs(udb, n); + udb_rel_ptr_free_space(&RADNODE(n)->lookup, udb, size_of_lookup(n)); + udb_rptr_set_ptr(&RADNODE(n)->lookup, udb, &a); + udb_ptr_unlink(&a, udb); + return 1; +} + +/** grow the radnode array, copy existing elements to start of new array */ +static int udb_radnode_array_grow(udb_base* udb, udb_ptr* n, size_t want) +{ + unsigned i; + unsigned ns = ((unsigned)lookup(n)->capacity)*2; + udb_ptr a; + assert(want <= 256); /* cannot be more, range of uint8 */ + if(want > ns) + ns = want; + if(ns > 256) ns = 256; + /* if this fails, the tree is still usable */ + if(!udb_ptr_alloc_space(&a, udb, udb_chunk_type_radarray, + size_of_lookup_needed(ns, lookup(n)->str_cap))) + return 0; + /* zero the newly allocated rel ptrs to init them */ + memset(UDB_PTR(&a), 0, size_of_lookup_needed(ns, lookup(n)->str_cap)); + assert(lookup(n)->len <= lookup(n)->capacity); + assert(lookup(n)->capacity < ns); + memcpy(RADARRAY(&a), lookup(n), sizeof(struct udb_radarray_d)); + RADARRAY(&a)->capacity = ns; + for(i=0; i<lookup(n)->len; i++) { + udb_rptr_set_rptr(&RADARRAY(&a)->array[i].node, udb, + &lookup(n)->array[i].node); + RADARRAY(&a)->array[i].len = lookup_len(n, i); + } + memmove(&RADARRAY(&a)->array[ns], lookup_string(n, 0), + lookup(n)->len * lookup(n)->str_cap); + udb_radarray_zero_ptrs(udb, n); + udb_rel_ptr_free_space(&RADNODE(n)->lookup, udb, size_of_lookup(n)); + udb_rptr_set_ptr(&RADNODE(n)->lookup, udb, &a); + udb_ptr_unlink(&a, udb); + return 1; +} + +/** make empty array in radnode */ +static int udb_radnode_array_create(udb_base* udb, udb_ptr* n) +{ + /* is there an array? */ + if(RADNODE(n)->lookup.data == 0) { + /* create array */ + udb_ptr a; + uint16_t cap = 0; + udb_radstrlen_t len = 0; + if(!udb_ptr_alloc_space(&a, udb, udb_chunk_type_radarray, + size_of_lookup_needed(cap, len))) + return 0; + memset(UDB_PTR(&a), 0, size_of_lookup_needed(cap, len)); + udb_rptr_set_ptr(&RADNODE(n)->lookup, udb, &a); + RADARRAY(&a)->len = cap; + RADARRAY(&a)->capacity = cap; + RADARRAY(&a)->str_cap = len; + RADNODE(n)->offset = 0; + udb_ptr_unlink(&a, udb); + } + return 1; +} + +/** make space in radnode for another byte, or longer strings */ +static int udb_radnode_array_space(udb_base* udb, udb_ptr* n, uint8_t byte, + udb_radstrlen_t len) +{ + /* is there an array? */ + if(RADNODE(n)->lookup.data == 0) { + /* create array */ + udb_ptr a; + uint16_t cap = 1; + if(!udb_ptr_alloc_space(&a, udb, udb_chunk_type_radarray, + size_of_lookup_needed(cap, len))) + return 0; + /* this memset inits the relptr that is allocated */ + memset(UDB_PTR(&a), 0, size_of_lookup_needed(cap, len)); + udb_rptr_set_ptr(&RADNODE(n)->lookup, udb, &a); + RADARRAY(&a)->len = cap; + RADARRAY(&a)->capacity = cap; + RADARRAY(&a)->str_cap = len; + RADNODE(n)->offset = byte; + udb_ptr_unlink(&a, udb); + return 1; + } + if(lookup(n)->capacity == 0) { + if(!udb_radnode_array_grow(udb, n, 1)) + return 0; + } + + /* make space for this stringsize */ + if(lookup(n)->str_cap < len) { + /* must resize for stringsize */ + if(!udb_radnode_str_grow(udb, n, len)) + return 0; + } + + /* other cases */ + /* is the array unused? */ + if(lookup(n)->len == 0 && lookup(n)->capacity != 0) { + lookup(n)->len = 1; + RADNODE(n)->offset = byte; + memset(&lookup(n)->array[0], 0, sizeof(struct udb_radsel_d)); + /* is it below the offset? */ + } else if(byte < RADNODE(n)->offset) { + /* is capacity enough? */ + int i; + unsigned need = RADNODE(n)->offset-byte; + if(lookup(n)->len+need > lookup(n)->capacity) { + /* grow array */ + if(!udb_radnode_array_grow(udb, n, lookup(n)->len+need)) + return 0; + } + /* take a piece of capacity into use, init the relptrs */ + for(i = lookup(n)->len; i< (int)(lookup(n)->len + need); i++) { + udb_rel_ptr_init(&lookup(n)->array[i].node); + } + /* reshuffle items to end */ + for(i = lookup(n)->len-1; i >= 0; i--) { + udb_rptr_set_rptr(&lookup(n)->array[need+i].node, + udb, &lookup(n)->array[i].node); + lookup(n)->array[need+i].len = lookup_len(n, i); + /* fixup pidx */ + if(lookup(n)->array[i+need].node.data) + lookup_node(n, i+need)->pidx = i+need; + } + memmove(lookup_string(n, need), lookup_string(n, 0), + lookup(n)->len*lookup(n)->str_cap); + /* zero the first */ + for(i = 0; i < (int)need; i++) { + udb_rptr_zero(&lookup(n)->array[i].node, udb); + lookup(n)->array[i].len = 0; + } + lookup(n)->len += need; + RADNODE(n)->offset = byte; + /* is it above the max? */ + } else if(byte - RADNODE(n)->offset >= lookup(n)->len) { + /* is capacity enough? */ + int i; + unsigned need = (byte-RADNODE(n)->offset) - lookup(n)->len + 1; + /* grow array */ + if(lookup(n)->len + need > lookup(n)->capacity) { + if(!udb_radnode_array_grow(udb, n, lookup(n)->len+need)) + return 0; + } + /* take new entries into use, init relptrs */ + for(i = lookup(n)->len; i< (int)(lookup(n)->len + need); i++) { + udb_rel_ptr_init(&lookup(n)->array[i].node); + lookup(n)->array[i].len = 0; + } + /* grow length */ + lookup(n)->len += need; + } + return 1; +} + +/** make space for string size */ +static int udb_radnode_str_space(udb_base* udb, udb_ptr* n, udb_radstrlen_t len) +{ + if(RADNODE(n)->lookup.data == 0) { + return udb_radnode_array_space(udb, n, 0, len); + } + if(lookup(n)->str_cap < len) { + /* must resize for stringsize */ + if(!udb_radnode_str_grow(udb, n, len)) + return 0; + } + return 1; +} + +/** copy remainder from prefixes for a split: + * plen: len prefix, l: longer bstring, llen: length of l. */ +static void udb_radsel_prefix_remainder(udb_radstrlen_t plen, + uint8_t* l, udb_radstrlen_t llen, + uint8_t* s, udb_radstrlen_t* slen) +{ + *slen = llen - plen; + /* assert(*slen <= lookup(n)->str_cap); */ + memmove(s, l+plen, llen-plen); +} + +/** create a prefix in the array strs */ +static void udb_radsel_str_create(uint8_t* s, udb_radstrlen_t* slen, + uint8_t* k, udb_radstrlen_t pos, udb_radstrlen_t len) +{ + *slen = len-pos; + /* assert(*slen <= lookup(n)->str_cap); */ + memmove(s, k+pos, len-pos); +} + +static udb_radstrlen_t +udb_bstr_common(uint8_t* x, udb_radstrlen_t xlen, + uint8_t* y, udb_radstrlen_t ylen) +{ + assert(sizeof(radstrlen_t) == sizeof(udb_radstrlen_t)); + return bstr_common_ext(x, xlen, y, ylen); +} + +static int +udb_bstr_is_prefix(uint8_t* p, udb_radstrlen_t plen, + uint8_t* x, udb_radstrlen_t xlen) +{ + assert(sizeof(radstrlen_t) == sizeof(udb_radstrlen_t)); + return bstr_is_prefix_ext(p, plen, x, xlen); +} + +/** grow array space for byte N after a string, (but if string shorter) */ +static int +udb_radnode_array_space_strremain(udb_base* udb, udb_ptr* n, + uint8_t* str, udb_radstrlen_t len, udb_radstrlen_t pos) +{ + assert(pos < len); + /* shift by one char because it goes in lookup array */ + return udb_radnode_array_space(udb, n, str[pos], len-(pos+1)); +} + + +/** radsel create a split when two nodes have shared prefix. + * @param udb: udb + * @param n: node with the radsel that gets changed, it contains a node. + * @param idx: the index of the radsel that gets changed. + * @param k: key byte string + * @param pos: position where the string enters the radsel (e.g. r.str) + * @param len: length of k. + * @param add: additional node for the string k. + * removed by called on failure. + * @return false on alloc failure, no changes made. + */ +static int udb_radsel_split(udb_base* udb, udb_ptr* n, uint8_t idx, uint8_t* k, + udb_radstrlen_t pos, udb_radstrlen_t len, udb_ptr* add) +{ + uint8_t* addstr = k+pos; + udb_radstrlen_t addlen = len-pos; + if(udb_bstr_is_prefix(addstr, addlen, lookup_string(n, idx), + lookup_len(n, idx))) { + udb_radstrlen_t split_len = 0; + /* 'add' is a prefix of r.node */ + /* also for empty addstr */ + /* set it up so that the 'add' node has r.node as child */ + /* so, r.node gets moved below the 'add' node, but we do + * this so that the r.node stays the same pointer for its + * key name */ + assert(addlen != lookup_len(n, idx)); + assert(addlen < lookup_len(n, idx)); + /* make space for new string sizes */ + if(!udb_radnode_str_space(udb, n, addlen)) + return 0; + if(lookup_len(n, idx) - addlen > 1) + /* shift one because a char is in the lookup array */ + split_len = lookup_len(n, idx) - (addlen+1); + if(!udb_radnode_array_space(udb, add, + lookup_string(n, idx)[addlen], split_len)) + return 0; + /* alloc succeeded, now link it in */ + udb_rptr_set_rptr(&RADNODE(add)->parent, udb, + &lookup_node(n, idx)->parent); + RADNODE(add)->pidx = lookup_node(n, idx)->pidx; + udb_rptr_set_rptr(&lookup(add)->array[0].node, udb, + &lookup(n)->array[idx].node); + if(lookup_len(n, idx) - addlen > 1) { + udb_radsel_prefix_remainder(addlen+1, + lookup_string(n, idx), lookup_len(n, idx), + lookup_string(add, 0), + &lookup(add)->array[0].len); + } else { + lookup(add)->array[0].len = 0; + } + udb_rptr_set_ptr(&lookup_node(n, idx)->parent, udb, add); + lookup_node(n, idx)->pidx = 0; + + udb_rptr_set_ptr(&lookup(n)->array[idx].node, udb, add); + memmove(lookup_string(n, idx), addstr, addlen); + lookup(n)->array[idx].len = addlen; + /* n's string may have become shorter */ + if(!udb_radarray_reduce_if_needed(udb, n)) { + /* ignore this, our tree has become inefficient */ + } + } else if(udb_bstr_is_prefix(lookup_string(n, idx), lookup_len(n, idx), + addstr, addlen)) { + udb_radstrlen_t split_len = 0; + udb_ptr rnode; + /* r.node is a prefix of 'add' */ + /* set it up so that the 'r.node' has 'add' as child */ + /* and basically, r.node is already completely fine, + * we only need to create a node as its child */ + assert(addlen != lookup_len(n, idx)); + assert(lookup_len(n, idx) < addlen); + udb_ptr_new(&rnode, udb, &lookup(n)->array[idx].node); + /* make space for string length */ + if(addlen-lookup_len(n, idx) > 1) { + /* shift one because a character goes into array */ + split_len = addlen - (lookup_len(n, idx)+1); + } + if(!udb_radnode_array_space(udb, &rnode, + addstr[lookup_len(n, idx)], split_len)) { + udb_ptr_unlink(&rnode, udb); + return 0; + } + /* alloc succeeded, now link it in */ + udb_rptr_set_ptr(&RADNODE(add)->parent, udb, &rnode); + RADNODE(add)->pidx = addstr[lookup_len(n, idx)] - + RADNODE(&rnode)->offset; + udb_rptr_set_ptr(&lookup(&rnode)->array[ RADNODE(add)->pidx ] + .node, udb, add); + if(addlen-lookup_len(n, idx) > 1) { + udb_radsel_prefix_remainder(lookup_len(n, idx)+1, + addstr, addlen, + lookup_string(&rnode, RADNODE(add)->pidx), + &lookup(&rnode)->array[ RADNODE(add)->pidx] + .len); + } else { + lookup(&rnode)->array[ RADNODE(add)->pidx].len = 0; + } + /* rnode's string has become shorter */ + if(!udb_radarray_reduce_if_needed(udb, &rnode)) { + /* ignore this, our tree has become inefficient */ + } + udb_ptr_unlink(&rnode, udb); + } else { + /* okay we need to create a new node that chooses between + * the nodes 'add' and r.node + * We do this so that r.node stays the same pointer for its + * key name. */ + udb_ptr com, rnode; + udb_radstrlen_t common_len = udb_bstr_common( + lookup_string(n, idx), lookup_len(n, idx), + addstr, addlen); + assert(common_len < lookup_len(n, idx)); + assert(common_len < addlen); + udb_ptr_new(&rnode, udb, &lookup(n)->array[idx].node); + + /* create the new node for choice */ + if(!udb_ptr_alloc_space(&com, udb, udb_chunk_type_radnode, + sizeof(struct udb_radnode_d))) { + udb_ptr_unlink(&rnode, udb); + return 0; /* out of space */ + } + memset(UDB_PTR(&com), 0, sizeof(struct udb_radnode_d)); + /* make stringspace for the two substring choices */ + /* this allocates the com->lookup array */ + if(!udb_radnode_array_space_strremain(udb, &com, + lookup_string(n, idx), lookup_len(n, idx), common_len) + || !udb_radnode_array_space_strremain(udb, &com, + addstr, addlen, common_len)) { + udb_ptr_unlink(&rnode, udb); + udb_radnode_delete(udb, &com); + return 0; + } + /* create stringspace for the shared prefix */ + if(common_len > 0) { + if(!udb_radnode_str_space(udb, n, common_len-1)) { + udb_ptr_unlink(&rnode, udb); + udb_radnode_delete(udb, &com); + return 0; + } + } + /* allocs succeeded, proceed to link it all up */ + udb_rptr_set_rptr(&RADNODE(&com)->parent, udb, + &RADNODE(&rnode)->parent); + RADNODE(&com)->pidx = RADNODE(&rnode)->pidx; + udb_rptr_set_ptr(&RADNODE(&rnode)->parent, udb, &com); + RADNODE(&rnode)->pidx = lookup_string(n, idx)[common_len] - + RADNODE(&com)->offset; + udb_rptr_set_ptr(&RADNODE(add)->parent, udb, &com); + RADNODE(add)->pidx = addstr[common_len] - + RADNODE(&com)->offset; + udb_rptr_set_ptr(&lookup(&com)->array[RADNODE(&rnode)->pidx] + .node, udb, &rnode); + if(lookup_len(n, idx)-common_len > 1) { + udb_radsel_prefix_remainder(common_len+1, + lookup_string(n, idx), lookup_len(n, idx), + lookup_string(&com, RADNODE(&rnode)->pidx), + &lookup(&com)->array[RADNODE(&rnode)->pidx].len); + } else { + lookup(&com)->array[RADNODE(&rnode)->pidx].len= 0; + } + udb_rptr_set_ptr(&lookup(&com)->array[RADNODE(add)->pidx] + .node, udb, add); + if(addlen-common_len > 1) { + udb_radsel_prefix_remainder(common_len+1, + addstr, addlen, + lookup_string(&com, RADNODE(add)->pidx), + &lookup(&com)->array[RADNODE(add)->pidx].len); + } else { + lookup(&com)->array[RADNODE(add)->pidx].len = 0; + } + memmove(lookup_string(n, idx), addstr, common_len); + lookup(n)->array[idx].len = common_len; + udb_rptr_set_ptr(&lookup(n)->array[idx].node, udb, &com); + udb_ptr_unlink(&rnode, udb); + udb_ptr_unlink(&com, udb); + /* n's string has become shorter */ + if(!udb_radarray_reduce_if_needed(udb, n)) { + /* ignore this, our tree has become inefficient */ + } + } + return 1; +} + +uint64_t* result_data = NULL; +udb_void udb_radix_insert(udb_base* udb, udb_ptr* rt, uint8_t* k, + udb_radstrlen_t len, udb_ptr* elem, udb_ptr* result) +{ + udb_void ret; + udb_ptr add, n; /* type udb_radnode_d */ + udb_radstrlen_t pos = 0; + /* create new element to add */ + if(!udb_ptr_alloc_space(&add, udb, udb_chunk_type_radnode, + sizeof(struct udb_radnode_d))) { + return 0; /* alloc failure */ + } + memset(UDB_PTR(&add), 0, sizeof(struct udb_radnode_d)); + udb_rptr_set_ptr(&RADNODE(&add)->elem, udb, elem); + if(!udb_radnode_array_create(udb, &add)) { + udb_ptr_free_space(&add, udb, sizeof(struct udb_radnode_d)); + return 0; /* alloc failure */ + } + udb_ptr_init(&n, udb); + result_data = &n.data; + + /* find out where to add it */ + if(!udb_radix_find_prefix_node(udb, rt, k, len, &n, &pos)) { + /* new root */ + assert(RADTREE(rt)->root.data == 0); + if(len == 0) { + udb_rptr_set_ptr(&RADTREE(rt)->root, udb, &add); + } else { + /* add a root to point to new node */ + udb_ptr_zero(&n, udb); + if(!udb_ptr_alloc_space(&n, udb, + udb_chunk_type_radnode, + sizeof(struct udb_radnode_d))) { + udb_radnode_delete(udb, &add); + udb_ptr_unlink(&n, udb); + return 0; /* alloc failure */ + } + memset(RADNODE(&n), 0, sizeof(struct udb_radnode_d)); + /* this creates the array lookup structure for n */ + if(!udb_radnode_array_space(udb, &n, k[0], len-1)) { + udb_radnode_delete(udb, &add); + udb_ptr_free_space(&n, udb, + sizeof(struct udb_radnode_d)); + return 0; /* alloc failure */ + } + udb_rptr_set_ptr(&RADNODE(&add)->parent, udb, &n); + RADNODE(&add)->pidx = 0; + udb_rptr_set_ptr(&lookup(&n)->array[0].node, udb, &add); + if(len > 1) { + udb_radsel_prefix_remainder(1, k, len, + lookup_string(&n, 0), + &lookup(&n)->array[0].len); + } + udb_rptr_set_ptr(&RADTREE(rt)->root, udb, &n); + } + } else if(pos == len) { + /* found an exact match */ + if(RADNODE(&n)->elem.data) { + /* already exists, failure */ + udb_radnode_delete(udb, &add); + udb_ptr_unlink(&n, udb); + return 0; + } + udb_rptr_set_ptr(&RADNODE(&n)->elem, udb, elem); + udb_radnode_delete(udb, &add); + udb_ptr_set_ptr(&add, udb, &n); + } else { + /* n is a node which can accomodate */ + uint8_t byte; + assert(pos < len); + byte = k[pos]; + + /* see if it falls outside of array */ + if(byte < RADNODE(&n)->offset || byte-RADNODE(&n)->offset >= + lookup(&n)->len) { + /* make space in the array for it; adjusts offset */ + if(!udb_radnode_array_space(udb, &n, byte, + len-(pos+1))) { + udb_radnode_delete(udb, &add); + udb_ptr_unlink(&n, udb); + return 0; + } + assert(byte>=RADNODE(&n)->offset && byte-RADNODE(&n)-> + offset<lookup(&n)->len); + byte -= RADNODE(&n)->offset; + /* see if more prefix needs to be split off */ + if(pos+1 < len) { + udb_radsel_str_create(lookup_string(&n, byte), + &lookup(&n)->array[byte].len, + k, pos+1, len); + } + /* insert the new node in the new bucket */ + udb_rptr_set_ptr(&RADNODE(&add)->parent, udb, &n); + RADNODE(&add)->pidx = byte; + udb_rptr_set_ptr(&lookup(&n)->array[byte].node, udb, + &add); + /* so a bucket exists and byte falls in it */ + } else if(lookup(&n)->array[byte - RADNODE(&n)->offset] + .node.data == 0) { + /* use existing bucket */ + byte -= RADNODE(&n)->offset; + if(pos+1 < len) { + /* make space and split off more prefix */ + if(!udb_radnode_str_space(udb, &n, + len-(pos+1))) { + udb_radnode_delete(udb, &add); + udb_ptr_unlink(&n, udb); + return 0; + } + udb_radsel_str_create(lookup_string(&n, byte), + &lookup(&n)->array[byte].len, + k, pos+1, len); + } + /* insert the new node in the new bucket */ + udb_rptr_set_ptr(&RADNODE(&add)->parent, udb, &n); + RADNODE(&add)->pidx = byte; + udb_rptr_set_ptr(&lookup(&n)->array[byte].node, udb, + &add); + } else { + /* use bucket but it has a shared prefix, + * split that out and create a new intermediate + * node to split out between the two. + * One of the two might exactmatch the new + * intermediate node */ + if(!udb_radsel_split(udb, &n, byte-RADNODE(&n)->offset, + k, pos+1, len, &add)) { + udb_radnode_delete(udb, &add); + udb_ptr_unlink(&n, udb); + return 0; + } + } + } + RADTREE(rt)->count ++; + ret = add.data; + udb_ptr_init(result, udb); + udb_ptr_set_ptr(result, udb, &add); + udb_ptr_unlink(&add, udb); + udb_ptr_unlink(&n, udb); + return ret; +} + +/** Cleanup node with one child, it is removed and joined into parent[x] str */ +static int +udb_radnode_cleanup_onechild(udb_base* udb, udb_ptr* n) +{ + udb_ptr par, child; + uint8_t pidx = RADNODE(n)->pidx; + radstrlen_t joinlen; + udb_ptr_new(&par, udb, &RADNODE(n)->parent); + udb_ptr_new(&child, udb, &lookup(n)->array[0].node); + + /* node had one child, merge them into the parent. */ + /* keep the child node, so its pointers stay valid. */ + + /* at parent, append child->str to array str */ + assert(pidx < lookup(&par)->len); + joinlen = lookup_len(&par, pidx) + lookup_len(n, 0) + 1; + /* make stringspace for the joined string */ + if(!udb_radnode_str_space(udb, &par, joinlen)) { + /* cleanup failed due to out of memory */ + /* the tree is inefficient, with node n still existing */ + udb_ptr_unlink(&par, udb); + udb_ptr_unlink(&child, udb); + udb_ptr_zero(n, udb); + return 0; + } + /* the string(par, pidx) is already there */ + /* the array lookup is gone, put its character in the lookup string*/ + lookup_string(&par, pidx)[lookup_len(&par, pidx)] = + RADNODE(&child)->pidx + RADNODE(n)->offset; + memmove(lookup_string(&par, pidx)+lookup_len(&par, pidx)+1, + lookup_string(n, 0), lookup_len(n, 0)); + lookup(&par)->array[pidx].len = joinlen; + /* and set the node to our child. */ + udb_rptr_set_ptr(&lookup(&par)->array[pidx].node, udb, &child); + udb_rptr_set_ptr(&RADNODE(&child)->parent, udb, &par); + RADNODE(&child)->pidx = pidx; + /* we are unlinked, delete our node */ + udb_radnode_delete(udb, n); + udb_ptr_unlink(&par, udb); + udb_ptr_unlink(&child, udb); + udb_ptr_zero(n, udb); + return 1; +} + +/** reduce the size of radarray, does a malloc */ +static int +udb_radarray_reduce(udb_base* udb, udb_ptr* n, uint16_t cap, + udb_radstrlen_t strcap) +{ + udb_ptr a; + unsigned i; + assert(lookup(n)->len <= cap); + assert(cap <= lookup(n)->capacity); + assert(strcap <= lookup(n)->str_cap); + if(!udb_ptr_alloc_space(&a, udb, udb_chunk_type_radarray, + size_of_lookup_needed(cap, strcap))) + return 0; + memset(RADARRAY(&a), 0, size_of_lookup_needed(cap, strcap)); + memcpy(RADARRAY(&a), lookup(n), sizeof(struct udb_radarray_d)); + RADARRAY(&a)->capacity = cap; + RADARRAY(&a)->str_cap = strcap; + for(i=0; i<lookup(n)->len; i++) { + udb_rel_ptr_init(&RADARRAY(&a)->array[i].node); + udb_rptr_set_rptr(&RADARRAY(&a)->array[i].node, udb, + &lookup(n)->array[i].node); + RADARRAY(&a)->array[i].len = lookup_len(n, i); + memmove(((uint8_t*)(&RADARRAY(&a)->array[cap]))+i*strcap, + lookup_string(n, i), lookup_len(n, i)); + } + udb_radarray_zero_ptrs(udb, n); + udb_rel_ptr_free_space(&RADNODE(n)->lookup, udb, size_of_lookup(n)); + udb_rptr_set_ptr(&RADNODE(n)->lookup, udb, &a); + udb_ptr_unlink(&a, udb); + return 1; +} + +/** find the max stringlength in the array */ +static udb_radstrlen_t udb_radarray_max_len(udb_ptr* n) +{ + unsigned i; + udb_radstrlen_t maxlen = 0; + for(i=0; i<lookup(n)->len; i++) { + if(lookup(n)->array[i].node.data && + lookup(n)->array[i].len > maxlen) + maxlen = lookup(n)->array[i].len; + } + return maxlen; +} + +/** see if radarray can be reduced (by a factor of two) */ +static int +udb_radarray_reduce_if_needed(udb_base* udb, udb_ptr* n) +{ + udb_radstrlen_t maxlen = udb_radarray_max_len(n); + if((lookup(n)->len <= lookup(n)->capacity/2 || lookup(n)->len == 0 + || maxlen <= lookup(n)->str_cap/2 || maxlen == 0) && + (lookup(n)->len != lookup(n)->capacity || + lookup(n)->str_cap != maxlen)) + return udb_radarray_reduce(udb, n, lookup(n)->len, maxlen); + return 1; +} + +static int +udb_radnode_array_clean_all(udb_base* udb, udb_ptr* n) +{ + RADNODE(n)->offset = 0; + lookup(n)->len = 0; + /* reallocate lookup to a smaller capacity structure */ + return udb_radarray_reduce(udb, n, 0, 0); +} + +/** remove NULL nodes from front of array */ +static int +udb_radnode_array_clean_front(udb_base* udb, udb_ptr* n) +{ + /* move them up and adjust offset */ + unsigned idx, shuf = 0; + /* remove until a nonNULL entry */ + while(shuf < lookup(n)->len && lookup(n)->array[shuf].node.data == 0) + shuf++; + if(shuf == 0) + return 1; + if(shuf == lookup(n)->len) { + /* the array is empty, the tree is inefficient */ + return udb_radnode_array_clean_all(udb, n); + } + assert(shuf < lookup(n)->len); + assert((int)shuf <= 255-(int)RADNODE(n)->offset); + /* move them */ + for(idx=0; idx<lookup(n)->len-shuf; idx++) { + udb_rptr_set_rptr(&lookup(n)->array[idx].node, udb, + &lookup(n)->array[shuf+idx].node); + lookup(n)->array[idx].len = lookup_len(n, shuf+idx); + memmove(lookup_string(n, idx), lookup_string(n, shuf+idx), + lookup(n)->array[idx].len); + } + /* zero the to-be-unused entries */ + for(idx=lookup(n)->len-shuf; idx<lookup(n)->len; idx++) { + udb_rptr_zero(&lookup(n)->array[idx].node, udb); + memset(lookup_string(n, idx), 0, lookup(n)->array[idx].len); + lookup(n)->array[idx].len = 0; + } + RADNODE(n)->offset += shuf; + lookup(n)->len -= shuf; + for(idx=0; idx<lookup(n)->len; idx++) + if(lookup(n)->array[idx].node.data) + lookup_node(n, idx)->pidx = idx; + + /* see if capacity has to shrink */ + return udb_radarray_reduce_if_needed(udb, n); +} + +/** remove NULL nodes from end of array */ +static int +udb_radnode_array_clean_end(udb_base* udb, udb_ptr* n) +{ + /* shorten it */ + unsigned shuf = 0; + /* remove until a nonNULL entry */ + /* remove until a nonNULL entry */ + while(shuf < lookup(n)->len && lookup(n)->array[lookup(n)->len-1-shuf] + .node.data == 0) + shuf++; + if(shuf == 0) + return 1; + if(shuf == lookup(n)->len) { + /* the array is empty, the tree is inefficient */ + return udb_radnode_array_clean_all(udb, n); + } + assert(shuf < lookup(n)->len); + lookup(n)->len -= shuf; + /* array elements can stay where they are */ + /* see if capacity has to shrink */ + return udb_radarray_reduce_if_needed(udb, n); +} + +/** clean up radnode leaf, where we know it has a parent */ +static int +udb_radnode_cleanup_leaf(udb_base* udb, udb_ptr* n, udb_ptr* par) +{ + uint8_t pidx; + /* node was a leaf */ + + /* delete leaf node, but store parent+idx */ + pidx = RADNODE(n)->pidx; + assert(pidx < lookup(par)->len); + + /** set parent ptr to this node to NULL before deleting the node, + * because otherwise ptrlinks fail */ + udb_rptr_zero(&lookup(par)->array[pidx].node, udb); + + udb_radnode_delete(udb, n); + + /* set parent+idx entry to NULL str and node.*/ + lookup(par)->array[pidx].len = 0; + + /* see if par offset or len must be adjusted */ + if(lookup(par)->len == 1) { + /* removed final element from array */ + if(!udb_radnode_array_clean_all(udb, par)) + return 0; + } else if(pidx == 0) { + /* removed first element from array */ + if(!udb_radnode_array_clean_front(udb, par)) + return 0; + } else if(pidx == lookup(par)->len-1) { + /* removed last element from array */ + if(!udb_radnode_array_clean_end(udb, par)) + return 0; + } + return 1; +} + +/** + * Cleanup a radix node that was made smaller, see if it can + * be merged with others. + * @param udb: the udb + * @param rt: tree to remove root if needed. + * @param n: node to cleanup + * @return false on alloc failure. + */ +static int +udb_radnode_cleanup(udb_base* udb, udb_ptr* rt, udb_ptr* n) +{ + while(!udb_ptr_is_null(n)) { + if(RADNODE(n)->elem.data) { + /* see if if needs to be reduced in stringsize */ + if(!udb_radarray_reduce_if_needed(udb, n)) { + udb_ptr_zero(n, udb); + return 0; + } + /* cannot delete node with a data element */ + udb_ptr_zero(n, udb); + return 1; + } else if(lookup(n)->len == 1 && RADNODE(n)->parent.data) { + return udb_radnode_cleanup_onechild(udb, n); + } else if(lookup(n)->len == 0) { + udb_ptr par; + if(!RADNODE(n)->parent.data) { + /* root deleted */ + udb_rptr_zero(&RADTREE(rt)->root, udb); + udb_radnode_delete(udb, n); + return 1; + } + udb_ptr_new(&par, udb, &RADNODE(n)->parent); + /* remove and delete the leaf node */ + if(!udb_radnode_cleanup_leaf(udb, n, &par)) { + udb_ptr_unlink(&par, udb); + udb_ptr_zero(n, udb); + return 0; + } + /* see if parent can now be cleaned up */ + udb_ptr_set_ptr(n, udb, &par); + udb_ptr_unlink(&par, udb); + } else { + /* see if if needs to be reduced in stringsize */ + if(!udb_radarray_reduce_if_needed(udb, n)) { + udb_ptr_zero(n, udb); + return 0; + } + /* node cannot be cleaned up */ + udb_ptr_zero(n, udb); + return 1; + } + } + /* ENOTREACH */ + return 1; +} + +void udb_radix_delete(udb_base* udb, udb_ptr* rt, udb_ptr* n) +{ + if(udb_ptr_is_null(n)) + return; + udb_rptr_zero(&RADNODE(n)->elem, udb); + RADTREE(rt)->count --; + if(!udb_radnode_cleanup(udb, rt, n)) { + /* out of memory in cleanup. the elem ptr is NULL, but + * the radix tree could be inefficient. */ + } +} + +udb_void udb_radix_search(udb_ptr* rt, uint8_t* k, udb_radstrlen_t len) +{ + /* since we only perform reads, and no udb_mallocs or udb_frees + * we know the pointers stay the same */ + struct udb_radnode_d* n; + udb_radstrlen_t pos = 0; + uint8_t byte; + void* base = *rt->base; + + n = (struct udb_radnode_d*)UDB_REL(base, RADTREE(rt)->root.data); +#define NARRAY(n) ((struct udb_radarray_d*)UDB_REL(base, n->lookup.data)) +#define NSTR(n, byte) (((uint8_t*)(&NARRAY(n)->array[NARRAY(n)->capacity]))+byte*NARRAY(n)->str_cap) + while(n != *rt->base) { + if(pos == len) + return UDB_SYSTOREL(*rt->base, n); + byte = k[pos]; + if(byte < n->offset) + return 0; + byte -= n->offset; + if(byte >= NARRAY(n)->len) + return 0; + pos++; + if(NARRAY(n)->array[byte].len != 0) { + /* must match additional string */ + if(pos+NARRAY(n)->array[byte].len > len) + return 0; /* no match */ + if(memcmp(&k[pos], NSTR(n, byte), + NARRAY(n)->array[byte].len) != 0) + return 0; /* no match */ + pos += NARRAY(n)->array[byte].len; + } + n = (struct udb_radnode_d*)UDB_REL(base, + NARRAY(n)->array[byte].node.data); + } + return 0; +} + +/** go to last elem-containing node in this subtree (excl self) */ +static void +udb_radnode_last_in_subtree(udb_base* udb, udb_ptr* n) +{ + int idx; + /* try last entry in array first */ + for(idx=((int)lookup(n)->len)-1; idx >= 0; idx--) { + if(lookup(n)->array[idx].node.data) { + udb_ptr s; + udb_ptr_init(&s, udb); + udb_ptr_set_rptr(&s, udb, &lookup(n)->array[idx].node); + /* does it have entries in its subtrees? */ + if(lookup(&s)->len > 0) { + udb_radnode_last_in_subtree(udb, &s); + if(!udb_ptr_is_null(&s)) { + udb_ptr_set_ptr(n, udb, &s); + udb_ptr_unlink(&s, udb); + return; + } + } + udb_ptr_set_rptr(&s, udb, &lookup(n)->array[idx].node); + /* no, does it have an entry itself? */ + if(RADNODE(&s)->elem.data) { + udb_ptr_set_ptr(n, udb, &s); + udb_ptr_unlink(&s, udb); + return; + } + udb_ptr_unlink(&s, udb); + } + } + udb_ptr_zero(n, udb); +} + +/** last in subtree, incl self */ +static void +udb_radnode_last_in_subtree_incl_self(udb_base* udb, udb_ptr* n) +{ + udb_ptr self; + udb_ptr_init(&self, udb); + udb_ptr_set_ptr(&self, udb, n); + udb_radnode_last_in_subtree(udb, n); + if(!udb_ptr_is_null(n)) { + udb_ptr_unlink(&self, udb); + return; + } + if(RADNODE(&self)->elem.data) { + udb_ptr_set_ptr(n, udb, &self); + udb_ptr_unlink(&self, udb); + return; + } + udb_ptr_zero(n, udb); + udb_ptr_unlink(&self, udb); +} + +/** return first elem-containing node in this subtree (excl self) */ +static void +udb_radnode_first_in_subtree(udb_base* udb, udb_ptr* n) +{ + unsigned idx; + /* try every subnode */ + for(idx=0; idx<lookup(n)->len; idx++) { + if(lookup(n)->array[idx].node.data) { + udb_ptr s; + udb_ptr_init(&s, udb); + udb_ptr_set_rptr(&s, udb, &lookup(n)->array[idx].node); + /* does it have elem itself? */ + if(RADNODE(&s)->elem.data) { + udb_ptr_set_ptr(n, udb, &s); + udb_ptr_unlink(&s, udb); + return; + } + /* try its subtrees */ + udb_radnode_first_in_subtree(udb, &s); + if(!udb_ptr_is_null(&s)) { + udb_ptr_set_ptr(n, udb, &s); + udb_ptr_unlink(&s, udb); + return; + } + + } + } + udb_ptr_zero(n, udb); +} + +/** Find an entry in arrays from idx-1 to 0 */ +static void +udb_radnode_find_prev_from_idx(udb_base* udb, udb_ptr* n, unsigned from) +{ + unsigned idx = from; + while(idx > 0) { + idx --; + if(lookup(n)->array[idx].node.data) { + udb_ptr_set_rptr(n, udb, &lookup(n)->array[idx].node); + udb_radnode_last_in_subtree_incl_self(udb, n); + if(!udb_ptr_is_null(n)) + return; + } + } + udb_ptr_zero(n, udb); +} + +/** return self or a previous element */ +static int udb_ret_self_or_prev(udb_base* udb, udb_ptr* n, udb_ptr* result) +{ + if(RADNODE(n)->elem.data) { + udb_ptr_set_ptr(result, udb, n); + } else { + udb_ptr_set_ptr(result, udb, n); + udb_radix_prev(udb, result); + } + udb_ptr_unlink(n, udb); + return 0; +} + + +int udb_radix_find_less_equal(udb_base* udb, udb_ptr* rt, uint8_t* k, + udb_radstrlen_t len, udb_ptr* result) +{ + udb_ptr n; + udb_radstrlen_t pos = 0; + uint8_t byte; + int r; + /* set result to NULL */ + udb_ptr_init(result, udb); + if(RADTREE(rt)->count == 0) { + /* empty tree */ + return 0; + } + udb_ptr_new(&n, udb, &RADTREE(rt)->root); + while(pos < len) { + byte = k[pos]; + if(byte < RADNODE(&n)->offset) { + /* so the previous is the element itself */ + /* or something before this element */ + return udb_ret_self_or_prev(udb, &n, result); + } + byte -= RADNODE(&n)->offset; + if(byte >= lookup(&n)->len) { + /* so, the previous is the last of array, or itself */ + /* or something before this element */ + udb_ptr_set_ptr(result, udb, &n); + udb_radnode_last_in_subtree_incl_self(udb, result); + if(udb_ptr_is_null(result)) { + udb_ptr_set_ptr(result, udb, &n); + udb_radix_prev(udb, result); + } + goto done_fail; + } + pos++; + if(!lookup(&n)->array[byte].node.data) { + /* no match */ + /* Find an entry in arrays from byte-1 to 0 */ + udb_ptr_set_ptr(result, udb, &n); + udb_radnode_find_prev_from_idx(udb, result, byte); + if(!udb_ptr_is_null(result)) + goto done_fail; + /* this entry or something before it */ + udb_ptr_zero(result, udb); + return udb_ret_self_or_prev(udb, &n, result); + } + if(lookup_len(&n, byte) != 0) { + /* must match additional string */ + if(pos+lookup_len(&n, byte) > len) { + /* the additional string is longer than key*/ + if( (r=memcmp(&k[pos], lookup_string(&n, byte), + len-pos)) <= 0) { + /* and the key is before this node */ + udb_ptr_set_rptr(result, udb, + &lookup(&n)->array[byte].node); + udb_radix_prev(udb, result); + } else { + /* the key is after the additional + * string, thus everything in that + * subtree is smaller. */ + udb_ptr_set_rptr(result, udb, + &lookup(&n)->array[byte].node); + udb_radnode_last_in_subtree_incl_self(udb, result); + /* if somehow that is NULL, + * then we have an inefficient tree: + * byte+1 is larger than us, so find + * something in byte-1 and before */ + if(udb_ptr_is_null(result)) { + udb_ptr_set_rptr(result, udb, + &lookup(&n)->array[byte].node); + udb_radix_prev(udb, result); + } + } + goto done_fail; /* no match */ + } + if( (r=memcmp(&k[pos], lookup_string(&n, byte), + lookup_len(&n, byte))) < 0) { + udb_ptr_set_rptr(result, udb, + &lookup(&n)->array[byte].node); + udb_radix_prev(udb, result); + goto done_fail; /* no match */ + } else if(r > 0) { + /* the key is larger than the additional + * string, thus everything in that subtree + * is smaller */ + udb_ptr_set_rptr(result, udb, + &lookup(&n)->array[byte].node); + udb_radnode_last_in_subtree_incl_self(udb, result); + /* if we have an inefficient tree */ + if(udb_ptr_is_null(result)) { + udb_ptr_set_rptr(result, udb, + &lookup(&n)->array[byte].node); + udb_radix_prev(udb, result); + } + goto done_fail; /* no match */ + } + pos += lookup_len(&n, byte); + } + udb_ptr_set_rptr(&n, udb, &lookup(&n)->array[byte].node); + } + if(RADNODE(&n)->elem.data) { + /* exact match */ + udb_ptr_set_ptr(result, udb, &n); + udb_ptr_unlink(&n, udb); + return 1; + } + /* there is a node which is an exact match, but it has no element */ + udb_ptr_set_ptr(result, udb, &n); + udb_radix_prev(udb, result); +done_fail: + udb_ptr_unlink(&n, udb); + return 0; +} + +void udb_radix_first(udb_base* udb, udb_ptr* rt, udb_ptr* p) +{ + udb_ptr_init(p, udb); + if(!rt || udb_ptr_is_null(rt) || RADTREE(rt)->count == 0) + return; + udb_ptr_set_rptr(p, udb, &RADTREE(rt)->root); + if(RADNODE(p)->elem.data) + return; + udb_radix_next(udb, p); +} + +void udb_radix_last(udb_base* udb, udb_ptr* rt, udb_ptr* p) +{ + udb_ptr_init(p, udb); + if(!rt || udb_ptr_is_null(rt) || RADTREE(rt)->count == 0) + return; + udb_ptr_set_rptr(p, udb, &RADTREE(rt)->root); + udb_radnode_last_in_subtree_incl_self(udb, p); +} + +void udb_radix_next(udb_base* udb, udb_ptr* n) +{ + udb_ptr s; + udb_ptr_init(&s, udb); + if(lookup(n)->len) { + /* go down */ + udb_ptr_set_ptr(&s, udb, n); + udb_radnode_first_in_subtree(udb, &s); + if(!udb_ptr_is_null(&s)) { + udb_ptr_set_ptr(n, udb, &s); + udb_ptr_unlink(&s, udb); + return; + } + } + /* go up - the parent->elem is not useful, because it is before us */ + while(RADNODE(n)->parent.data) { + unsigned idx = RADNODE(n)->pidx; + udb_ptr_set_rptr(n, udb, &RADNODE(n)->parent); + idx++; + for(; idx < lookup(n)->len; idx++) { + /* go down the next branch */ + if(lookup(n)->array[idx].node.data) { + udb_ptr_set_rptr(&s, udb, + &lookup(n)->array[idx].node); + /* node itself */ + if(RADNODE(&s)->elem.data) { + udb_ptr_set_ptr(n, udb, &s); + udb_ptr_unlink(&s, udb); + return; + } + /* or subtree */ + udb_radnode_first_in_subtree(udb, &s); + if(!udb_ptr_is_null(&s)) { + udb_ptr_set_ptr(n, udb, &s); + udb_ptr_unlink(&s, udb); + return; + } + } + } + } + udb_ptr_unlink(&s, udb); + udb_ptr_zero(n, udb); +} + +void udb_radix_prev(udb_base* udb, udb_ptr* n) +{ + /* must go up, since all array nodes are after this node */ + while(RADNODE(n)->parent.data) { + uint8_t idx = RADNODE(n)->pidx; + udb_ptr s; + udb_ptr_set_rptr(n, udb, &RADNODE(n)->parent); + assert(lookup(n)->len > 0); /* since we are a child */ + /* see if there are elements in previous branches there */ + udb_ptr_init(&s, udb); + udb_ptr_set_ptr(&s, udb, n); + udb_radnode_find_prev_from_idx(udb, &s, idx); + if(!udb_ptr_is_null(&s)) { + udb_ptr_set_ptr(n, udb, &s); + udb_ptr_unlink(&s, udb); + return; + } + udb_ptr_unlink(&s, udb); + /* the current node is before the array */ + if(RADNODE(n)->elem.data) + return; + } + udb_ptr_zero(n, udb); +} + +udb_void udb_radname_insert(udb_base* udb, udb_ptr* rt, const uint8_t* dname, + size_t dlen, udb_ptr* elem, udb_ptr* result) +{ + uint8_t k[300]; + radstrlen_t klen = (radstrlen_t)sizeof(k); + radname_d2r(k, &klen, dname, dlen); + return udb_radix_insert(udb, rt, k, klen, elem, result); +} + +int udb_radname_search(udb_base* udb, udb_ptr* rt, const uint8_t* dname, + size_t dlen, udb_ptr* result) +{ + udb_void r; + uint8_t k[300]; + radstrlen_t klen = (radstrlen_t)sizeof(k); + radname_d2r(k, &klen, dname, dlen); + r = udb_radix_search(rt, k, klen); + udb_ptr_init(result, udb); + udb_ptr_set(result, udb, r); + return (r != 0); +} + +void udb_radix_tree_walk_chunk(void* base, void* d, uint64_t s, + udb_walk_relptr_cb* cb, void* arg) +{ + struct udb_radtree_d* p = (struct udb_radtree_d*)d; + assert(s >= sizeof(struct udb_radtree_d)); + (void)s; + (*cb)(base, &p->root, arg); +} + +void udb_radix_node_walk_chunk(void* base, void* d, uint64_t s, + udb_walk_relptr_cb* cb, void* arg) +{ + struct udb_radnode_d* p = (struct udb_radnode_d*)d; + assert(s >= sizeof(struct udb_radnode_d)); + (void)s; + (*cb)(base, &p->elem, arg); + (*cb)(base, &p->parent, arg); + (*cb)(base, &p->lookup, arg); +} + +void udb_radix_array_walk_chunk(void* base, void* d, uint64_t s, + udb_walk_relptr_cb* cb, void* arg) +{ + struct udb_radarray_d* p = (struct udb_radarray_d*)d; + unsigned i; + assert(s >= sizeof(struct udb_radarray_d)+ + p->capacity*(sizeof(struct udb_radsel_d)+p->str_cap)); + (void)s; + for(i=0; i<p->len; i++) { + (*cb)(base, &p->array[i].node, arg); + } +} diff --git a/usr.sbin/nsd/udbradtree.h b/usr.sbin/nsd/udbradtree.h new file mode 100644 index 00000000000..6f4bc735819 --- /dev/null +++ b/usr.sbin/nsd/udbradtree.h @@ -0,0 +1,245 @@ +/* + * udbradtree -- radix tree for binary strings for in udb file. + * + * Copyright (c) 2011, NLnet Labs. See LICENSE for license. + */ +#ifndef UDB_RADTREE_H +#define UDB_RADTREE_H +#include "udb.h" +struct udb_radnode; + +/** length of the binary string */ +typedef uint16_t udb_radstrlen_t; + +/** + * The radix tree + * + * The elements are stored based on binary strings(0-255) of a given length. + * They are sorted, a prefix is sorted before its suffixes. + * If you want to know the key string, you should store it yourself, the + * tree stores it in the parts necessary for lookup. + * For binary strings for domain names see the radname routines. + * + * This is the tree on disk representation. It has _d suffix in the name + * to help delineate disk structures from normal structures. + */ +struct udb_radtree_d { + /** root node in tree, to udb_radnode_d */ + struct udb_rel_ptr root; + /** count of number of elements */ + uint64_t count; +}; + +/** + * A radix tree lookup node. It is stored on disk, and the lookup array + * is allocated. + */ +struct udb_radnode_d { + /** data element associated with the binary string up to this node */ + struct udb_rel_ptr elem; + /** parent node (NULL for the root), to udb_radnode_d */ + struct udb_rel_ptr parent; + /** the array structure, for lookup by [byte-offset]. udb_radarray_d */ + struct udb_rel_ptr lookup; + /** index in the parent lookup array */ + uint8_t pidx; + /** offset of the lookup array, add to [i] for lookups */ + uint8_t offset; +}; + +/** + * radix select edge in array + * The string for this element is the Nth string in the stringarray. + */ +struct udb_radsel_d { + /** length of the additional string for this edge, + * additional string after the selection-byte for this edge.*/ + udb_radstrlen_t len; + /** padding for non64bit compilers to 64bit boundaries, to make + * the udb file more portable, without this the file would work + * on the system it is created on (which is what we promise), but + * with this, you have a chance of it working on other platforms */ + uint16_t padding16; + uint32_t padding32; + /** node that deals with byte+str, to udb_radnode_d */ + struct udb_rel_ptr node; +}; + +/** + * Array of radsel elements. + * This is the header, the array is allocated contiguously behind it. + * The strings (often very short) are allocated behind the array. + * All strings are given the same amount of space (str_cap), + * so there is capacity*str_cap bytes at the end. + */ +struct udb_radarray_d { + /** length of the lookup array */ + uint16_t len; + /** capacity of the lookup array (can be larger than length) */ + uint16_t capacity; + /** space capacity of for every string */ + udb_radstrlen_t str_cap; + /** padding to 64bit alignment, just in case compiler goes mad */ + uint16_t padding; + /** the elements (allocated contiguously after this structure) */ + struct udb_radsel_d array[0]; +}; + +/** + * Create new radix tree on udb storage + * @param udb: the udb to allocate space on. + * @param ptr: ptr to the udbradtree is returned here. pass uninitialised. + * type is udb_radtree_d. + * @return 0 on alloc failure. + */ +int udb_radix_tree_create(udb_base* udb, udb_ptr* ptr); + +/** + * Delete intermediate nodes from radix tree + * @param udb: the udb. + * @param rt: radix tree to be cleared. type udb_radtree_d. + */ +void udb_radix_tree_clear(udb_base* udb, udb_ptr* rt); + +/** + * Delete radix tree. + * You must have deleted the elements, this deletes the nodes. + * @param udb: the udb. + * @param rt: radix tree to be deleted. type udb_radtree_d. + */ +void udb_radix_tree_delete(udb_base* udb, udb_ptr* rt); + +/** + * Insert element into radix tree. + * @param udb: the udb. + * @param rt: the radix tree, type udb_radtree_d. + * @param key: key string. + * @param len: length of key. + * @param elem: pointer to element data, on the udb store. + * @param result: the inserted node is set to this value. Pass uninited. + Not set if the routine fails. + * @return NULL on failure - out of memory. + * NULL on failure - duplicate entry. + * On success the new radix node for this element (udb_radnode_d). + */ +udb_void udb_radix_insert(udb_base* udb, udb_ptr* rt, uint8_t* k, + udb_radstrlen_t len, udb_ptr* elem, udb_ptr* result); + +/** + * Delete element from radix tree. + * @param udb: the udb. + * @param rt: the radix tree. type udb_radtree_d + * @param n: radix node for that element. type udb_radnode_d + * if NULL, nothing is deleted. + */ +void udb_radix_delete(udb_base* udb, udb_ptr* rt, udb_ptr* n); + +/** + * Find radix element in tree. + * @param rt: the radix tree, type udb_radtree_d. + * @param key: key string. + * @param len: length of key. + * @return the radix node or NULL if not found. type udb_radnode_d + */ +udb_void udb_radix_search(udb_ptr* rt, uint8_t* k, + udb_radstrlen_t len); + +/** + * Find radix element in tree, and if not found, find the closest smaller or + * equal element in the tree. + * @param udb: the udb. + * @param rt: the radix tree, type udb_radtree_d. + * @param key: key string. + * @param len: length of key. + * @param result: returns the radix node or closest match (NULL if key is + * smaller than the smallest key in the tree). type udb_radnode_d. + * you can pass an uninitialized ptr, an unlinked or a zeroed one. + * @return true if exact match, false if no match. + */ +int udb_radix_find_less_equal(udb_base* udb, udb_ptr* rt, uint8_t* k, + udb_radstrlen_t len, udb_ptr* result); + +/** + * Return the first (smallest) element in the tree. + * @param udb: the udb. + * @param rt: the radix tree, type udb_radtree_d. + * @param p: set to the first node in the tree, or NULL if none. + * type udb_radnode_d. + * pass uninited, zero or unlinked udb_ptr. + */ +void udb_radix_first(udb_base* udb, udb_ptr* rt, udb_ptr* p); + +/** + * Return the last (largest) element in the tree. + * @param udb: the udb. + * @param rt: the radix tree, type udb_radtree_d. + * @param p: last node or NULL if none, type udb_radnode_d. + * pass uninited, zero or unlinked udb_ptr. + */ +void udb_radix_last(udb_base* udb, udb_ptr* rt, udb_ptr* p); + +/** + * Return the next element. + * @param udb: the udb. + * @param n: adjusted to the next element, or NULL if none. type udb_radnode_d. + */ +void udb_radix_next(udb_base* udb, udb_ptr* n); + +/** + * Return the previous element. + * @param udb: the udb. + * @param n: adjusted to the prev node or NULL if none. type udb_radnode_d. + */ +void udb_radix_prev(udb_base* udb, udb_ptr* n); + +/* + * Perform a walk through all elements of the tree. + * node: variable of type struct radnode*. + * tree: pointer to the tree. + * for(udb_radix_first(tree, node); node->data; udb_radix_next(node)) +*/ + +/** for use in udb-walkfunc, walks relptrs in udb_chunk_type_radtree */ +void udb_radix_tree_walk_chunk(void* base, void* d, uint64_t s, + udb_walk_relptr_cb* cb, void* arg); + +/** for use in udb-walkfunc, walks relptrs in udb_chunk_type_radnode */ +void udb_radix_node_walk_chunk(void* base, void* d, uint64_t s, + udb_walk_relptr_cb* cb, void* arg); + +/** for use in udb-walkfunc, walks relptrs in udb_chunk_type_radarray */ +void udb_radix_array_walk_chunk(void* base, void* d, uint64_t s, + udb_walk_relptr_cb* cb, void* arg); + +/** get the memory used by the lookup structure for a radnode */ +size_t size_of_lookup_ext(udb_ptr* node); + +/** insert radtree element, key is a domain name + * @param udb: udb. + * @param rt: the tree. + * @param dname: domain name in uncompressed wireformat. + * @param dlen: length of k. + * @param elem: element to store + * @param result: the inserted node is set to this value. Pass uninited. + Not set if the routine fails. + * @return 0 on failure + */ +udb_void udb_radname_insert(udb_base* udb, udb_ptr* rt, const uint8_t* dname, + size_t dlen, udb_ptr* elem, udb_ptr* result); + +/** search for a radname element, key is a domain name. + * @param udb: udb + * @param rt: the tree + * @param dname: domain name in uncompressed wireformat. + * @param dlen: length of k. + * @param result: result ptr to store the node into. + * may be uninitialized. + * @return 0 if not found. + */ +int udb_radname_search(udb_base* udb, udb_ptr* rt, const uint8_t* dname, + size_t dlen, udb_ptr* result); + +#define RADNODE(ptr) ((struct udb_radnode_d*)UDB_PTR(ptr)) +#define RADTREE(ptr) ((struct udb_radtree_d*)UDB_PTR(ptr)) + +#endif /* UDB_RADTREE_H */ diff --git a/usr.sbin/nsd/udbzone.c b/usr.sbin/nsd/udbzone.c new file mode 100644 index 00000000000..bd5929b3929 --- /dev/null +++ b/usr.sbin/nsd/udbzone.c @@ -0,0 +1,786 @@ +/* + * udbzone -- store zone and rrset information in udb file. + * + * Copyright (c) 2011, NLnet Labs. See LICENSE for license. + */ +#include "config.h" +#include "udbzone.h" +#include "util.h" +#include "iterated_hash.h" +#include "dns.h" +#include "dname.h" +#include "difffile.h" +#include <string.h> + +/** delete the zone plain its own data */ +static void +udb_zone_delete_plain(udb_base* udb, udb_ptr* zone) +{ + udb_ptr dtree; + assert(udb_ptr_get_type(zone) == udb_chunk_type_zone); + udb_zone_clear(udb, zone); + udb_rptr_zero(&ZONE(zone)->node, udb); + udb_rptr_zero(&ZONE(zone)->nsec3param, udb); + udb_rptr_zero(&ZONE(zone)->log_str, udb); + udb_ptr_new(&dtree, udb, &ZONE(zone)->domains); + udb_rptr_zero(&ZONE(zone)->domains, udb); + udb_radix_tree_delete(udb, &dtree); + udb_ptr_free_space(zone, udb, + sizeof(struct zone_d)+ZONE(zone)->namelen); +} + +int +udb_dns_init_file(udb_base* udb) +{ + udb_ptr ztree; + if(!udb_radix_tree_create(udb, &ztree)) { + return 0; + } + udb_base_set_userdata(udb, ztree.data); + udb_ptr_unlink(&ztree, udb); + return 1; +} + +void +udb_dns_deinit_file(udb_base* udb) +{ + udb_ptr ztree; + udb_ptr z; + udb_ptr_new(&ztree, udb, udb_base_get_userdata(udb)); + if(udb_ptr_is_null(&ztree)) { + return; + } + assert(udb_ptr_get_type(&ztree) == udb_chunk_type_radtree); + /* delete all zones */ + for(udb_radix_first(udb, &ztree, &z); z.data; udb_radix_next(udb, &z)){ + udb_ptr zone; + udb_ptr_new(&zone, udb, &RADNODE(&z)->elem); + udb_rptr_zero(&RADNODE(&z)->elem, udb); + udb_zone_delete_plain(udb, &zone); + } + udb_ptr_unlink(&z, udb); + + udb_base_set_userdata(udb, 0); + udb_radix_tree_delete(udb, &ztree); +} + +int +udb_zone_create(udb_base* udb, udb_ptr* result, const uint8_t* dname, + size_t dlen) +{ + udb_ptr ztree, z, node, dtree; + udb_ptr_new(&ztree, udb, udb_base_get_userdata(udb)); + assert(udb_ptr_get_type(&ztree) == udb_chunk_type_radtree); + udb_ptr_init(result, udb); + if(udb_zone_search(udb, &z, dname, dlen)) { + udb_ptr_unlink(&ztree, udb); + udb_ptr_unlink(&z, udb); + /* duplicate */ + return 0; + } + if(!udb_ptr_alloc_space(&z, udb, udb_chunk_type_zone, + sizeof(struct zone_d)+dlen)) { + udb_ptr_unlink(&ztree, udb); + /* failed alloc */ + return 0; + } + /* init the zone object */ + udb_rel_ptr_init(&ZONE(&z)->node); + udb_rel_ptr_init(&ZONE(&z)->domains); + udb_rel_ptr_init(&ZONE(&z)->nsec3param); + udb_rel_ptr_init(&ZONE(&z)->log_str); + ZONE(&z)->rrset_count = 0; + ZONE(&z)->rr_count = 0; + ZONE(&z)->expired = 0; + ZONE(&z)->mtime = 0; + ZONE(&z)->namelen = dlen; + memmove(ZONE(&z)->name, dname, dlen); + if(!udb_radix_tree_create(udb, &dtree)) { + udb_ptr_free_space(&z, udb, sizeof(struct zone_d)+dlen); + udb_ptr_unlink(&ztree, udb); + /* failed alloc */ + return 0; + } + udb_rptr_set_ptr(&ZONE(&z)->domains, udb, &dtree); + + /* insert it */ + if(!udb_radname_insert(udb, &ztree, dname, dlen, &z, &node)) { + udb_ptr_free_space(&z, udb, sizeof(struct zone_d)+dlen); + udb_ptr_unlink(&ztree, udb); + udb_radix_tree_delete(udb, &dtree); + udb_ptr_unlink(&dtree, udb); + /* failed alloc */ + return 0; + } + udb_rptr_set_ptr(&ZONE(&z)->node, udb, &node); + udb_ptr_set_ptr(result, udb, &z); + udb_ptr_unlink(&z, udb); + udb_ptr_unlink(&dtree, udb); + udb_ptr_unlink(&ztree, udb); + udb_ptr_unlink(&node, udb); + return 1; +} + +/** delete an RR */ +static void +rr_delete(udb_base* udb, udb_ptr* rr) +{ + assert(udb_ptr_get_type(rr) == udb_chunk_type_rr); + udb_rptr_zero(&RR(rr)->next, udb); + udb_ptr_free_space(rr, udb, sizeof(struct rr_d)+RR(rr)->len); +} + +/** delete an rrset */ +static void +rrset_delete(udb_base* udb, udb_ptr* rrset) +{ + udb_ptr rr, n; + assert(udb_ptr_get_type(rrset) == udb_chunk_type_rrset); + + /* free RRs */ + udb_ptr_new(&rr, udb, &RRSET(rrset)->rrs); + udb_ptr_init(&n, udb); + udb_rptr_zero(&RRSET(rrset)->rrs, udb); + while(!udb_ptr_is_null(&rr)) { + udb_ptr_set_rptr(&n, udb, &RR(&rr)->next); + rr_delete(udb, &rr); + udb_ptr_set_ptr(&rr, udb, &n); + udb_ptr_zero(&n, udb); + } + udb_ptr_unlink(&n, udb); + udb_ptr_unlink(&rr, udb); + + udb_rptr_zero(&RRSET(rrset)->next, udb); + udb_ptr_free_space(rrset, udb, sizeof(struct rrset_d)); +} + +/** clear a domain of its rrsets, rrs */ +static void +domain_clear(udb_base* udb, udb_ptr* d) +{ + udb_ptr rrset, n; + assert(udb_ptr_get_type(d) == udb_chunk_type_domain); + udb_ptr_new(&rrset, udb, &DOMAIN(d)->rrsets); + udb_ptr_init(&n, udb); + udb_rptr_zero(&DOMAIN(d)->rrsets, udb); + while(!udb_ptr_is_null(&rrset)) { + udb_ptr_set_rptr(&n, udb, &RRSET(&rrset)->next); + rrset_delete(udb, &rrset); + udb_ptr_set_ptr(&rrset, udb, &n); + udb_ptr_zero(&n, udb); + } + udb_ptr_unlink(&n, udb); + udb_ptr_unlink(&rrset, udb); +} + +/** delete a domain and all its rrsets, rrs */ +static void +domain_delete(udb_base* udb, udb_ptr* d) +{ + domain_clear(udb, d); + udb_rptr_zero(&DOMAIN(d)->node, udb); + udb_ptr_free_space(d, udb, + sizeof(struct domain_d)+DOMAIN(d)->namelen); +} + +/** delete domain but also unlink from tree at zone */ +static void +domain_delete_unlink(udb_base* udb, udb_ptr* z, udb_ptr* d) +{ + udb_ptr dtree, n; + udb_ptr_new(&dtree, udb, &ZONE(z)->domains); + udb_ptr_new(&n, udb, &DOMAIN(d)->node); + udb_rptr_zero(&DOMAIN(d)->node, udb); + udb_radix_delete(udb, &dtree, &n); + udb_ptr_unlink(&dtree, udb); + udb_ptr_unlink(&n, udb); + domain_delete(udb, d); +} + +void +udb_zone_clear(udb_base* udb, udb_ptr* zone) +{ + udb_ptr dtree, d; + assert(udb_ptr_get_type(zone) == udb_chunk_type_zone); + udb_ptr_new(&dtree, udb, &ZONE(zone)->domains); + udb_rptr_zero(&ZONE(zone)->nsec3param, udb); + udb_zone_set_log_str(udb, zone, NULL); + + /* walk and delete all domains, rrsets, rrs, but keep tree */ + for(udb_radix_first(udb, &dtree, &d); d.data; udb_radix_next(udb, &d)){ + udb_ptr domain; + udb_ptr_new(&domain, udb, &RADNODE(&d)->elem); + udb_rptr_zero(&RADNODE(&d)->elem, udb); + domain_delete(udb, &domain); + } + udb_ptr_unlink(&d, udb); + udb_radix_tree_clear(udb, &dtree); + ZONE(zone)->rrset_count = 0; + ZONE(zone)->rr_count = 0; + ZONE(zone)->expired = 0; + ZONE(zone)->mtime = 0; + udb_ptr_unlink(&dtree, udb); +} + +void +udb_zone_delete(udb_base* udb, udb_ptr* zone) +{ + udb_ptr ztree, n; + udb_ptr_new(&ztree, udb, udb_base_get_userdata(udb)); + udb_ptr_new(&n, udb, &ZONE(zone)->node); + udb_rptr_zero(&ZONE(zone)->node, udb); + udb_radix_delete(udb, &ztree, &n); + udb_ptr_unlink(&ztree, udb); + udb_ptr_unlink(&n, udb); + udb_zone_delete_plain(udb, zone); +} + +int +udb_zone_search(udb_base* udb, udb_ptr* result, const uint8_t* dname, + size_t dname_len) +{ + udb_ptr ztree; + udb_ptr_new(&ztree, udb, udb_base_get_userdata(udb)); + assert(udb_ptr_get_type(&ztree) == udb_chunk_type_radtree); + if(udb_radname_search(udb, &ztree, dname, dname_len, result)) { + if(result->data) + udb_ptr_set_rptr(result, udb, &RADNODE(result)->elem); + udb_ptr_unlink(&ztree, udb); + return (result->data != 0); + } + udb_ptr_unlink(&ztree, udb); + return 0; +} + +uint64_t udb_zone_get_mtime(udb_base* udb, const uint8_t* dname, size_t dlen) +{ + udb_ptr z; + if(udb_zone_search(udb, &z, dname, dlen)) { + uint64_t t = ZONE(&z)->mtime; + udb_ptr_unlink(&z, udb); + return t; + } + return 0; +} + +void udb_zone_set_log_str(udb_base* udb, udb_ptr* zone, const char* str) +{ + /* delete original log str (if any) */ + if(ZONE(zone)->log_str.data) { + udb_ptr s; + size_t sz; + udb_ptr_new(&s, udb, &ZONE(zone)->log_str); + udb_rptr_zero(&ZONE(zone)->log_str, udb); + sz = strlen((char*)udb_ptr_data(&s))+1; + udb_ptr_free_space(&s, udb, sz); + } + + /* set new log str */ + if(str) { + udb_ptr s; + size_t sz = strlen(str)+1; + if(!udb_ptr_alloc_space(&s, udb, udb_chunk_type_data, sz)) { + return; /* failed to allocate log string */ + } + memmove(udb_ptr_data(&s), str, sz); + udb_rptr_set_ptr(&ZONE(zone)->log_str, udb, &s); + udb_ptr_unlink(&s, udb); + } +} + +#ifdef NSEC3 +/** select the nsec3param for nsec3 usage */ +static void +select_nsec3_param(udb_base* udb, udb_ptr* zone, udb_ptr* rrset) +{ + udb_ptr rr; + udb_ptr_new(&rr, udb, &RRSET(rrset)->rrs); + while(rr.data) { + if(RR(&rr)->len >= 5 && RR(&rr)->wire[0] == NSEC3_SHA1_HASH && + RR(&rr)->wire[1] == 0) { + udb_rptr_set_ptr(&ZONE(zone)->nsec3param, udb, &rr); + udb_ptr_unlink(&rr, udb); + return; + } + udb_ptr_set_rptr(&rr, udb, &RR(&rr)->next); + } + udb_ptr_unlink(&rr, udb); +} + +const char* +udb_nsec3param_string(udb_ptr* rr) +{ + /* max saltlenth plus first couple of numbers (3+1+5+1+3+1) */ + static char params[MAX_RDLENGTH*2+16]; + char* p; + assert(RR(rr)->len >= 5); + p = params + snprintf(params, sizeof(params), "%u %u %u ", + (unsigned)RR(rr)->wire[0], (unsigned)RR(rr)->wire[1], + (unsigned)read_uint16(&RR(rr)->wire[2])); + if(RR(rr)->wire[4] == 0) { + *p++ = '-'; + } else { + assert(RR(rr)->len >= 5+RR(rr)->wire[4]); + p += hex_ntop(&RR(rr)->wire[5], RR(rr)->wire[4], p, + sizeof(params)-strlen(params)-1); + } + *p = 0; + return params; +} + +/** look in zone for new selected nsec3param record from rrset */ +static void +zone_hash_nsec3param(udb_base* udb, udb_ptr* zone, udb_ptr* rrset) +{ + select_nsec3_param(udb, zone, rrset); + if(ZONE(zone)->nsec3param.data == 0) + return; + /* prettyprint the nsec3 parameters we are using */ + if(2 <= verbosity) { + udb_ptr par; + udb_ptr_new(&par, udb, &ZONE(zone)->nsec3param); + VERBOSITY(1, (LOG_INFO, "rehash of zone %s with parameters %s", + wiredname2str(ZONE(zone)->name), + udb_nsec3param_string(&par))); + udb_ptr_unlink(&par, udb); + } +} +#endif /* NSEC3 */ + +/** create a new domain name */ +static int +domain_create(udb_base* udb, udb_ptr* zone, const uint8_t* nm, size_t nmlen, + udb_ptr* result) +{ + udb_ptr dtree, node; + /* create domain chunk */ + if(!udb_ptr_alloc_space(result, udb, udb_chunk_type_domain, + sizeof(struct domain_d)+nmlen)) + return 0; + udb_rel_ptr_init(&DOMAIN(result)->node); + udb_rel_ptr_init(&DOMAIN(result)->rrsets); + DOMAIN(result)->namelen = nmlen; + memmove(DOMAIN(result)->name, nm, nmlen); + + /* insert into domain tree */ + udb_ptr_new(&dtree, udb, &ZONE(zone)->domains); + if(!udb_radname_insert(udb, &dtree, nm, nmlen, result, &node)) { + udb_ptr_free_space(result, udb, sizeof(struct domain_d)+nmlen); + udb_ptr_unlink(&dtree, udb); + return 0; + } + udb_rptr_set_ptr(&DOMAIN(result)->node, udb, &node); + udb_ptr_unlink(&dtree, udb); + udb_ptr_unlink(&node, udb); + return 1; +} + +int +udb_domain_find(udb_base* udb, udb_ptr* zone, const uint8_t* nm, size_t nmlen, + udb_ptr* result) +{ + int r; + udb_ptr dtree; + assert(udb_ptr_get_type(zone) == udb_chunk_type_zone); + udb_ptr_new(&dtree, udb, &ZONE(zone)->domains); + r = udb_radname_search(udb, &dtree, nm, nmlen, result); + if(result->data) + udb_ptr_set_rptr(result, udb, &RADNODE(result)->elem); + udb_ptr_unlink(&dtree, udb); + return r && result->data; +} + +/** find or create a domain name in the zone domain tree */ +static int +domain_find_or_create(udb_base* udb, udb_ptr* zone, const uint8_t* nm, + size_t nmlen, udb_ptr* result) +{ + assert(udb_ptr_get_type(zone) == udb_chunk_type_zone); + if(udb_domain_find(udb, zone, nm, nmlen, result)) + return 1; + return domain_create(udb, zone, nm, nmlen, result); +} + +/** remove rrset from the domain name rrset-list */ +static void +domain_remove_rrset(udb_base* udb, udb_ptr* domain, uint16_t t) +{ + udb_ptr p, prev; + assert(udb_ptr_get_type(domain) == udb_chunk_type_domain); + udb_ptr_new(&p, udb, &DOMAIN(domain)->rrsets); + udb_ptr_init(&prev, udb); + while(p.data) { + if(RRSET(&p)->type == t) { + /* remove it */ + if(prev.data == 0) { + /* first rrset */ + udb_rptr_set_rptr(&DOMAIN(domain)->rrsets, + udb, &RRSET(&p)->next); + } else { + udb_rptr_set_rptr(&RRSET(&prev)->next, + udb, &RRSET(&p)->next); + } + udb_ptr_unlink(&prev, udb); + rrset_delete(udb, &p); + return; + } + udb_ptr_set_ptr(&prev, udb, &p); + udb_ptr_set_rptr(&p, udb, &RRSET(&p)->next); + } + /* rrset does not exist */ + udb_ptr_unlink(&prev, udb); + udb_ptr_unlink(&p, udb); +} + +/** create rrset in the domain rrset list */ +static int +rrset_create(udb_base* udb, udb_ptr* domain, uint16_t t, udb_ptr* res) +{ + /* create it */ + if(!udb_ptr_alloc_space(res, udb, udb_chunk_type_rrset, + sizeof(struct rrset_d))) + return 0; + udb_rel_ptr_init(&RRSET(res)->next); + udb_rel_ptr_init(&RRSET(res)->rrs); + RRSET(res)->type = t; + +#if 0 + /* link it in, at the front */ + udb_rptr_set_rptr(&RRSET(res)->next, udb, &DOMAIN(domain)->rrsets); + udb_rptr_set_ptr(&DOMAIN(domain)->rrsets, udb, res); +#else + /* preserve RRset order, link at end */ + if(DOMAIN(domain)->rrsets.data == 0) { + udb_rptr_set_ptr(&DOMAIN(domain)->rrsets, udb, res); + } else { + udb_ptr p; + udb_ptr_new(&p, udb, &DOMAIN(domain)->rrsets); + while(RRSET(&p)->next.data) + udb_ptr_set_rptr(&p, udb, &RRSET(&p)->next); + udb_rptr_set_ptr(&RRSET(&p)->next, udb, res); + udb_ptr_unlink(&p, udb); + } +#endif + return 1; +} + +int +udb_rrset_find(udb_base* udb, udb_ptr* domain, uint16_t t, udb_ptr* res) +{ + assert(udb_ptr_get_type(domain) == udb_chunk_type_domain); + udb_ptr_init(res, udb); + udb_ptr_set_rptr(res, udb, &DOMAIN(domain)->rrsets); + while(res->data) { + if(RRSET(res)->type == t) + return 1; + udb_ptr_set_rptr(res, udb, &RRSET(res)->next); + } + /* rrset does not exist and res->data is conveniently zero */ + return 0; +} + +/** find or create rrset in the domain rrset list */ +static int +rrset_find_or_create(udb_base* udb, udb_ptr* domain, uint16_t t, udb_ptr* res) +{ + if(udb_rrset_find(udb, domain, t, res)) + return 1; + return rrset_create(udb, domain, t, res); +} + +/** see if RR matches type, class and rdata */ +static int +rr_match(udb_ptr* rr, uint16_t t, uint16_t k, uint8_t* rdata, size_t rdatalen) +{ + return RR(rr)->type == t && RR(rr)->klass == k && + RR(rr)->len == rdatalen && + memcmp(RR(rr)->wire, rdata, rdatalen) == 0; +} + +/** see if RR exists in the RR list that matches the rdata, and return it */ +static int +rr_search(udb_base* udb, udb_ptr* rrset, uint16_t t, uint16_t k, + uint8_t* rdata, size_t rdatalen, udb_ptr* result) +{ + assert(udb_ptr_get_type(rrset) == udb_chunk_type_rrset); + udb_ptr_init(result, udb); + udb_ptr_set_rptr(result, udb, &RRSET(rrset)->rrs); + while(result->data) { + if(rr_match(result, t, k, rdata, rdatalen)) + return 1; /* found */ + udb_ptr_set_rptr(result, udb, &RR(result)->next); + } + /* not found and result->data is conveniently zero */ + return 0; +} + +/** create RR chunk */ +static int +rr_create(udb_base* udb, uint16_t t, uint16_t k, uint32_t ttl, + uint8_t* rdata, size_t rdatalen, udb_ptr* rr) +{ + if(!udb_ptr_alloc_space(rr, udb, udb_chunk_type_rr, + sizeof(struct rr_d)+rdatalen)) + return 0; + udb_rel_ptr_init(&RR(rr)->next); + RR(rr)->type = t; + RR(rr)->klass = k; + RR(rr)->ttl = ttl; + RR(rr)->len = rdatalen; + memmove(RR(rr)->wire, rdata, rdatalen); + return 1; +} + +/** add an RR to an RRset. */ +static int +rrset_add_rr(udb_base* udb, udb_ptr* rrset, uint16_t t, uint16_t k, + uint32_t ttl, uint8_t* rdata, size_t rdatalen) +{ + udb_ptr rr; + assert(udb_ptr_get_type(rrset) == udb_chunk_type_rrset); + /* create it */ + if(!rr_create(udb, t, k, ttl, rdata, rdatalen, &rr)) + return 0; + + /* add at end, to preserve order of RRs */ + if(RRSET(rrset)->rrs.data == 0) { + udb_rptr_set_ptr(&RRSET(rrset)->rrs, udb, &rr); + } else { + udb_ptr lastrr; + udb_ptr_new(&lastrr, udb, &RRSET(rrset)->rrs); + while(RR(&lastrr)->next.data) + udb_ptr_set_rptr(&lastrr, udb, &RR(&lastrr)->next); + udb_rptr_set_ptr(&RR(&lastrr)->next, udb, &rr); + udb_ptr_unlink(&lastrr, udb); + } + udb_ptr_unlink(&rr, udb); + return 1; +} + +/** remove an RR from an RRset. return 0 if RR did not exist. */ +static int +rrset_del_rr(udb_base* udb, udb_ptr* rrset, uint16_t t, uint16_t k, + uint8_t* rdata, size_t rdatalen) +{ + udb_ptr p, prev; + assert(udb_ptr_get_type(rrset) == udb_chunk_type_rrset); + udb_ptr_new(&p, udb, &RRSET(rrset)->rrs); + udb_ptr_init(&prev, udb); + while(p.data) { + if(rr_match(&p, t, k, rdata, rdatalen)) { + /* remove it */ + if(prev.data == 0) { + /* first in list */ + udb_rptr_set_rptr(&RRSET(rrset)->rrs, udb, + &RR(&p)->next); + } else { + udb_rptr_set_rptr(&RR(&prev)->next, udb, + &RR(&p)->next); + } + udb_ptr_unlink(&prev, udb); + rr_delete(udb, &p); + return 1; + } + udb_ptr_set_ptr(&prev, udb, &p); + udb_ptr_set_rptr(&p, udb, &RR(&p)->next); + } + /* not found */ + udb_ptr_unlink(&prev, udb); + udb_ptr_unlink(&p, udb); + return 0; +} + +int +udb_zone_add_rr(udb_base* udb, udb_ptr* zone, const uint8_t* nm, size_t nmlen, + uint16_t t, uint16_t k, uint32_t ttl, uint8_t* rdata, size_t rdatalen) +{ + udb_ptr domain, rrset, rr; + int created_rrset = 0; + assert(udb_ptr_get_type(zone) == udb_chunk_type_zone); + + /* find or create domain */ + if(!domain_find_or_create(udb, zone, nm, nmlen, &domain)) { + return 0; + } + /* find or create rrset(type) */ + if(!rrset_find_or_create(udb, &domain, t, &rrset)) { + goto exit_clean_domain; + } + if(RRSET(&rrset)->rrs.data == 0) + created_rrset = 1; + /* test for duplicate RRs */ + if(rr_search(udb, &rrset, t, k, rdata, rdatalen, &rr)) { + udb_ptr_unlink(&rr, udb); + goto exit_clean_domain_rrset; + } + /* add RR to rrset */ + if(!rrset_add_rr(udb, &rrset, t, k, ttl, rdata, rdatalen)) { + exit_clean_domain_rrset: + /* if rrset was created, remove it */ + if(RRSET(&rrset)->rrs.data == 0) { + udb_ptr_zero(&rrset, udb); + domain_remove_rrset(udb, &domain, t); + } + udb_ptr_unlink(&rrset, udb); + exit_clean_domain: + /* if domain created, delete it */ + if(DOMAIN(&domain)->rrsets.data == 0) + domain_delete_unlink(udb, zone, &domain); + udb_ptr_unlink(&domain, udb); + return 0; + } + /* success, account changes */ + if(created_rrset) + ZONE(zone)->rrset_count ++; + ZONE(zone)->rr_count ++; +#ifdef NSEC3 + if(t == TYPE_NSEC3PARAM && ZONE(zone)->nsec3param.data == 0) + zone_hash_nsec3param(udb, zone, &rrset); +#endif /* NSEC3 */ + udb_ptr_unlink(&domain, udb); + udb_ptr_unlink(&rrset, udb); + return 1; +} + +void +udb_zone_del_rr(udb_base* udb, udb_ptr* zone, const uint8_t* nm, size_t nmlen, + uint16_t t, uint16_t k, uint8_t* rdata, size_t rdatalen) +{ + udb_ptr domain, rrset; + assert(udb_ptr_get_type(zone) == udb_chunk_type_zone); + /* find the domain */ + if(!udb_domain_find(udb, zone, nm, nmlen, &domain)) + return; + /* find the rrset */ + if(!udb_rrset_find(udb, &domain, t, &rrset)) { + udb_ptr_unlink(&domain, udb); + return; + } + /* remove the RR */ +#ifdef NSEC3 + if(t == TYPE_NSEC3PARAM) { + udb_ptr rr; + if(rr_search(udb, &rrset, t, k, rdata, rdatalen, &rr)) { + if(rr.data == ZONE(zone)->nsec3param.data) { + udb_rptr_zero(&ZONE(zone)->nsec3param, udb); + } + udb_ptr_unlink(&rr, udb); + } + } +#endif /* NSEC3 */ + if(!rrset_del_rr(udb, &rrset, t, k, rdata, rdatalen)) { + /* rr did not exist */ + udb_ptr_unlink(&domain, udb); + udb_ptr_unlink(&rrset, udb); + return; + } + ZONE(zone)->rr_count --; +#ifdef NSEC3 + if(t == TYPE_NSEC3PARAM && ZONE(zone)->nsec3param.data == 0 && + RRSET(&rrset)->rrs.data != 0) { + zone_hash_nsec3param(udb, zone, &rrset); + } +#endif /* NSEC3 */ + /* see we we can remove the rrset too */ + if(RRSET(&rrset)->rrs.data == 0) { + udb_ptr_zero(&rrset, udb); + domain_remove_rrset(udb, &domain, t); + ZONE(zone)->rrset_count --; + } + /* see if we can remove the domain name too */ + if(DOMAIN(&domain)->rrsets.data == 0) { + domain_delete_unlink(udb, zone, &domain); + } + udb_ptr_unlink(&rrset, udb); + udb_ptr_unlink(&domain, udb); +} + +void +udb_zone_walk_chunk(void* base, void* d, uint64_t s, udb_walk_relptr_cb* cb, + void* arg) +{ + struct zone_d* p = (struct zone_d*)d; + assert(s >= sizeof(struct zone_d)+p->namelen); + (void)s; + (*cb)(base, &p->node, arg); + (*cb)(base, &p->domains, arg); + (*cb)(base, &p->nsec3param, arg); + (*cb)(base, &p->log_str, arg); +} + +void +udb_domain_walk_chunk(void* base, void* d, uint64_t s, udb_walk_relptr_cb* cb, + void* arg) +{ + struct domain_d* p = (struct domain_d*)d; + assert(s >= sizeof(struct domain_d)+p->namelen); + (void)s; + (*cb)(base, &p->node, arg); + (*cb)(base, &p->rrsets, arg); +} + +void +udb_rrset_walk_chunk(void* base, void* d, uint64_t s, udb_walk_relptr_cb* cb, + void* arg) +{ + struct rrset_d* p = (struct rrset_d*)d; + assert(s >= sizeof(struct rrset_d)); + (void)s; + (*cb)(base, &p->next, arg); + (*cb)(base, &p->rrs, arg); +} + +void +udb_rr_walk_chunk(void* base, void* d, uint64_t s, udb_walk_relptr_cb* cb, + void* arg) +{ + struct rr_d* p = (struct rr_d*)d; + assert(s >= sizeof(struct rr_d)+p->len); + (void)s; + (*cb)(base, &p->next, arg); +} + +void +udb_task_walk_chunk(void* base, void* d, uint64_t s, udb_walk_relptr_cb* cb, + void* arg) +{ + struct task_list_d* p = (struct task_list_d*)d; + assert(s >= p->size); + (void)s; + (*cb)(base, &p->next, arg); +} + +void namedb_walkfunc(void* base, void* warg, uint8_t t, void* d, uint64_t s, + udb_walk_relptr_cb* cb, void* arg) +{ + (void)warg; + switch(t) { + case udb_chunk_type_radtree: + udb_radix_tree_walk_chunk(base, d, s, cb, arg); + break; + case udb_chunk_type_radnode: + udb_radix_node_walk_chunk(base, d, s, cb, arg); + break; + case udb_chunk_type_radarray: + udb_radix_array_walk_chunk(base, d, s, cb, arg); + break; + case udb_chunk_type_zone: + udb_zone_walk_chunk(base, d, s, cb, arg); + break; + case udb_chunk_type_domain: + udb_domain_walk_chunk(base, d, s, cb, arg); + break; + case udb_chunk_type_rrset: + udb_rrset_walk_chunk(base, d, s, cb, arg); + break; + case udb_chunk_type_rr: + udb_rr_walk_chunk(base, d, s, cb, arg); + break; + case udb_chunk_type_task: + udb_task_walk_chunk(base, d, s, cb, arg); + break; + default: + /* no rel ptrs */ + break; + } +} diff --git a/usr.sbin/nsd/udbzone.h b/usr.sbin/nsd/udbzone.h new file mode 100644 index 00000000000..f1163e4dfe5 --- /dev/null +++ b/usr.sbin/nsd/udbzone.h @@ -0,0 +1,147 @@ +/* + * udbzone -- store zone and rrset information in udb file. + * + * Copyright (c) 2011, NLnet Labs. See LICENSE for license. + */ +#ifndef UDB_ZONE_H +#define UDB_ZONE_H +#include "udb.h" +#include "dns.h" +#include "udbradtree.h" + +/** + * Store the DNS information in udb file on disk. + * udb_global + * | + * v + * zonetree -> zone -- zone_name + * radtree | + * v + * domain --> rrset -> rr + * radtree list list + * |-- name + */ + +/** zone information in the nsd.udb. Name allocated after it. */ +struct zone_d { + /** radtree node in the zonetree for this zone */ + udb_rel_ptr node; + /** the radtree for the domain names in the zone */ + udb_rel_ptr domains; + /** the NSEC3PARAM rr used for hashing (or 0), rr_d pointer */ + udb_rel_ptr nsec3param; + /** the log_str for the AXFR change, or 0 */ + udb_rel_ptr log_str; + /** modification time, time when the zone data was changed */ + uint64_t mtime; + /** number of RRsets in the zone */ + uint64_t rrset_count; + /** number of RRs in the zone */ + uint64_t rr_count; + /** the length of the zone name */ + udb_radstrlen_t namelen; + /** if the zone is expired */ + uint8_t expired; + /** if the zone has been changed by AXFR */ + uint8_t is_changed; + /** the zone (wire uncompressed) name in DNS format */ + uint8_t name[0]; +}; + +/** domain name in the nametree. name allocated after it */ +struct domain_d { + /** radtree node in the nametree for this domain */ + udb_rel_ptr node; + /** the list of rrsets for this name, single linked */ + udb_rel_ptr rrsets; + /** length of the domain name */ + udb_radstrlen_t namelen; + /** the domain (wire uncompressed) name in DNS format */ + uint8_t name[0]; +}; + +/** rrset information. */ +struct rrset_d { + /** next in rrset list */ + udb_rel_ptr next; + /** the singly linked list of rrs for this rrset */ + udb_rel_ptr rrs; + /** type of the RRs in this rrset (host order) */ + uint16_t type; +}; + +/** rr information; wireformat data allocated after it */ +struct rr_d { + /** next in rr list */ + udb_rel_ptr next; + /** type (host order) */ + uint16_t type; + /** class (host order) */ + uint16_t klass; + /** ttl (host order) */ + uint32_t ttl; + /** length of wireformat */ + uint16_t len; + /** wireformat of rdata (without rdatalen) */ + uint8_t wire[0]; +}; + +/** init an udb for use as DNS store */ +int udb_dns_init_file(udb_base* udb); +/** de-init an udb for use as DNS store */ +void udb_dns_deinit_file(udb_base* udb); + +/** create a zone */ +int udb_zone_create(udb_base* udb, udb_ptr* result, const uint8_t* dname, + size_t dlen); +/** clear all RRsets from a zone */ +void udb_zone_clear(udb_base* udb, udb_ptr* zone); +/** delete a zone */ +void udb_zone_delete(udb_base* udb, udb_ptr* zone); +/** find a zone by name (exact match) */ +int udb_zone_search(udb_base* udb, udb_ptr* result, const uint8_t* dname, + size_t dlen); +/** get modification time for zone or 0 */ +uint64_t udb_zone_get_mtime(udb_base* udb, const uint8_t* dname, size_t dlen); +/** set log str in udb, or remove it */ +void udb_zone_set_log_str(udb_base* udb, udb_ptr* zone, const char* str); +/** find a domain name in the zone domain tree */ +int udb_domain_find(udb_base* udb, udb_ptr* zone, const uint8_t* nm, + size_t nmlen, udb_ptr* result); +/** find rrset in domain */ +int udb_rrset_find(udb_base* udb, udb_ptr* domain, uint16_t t, udb_ptr* res); + +/** add an RR to a zone */ +int udb_zone_add_rr(udb_base* udb, udb_ptr* zone, const uint8_t* nm, + size_t nmlen, uint16_t t, uint16_t k, uint32_t ttl, uint8_t* rdata, + size_t rdatalen); +/** del an RR from a zone */ +void udb_zone_del_rr(udb_base* udb, udb_ptr* zone, const uint8_t* nm, + size_t nmlen, uint16_t t, uint16_t k, uint8_t* rdata, size_t rdatalen); + +/** get pretty string for nsec3parameters (static buffer returned) */ +const char* udb_nsec3param_string(udb_ptr* rr); + +/** for use in udb-walkfunc, walks relptrs in udb_chunk_type_zone */ +void udb_zone_walk_chunk(void* base, void* d, uint64_t s, + udb_walk_relptr_cb* cb, void* arg); +/** for use in udb-walkfunc, walks relptrs in udb_chunk_type_domain */ +void udb_domain_walk_chunk(void* base, void* d, uint64_t s, + udb_walk_relptr_cb* cb, void* arg); +/** for use in udb-walkfunc, walks relptrs in udb_chunk_type_rrset */ +void udb_rrset_walk_chunk(void* base, void* d, uint64_t s, + udb_walk_relptr_cb* cb, void* arg); +/** for use in udb-walkfunc, walks relptrs in udb_chunk_type_rr */ +void udb_rr_walk_chunk(void* base, void* d, uint64_t s, + udb_walk_relptr_cb* cb, void* arg); + +/** walk through relptrs in registered types */ +void namedb_walkfunc(void* base, void* warg, uint8_t t, void* d, uint64_t s, + udb_walk_relptr_cb* cb, void* arg); + +#define ZONE(ptr) ((struct zone_d*)UDB_PTR(ptr)) +#define DOMAIN(ptr) ((struct domain_d*)UDB_PTR(ptr)) +#define RRSET(ptr) ((struct rrset_d*)UDB_PTR(ptr)) +#define RR(ptr) ((struct rr_d*)UDB_PTR(ptr)) + +#endif /* UDB_ZONE_H */ diff --git a/usr.sbin/nsd/xfrd-disk.h b/usr.sbin/nsd/xfrd-disk.h index 42db1993180..2c8e23fc752 100644 --- a/usr.sbin/nsd/xfrd-disk.h +++ b/usr.sbin/nsd/xfrd-disk.h @@ -1,7 +1,7 @@ /* * xfrd-disk.h - XFR (transfer) Daemon TCP system header file. Save/Load state to disk. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -10,8 +10,8 @@ #ifndef XFRD_DISK_H #define XFRD_DISK_H -#include "config.h" struct xfrd_state; +struct nsd; /* magic string to identify xfrd state file */ #define XFRD_FILE_MAGIC "NSDXFRD1" @@ -21,4 +21,13 @@ void xfrd_read_state(struct xfrd_state* xfrd); /* write xfrd zone state if possible */ void xfrd_write_state(struct xfrd_state* xfrd); +/* create temp directory */ +void xfrd_make_tempdir(struct nsd* nsd); +/* rmdir temp directory */ +void xfrd_del_tempdir(struct nsd* nsd); +/* open temp file, makes directory if needed */ +FILE* xfrd_open_xfrfile(struct nsd* nsd, uint64_t number, char* mode); +/* unlink temp file */ +void xfrd_unlink_xfrfile(struct nsd* nsd, uint64_t number); + #endif /* XFRD_DISK_H */ diff --git a/usr.sbin/nsd/xfrd-notify.c b/usr.sbin/nsd/xfrd-notify.c index 0aa5c2c6cd7..6fb8e00e1e8 100644 --- a/usr.sbin/nsd/xfrd-notify.c +++ b/usr.sbin/nsd/xfrd-notify.c @@ -1,7 +1,7 @@ /* * xfrd-notify.c - notify sending routines * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -11,7 +11,7 @@ #include <assert.h> #include <string.h> #include <unistd.h> - +#include <errno.h> #include "xfrd-notify.h" #include "xfrd.h" #include "xfrd-tcp.h" @@ -22,8 +22,6 @@ /* start sending notifies */ static void notify_enable(struct notify_zone_t* zone, struct xfrd_soa* new_soa); -/* stop sending notifies */ -static void notify_disable(struct notify_zone_t* zone); /* setup the notify active state */ static void setup_notify_active(struct notify_zone_t* zone); @@ -31,21 +29,29 @@ static void setup_notify_active(struct notify_zone_t* zone); static int xfrd_handle_notify_reply(struct notify_zone_t* zone, buffer_type* packet); /* handle zone notify send */ -static void xfrd_handle_notify_send(netio_type *netio, - netio_handler_type *handler, netio_event_types_type event_types); +static void xfrd_handle_notify_send(int fd, short event, void* arg); static void xfrd_notify_next(struct notify_zone_t* zone); static void xfrd_notify_send_udp(struct notify_zone_t* zone, buffer_type* packet); static void +notify_send_disable(struct notify_zone_t* zone) +{ + zone->notify_send_enable = 0; + event_del(&zone->notify_send_handler); + if(zone->notify_send_handler.ev_fd != -1) { + close(zone->notify_send_handler.ev_fd); + } +} + +void notify_disable(struct notify_zone_t* zone) { zone->notify_current = 0; - zone->notify_send_handler.timeout = NULL; - if(zone->notify_send_handler.fd != -1) { - close(zone->notify_send_handler.fd); - zone->notify_send_handler.fd = -1; + /* if added, then remove */ + if(zone->notify_send_enable) { + notify_send_disable(zone); } if(xfrd->notify_udp_num == XFRD_MAX_UDP_NOTIFY) { @@ -56,6 +62,8 @@ notify_disable(struct notify_zone_t* zone) assert(wz->is_waiting); wz->is_waiting = 0; xfrd->notify_waiting_first = wz->waiting_next; + if(wz->waiting_next) + wz->waiting_next->waiting_prev = NULL; if(xfrd->notify_waiting_last == wz) xfrd->notify_waiting_last = NULL; /* see if this zone needs notify sending */ @@ -72,13 +80,12 @@ notify_disable(struct notify_zone_t* zone) } void -init_notify_send(rbtree_t* tree, netio_type* netio, region_type* region, - const dname_type* apex, zone_options_t* options, zone_type* dbzone) +init_notify_send(rbtree_t* tree, region_type* region, zone_options_t* options) { struct notify_zone_t* not = (struct notify_zone_t*) region_alloc(region, sizeof(struct notify_zone_t)); memset(not, 0, sizeof(struct notify_zone_t)); - not->apex = apex; + not->apex = options->node.key; not->apex_str = options->name; not->node.key = not->apex; not->options = options; @@ -87,23 +94,49 @@ init_notify_send(rbtree_t* tree, netio_type* netio, region_type* region, not->current_soa = (struct xfrd_soa*)region_alloc(region, sizeof(struct xfrd_soa)); memset(not->current_soa, 0, sizeof(struct xfrd_soa)); - if(dbzone && dbzone->soa_rrset && dbzone->soa_rrset->rrs) { - xfrd_copy_soa(not->current_soa, dbzone->soa_rrset->rrs); - } not->is_waiting = 0; - not->notify_send_handler.fd = -1; - not->notify_send_handler.timeout = 0; - not->notify_send_handler.user_data = not; - not->notify_send_handler.event_types = - NETIO_EVENT_READ|NETIO_EVENT_TIMEOUT; - not->notify_send_handler.event_handler = xfrd_handle_notify_send; - netio_add_handler(netio, ¬->notify_send_handler); - tsig_create_record_custom(¬->notify_tsig, region, 0, 0, 4); + + not->notify_send_enable = 0; + tsig_create_record_custom(¬->notify_tsig, NULL, 0, 0, 4); not->notify_current = 0; rbtree_insert(tree, (rbnode_t*)not); } +void +xfrd_del_notify(xfrd_state_t* xfrd, const dname_type* dname) +{ + /* find it */ + struct notify_zone_t* not = (struct notify_zone_t*)rbtree_delete( + xfrd->notify_zones, dname); + if(!not) + return; + + /* waiting list */ + if(not->is_waiting) { + if(not->waiting_prev) + not->waiting_prev->waiting_next = not->waiting_next; + else xfrd->notify_waiting_first = not->waiting_next; + if(not->waiting_next) + not->waiting_next->waiting_prev = not->waiting_prev; + else xfrd->notify_waiting_last = not->waiting_prev; + not->is_waiting = 0; + } + + /* event */ + if(not->notify_send_enable) { + notify_disable(not); + } + + /* del tsig */ + tsig_delete_record(¬->notify_tsig, NULL); + + /* free it */ + region_recycle(xfrd->region, not->current_soa, sizeof(xfrd_soa_t)); + /* the apex is recycled when the zone_options.node.key is removed */ + region_recycle(xfrd->region, not, sizeof(*not)); +} + static int xfrd_handle_notify_reply(struct notify_zone_t* zone, buffer_type* packet) { @@ -151,13 +184,15 @@ xfrd_notify_next(struct notify_zone_t* zone) static void xfrd_notify_send_udp(struct notify_zone_t* zone, buffer_type* packet) { - if(zone->notify_send_handler.fd != -1) - close(zone->notify_send_handler.fd); - zone->notify_send_handler.fd = -1; + int fd; + if(zone->notify_send_enable) { + notify_send_disable(zone); + } /* Set timeout for next reply */ - zone->notify_timeout.tv_sec = xfrd_time() + XFRD_NOTIFY_RETRY_TIMOUT; + zone->notify_timeout.tv_sec = XFRD_NOTIFY_RETRY_TIMOUT; /* send NOTIFY to secondary. */ - xfrd_setup_packet(packet, TYPE_SOA, CLASS_IN, zone->apex); + xfrd_setup_packet(packet, TYPE_SOA, CLASS_IN, zone->apex, + qid_generate()); zone->notify_query_id = ID(packet); OPCODE_SET(packet, OPCODE_NOTIFY); AA_SET(packet); @@ -170,41 +205,53 @@ xfrd_notify_send_udp(struct notify_zone_t* zone, buffer_type* packet) xfrd_tsig_sign_request(packet, &zone->notify_tsig, zone->notify_current); } buffer_flip(packet); - zone->notify_send_handler.fd = xfrd_send_udp(zone->notify_current, - packet, zone->options->outgoing_interface); - if(zone->notify_send_handler.fd == -1) { + fd = xfrd_send_udp(zone->notify_current, packet, + zone->options->pattern->outgoing_interface); + if(fd == -1) { log_msg(LOG_ERR, "xfrd: zone %s: could not send notify #%d to %s", zone->apex_str, zone->notify_retry, zone->notify_current->ip_address_spec); + event_set(&zone->notify_send_handler, -1, EV_TIMEOUT, + xfrd_handle_notify_send, zone); + if(event_base_set(xfrd->event_base, &zone->notify_send_handler) != 0) + log_msg(LOG_ERR, "notify_send: event_base_set failed"); + if(evtimer_add(&zone->notify_send_handler, &zone->notify_timeout) != 0) + log_msg(LOG_ERR, "notify_send: evtimer_add failed"); + zone->notify_send_enable = 1; return; } + event_set(&zone->notify_send_handler, fd, EV_READ | EV_TIMEOUT, + xfrd_handle_notify_send, zone); + if(event_base_set(xfrd->event_base, &zone->notify_send_handler) != 0) + log_msg(LOG_ERR, "notify_send: event_base_set failed"); + if(event_add(&zone->notify_send_handler, &zone->notify_timeout) != 0) + log_msg(LOG_ERR, "notify_send: evtimer_add failed"); + zone->notify_send_enable = 1; DEBUG(DEBUG_XFRD,1, (LOG_INFO, "xfrd: zone %s: sent notify #%d to %s", zone->apex_str, zone->notify_retry, zone->notify_current->ip_address_spec)); } static void -xfrd_handle_notify_send(netio_type* ATTR_UNUSED(netio), - netio_handler_type *handler, netio_event_types_type event_types) +xfrd_handle_notify_send(int fd, short event, void* arg) { - struct notify_zone_t* zone = (struct notify_zone_t*)handler->user_data; + struct notify_zone_t* zone = (struct notify_zone_t*)arg; buffer_type* packet = xfrd_get_temp_buffer(); assert(zone->notify_current); if(zone->is_waiting) { DEBUG(DEBUG_XFRD,1, (LOG_INFO, "xfrd: notify waiting, skipped, %s", zone->apex_str)); - assert(zone->notify_send_handler.fd == -1); return; } - if(event_types & NETIO_EVENT_READ) { + if((event & EV_READ)) { DEBUG(DEBUG_XFRD,1, (LOG_INFO, "xfrd: zone %s: read notify ACK", zone->apex_str)); - assert(handler->fd != -1); - if(xfrd_udp_read_packet(packet, zone->notify_send_handler.fd)) { + assert(fd != -1); + if(xfrd_udp_read_packet(packet, fd)) { if(xfrd_handle_notify_reply(zone, packet)) xfrd_notify_next(zone); } - } else if(event_types & NETIO_EVENT_TIMEOUT) { + } else if((event & EV_TIMEOUT)) { DEBUG(DEBUG_XFRD,1, (LOG_INFO, "xfrd: zone %s: notify timeout", zone->apex_str)); /* timeout, try again */ @@ -212,7 +259,7 @@ xfrd_handle_notify_send(netio_type* ATTR_UNUSED(netio), /* see if notify is still enabled */ if(zone->notify_current) { zone->notify_retry++; - if(zone->notify_retry > zone->options->notify_retry) { + if(zone->notify_retry > zone->options->pattern->notify_retry) { log_msg(LOG_ERR, "xfrd: zone %s: max notify send count reached, %s unreachable", zone->apex_str, zone->notify_current->ip_address_spec); xfrd_notify_next(zone); @@ -228,16 +275,25 @@ static void setup_notify_active(struct notify_zone_t* zone) { zone->notify_retry = 0; - zone->notify_current = zone->options->notify; - zone->notify_send_handler.timeout = &zone->notify_timeout; - zone->notify_timeout.tv_sec = xfrd_time(); - zone->notify_timeout.tv_nsec = 0; + zone->notify_current = zone->options->pattern->notify; + zone->notify_timeout.tv_sec = 0; + zone->notify_timeout.tv_usec = 0; + + if(zone->notify_send_enable) + notify_send_disable(zone); + event_set(&zone->notify_send_handler, -1, EV_TIMEOUT, + xfrd_handle_notify_send, zone); + if(event_base_set(xfrd->event_base, &zone->notify_send_handler) != 0) + log_msg(LOG_ERR, "notifysend: event_base_set failed"); + if(evtimer_add(&zone->notify_send_handler, &zone->notify_timeout) != 0) + log_msg(LOG_ERR, "notifysend: evtimer_add failed"); + zone->notify_send_enable = 1; } static void notify_enable(struct notify_zone_t* zone, struct xfrd_soa* new_soa) { - if(!zone->options->notify) { + if(!zone->options->pattern->notify) { return; /* no notify acl, nothing to do */ } @@ -254,27 +310,37 @@ notify_enable(struct notify_zone_t* zone, struct xfrd_soa* new_soa) return; } /* put it in waiting list */ - zone->notify_current = zone->options->notify; + zone->notify_current = zone->options->pattern->notify; zone->is_waiting = 1; zone->waiting_next = NULL; + zone->waiting_prev = xfrd->notify_waiting_last; if(xfrd->notify_waiting_last) { xfrd->notify_waiting_last->waiting_next = zone; } else { xfrd->notify_waiting_first = zone; } xfrd->notify_waiting_last = zone; - zone->notify_send_handler.timeout = NULL; DEBUG(DEBUG_XFRD,1, (LOG_INFO, "xfrd: zone %s: notify on waiting list.", zone->apex_str)); } void +xfrd_notify_start(struct notify_zone_t* zone) +{ + if(zone->is_waiting || zone->notify_send_enable) + return; + notify_enable(zone, NULL); +} + +void xfrd_send_notify(rbtree_t* tree, const dname_type* apex, struct xfrd_soa* new_soa) { /* lookup the zone */ struct notify_zone_t* zone = (struct notify_zone_t*) rbtree_search(tree, apex); assert(zone); + if(zone->notify_send_enable) + notify_disable(zone); notify_enable(zone, new_soa); } @@ -286,24 +352,24 @@ notify_handle_master_zone_soainfo(rbtree_t* tree, /* lookup the zone */ struct notify_zone_t* zone = (struct notify_zone_t*) rbtree_search(tree, apex); - assert(zone); + if(!zone) return; /* got SOAINFO but zone was deleted meanwhile */ /* check if SOA changed */ if( (new_soa == NULL && zone->current_soa->serial == 0) || (new_soa && new_soa->serial == zone->current_soa->serial)) return; - + if(zone->notify_send_enable) + notify_disable(zone); notify_enable(zone, new_soa); } -void close_notify_fds(rbtree_t* tree) +void +close_notify_fds(rbtree_t* tree) { struct notify_zone_t* zone; RBTREE_FOR(zone, struct notify_zone_t*, tree) { - if(zone->notify_send_handler.fd != -1) { - close(zone->notify_send_handler.fd); - zone->notify_send_handler.fd = -1; - } + if(zone->notify_send_enable) + notify_send_disable(zone); } } diff --git a/usr.sbin/nsd/xfrd-notify.h b/usr.sbin/nsd/xfrd-notify.h index 242c7e763ca..4f084d302e7 100644 --- a/usr.sbin/nsd/xfrd-notify.h +++ b/usr.sbin/nsd/xfrd-notify.h @@ -1,7 +1,7 @@ /* * xfrd-notify.h - notify sending routines. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -10,9 +10,12 @@ #ifndef XFRD_NOTIFY_H #define XFRD_NOTIFY_H -#include "config.h" +#ifndef USE_MINI_EVENT +#include <event.h> +#else +#include "mini_event.h" +#endif #include "tsig.h" -#include "netio.h" #include "rbtree.h" struct nsd; @@ -22,6 +25,7 @@ struct zone_options; struct zone; struct xfrd_soa; struct acl_options; +struct xfrd_state; /** * This struct keeps track of outbound notifies for a zone. @@ -38,26 +42,32 @@ struct notify_zone_t { /* notify sending handler */ /* Not saved on disk (i.e. kill of daemon stops notifies) */ - netio_handler_type notify_send_handler; - struct timespec notify_timeout; + int notify_send_enable; + struct event notify_send_handler; + struct timeval notify_timeout; struct acl_options* notify_current; /* current slave to notify */ + uint8_t notify_restart; /* restart notify after repattern */ uint8_t notify_retry; /* how manieth retry in sending to current */ uint16_t notify_query_id; /* is this notify waiting for a socket? */ uint8_t is_waiting; - /* next in the waiting list for the udp sockets */ + /* the double linked waiting list for the udp sockets */ struct notify_zone_t* waiting_next; + struct notify_zone_t* waiting_prev; }; /* initialise outgoing notifies */ -void init_notify_send(rbtree_t* tree, netio_type* netio, region_type* region, - const dname_type* apex, struct zone_options* options, - struct zone* dbzone); +void init_notify_send(rbtree_t* tree, region_type* region, + struct zone_options* options); +/* delete notify zone */ +void xfrd_del_notify(struct xfrd_state* xfrd, const dname_type* dname); /* send notifications to all in the notify list */ void xfrd_send_notify(rbtree_t* tree, const struct dname* apex, struct xfrd_soa* new_soa); +/* start notifications, if not started already (does not clobber SOA) */ +void xfrd_notify_start(struct notify_zone_t* zone); /* handle soa update notify for a master zone. newsoa can be NULL. Makes sure that the soa (serial) has changed. Or drops notify. */ @@ -66,5 +76,7 @@ void notify_handle_master_zone_soainfo(rbtree_t* tree, /* close fds in use for notification sending */ void close_notify_fds(rbtree_t* tree); +/* stop send of notify */ +void notify_disable(struct notify_zone_t* zone); #endif /* XFRD_NOTIFY_H */ diff --git a/usr.sbin/nsd/xfrd-tcp.h b/usr.sbin/nsd/xfrd-tcp.h index e42e1a4456a..ac3f9dc5aab 100644 --- a/usr.sbin/nsd/xfrd-tcp.h +++ b/usr.sbin/nsd/xfrd-tcp.h @@ -1,7 +1,7 @@ /* * xfrd-tcp.h - XFR (transfer) Daemon TCP system header file. Manages tcp conn. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -10,7 +10,6 @@ #ifndef XFRD_TCP_H #define XFRD_TCP_H -#include "config.h" #include "xfrd.h" struct buffer; @@ -21,6 +20,7 @@ struct region; struct dname; struct acl_options; +struct xfrd_tcp_pipeline; typedef struct xfrd_tcp xfrd_tcp_t; typedef struct xfrd_tcp_set xfrd_tcp_set_t; /* @@ -28,12 +28,14 @@ typedef struct xfrd_tcp_set xfrd_tcp_set_t; */ struct xfrd_tcp_set { /* tcp connections, each has packet and read/wr state */ - struct xfrd_tcp *tcp_state[XFRD_MAX_TCP]; + struct xfrd_tcp_pipeline *tcp_state[XFRD_MAX_TCP]; /* number of TCP connections in use. */ int tcp_count; /* TCP timeout. */ int tcp_timeout; - /* linked list of zones waiting for a TCP connection */ + /* rbtree with pipelines sorted by master */ + rbtree_t* pipetree; + /* double linked list of zones waiting for a TCP connection */ struct xfrd_zone *tcp_waiting_first, *tcp_waiting_last; }; @@ -61,27 +63,91 @@ struct xfrd_tcp { struct buffer* packet; }; +/* use illegal pointer value to denote skipped ID number. + * if this does not work, we can allocate with malloc */ +#define TCP_NULL_SKIP ((struct xfrd_zone*)-1) +/* the number of ID values (16 bits) for a pipeline */ +#define ID_PIPE_NUM 65536 + +/** + * Structure to keep track of a pipelined set of queries on + * an open tcp connection. The queries may be answered with + * interleaved answer packets, the ID number disambiguates. + * Sorted by the master IP address so you can use lookup with + * smaller-or-equal to find the tcp connection most suitable. + */ +struct xfrd_tcp_pipeline { + /* the rbtree node, sorted by IP and nr of unused queries */ + rbnode_t node; + /* destination IP address */ +#ifdef INET6 + struct sockaddr_storage ip; +#else + struct sockaddr_in ip; +#endif /* INET6 */ + socklen_t ip_len; + /* number of unused IDs. used IDs are waiting to send their query, + * or have been sent but not not all answer packets have been received. + * Sorted by num_unused, so a lookup smaller-equal for 65536 finds the + * connection to that master that has the most free IDs. */ + int num_unused; + /* number of skip-set IDs (these are 'in-use') */ + int num_skip; + + int handler_added; + /* the event handler for this pipe (it'll disambiguate by ID) */ + struct event handler; + + /* the tcp connection to use for reading */ + xfrd_tcp_t* tcp_r; + /* the tcp connection to use for writing, if it is done successfully, + * then the first zone from the sendlist can be removed. */ + xfrd_tcp_t* tcp_w; + /* once a byte has been written, handshake complete */ + int connection_established; + + /* list of queries that want to send, first to get write event, + * if NULL, no write event interest */ + struct xfrd_zone* tcp_send_first, *tcp_send_last; + /* the unused and id arrays must be last in the structure */ + /* per-ID number the queries that have this ID number, every + * query owns one ID numbers (until it is done). NULL: unused + * When a query is done but not all answer-packets have been + * consumed for that ID number, the rest is skipped, this + * is denoted with the pointer-value TCP_NULL_SKIP, the ids that + * are skipped are not on the unused list. They may be + * removed once the last answer packet is skipped. + * ID_PIPE_NUM-num_unused values in the id array are nonNULL (either + * a zone pointer or SKIP) */ + struct xfrd_zone* id[ID_PIPE_NUM]; + /* unused ID numbers; the first part of the array contains the IDs */ + uint16_t unused[ID_PIPE_NUM]; +}; + /* create set of tcp connections */ xfrd_tcp_set_t* xfrd_tcp_set_create(struct region* region); /* init tcp state */ -xfrd_tcp_t* xfrd_tcp_create(struct region* region); +xfrd_tcp_t* xfrd_tcp_create(struct region* region, size_t bufsize); /* obtain tcp connection for a zone (or wait) */ void xfrd_tcp_obtain(xfrd_tcp_set_t* set, struct xfrd_zone* zone); /* release tcp connection for a zone (starts waiting) */ void xfrd_tcp_release(xfrd_tcp_set_t* set, struct xfrd_zone* zone); +/* release tcp pipe entirely (does not stop the zones inside it) */ +void xfrd_tcp_pipe_release(xfrd_tcp_set_t* set, struct xfrd_tcp_pipeline* tp, + int conn); /* use tcp connection to start xfr */ -void xfrd_tcp_xfr(xfrd_tcp_set_t* set, struct xfrd_zone* zone); +void xfrd_tcp_setup_write_packet(struct xfrd_tcp_pipeline* tp, + struct xfrd_zone* zone); /* initialize tcp_state for a zone. Opens the connection. true on success.*/ -int xfrd_tcp_open(xfrd_tcp_set_t* set, struct xfrd_zone* zone); +int xfrd_tcp_open(xfrd_tcp_set_t* set, struct xfrd_tcp_pipeline* tp, struct xfrd_zone* zone); /* read data from tcp, maybe partial read */ -void xfrd_tcp_read(xfrd_tcp_set_t* set, struct xfrd_zone* zone); +void xfrd_tcp_read(struct xfrd_tcp_pipeline* tp); /* write data to tcp, maybe a partial write */ -void xfrd_tcp_write(xfrd_tcp_set_t* set, struct xfrd_zone* zone); +void xfrd_tcp_write(struct xfrd_tcp_pipeline* tp, struct xfrd_zone* zone); +/* handle tcp pipe events */ +void xfrd_handle_tcp_pipe(int fd, short event, void* arg); -/* see if the tcp connection is in the reading stage (else writin) */ -static inline int xfrd_tcp_is_reading(xfrd_tcp_set_t* set, int conn) -{return set->tcp_state[conn]->is_reading;} /* * Read from a stream connection (size16)+packet into buffer. * returns value is @@ -103,7 +169,7 @@ int conn_write(xfrd_tcp_t* conn); /* setup DNS packet for a query of this type */ void xfrd_setup_packet(struct buffer* packet, - uint16_t type, uint16_t klass, const struct dname* dname); + uint16_t type, uint16_t klass, const struct dname* dname, uint16_t qid); /* write soa in network format to the packet buffer */ void xfrd_write_soa_buffer(struct buffer* packet, const struct dname* apex, struct xfrd_soa* soa); @@ -122,4 +188,7 @@ socklen_t xfrd_acl_sockaddr_frm(struct acl_options* acl, struct sockaddr_in *frm); #endif /* INET6 */ +/* create pipeline tcp structure */ +struct xfrd_tcp_pipeline* xfrd_tcp_pipeline_create(region_type* region); + #endif /* XFRD_TCP_H */ diff --git a/usr.sbin/nsd/xfrd.h b/usr.sbin/nsd/xfrd.h index e4d6a278259..b71f8c54f5e 100644 --- a/usr.sbin/nsd/xfrd.h +++ b/usr.sbin/nsd/xfrd.h @@ -1,7 +1,7 @@ /* * xfrd.h - XFR (transfer) Daemon header file. Coordinates SOA updates. * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * @@ -10,8 +10,11 @@ #ifndef XFRD_H #define XFRD_H -#include "config.h" -#include "netio.h" +#ifndef USE_MINI_EVENT +#include <event.h> +#else +#include "mini_event.h" +#endif #include "rbtree.h" #include "namedb.h" #include "options.h" @@ -24,6 +27,7 @@ struct buffer; struct xfrd_tcp; struct xfrd_tcp_set; struct notify_zone_t; +struct udb_ptr; typedef struct xfrd_state xfrd_state_t; typedef struct xfrd_zone xfrd_zone_t; typedef struct xfrd_soa xfrd_soa_t; @@ -35,41 +39,47 @@ struct xfrd_state { /* time when daemon was last started */ time_t xfrd_start_time; struct region* region; - netio_type* netio; + struct event_base* event_base; struct nsd* nsd; struct xfrd_tcp_set* tcp_set; /* packet buffer for udp packets */ struct buffer* packet; - /* udp waiting list */ + /* udp waiting list, double linked list */ struct xfrd_zone *udp_waiting_first, *udp_waiting_last; /* number of udp sockets (for sending queries) in use */ size_t udp_use_num; + /* activated waiting list, double linked list */ + struct xfrd_zone *activated_first; /* current time is cached */ uint8_t got_time; time_t current_time; + /* counter for xfr file numbers */ + uint64_t xfrfilenumber; + /* timer for NSD reload */ - struct timespec reload_timeout; - netio_handler_type reload_handler; + struct timeval reload_timeout; + struct event reload_handler; + int reload_added; /* last reload must have caught all zone updates before this time */ time_t reload_cmd_last_sent; uint8_t can_send_reload; + pid_t reload_pid; /* communication channel with server_main */ - netio_handler_type ipc_handler; - uint8_t ipc_is_soa; - uint8_t parent_soa_info_pass; + struct event ipc_handler; + int ipc_handler_flags; struct xfrd_tcp *ipc_conn; struct buffer* ipc_pass; /* sending ipc to server_main */ - struct xfrd_tcp *ipc_conn_write; + uint8_t need_to_send_shutdown; uint8_t need_to_send_reload; + uint8_t need_to_send_stats; uint8_t need_to_send_quit; - uint8_t sending_zone_state; uint8_t ipc_send_blocked; - stack_type* dirty_zones; /* stack of xfrd_zone* */ + struct udb_ptr* last_task; /* xfrd shutdown flag */ uint8_t shutdown; @@ -138,10 +148,6 @@ struct xfrd_zone { xfrd_zone_expired } state; - /* if state is dirty it needs to be sent to server_main. - * it is also on the dirty_stack. Not saved on disk. */ - uint8_t dirty; - /* master to try to transfer from, number for persistence */ acl_options_t* master; int master_num; @@ -152,8 +158,10 @@ struct xfrd_zone { int fresh_xfr_timeout; /* handler for timeouts */ - struct timespec timeout; - netio_handler_type zone_handler; + struct timeval timeout; + struct event zone_handler; + int zone_handler_flags; + int event_added; /* tcp connection zone is using, or -1 */ int tcp_conn; @@ -161,10 +169,22 @@ struct xfrd_zone { uint8_t tcp_waiting; /* next zone in waiting list */ xfrd_zone_t* tcp_waiting_next; + xfrd_zone_t* tcp_waiting_prev; + /* zone is in its tcp send queue */ + uint8_t in_tcp_send; + /* next zone in tcp send queue */ + xfrd_zone_t* tcp_send_next; + xfrd_zone_t* tcp_send_prev; /* zone is waiting for a udp connection (tcp is preferred) */ uint8_t udp_waiting; /* next zone in waiting list for UDP */ xfrd_zone_t* udp_waiting_next; + xfrd_zone_t* udp_waiting_prev; + /* zone has been activated to run now (after the other events + * but before blocking in select again) */ + uint8_t is_activated; + xfrd_zone_t* activated_next; + xfrd_zone_t* activated_prev; /* xfr message handling data */ /* query id */ @@ -174,6 +194,8 @@ struct xfrd_zone { size_t msg_rr_count; uint8_t msg_is_ixfr; /* 1:IXFR detected. 2:middle IXFR SOA seen. */ tsig_record_type tsig; /* tsig state for IXFR/AXFR */ + uint64_t xfrfilenumber; /* identifier for file to store xfr into, + valid if msg_seq_nr nonzero */ }; enum xfrd_packet_result { @@ -194,18 +216,24 @@ enum xfrd_packet_result { Note that also some sockets are used for writing the ixfr.db, xfrd.state files and for the pipes to the main parent process. */ -#define XFRD_MAX_TCP 50 /* max number of TCP AXFR/IXFR concurrent connections.*/ +#define XFRD_MAX_TCP 32 /* max number of TCP AXFR/IXFR concurrent connections.*/ /* Each entry has 64Kb buffer preallocated.*/ -#define XFRD_MAX_UDP 100 /* max number of UDP sockets at a time for IXFR */ -#define XFRD_MAX_UDP_NOTIFY 50 /* max concurrent UDP sockets for NOTIFY */ +#define XFRD_MAX_UDP 64 /* max number of UDP sockets at a time for IXFR */ +#define XFRD_MAX_UDP_NOTIFY 64 /* max concurrent UDP sockets for NOTIFY */ extern xfrd_state_t* xfrd; /* start xfrd, new start. Pass socket to server_main. */ -void xfrd_init(int socket, struct nsd* nsd); +void xfrd_init(int socket, struct nsd* nsd, int shortsoa, int reload_active); + +/* add new slave zone, dname(from zone_opt) and given options */ +void xfrd_init_slave_zone(xfrd_state_t* xfrd, zone_options_t* zone_opt); + +/* delete slave zone */ +void xfrd_del_slave_zone(xfrd_state_t* xfrd, const dname_type* dname); /* get the current time epoch. Cached for speed. */ -time_t xfrd_time(); +time_t xfrd_time(void); /* * Handle final received packet from network. @@ -220,6 +248,8 @@ void xfrd_set_timer(xfrd_zone_t* zone, time_t t); void xfrd_set_refresh_now(xfrd_zone_t* zone); /* unset the timer - no more timeouts, for when zone is queued */ void xfrd_unset_timer(xfrd_zone_t* zone); +/* remove the 'refresh now', remove it from the activated list */ +void xfrd_deactivate_zone(xfrd_zone_t* z); /* * Make a new request to next master server. @@ -249,7 +279,7 @@ void xfrd_udp_release(xfrd_zone_t* zone); /* * Get a static buffer for temporary use (to build a packet). */ -struct buffer* xfrd_get_temp_buffer(); +struct buffer* xfrd_get_temp_buffer(void); /* * TSIG sign outgoing request. Call if acl has a key. @@ -268,27 +298,41 @@ void xfrd_handle_incoming_soa(xfrd_zone_t* zone, xfrd_soa_t* soa, void xfrd_handle_passed_packet(buffer_type* packet, int acl_num, int acl_xfr); -/* send expiry notify for all zones to nsd (sets all dirty). */ -void xfrd_send_expy_all_zones(); - /* try to reopen the logfile. */ -void xfrd_reopen_logfile(); +void xfrd_reopen_logfile(void); + +/* free namedb for xfrd usage */ +void xfrd_free_namedb(struct nsd* nsd); /* copy SOA info from rr to soa struct. */ void xfrd_copy_soa(xfrd_soa_t* soa, rr_type* rr); /* check for failed updates - it is assumed that now the reload has finished, and all zone SOAs have been sent. */ -void xfrd_check_failed_updates(); +void xfrd_check_failed_updates(void); /* * Prepare zones for a reload, this sets the times on the zones to be * before the current time, so the reload happens after. */ -void xfrd_prepare_zones_for_reload(); +void xfrd_prepare_zones_for_reload(void); /* Bind a local interface to a socket descriptor, return 1 on success */ int xfrd_bind_local_interface(int sockd, acl_options_t* ifc, acl_options_t* acl, int tcp); +/* process results and soa info from reload */ +void xfrd_process_task_result(xfrd_state_t* xfrd, struct udb_base* taskudb); + +/* set to reload right away (for user controlled reload events) */ +void xfrd_set_reload_now(xfrd_state_t* xfrd); + +/* handle incoming notify (soa or NULL) and start zone xfr if necessary */ +void xfrd_handle_notify_and_start_xfr(xfrd_zone_t* zone, xfrd_soa_t* soa); + +/* handle zone timeout, event */ +void xfrd_handle_zone(int fd, short event, void* arg); + +const char* xfrd_pretty_time(time_t v); + #endif /* XFRD_H */ diff --git a/usr.sbin/nsd/zlexer.lex b/usr.sbin/nsd/zlexer.lex index 666a4ba3135..bcb5661ab80 100644 --- a/usr.sbin/nsd/zlexer.lex +++ b/usr.sbin/nsd/zlexer.lex @@ -2,7 +2,7 @@ /* * zlexer.lex - lexical analyzer for (DNS) zone files * - * Copyright (c) 2001-2011, NLnet Labs. All rights reserved. + * Copyright (c) 2001-2006, NLnet Labs. All rights reserved * * See LICENSE for the license. * @@ -19,8 +19,6 @@ #include "dname.h" #include "zparser.h" -#define YY_NO_UNPUT - #if 0 #define LEXOUT(s) printf s /* used ONLY when debugging */ #else @@ -68,6 +66,23 @@ pop_parser_state(void) yy_switch_to_buffer(include_stack[include_stack_ptr]); } +static YY_BUFFER_STATE oldstate; +/* Start string scan */ +void +parser_push_stringbuf(char* str) +{ + oldstate = YY_CURRENT_BUFFER; + yy_switch_to_buffer(yy_scan_string(str)); +} + +void +parser_pop_stringbuf(void) +{ + yy_delete_buffer(YY_CURRENT_BUFFER); + yy_switch_to_buffer(oldstate); + oldstate = NULL; +} + #ifndef yy_set_bol /* compat definition, for flex 2.4.6 */ #define yy_set_bol(at_bol) \ { \ @@ -78,6 +93,16 @@ pop_parser_state(void) #endif %} +%option noinput +%option nounput +%{ +#ifndef YY_NO_UNPUT +#define YY_NO_UNPUT 1 +#endif +#ifndef YY_NO_INPUT +#define YY_NO_INPUT 1 +#endif +%} SPACE [ \t] LETTER [a-zA-Z] @@ -244,6 +269,8 @@ ANY [^\"\n\\]|\\. <bitlabel><<EOF>> { zc_error("EOF inside bitlabel"); BEGIN(INITIAL); + yyrestart(yyin); /* this is so that lex does not give an internal err */ + yyterminate(); } <bitlabel>{BIT}* { yymore(); } <bitlabel>\n { ++parser->line; yymore(); } @@ -258,6 +285,8 @@ ANY [^\"\n\\]|\\. <quotedstring><<EOF>> { zc_error("EOF inside quoted string"); BEGIN(INITIAL); + yyrestart(yyin); /* this is so that lex does not give an internal err */ + yyterminate(); } <quotedstring>{ANY}* { LEXOUT(("STR ")); yymore(); } <quotedstring>\n { ++parser->line; yymore(); } |