/* * server.c -- nsd(8) network input/output * * Copyright (c) 2001-2006, NLnet Labs. All rights reserved. * * See LICENSE for the license. * */ #include "config.h" #include #include #include #include #include #include #include #ifdef USE_TCP_FASTOPEN #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_SYS_RANDOM_H #include #endif #ifndef SHUT_WR #define SHUT_WR 1 #endif #ifdef HAVE_MMAP #include #endif /* HAVE_MMAP */ #ifdef HAVE_OPENSSL_RAND_H #include #endif #ifdef HAVE_OPENSSL_SSL_H #include #endif #ifdef HAVE_OPENSSL_ERR_H #include #endif #ifdef HAVE_OPENSSL_OCSP_H #include #endif #ifndef USE_MINI_EVENT # ifdef HAVE_EVENT_H # include # else # include # include "event2/event_struct.h" # include "event2/event_compat.h" # endif #else # include "mini_event.h" #endif #include "axfr.h" #include "namedb.h" #include "netio.h" #include "xfrd.h" #include "xfrd-tcp.h" #include "xfrd-disk.h" #include "difffile.h" #include "nsec3.h" #include "ipc.h" #include "udb.h" #include "remote.h" #include "lookup3.h" #include "rrl.h" #ifdef USE_DNSTAP #include "dnstap/dnstap_collector.h" #endif #define RELOAD_SYNC_TIMEOUT 25 /* seconds */ #ifdef USE_TCP_FASTOPEN #define TCP_FASTOPEN_FILE "/proc/sys/net/ipv4/tcp_fastopen" #define TCP_FASTOPEN_SERVER_BIT_MASK 0x2 #endif /* * Data for the UDP handlers. */ struct udp_handler_data { struct nsd *nsd; struct nsd_socket *socket; struct event event; }; struct tcp_accept_handler_data { struct nsd *nsd; struct nsd_socket *socket; int event_added; struct event event; #ifdef HAVE_SSL /* handler accepts TLS connections on the dedicated port */ int tls_accept; #endif }; /* * These globals are used to enable the TCP accept handlers * when the number of TCP connection drops below the maximum * number of TCP connections. */ static size_t tcp_accept_handler_count; static struct tcp_accept_handler_data *tcp_accept_handlers; static struct event slowaccept_event; static int slowaccept; #ifdef HAVE_SSL static unsigned char *ocspdata = NULL; static long ocspdata_len = 0; #endif #ifdef NONBLOCKING_IS_BROKEN /* Define NUM_RECV_PER_SELECT to 1 (one) to avoid opportunistically trying to read multiple times from a socket when reported ready by select. */ # define NUM_RECV_PER_SELECT (1) #else /* !NONBLOCKING_IS_BROKEN */ # define NUM_RECV_PER_SELECT (100) #endif /* NONBLOCKING_IS_BROKEN */ #ifndef HAVE_MMSGHDR struct mmsghdr { struct msghdr msg_hdr; unsigned int msg_len; }; #endif static struct mmsghdr msgs[NUM_RECV_PER_SELECT]; static struct iovec iovecs[NUM_RECV_PER_SELECT]; static struct query *queries[NUM_RECV_PER_SELECT]; /* * Data for the TCP connection handlers. * * The TCP handlers use non-blocking I/O. This is necessary to avoid * blocking the entire server on a slow TCP connection, but does make * reading from and writing to the socket more complicated. * * Basically, whenever a read/write would block (indicated by the * EAGAIN errno variable) we remember the position we were reading * from/writing to and return from the TCP reading/writing event * handler. When the socket becomes readable/writable again we * continue from the same position. */ struct tcp_handler_data { /* * The region used to allocate all TCP connection related * data, including this structure. This region is destroyed * when the connection is closed. */ region_type* region; /* * The global nsd structure. */ struct nsd* nsd; /* * The current query data for this TCP connection. */ query_type* query; /* * The query_state is used to remember if we are performing an * AXFR, if we're done processing, or if we should discard the * query and connection. */ query_state_type query_state; /* * The event for the file descriptor and tcp timeout */ struct event event; /* * The bytes_transmitted field is used to remember the number * of bytes transmitted when receiving or sending a DNS * packet. The count includes the two additional bytes used * to specify the packet length on a TCP connection. */ size_t bytes_transmitted; /* * The number of queries handled by this specific TCP connection. */ int query_count; /* * The timeout in msec for this tcp connection */ int tcp_timeout; #ifdef HAVE_SSL /* * TLS object. */ SSL* tls; /* * TLS handshake state. */ enum { tls_hs_none, tls_hs_read, tls_hs_write, tls_hs_read_event, tls_hs_write_event } shake_state; #endif /* list of connections, for service of remaining tcp channels */ struct tcp_handler_data *prev, *next; }; /* global that is the list of active tcp channels */ static struct tcp_handler_data *tcp_active_list = NULL; /* * Handle incoming queries on the UDP server sockets. */ static void handle_udp(int fd, short event, void* arg); /* * Handle incoming connections on the TCP sockets. These handlers * usually wait for the NETIO_EVENT_READ event (indicating an incoming * connection) but are disabled when the number of current TCP * connections is equal to the maximum number of TCP connections. * Disabling is done by changing the handler to wait for the * NETIO_EVENT_NONE type. This is done using the function * configure_tcp_accept_handlers. */ static void handle_tcp_accept(int fd, short event, void* arg); /* * Handle incoming queries on a TCP connection. The TCP connections * are configured to be non-blocking and the handler may be called * multiple times before a complete query is received. */ static void handle_tcp_reading(int fd, short event, void* arg); /* * Handle outgoing responses on a TCP connection. The TCP connections * are configured to be non-blocking and the handler may be called * multiple times before a complete response is sent. */ static void handle_tcp_writing(int fd, short event, void* arg); #ifdef HAVE_SSL /* Create SSL object and associate fd */ static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd); /* * Handle TLS handshake. May be called multiple times if incomplete. */ static int tls_handshake(struct tcp_handler_data* data, int fd, int writing); /* * Handle incoming queries on a TLS over TCP connection. The TLS * connections are configured to be non-blocking and the handler may * be called multiple times before a complete query is received. */ static void handle_tls_reading(int fd, short event, void* arg); /* * Handle outgoing responses on a TLS over TCP connection. The TLS * connections are configured to be non-blocking and the handler may * be called multiple times before a complete response is sent. */ static void handle_tls_writing(int fd, short event, void* arg); #endif /* * Send all children the quit nonblocking, then close pipe. */ static void send_children_quit(struct nsd* nsd); /* same, for shutdown time, waits for child to exit to avoid restart issues */ static void send_children_quit_and_wait(struct nsd* nsd); /* set childrens flags to send NSD_STATS to them */ #ifdef BIND8_STATS static void set_children_stats(struct nsd* nsd); #endif /* BIND8_STATS */ /* * Change the event types the HANDLERS are interested in to EVENT_TYPES. */ static void configure_handler_event_types(short event_types); static uint16_t *compressed_dname_offsets = 0; static uint32_t compression_table_capacity = 0; static uint32_t compression_table_size = 0; static domain_type* compressed_dnames[MAXRRSPP]; #ifdef USE_TCP_FASTOPEN /* Checks to see if the kernel value must be manually changed in order for TCP Fast Open to support server mode */ static void report_tcp_fastopen_config() { int tcp_fastopen_fp; uint8_t tcp_fastopen_value; if ( (tcp_fastopen_fp = open(TCP_FASTOPEN_FILE, O_RDONLY)) == -1 ) { log_msg(LOG_INFO,"Error opening " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); } if (read(tcp_fastopen_fp, &tcp_fastopen_value, 1) == -1 ) { log_msg(LOG_INFO,"Error reading " TCP_FASTOPEN_FILE ": %s\n", strerror(errno)); close(tcp_fastopen_fp); } if (!(tcp_fastopen_value & TCP_FASTOPEN_SERVER_BIT_MASK)) { log_msg(LOG_WARNING, "Error: TCP Fast Open support is available and configured in NSD by default.\n"); log_msg(LOG_WARNING, "However the kernel paramenters are not configured to support TCP_FASTOPEN in server mode.\n"); log_msg(LOG_WARNING, "To enable TFO use the command:"); log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=2' for pure server mode or\n"); log_msg(LOG_WARNING, " 'sudo sysctl -w net.ipv4.tcp_fastopen=3' for both client and server mode\n"); log_msg(LOG_WARNING, "NSD will not have TCP Fast Open available until this change is made.\n"); close(tcp_fastopen_fp); } close(tcp_fastopen_fp); } #endif /* * Remove the specified pid from the list of child pids. Returns -1 if * the pid is not in the list, child_num otherwise. The field is set to 0. */ static int delete_child_pid(struct nsd *nsd, pid_t pid) { size_t i; for (i = 0; i < nsd->child_count; ++i) { if (nsd->children[i].pid == pid) { nsd->children[i].pid = 0; if(!nsd->children[i].need_to_exit) { if(nsd->children[i].child_fd != -1) close(nsd->children[i].child_fd); nsd->children[i].child_fd = -1; if(nsd->children[i].handler) nsd->children[i].handler->fd = -1; } return i; } } return -1; } /* * Restart child servers if necessary. */ static int restart_child_servers(struct nsd *nsd, region_type* region, netio_type* netio, int* xfrd_sock_p) { struct main_ipc_handler_data *ipc_data; size_t i; int sv[2]; /* Fork the child processes... */ for (i = 0; i < nsd->child_count; ++i) { if (nsd->children[i].pid <= 0) { if (nsd->children[i].child_fd != -1) close(nsd->children[i].child_fd); if (socketpair(AF_UNIX, SOCK_STREAM, 0, sv) == -1) { log_msg(LOG_ERR, "socketpair: %s", strerror(errno)); return -1; } nsd->children[i].child_fd = sv[0]; nsd->children[i].parent_fd = sv[1]; nsd->children[i].pid = fork(); switch (nsd->children[i].pid) { default: /* SERVER MAIN */ close(nsd->children[i].parent_fd); nsd->children[i].parent_fd = -1; if (fcntl(nsd->children[i].child_fd, F_SETFL, O_NONBLOCK) == -1) { log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); } if(!nsd->children[i].handler) { ipc_data = (struct main_ipc_handler_data*) region_alloc( region, sizeof(struct main_ipc_handler_data)); ipc_data->nsd = nsd; ipc_data->child = &nsd->children[i]; ipc_data->child_num = i; ipc_data->xfrd_sock = xfrd_sock_p; ipc_data->packet = buffer_create(region, QIOBUFSZ); ipc_data->forward_mode = 0; ipc_data->got_bytes = 0; ipc_data->total_bytes = 0; ipc_data->acl_num = 0; nsd->children[i].handler = (struct netio_handler*) region_alloc( region, sizeof(struct netio_handler)); nsd->children[i].handler->fd = nsd->children[i].child_fd; nsd->children[i].handler->timeout = NULL; nsd->children[i].handler->user_data = ipc_data; nsd->children[i].handler->event_types = NETIO_EVENT_READ; nsd->children[i].handler->event_handler = parent_handle_child_command; netio_add_handler(netio, nsd->children[i].handler); } /* clear any ongoing ipc */ ipc_data = (struct main_ipc_handler_data*) nsd->children[i].handler->user_data; ipc_data->forward_mode = 0; /* restart - update fd */ nsd->children[i].handler->fd = nsd->children[i].child_fd; break; case 0: /* CHILD */ /* the child need not be able to access the * nsd.db file */ namedb_close_udb(nsd->db); #ifdef MEMCLEAN /* OS collects memory pages */ region_destroy(region); #endif if (pledge("stdio rpath inet", NULL) == -1) { log_msg(LOG_ERR, "pledge"); exit(1); } nsd->pid = 0; nsd->child_count = 0; nsd->server_kind = nsd->children[i].kind; nsd->this_child = &nsd->children[i]; nsd->this_child->child_num = i; /* remove signal flags inherited from parent the parent will handle them. */ nsd->signal_hint_reload_hup = 0; nsd->signal_hint_reload = 0; nsd->signal_hint_child = 0; nsd->signal_hint_quit = 0; nsd->signal_hint_shutdown = 0; nsd->signal_hint_stats = 0; nsd->signal_hint_statsusr = 0; close(*xfrd_sock_p); close(nsd->this_child->child_fd); nsd->this_child->child_fd = -1; if (fcntl(nsd->this_child->parent_fd, F_SETFL, O_NONBLOCK) == -1) { log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); } server_child(nsd); /* NOTREACH */ exit(0); case -1: log_msg(LOG_ERR, "fork failed: %s", strerror(errno)); return -1; } } } return 0; } #ifdef BIND8_STATS static void set_bind8_alarm(struct nsd* nsd) { /* resync so that the next alarm is on the next whole minute */ if(nsd->st.period > 0) /* % by 0 gives divbyzero error */ alarm(nsd->st.period - (time(NULL) % nsd->st.period)); } #endif /* set zone stat ids for zones initially read in */ static void zonestatid_tree_set(struct nsd* nsd) { struct radnode* n; for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { zone_type* zone = (zone_type*)n->elem; zone->zonestatid = getzonestatid(nsd->options, zone->opts); } } #ifdef USE_ZONE_STATS void server_zonestat_alloc(struct nsd* nsd) { size_t num = (nsd->options->zonestatnames->count==0?1: nsd->options->zonestatnames->count); size_t sz = sizeof(struct nsdst)*num; char tmpfile[256]; uint8_t z = 0; /* file names */ nsd->zonestatfname[0] = 0; nsd->zonestatfname[1] = 0; snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.0", nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); nsd->zonestatfname[0] = region_strdup(nsd->region, tmpfile); snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.zstat.1", nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); nsd->zonestatfname[1] = region_strdup(nsd->region, tmpfile); /* file descriptors */ nsd->zonestatfd[0] = open(nsd->zonestatfname[0], O_CREAT|O_RDWR, 0600); if(nsd->zonestatfd[0] == -1) { log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[0], strerror(errno)); exit(1); } nsd->zonestatfd[1] = open(nsd->zonestatfname[1], O_CREAT|O_RDWR, 0600); if(nsd->zonestatfd[0] == -1) { log_msg(LOG_ERR, "cannot create %s: %s", nsd->zonestatfname[1], strerror(errno)); close(nsd->zonestatfd[0]); unlink(nsd->zonestatfname[0]); exit(1); } #ifdef HAVE_MMAP if(lseek(nsd->zonestatfd[0], (off_t)sz-1, SEEK_SET) == -1) { log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[0], strerror(errno)); exit(1); } if(write(nsd->zonestatfd[0], &z, 1) == -1) { log_msg(LOG_ERR, "cannot extend stat file %s (%s)", nsd->zonestatfname[0], strerror(errno)); exit(1); } if(lseek(nsd->zonestatfd[1], (off_t)sz-1, SEEK_SET) == -1) { log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[1], strerror(errno)); exit(1); } if(write(nsd->zonestatfd[1], &z, 1) == -1) { log_msg(LOG_ERR, "cannot extend stat file %s (%s)", nsd->zonestatfname[1], strerror(errno)); exit(1); } nsd->zonestat[0] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[0], 0); if(nsd->zonestat[0] == MAP_FAILED) { log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); unlink(nsd->zonestatfname[0]); unlink(nsd->zonestatfname[1]); exit(1); } nsd->zonestat[1] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[1], 0); if(nsd->zonestat[1] == MAP_FAILED) { log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); unlink(nsd->zonestatfname[0]); unlink(nsd->zonestatfname[1]); exit(1); } memset(nsd->zonestat[0], 0, sz); memset(nsd->zonestat[1], 0, sz); nsd->zonestatsize[0] = num; nsd->zonestatsize[1] = num; nsd->zonestatdesired = num; nsd->zonestatsizenow = num; nsd->zonestatnow = nsd->zonestat[0]; #endif /* HAVE_MMAP */ } void zonestat_remap(struct nsd* nsd, int idx, size_t sz) { #ifdef HAVE_MMAP #ifdef MREMAP_MAYMOVE nsd->zonestat[idx] = (struct nsdst*)mremap(nsd->zonestat[idx], sizeof(struct nsdst)*nsd->zonestatsize[idx], sz, MREMAP_MAYMOVE); if(nsd->zonestat[idx] == MAP_FAILED) { log_msg(LOG_ERR, "mremap failed: %s", strerror(errno)); exit(1); } #else /* !HAVE MREMAP */ if(msync(nsd->zonestat[idx], sizeof(struct nsdst)*nsd->zonestatsize[idx], MS_ASYNC) != 0) log_msg(LOG_ERR, "msync failed: %s", strerror(errno)); if(munmap(nsd->zonestat[idx], sizeof(struct nsdst)*nsd->zonestatsize[idx]) != 0) log_msg(LOG_ERR, "munmap failed: %s", strerror(errno)); nsd->zonestat[idx] = (struct nsdst*)mmap(NULL, sz, PROT_READ|PROT_WRITE, MAP_SHARED, nsd->zonestatfd[idx], 0); if(nsd->zonestat[idx] == MAP_FAILED) { log_msg(LOG_ERR, "mmap failed: %s", strerror(errno)); exit(1); } #endif /* MREMAP */ #endif /* HAVE_MMAP */ } /* realloc the zonestat array for the one that is not currently in use, * to match the desired new size of the array (if applicable) */ void server_zonestat_realloc(struct nsd* nsd) { #ifdef HAVE_MMAP uint8_t z = 0; size_t sz; int idx = 0; /* index of the zonestat array that is not in use */ if(nsd->zonestatnow == nsd->zonestat[0]) idx = 1; if(nsd->zonestatsize[idx] == nsd->zonestatdesired) return; sz = sizeof(struct nsdst)*nsd->zonestatdesired; if(lseek(nsd->zonestatfd[idx], (off_t)sz-1, SEEK_SET) == -1) { log_msg(LOG_ERR, "lseek %s: %s", nsd->zonestatfname[idx], strerror(errno)); exit(1); } if(write(nsd->zonestatfd[idx], &z, 1) == -1) { log_msg(LOG_ERR, "cannot extend stat file %s (%s)", nsd->zonestatfname[idx], strerror(errno)); exit(1); } zonestat_remap(nsd, idx, sz); /* zero the newly allocated region */ if(nsd->zonestatdesired > nsd->zonestatsize[idx]) { memset(((char*)nsd->zonestat[idx])+sizeof(struct nsdst) * nsd->zonestatsize[idx], 0, sizeof(struct nsdst) * (nsd->zonestatdesired - nsd->zonestatsize[idx])); } nsd->zonestatsize[idx] = nsd->zonestatdesired; #endif /* HAVE_MMAP */ } /* switchover to use the other array for the new children, that * briefly coexist with the old children. And we want to avoid them * both writing to the same statistics arrays. */ void server_zonestat_switch(struct nsd* nsd) { if(nsd->zonestatnow == nsd->zonestat[0]) { nsd->zonestatnow = nsd->zonestat[1]; nsd->zonestatsizenow = nsd->zonestatsize[1]; } else { nsd->zonestatnow = nsd->zonestat[0]; nsd->zonestatsizenow = nsd->zonestatsize[0]; } } #endif /* USE_ZONE_STATS */ static void cleanup_dname_compression_tables(void *ptr) { free(ptr); compressed_dname_offsets = NULL; compression_table_capacity = 0; } static void initialize_dname_compression_tables(struct nsd *nsd) { size_t needed = domain_table_count(nsd->db->domains) + 1; needed += EXTRA_DOMAIN_NUMBERS; if(compression_table_capacity < needed) { if(compressed_dname_offsets) { region_remove_cleanup(nsd->db->region, cleanup_dname_compression_tables, compressed_dname_offsets); free(compressed_dname_offsets); } compressed_dname_offsets = (uint16_t *) xmallocarray( needed, sizeof(uint16_t)); region_add_cleanup(nsd->db->region, cleanup_dname_compression_tables, compressed_dname_offsets); compression_table_capacity = needed; compression_table_size=domain_table_count(nsd->db->domains)+1; } memset(compressed_dname_offsets, 0, needed * sizeof(uint16_t)); compressed_dname_offsets[0] = QHEADERSZ; /* The original query name */ } static int set_cloexec(struct nsd_socket *sock) { assert(sock != NULL); if(fcntl(sock->s, F_SETFD, FD_CLOEXEC) == -1) { const char *socktype = sock->addr.ai_family == SOCK_DGRAM ? "udp" : "tcp"; log_msg(LOG_ERR, "fcntl(..., O_CLOEXEC) failed for %s: %s", socktype, strerror(errno)); return -1; } return 1; } static int set_reuseport(struct nsd_socket *sock) { #ifdef SO_REUSEPORT int on = 1; #ifdef SO_REUSEPORT_LB /* FreeBSD 12 has SO_REUSEPORT_LB that does load balancing like * SO_REUSEPORT on Linux. This is what the users want with the config * option in nsd.conf; if we actually need local address and port reuse * they'll also need to have SO_REUSEPORT set for them, assume it was * _LB they want. */ int opt = SO_REUSEPORT_LB; static const char optname[] = "SO_REUSEPORT_LB"; #else /* !SO_REUSEPORT_LB */ int opt = SO_REUSEPORT; static const char optname[] = "SO_REUSEPORT"; #endif /* SO_REUSEPORT_LB */ if (0 == setsockopt(sock->s, SOL_SOCKET, opt, &on, sizeof(on))) { return 1; } else if(verbosity >= 3 || errno != ENOPROTOOPT) { log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", optname, strerror(errno)); } return -1; #else (void)sock; #endif /* SO_REUSEPORT */ return 0; } static int set_reuseaddr(struct nsd_socket *sock) { #ifdef SO_REUSEADDR int on = 1; if(setsockopt(sock->s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == 0) { return 1; } log_msg(LOG_ERR, "setsockopt(..., SO_REUSEADDR, ...) failed: %s", strerror(errno)); return -1; #endif /* SO_REUSEADDR */ return 0; } static int set_rcvbuf(struct nsd_socket *sock, int rcv) { #ifdef SO_RCVBUF #ifdef SO_RCVBUFFORCE if(0 == setsockopt( sock->s, SOL_SOCKET, SO_RCVBUFFORCE, &rcv, sizeof(rcv))) { return 1; } if(errno == EPERM || errno == ENOBUFS) { return 0; } log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUFFORCE, ...) failed: %s", strerror(errno)); return -1; #else /* !SO_RCVBUFFORCE */ if (0 == setsockopt( sock->s, SOL_SOCKET, SO_RCVBUF, &rcv, sizeof(rcv))) { return 1; } if(errno == ENOSYS || errno == ENOBUFS) { return 0; } log_msg(LOG_ERR, "setsockopt(..., SO_RCVBUF, ...) failed: %s", strerror(errno)); return -1; #endif /* SO_RCVBUFFORCE */ #endif /* SO_RCVBUF */ return 0; } static int set_sndbuf(struct nsd_socket *sock, int snd) { #ifdef SO_SNDBUF #ifdef SO_SNDBUFFORCE if(0 == setsockopt( sock->s, SOL_SOCKET, SO_SNDBUFFORCE, &snd, sizeof(snd))) { return 1; } if(errno == EPERM || errno == ENOBUFS) { return 0; } log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUFFORCE, ...) failed: %s", strerror(errno)); return -1; #else /* !SO_SNDBUFFORCE */ if(0 == setsockopt( sock->s, SOL_SOCKET, SO_SNDBUF, &snd, sizeof(snd))) { return 1; } if(errno == ENOSYS || errno == ENOBUFS) { return 0; } log_msg(LOG_ERR, "setsockopt(..., SO_SNDBUF, ...) failed: %s", strerror(errno)); return -1; #endif /* SO_SNDBUFFORCE */ #endif /* SO_SNDBUF */ return 0; } static int set_nonblock(struct nsd_socket *sock) { const char *socktype = sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; if(fcntl(sock->s, F_SETFL, O_NONBLOCK) == -1) { log_msg(LOG_ERR, "fctnl(..., O_NONBLOCK) failed for %s: %s", socktype, strerror(errno)); return -1; } return 1; } static int set_ipv6_v6only(struct nsd_socket *sock) { #ifdef INET6 #ifdef IPV6_V6ONLY int on = 1; const char *socktype = sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; if(0 == setsockopt( sock->s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on))) { return 1; } log_msg(LOG_ERR, "setsockopt(..., IPV6_V6ONLY, ...) failed for %s: %s", socktype, strerror(errno)); return -1; #endif /* IPV6_V6ONLY */ #endif /* INET6 */ return 0; } static int set_ipv6_use_min_mtu(struct nsd_socket *sock) { #if defined(INET6) && (defined(IPV6_USE_MIN_MTU) || defined(IPV6_MTU)) #if defined(IPV6_USE_MIN_MTU) /* There is no fragmentation of IPv6 datagrams during forwarding in the * network. Therefore we do not send UDP datagrams larger than the * minimum IPv6 MTU of 1280 octets. The EDNS0 message length can be * larger if the network stack supports IPV6_USE_MIN_MTU. */ int opt = IPV6_USE_MIN_MTU; int optval = 1; static const char optname[] = "IPV6_USE_MIN_MTU"; #elif defined(IPV6_MTU) /* On Linux, PMTUD is disabled by default for datagrams so set the MTU * to the MIN MTU to get the same. */ int opt = IPV6_MTU; int optval = IPV6_MIN_MTU; static const char optname[] = "IPV6_MTU"; #endif if(0 == setsockopt( sock->s, IPPROTO_IPV6, opt, &optval, sizeof(optval))) { return 1; } log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed: %s", optname, strerror(errno)); return -1; #else (void)sock; #endif /* INET6 */ return 0; } static int set_ipv4_no_pmtu_disc(struct nsd_socket *sock) { int ret = 0; #if defined(IP_MTU_DISCOVER) int opt = IP_MTU_DISCOVER; int optval; # if defined(IP_PMTUDISC_OMIT) /* Linux 3.15 has IP_PMTUDISC_OMIT which makes sockets ignore PMTU * information and send packets with DF=0. Fragmentation is allowed if * and only if the packet size exceeds the outgoing interface MTU or * the packet encounters smaller MTU link in network. This mitigates * DNS fragmentation attacks by preventing forged PMTU information. * FreeBSD already has same semantics without setting the option. */ optval = IP_PMTUDISC_OMIT; if(0 == setsockopt( sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) { return 1; } log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", "IP_MTU_DISCOVER", "IP_PMTUDISC_OMIT", strerror(errno)); # endif /* IP_PMTUDISC_OMIT */ # if defined(IP_PMTUDISC_DONT) /* Use IP_PMTUDISC_DONT if IP_PMTUDISC_OMIT failed / undefined. */ optval = IP_PMTUDISC_DONT; if(0 == setsockopt( sock->s, IPPROTO_IP, opt, &optval, sizeof(optval))) { return 1; } log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", "IP_MTU_DISCOVER", "IP_PMTUDISC_DONT", strerror(errno)); # endif ret = -1; #elif defined(IP_DONTFRAG) int off = 0; if (0 == setsockopt( sock->s, IPPROTO_IP, IP_DONTFRAG, &off, sizeof(off))) { return 1; } log_msg(LOG_ERR, "setsockopt(..., IP_DONTFRAG, ...) failed: %s", strerror(errno)); ret = -1; #else (void)sock; #endif return ret; } static int set_ip_freebind(struct nsd_socket *sock) { #ifdef IP_FREEBIND int on = 1; const char *socktype = sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; if(setsockopt(sock->s, IPPROTO_IP, IP_FREEBIND, &on, sizeof(on)) == 0) { return 1; } log_msg(LOG_ERR, "setsockopt(..., IP_FREEBIND, ...) failed for %s: %s", socktype, strerror(errno)); return -1; #else (void)sock; #endif /* IP_FREEBIND */ return 0; } static int set_ip_transparent(struct nsd_socket *sock) { /* The scandalous preprocessor blob here calls for some explanation :) POSIX does not specify an option to bind non-local IPs, so platforms developed several implementation-specific options, all set in the same way, but with different names. For additional complexity, some platform manage this setting differently for different address families (IPv4 vs IPv6). This scandalous preprocessor blob below abstracts such variability in the way which leaves the C code as lean and clear as possible. */ #if defined(IP_TRANSPARENT) # define NSD_SOCKET_OPTION_TRANSPARENT IP_TRANSPARENT # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_TRANSPARENT" // as of 2020-01, Linux does not support this on IPv6 programmatically #elif defined(SO_BINDANY) # define NSD_SOCKET_OPTION_TRANSPARENT SO_BINDANY # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL SOL_SOCKET # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "SO_BINDANY" #elif defined(IP_BINDANY) # define NSD_SOCKET_OPTION_TRANSPARENT IP_BINDANY # define NSD_SOCKET_OPTION_TRANSPARENT6 IPV6_BINDANY # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL IPPROTO_IP # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 IPPROTO_IPV6 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME "IP_BINDANY" #endif #ifndef NSD_SOCKET_OPTION_TRANSPARENT (void)sock; #else # ifndef NSD_SOCKET_OPTION_TRANSPARENT6 # define NSD_SOCKET_OPTION_TRANSPARENT6 NSD_SOCKET_OPTION_TRANSPARENT # endif # ifndef NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 # define NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL # endif # ifndef NSD_SOCKET_OPTION_TRANSPARENT_NAME6 # define NSD_SOCKET_OPTION_TRANSPARENT_NAME6 NSD_SOCKET_OPTION_TRANSPARENT_NAME # endif int on = 1; const char *socktype = sock->addr.ai_socktype == SOCK_DGRAM ? "udp" : "tcp"; const int is_ip6 = (sock->addr.ai_family == AF_INET6); if(0 == setsockopt( sock->s, is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL6 : NSD_SOCKET_OPTION_TRANSPARENT_OPTLEVEL, is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT6 : NSD_SOCKET_OPTION_TRANSPARENT, &on, sizeof(on))) { return 1; } log_msg(LOG_ERR, "setsockopt(..., %s, ...) failed for %s: %s", is_ip6 ? NSD_SOCKET_OPTION_TRANSPARENT_NAME6 : NSD_SOCKET_OPTION_TRANSPARENT_NAME, socktype, strerror(errno)); return -1; #endif return 0; } static int set_tcp_maxseg(struct nsd_socket *sock, int mss) { #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG) if(setsockopt(sock->s, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == 0) { return 1; } log_msg(LOG_ERR, "setsockopt(..., TCP_MAXSEG, ...) failed for tcp: %s", strerror(errno)); return -1; #else log_msg(LOG_ERR, "setsockopt(TCP_MAXSEG) unsupported"); #endif return 0; } #ifdef USE_TCP_FASTOPEN static int set_tcp_fastopen(struct nsd_socket *sock) { /* qlen specifies how many outstanding TFO requests to allow. Limit is * a defense against IP spoofing attacks as suggested in RFC7413. */ int qlen; #ifdef __APPLE__ /* macOS X implementation only supports qlen of 1 via this call. The * actual value is configured by the net.inet.tcp.fastopen_backlog * kernel parameter. */ qlen = 1; #else /* 5 is recommended on Linux. */ qlen = 5; #endif if (0 == setsockopt( sock->s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, sizeof(qlen))) { return 1; } if (errno == EPERM) { log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s " "; this could likely be because sysctl " "net.inet.tcp.fastopen.enabled, " "net.inet.tcp.fastopen.server_enable, or " "net.ipv4.tcp_fastopen is disabled", strerror(errno)); /* Squelch ENOPROTOOPT: FreeBSD server mode with kernel support * disabled, except when verbosity enabled for debugging */ } else if(errno != ENOPROTOOPT || verbosity >= 3) { log_msg(LOG_ERR, "Setting TCP Fast Open as server failed: %s", strerror(errno)); } return (errno == ENOPROTOOPT ? 0 : -1); } #endif /* USE_TCP_FASTOPEN */ static int set_bindtodevice(struct nsd_socket *sock) { #if defined(SO_BINDTODEVICE) if(setsockopt(sock->s, SOL_SOCKET, SO_BINDTODEVICE, sock->device, strlen(sock->device)) == -1) { log_msg(LOG_ERR, "setsockopt(..., %s, %s, ...) failed: %s", "SO_BINDTODEVICE", sock->device, strerror(errno)); return -1; } return 1; #else (void)sock; return 0; #endif } static int set_setfib(struct nsd_socket *sock) { #if defined(SO_SETFIB) if(setsockopt(sock->s, SOL_SOCKET, SO_SETFIB, (const void *)&sock->fib, sizeof(sock->fib)) == -1) { log_msg(LOG_ERR, "setsockopt(..., %s, %d, ...) failed: %s", "SO_SETFIB", sock->fib, strerror(errno)); return -1; } return 1; #else (void)sock; return 0; #endif } static int open_udp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) { int rcv = 1*1024*1024, snd = 1*1024*1024; if(-1 == (sock->s = socket( sock->addr.ai_family, sock->addr.ai_socktype, 0))) { #ifdef INET6 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && (sock->addr.ai_family == AF_INET6) && (errno == EAFNOSUPPORT)) { log_msg(LOG_WARNING, "fallback to UDP4, no IPv6: " "not supported"); return 0; } #endif log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); return -1; } set_cloexec(sock); if(nsd->reuseport && reuseport_works && *reuseport_works) *reuseport_works = (set_reuseport(sock) == 1); if(nsd->options->receive_buffer_size > 0) rcv = nsd->options->receive_buffer_size; if(set_rcvbuf(sock, rcv) == -1) return -1; if(nsd->options->send_buffer_size > 0) snd = nsd->options->send_buffer_size; if(set_sndbuf(sock, snd) == -1) return -1; #ifdef INET6 if(sock->addr.ai_family == AF_INET6) { if(set_ipv6_v6only(sock) == -1 || set_ipv6_use_min_mtu(sock) == -1) return -1; } else #endif /* INET6 */ if(sock->addr.ai_family == AF_INET) { if(set_ipv4_no_pmtu_disc(sock) == -1) return -1; } /* Set socket to non-blocking. Otherwise, on operating systems * with thundering herd problems, the UDP recv could block * after select returns readable. */ set_nonblock(sock); if(nsd->options->ip_freebind) (void)set_ip_freebind(sock); if(nsd->options->ip_transparent) (void)set_ip_transparent(sock); if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) return -1; if(sock->fib != -1 && set_setfib(sock) == -1) return -1; if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { char buf[256]; addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); log_msg(LOG_ERR, "can't bind udp socket %s: %s", buf, strerror(errno)); return -1; } return 1; } static int open_tcp_socket(struct nsd *nsd, struct nsd_socket *sock, int *reuseport_works) { #ifdef USE_TCP_FASTOPEN report_tcp_fastopen_config(); #endif (void)reuseport_works; if(-1 == (sock->s = socket( sock->addr.ai_family, sock->addr.ai_socktype, 0))) { #ifdef INET6 if((sock->flags & NSD_SOCKET_IS_OPTIONAL) && (sock->addr.ai_family == AF_INET6) && (errno == EAFNOSUPPORT)) { log_msg(LOG_WARNING, "fallback to TCP4, no IPv6: " "not supported"); return 0; } #endif /* INET6 */ log_msg(LOG_ERR, "can't create a socket: %s", strerror(errno)); return -1; } set_cloexec(sock); if(nsd->reuseport && reuseport_works && *reuseport_works) *reuseport_works = (set_reuseport(sock) == 1); (void)set_reuseaddr(sock); #ifdef INET6 if(sock->addr.ai_family == AF_INET6) { if (set_ipv6_v6only(sock) == -1 || set_ipv6_use_min_mtu(sock) == -1) return -1; } #endif if(nsd->tcp_mss > 0) set_tcp_maxseg(sock, nsd->tcp_mss); /* (StevensUNP p463), if TCP listening socket is blocking, then it may block in accept, even if select() says readable. */ (void)set_nonblock(sock); if(nsd->options->ip_freebind) (void)set_ip_freebind(sock); if(nsd->options->ip_transparent) (void)set_ip_transparent(sock); if((sock->flags & NSD_BIND_DEVICE) && set_bindtodevice(sock) == -1) return -1; if(sock->fib != -1 && set_setfib(sock) == -1) return -1; if(bind(sock->s, (struct sockaddr *)&sock->addr.ai_addr, sock->addr.ai_addrlen) == -1) { char buf[256]; addrport2str((void*)&sock->addr.ai_addr, buf, sizeof(buf)); log_msg(LOG_ERR, "can't bind tcp socket %s: %s", buf, strerror(errno)); return -1; } #ifdef USE_TCP_FASTOPEN (void)set_tcp_fastopen(sock); #endif if(listen(sock->s, TCP_BACKLOG) == -1) { log_msg(LOG_ERR, "can't listen: %s", strerror(errno)); return -1; } return 1; } /* * Initialize the server, reuseport, create and bind the sockets. */ int server_init(struct nsd *nsd) { size_t i; int reuseport = 1; /* Determine if REUSEPORT works. */ /* open server interface ports */ for(i = 0; i < nsd->ifs; i++) { if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1 || open_tcp_socket(nsd, &nsd->tcp[i], &reuseport) == -1) { return -1; } } if(nsd->reuseport && reuseport) { size_t ifs = nsd->ifs * nsd->reuseport; /* increase the size of the interface arrays, there are going * to be separate interface file descriptors for every server * instance */ region_remove_cleanup(nsd->region, free, nsd->udp); region_remove_cleanup(nsd->region, free, nsd->tcp); nsd->udp = xrealloc(nsd->udp, ifs * sizeof(*nsd->udp)); nsd->tcp = xrealloc(nsd->tcp, ifs * sizeof(*nsd->tcp)); region_add_cleanup(nsd->region, free, nsd->udp); region_add_cleanup(nsd->region, free, nsd->tcp); if(ifs > nsd->ifs) { memset(&nsd->udp[nsd->ifs], 0, (ifs-nsd->ifs)*sizeof(*nsd->udp)); memset(&nsd->tcp[nsd->ifs], 0, (ifs-nsd->ifs)*sizeof(*nsd->tcp)); } for(i = nsd->ifs; i < ifs; i++) { nsd->udp[i] = nsd->udp[i%nsd->ifs]; nsd->udp[i].s = -1; if(open_udp_socket(nsd, &nsd->udp[i], &reuseport) == -1) { return -1; } /* Turn off REUSEPORT for TCP by copying the socket * file descriptor. * This means we should not close TCP used by * other servers in reuseport enabled mode, in * server_child(). */ nsd->tcp[i] = nsd->tcp[i%nsd->ifs]; } nsd->ifs = ifs; } else { nsd->reuseport = 0; } return 0; } /* * Prepare the server for take off. * */ int server_prepare(struct nsd *nsd) { #ifdef RATELIMIT /* set secret modifier for hashing (udb ptr buckets and rate limits) */ #ifdef HAVE_GETRANDOM uint32_t v; if(getrandom(&v, sizeof(v), 0) == -1) { log_msg(LOG_ERR, "getrandom failed: %s", strerror(errno)); exit(1); } hash_set_raninit(v); #elif defined(HAVE_ARC4RANDOM) hash_set_raninit(arc4random()); #else uint32_t v = getpid() ^ time(NULL); srandom((unsigned long)v); # ifdef HAVE_SSL if(RAND_status() && RAND_bytes((unsigned char*)&v, sizeof(v)) > 0) hash_set_raninit(v); else # endif hash_set_raninit(random()); #endif rrl_mmap_init(nsd->child_count, nsd->options->rrl_size, nsd->options->rrl_ratelimit, nsd->options->rrl_whitelist_ratelimit, nsd->options->rrl_slip, nsd->options->rrl_ipv4_prefix_length, nsd->options->rrl_ipv6_prefix_length); #endif /* RATELIMIT */ /* Open the database... */ if ((nsd->db = namedb_open(nsd->dbfile, nsd->options)) == NULL) { log_msg(LOG_ERR, "unable to open the database %s: %s", nsd->dbfile, strerror(errno)); unlink(nsd->task[0]->fname); unlink(nsd->task[1]->fname); #ifdef USE_ZONE_STATS unlink(nsd->zonestatfname[0]); unlink(nsd->zonestatfname[1]); #endif xfrd_del_tempdir(nsd); return -1; } /* check if zone files have been modified */ /* NULL for taskudb because we send soainfo in a moment, batched up, * for all zones */ if(nsd->options->zonefiles_check || (nsd->options->database == NULL || nsd->options->database[0] == 0)) namedb_check_zonefiles(nsd, nsd->options, NULL, NULL); zonestatid_tree_set(nsd); compression_table_capacity = 0; initialize_dname_compression_tables(nsd); #ifdef BIND8_STATS /* Initialize times... */ time(&nsd->st.boot); set_bind8_alarm(nsd); #endif /* BIND8_STATS */ return 0; } /* * Fork the required number of servers. */ static int server_start_children(struct nsd *nsd, region_type* region, netio_type* netio, int* xfrd_sock_p) { size_t i; /* Start all child servers initially. */ for (i = 0; i < nsd->child_count; ++i) { nsd->children[i].pid = 0; } return restart_child_servers(nsd, region, netio, xfrd_sock_p); } static void server_close_socket(struct nsd_socket *sock) { if(sock->s != -1) { close(sock->s); sock->s = -1; } } void server_close_all_sockets(struct nsd_socket sockets[], size_t n) { size_t i; /* Close all the sockets... */ for (i = 0; i < n; ++i) { server_close_socket(&sockets[i]); } } /* * Close the sockets, shutdown the server and exit. * Does not return. */ void server_shutdown(struct nsd *nsd) { size_t i; server_close_all_sockets(nsd->udp, nsd->ifs); server_close_all_sockets(nsd->tcp, nsd->ifs); /* CHILD: close command channel to parent */ if(nsd->this_child && nsd->this_child->parent_fd != -1) { close(nsd->this_child->parent_fd); nsd->this_child->parent_fd = -1; } /* SERVER: close command channels to children */ if(!nsd->this_child) { for(i=0; i < nsd->child_count; ++i) if(nsd->children[i].child_fd != -1) { close(nsd->children[i].child_fd); nsd->children[i].child_fd = -1; } } tsig_finalize(); #ifdef HAVE_SSL daemon_remote_delete(nsd->rc); /* ssl-delete secret keys */ if (nsd->tls_ctx) SSL_CTX_free(nsd->tls_ctx); #endif #ifdef MEMCLEAN /* OS collects memory pages */ #ifdef RATELIMIT rrl_mmap_deinit_keep_mmap(); #endif #ifdef USE_DNSTAP dt_collector_destroy(nsd->dt_collector, nsd); #endif udb_base_free_keep_mmap(nsd->task[0]); udb_base_free_keep_mmap(nsd->task[1]); namedb_close_udb(nsd->db); /* keeps mmap */ namedb_close(nsd->db); nsd_options_destroy(nsd->options); region_destroy(nsd->region); #endif log_finalize(); exit(0); } void server_prepare_xfrd(struct nsd* nsd) { char tmpfile[256]; /* create task mmaps */ nsd->mytask = 0; snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.0", nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); nsd->task[0] = task_file_create(tmpfile); if(!nsd->task[0]) { #ifdef USE_ZONE_STATS unlink(nsd->zonestatfname[0]); unlink(nsd->zonestatfname[1]); #endif xfrd_del_tempdir(nsd); exit(1); } snprintf(tmpfile, sizeof(tmpfile), "%snsd-xfr-%d/nsd.%u.task.1", nsd->options->xfrdir, (int)getpid(), (unsigned)getpid()); nsd->task[1] = task_file_create(tmpfile); if(!nsd->task[1]) { unlink(nsd->task[0]->fname); #ifdef USE_ZONE_STATS unlink(nsd->zonestatfname[0]); unlink(nsd->zonestatfname[1]); #endif xfrd_del_tempdir(nsd); exit(1); } assert(udb_base_get_userdata(nsd->task[0])->data == 0); assert(udb_base_get_userdata(nsd->task[1])->data == 0); /* create xfrd listener structure */ nsd->xfrd_listener = region_alloc(nsd->region, sizeof(netio_handler_type)); nsd->xfrd_listener->user_data = (struct ipc_handler_conn_data*) region_alloc(nsd->region, sizeof(struct ipc_handler_conn_data)); nsd->xfrd_listener->fd = -1; ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->nsd = nsd; ((struct ipc_handler_conn_data*)nsd->xfrd_listener->user_data)->conn = xfrd_tcp_create(nsd->region, QIOBUFSZ); } void server_start_xfrd(struct nsd *nsd, int del_db, int reload_active) { pid_t pid; int sockets[2] = {0,0}; struct ipc_handler_conn_data *data; if(nsd->xfrd_listener->fd != -1) close(nsd->xfrd_listener->fd); if(del_db) { /* recreate taskdb that xfrd was using, it may be corrupt */ /* we (or reload) use nsd->mytask, and xfrd uses the other */ char* tmpfile = nsd->task[1-nsd->mytask]->fname; nsd->task[1-nsd->mytask]->fname = NULL; /* free alloc already, so udb does not shrink itself */ udb_alloc_delete(nsd->task[1-nsd->mytask]->alloc); nsd->task[1-nsd->mytask]->alloc = NULL; udb_base_free(nsd->task[1-nsd->mytask]); /* create new file, overwrite the old one */ nsd->task[1-nsd->mytask] = task_file_create(tmpfile); free(tmpfile); } if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) == -1) { log_msg(LOG_ERR, "startxfrd failed on socketpair: %s", strerror(errno)); return; } pid = fork(); switch (pid) { case -1: log_msg(LOG_ERR, "fork xfrd failed: %s", strerror(errno)); break; default: /* PARENT: close first socket, use second one */ close(sockets[0]); if (fcntl(sockets[1], F_SETFL, O_NONBLOCK) == -1) { log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); } if(del_db) xfrd_free_namedb(nsd); /* use other task than I am using, since if xfrd died and is * restarted, the reload is using nsd->mytask */ nsd->mytask = 1 - nsd->mytask; #ifdef HAVE_SETPROCTITLE setproctitle("xfrd"); #endif #ifdef HAVE_CPUSET_T if(nsd->use_cpu_affinity) { set_cpu_affinity(nsd->xfrd_cpuset); } #endif xfrd_init(sockets[1], nsd, del_db, reload_active, pid); /* ENOTREACH */ break; case 0: /* CHILD: close second socket, use first one */ close(sockets[1]); if (fcntl(sockets[0], F_SETFL, O_NONBLOCK) == -1) { log_msg(LOG_ERR, "cannot fcntl pipe: %s", strerror(errno)); } nsd->xfrd_listener->fd = sockets[0]; break; } /* server-parent only */ nsd->xfrd_listener->timeout = NULL; nsd->xfrd_listener->event_types = NETIO_EVENT_READ; nsd->xfrd_listener->event_handler = parent_handle_xfrd_command; /* clear ongoing ipc reads */ data = (struct ipc_handler_conn_data *) nsd->xfrd_listener->user_data; data->conn->is_reading = 0; } /** add all soainfo to taskdb */ static void add_all_soa_to_task(struct nsd* nsd, struct udb_base* taskudb) { struct radnode* n; udb_ptr task_last; /* last task, mytask is empty so NULL */ /* add all SOA INFO to mytask */ udb_ptr_init(&task_last, taskudb); for(n=radix_first(nsd->db->zonetree); n; n=radix_next(n)) { task_new_soainfo(taskudb, &task_last, (zone_type*)n->elem, 0); } udb_ptr_unlink(&task_last, taskudb); } void server_send_soa_xfrd(struct nsd* nsd, int shortsoa) { /* normally this exchanges the SOA from nsd->xfrd and the expire back. * parent fills one taskdb with soas, xfrd fills other with expires. * then they exchange and process. * shortsoa: xfrd crashes and needs to be restarted and one taskdb * may be in use by reload. Fill SOA in taskdb and give to xfrd. * expire notifications can be sent back via a normal reload later * (xfrd will wait for current running reload to finish if any). */ sig_atomic_t cmd = 0; pid_t mypid; int xfrd_sock = nsd->xfrd_listener->fd; struct udb_base* taskudb = nsd->task[nsd->mytask]; udb_ptr t; if(!shortsoa) { if(nsd->signal_hint_shutdown) { shutdown: log_msg(LOG_WARNING, "signal received, shutting down..."); server_close_all_sockets(nsd->udp, nsd->ifs); server_close_all_sockets(nsd->tcp, nsd->ifs); #ifdef HAVE_SSL daemon_remote_close(nsd->rc); #endif /* Unlink it if possible... */ unlinkpid(nsd->pidfile); unlink(nsd->task[0]->fname); unlink(nsd->task[1]->fname); #ifdef USE_ZONE_STATS unlink(nsd->zonestatfname[0]); unlink(nsd->zonestatfname[1]); #endif /* write the nsd.db to disk, wait for it to complete */ udb_base_sync(nsd->db->udb, 1); udb_base_close(nsd->db->udb); server_shutdown(nsd); exit(0); } } if(shortsoa) { /* put SOA in xfrd task because mytask may be in use */ taskudb = nsd->task[1-nsd->mytask]; } add_all_soa_to_task(nsd, taskudb); if(!shortsoa) { /* wait for xfrd to signal task is ready, RELOAD signal */ if(block_read(nsd, xfrd_sock, &cmd, sizeof(cmd), -1) != sizeof(cmd) || cmd != NSD_RELOAD) { log_msg(LOG_ERR, "did not get start signal from xfrd"); exit(1); } if(nsd->signal_hint_shutdown) { goto shutdown; } } /* give xfrd our task, signal it with RELOAD_DONE */ task_process_sync(taskudb); cmd = NSD_RELOAD_DONE; if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", (int)nsd->pid, strerror(errno)); } mypid = getpid(); if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", strerror(errno)); } if(!shortsoa) { /* process the xfrd task works (expiry data) */ nsd->mytask = 1 - nsd->mytask; taskudb = nsd->task[nsd->mytask]; task_remap(taskudb); udb_ptr_new(&t, taskudb, udb_base_get_userdata(taskudb)); while(!udb_ptr_is_null(&t)) { task_process_expire(nsd->db, TASKLIST(&t)); udb_ptr_set_rptr(&t, taskudb, &TASKLIST(&t)->next); } udb_ptr_unlink(&t, taskudb); task_clear(taskudb); /* tell xfrd that the task is emptied, signal with RELOAD_DONE */ cmd = NSD_RELOAD_DONE; if(!write_socket(xfrd_sock, &cmd, sizeof(cmd))) { log_msg(LOG_ERR, "problems sending soa end from reload %d to xfrd: %s", (int)nsd->pid, strerror(errno)); } } } #ifdef HAVE_SSL static void log_crypto_from_err(const char* str, unsigned long err) { /* error:[error code]:[library name]:[function name]:[reason string] */ char buf[128]; unsigned long e; ERR_error_string_n(err, buf, sizeof(buf)); log_msg(LOG_ERR, "%s crypto %s", str, buf); while( (e=ERR_get_error()) ) { ERR_error_string_n(e, buf, sizeof(buf)); log_msg(LOG_ERR, "and additionally crypto %s", buf); } } void log_crypto_err(const char* str) { log_crypto_from_err(str, ERR_get_error()); } /** true if the ssl handshake error has to be squelched from the logs */ static int squelch_err_ssl_handshake(unsigned long err) { if(verbosity >= 3) return 0; /* only squelch on low verbosity */ /* this is very specific, we could filter on ERR_GET_REASON() * (the third element in ERR_PACK) */ if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) || err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) || err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) || err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE) #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER) #endif #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL) || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL) # ifdef SSL_R_VERSION_TOO_LOW || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW) # endif #endif ) return 1; return 0; } void perform_openssl_init(void) { /* init SSL library */ #ifdef HAVE_ERR_LOAD_CRYPTO_STRINGS ERR_load_crypto_strings(); #endif ERR_load_SSL_strings(); #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_CRYPTO) OpenSSL_add_all_algorithms(); #else OPENSSL_init_crypto(OPENSSL_INIT_ADD_ALL_CIPHERS | OPENSSL_INIT_ADD_ALL_DIGESTS | OPENSSL_INIT_LOAD_CRYPTO_STRINGS, NULL); #endif #if OPENSSL_VERSION_NUMBER < 0x10100000 || !defined(HAVE_OPENSSL_INIT_SSL) (void)SSL_library_init(); #else OPENSSL_init_ssl(0, NULL); #endif if(!RAND_status()) { /* try to seed it */ unsigned char buf[256]; unsigned int v, seed=(unsigned)time(NULL) ^ (unsigned)getpid(); size_t i; v = seed; for(i=0; i<256/sizeof(v); i++) { memmove(buf+i*sizeof(v), &v, sizeof(v)); v = v*seed + (unsigned int)i; } RAND_seed(buf, 256); log_msg(LOG_WARNING, "warning: no entropy, seeding openssl PRNG with time"); } } static int get_ocsp(char *filename, unsigned char **ocsp) { BIO *bio; OCSP_RESPONSE *response; int len = -1; unsigned char *p, *buf; assert(filename); if ((bio = BIO_new_file(filename, "r")) == NULL) { log_crypto_err("get_ocsp: BIO_new_file failed"); return -1; } if ((response = d2i_OCSP_RESPONSE_bio(bio, NULL)) == NULL) { log_crypto_err("get_ocsp: d2i_OCSP_RESPONSE_bio failed"); BIO_free(bio); return -1; } if ((len = i2d_OCSP_RESPONSE(response, NULL)) <= 0) { log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #1 failed"); OCSP_RESPONSE_free(response); BIO_free(bio); return -1; } if ((buf = malloc((size_t) len)) == NULL) { log_msg(LOG_ERR, "get_ocsp: malloc failed"); OCSP_RESPONSE_free(response); BIO_free(bio); return -1; } p = buf; if ((len = i2d_OCSP_RESPONSE(response, &p)) <= 0) { log_crypto_err("get_ocsp: i2d_OCSP_RESPONSE #2 failed"); free(buf); OCSP_RESPONSE_free(response); BIO_free(bio); return -1; } OCSP_RESPONSE_free(response); BIO_free(bio); *ocsp = buf; return len; } /* further setup ssl ctx after the keys are loaded */ static void listen_sslctx_setup_2(void* ctxt) { SSL_CTX* ctx = (SSL_CTX*)ctxt; (void)ctx; #if HAVE_DECL_SSL_CTX_SET_ECDH_AUTO if(!SSL_CTX_set_ecdh_auto(ctx,1)) { log_crypto_err("Error in SSL_CTX_ecdh_auto, not enabling ECDHE"); } #elif defined(HAVE_DECL_SSL_CTX_SET_TMP_ECDH) && defined(NID_X9_62_prime256v1) && defined(HAVE_EC_KEY_NEW_BY_CURVE_NAME) if(1) { EC_KEY *ecdh = EC_KEY_new_by_curve_name (NID_X9_62_prime256v1); if (!ecdh) { log_crypto_err("could not find p256, not enabling ECDHE"); } else { if (1 != SSL_CTX_set_tmp_ecdh (ctx, ecdh)) { log_crypto_err("Error in SSL_CTX_set_tmp_ecdh, not enabling ECDHE"); } EC_KEY_free (ecdh); } } #endif } static int add_ocsp_data_cb(SSL *s, void* ATTR_UNUSED(arg)) { if(ocspdata) { unsigned char *p; if ((p=malloc(ocspdata_len)) == NULL) { log_msg(LOG_ERR, "add_ocsp_data_cb: malloc failure"); return SSL_TLSEXT_ERR_NOACK; } memcpy(p, ocspdata, ocspdata_len); if ((SSL_set_tlsext_status_ocsp_resp(s, p, ocspdata_len)) != 1) { log_crypto_err("Error in SSL_set_tlsext_status_ocsp_resp"); free(p); return SSL_TLSEXT_ERR_NOACK; } return SSL_TLSEXT_ERR_OK; } else { return SSL_TLSEXT_ERR_NOACK; } } SSL_CTX* server_tls_ctx_setup(char* key, char* pem, char* verifypem) { SSL_CTX *ctx = SSL_CTX_new(SSLv23_server_method()); if(!ctx) { log_crypto_err("could not SSL_CTX_new"); return NULL; } /* no SSLv2, SSLv3 because has defects */ #if SSL_OP_NO_SSLv2 != 0 if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2) & SSL_OP_NO_SSLv2) != SSL_OP_NO_SSLv2){ log_crypto_err("could not set SSL_OP_NO_SSLv2"); SSL_CTX_free(ctx); return NULL; } #endif if((SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv3) & SSL_OP_NO_SSLv3) != SSL_OP_NO_SSLv3){ log_crypto_err("could not set SSL_OP_NO_SSLv3"); SSL_CTX_free(ctx); return 0; } #if defined(SSL_OP_NO_TLSv1) && defined(SSL_OP_NO_TLSv1_1) /* if we have tls 1.1 disable 1.0 */ if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1) & SSL_OP_NO_TLSv1) != SSL_OP_NO_TLSv1){ log_crypto_err("could not set SSL_OP_NO_TLSv1"); SSL_CTX_free(ctx); return 0; } #endif #if defined(SSL_OP_NO_TLSv1_1) && defined(SSL_OP_NO_TLSv1_2) /* if we have tls 1.2 disable 1.1 */ if((SSL_CTX_set_options(ctx, SSL_OP_NO_TLSv1_1) & SSL_OP_NO_TLSv1_1) != SSL_OP_NO_TLSv1_1){ log_crypto_err("could not set SSL_OP_NO_TLSv1_1"); SSL_CTX_free(ctx); return 0; } #endif #if defined(SSL_OP_NO_RENEGOTIATION) /* disable client renegotiation */ if((SSL_CTX_set_options(ctx, SSL_OP_NO_RENEGOTIATION) & SSL_OP_NO_RENEGOTIATION) != SSL_OP_NO_RENEGOTIATION) { log_crypto_err("could not set SSL_OP_NO_RENEGOTIATION"); SSL_CTX_free(ctx); return 0; } #endif #if defined(SHA256_DIGEST_LENGTH) && defined(SSL_TXT_CHACHA20) /* if we have sha256, set the cipher list to have no known vulns */ if(!SSL_CTX_set_cipher_list(ctx, "ECDHE+AESGCM:ECDHE+CHACHA20")) log_crypto_err("could not set cipher list with SSL_CTX_set_cipher_list"); #endif if((SSL_CTX_set_options(ctx, SSL_OP_CIPHER_SERVER_PREFERENCE) & SSL_OP_CIPHER_SERVER_PREFERENCE) != SSL_OP_CIPHER_SERVER_PREFERENCE) { log_crypto_err("could not set SSL_OP_CIPHER_SERVER_PREFERENCE"); SSL_CTX_free(ctx); return 0; } #ifdef HAVE_SSL_CTX_SET_SECURITY_LEVEL SSL_CTX_set_security_level(ctx, 0); #endif if(!SSL_CTX_use_certificate_chain_file(ctx, pem)) { log_msg(LOG_ERR, "error for cert file: %s", pem); log_crypto_err("error in SSL_CTX use_certificate_chain_file"); SSL_CTX_free(ctx); return NULL; } if(!SSL_CTX_use_PrivateKey_file(ctx, key, SSL_FILETYPE_PEM)) { log_msg(LOG_ERR, "error for private key file: %s", key); log_crypto_err("Error in SSL_CTX use_PrivateKey_file"); SSL_CTX_free(ctx); return NULL; } if(!SSL_CTX_check_private_key(ctx)) { log_msg(LOG_ERR, "error for key file: %s", key); log_crypto_err("Error in SSL_CTX check_private_key"); SSL_CTX_free(ctx); return NULL; } listen_sslctx_setup_2(ctx); if(verifypem && verifypem[0]) { if(!SSL_CTX_load_verify_locations(ctx, verifypem, NULL)) { log_crypto_err("Error in SSL_CTX verify locations"); SSL_CTX_free(ctx); return NULL; } SSL_CTX_set_client_CA_list(ctx, SSL_load_client_CA_file(verifypem)); SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, NULL); } return ctx; } SSL_CTX* server_tls_ctx_create(struct nsd* nsd, char* verifypem, char* ocspfile) { char *key, *pem; SSL_CTX *ctx; key = nsd->options->tls_service_key; pem = nsd->options->tls_service_pem; if(!key || key[0] == 0) { log_msg(LOG_ERR, "error: no tls-service-key file specified"); return NULL; } if(!pem || pem[0] == 0) { log_msg(LOG_ERR, "error: no tls-service-pem file specified"); return NULL; } /* NOTE:This mimics the existing code in Unbound 1.5.1 by supporting SSL but * raft-ietf-uta-tls-bcp-08 recommends only using TLSv1.2*/ ctx = server_tls_ctx_setup(key, pem, verifypem); if(!ctx) { log_msg(LOG_ERR, "could not setup server TLS context"); return NULL; } if(ocspfile && ocspfile[0]) { if ((ocspdata_len = get_ocsp(ocspfile, &ocspdata)) < 0) { log_crypto_err("Error reading OCSPfile"); SSL_CTX_free(ctx); return NULL; } else { VERBOSITY(2, (LOG_INFO, "ocspfile %s loaded", ocspfile)); if(!SSL_CTX_set_tlsext_status_cb(ctx, add_ocsp_data_cb)) { log_crypto_err("Error in SSL_CTX_set_tlsext_status_cb"); SSL_CTX_free(ctx); return NULL; } } } return ctx; } /* check if tcp_handler_accept_data created for TLS dedicated port */ int using_tls_port(struct sockaddr* addr, const char* tls_port) { in_port_t port = 0; if (addr->sa_family == AF_INET) port = ((struct sockaddr_in*)addr)->sin_port; #ifndef HAVE_STRUCT_SOCKADDR_IN6 else port = ((struct sockaddr_in6*)addr)->sin6_port; #endif /* HAVE_STRUCT_SOCKADDR_IN6 */ if (atoi(tls_port) == ntohs(port)) return 1; return 0; } #endif /* pass timeout=-1 for blocking. Returns size, 0, -1(err), or -2(timeout) */ ssize_t block_read(struct nsd* nsd, int s, void* p, ssize_t sz, int timeout) { uint8_t* buf = (uint8_t*) p; ssize_t total = 0; struct pollfd fd; memset(&fd, 0, sizeof(fd)); fd.fd = s; fd.events = POLLIN; while( total < sz) { ssize_t ret; ret = poll(&fd, 1, (timeout==-1)?-1:timeout*1000); if(ret == -1) { if(errno == EAGAIN) /* blocking read */ continue; if(errno == EINTR) { if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) return -1; /* other signals can be handled later */ continue; } /* some error */ return -1; } if(ret == 0) { /* operation timed out */ return -2; } ret = read(s, buf+total, sz-total); if(ret == -1) { if(errno == EAGAIN) /* blocking read */ continue; if(errno == EINTR) { if(nsd && (nsd->signal_hint_quit || nsd->signal_hint_shutdown)) return -1; /* other signals can be handled later */ continue; } /* some error */ return -1; } if(ret == 0) { /* closed connection! */ return 0; } total += ret; } return total; } static void reload_process_tasks(struct nsd* nsd, udb_ptr* last_task, int cmdsocket) { sig_atomic_t cmd = NSD_QUIT_SYNC; udb_ptr t, next; udb_base* u = nsd->task[nsd->mytask]; udb_ptr_init(&next, u); udb_ptr_new(&t, u, udb_base_get_userdata(u)); udb_base_set_userdata(u, 0); while(!udb_ptr_is_null(&t)) { /* store next in list so this one can be deleted or reused */ udb_ptr_set_rptr(&next, u, &TASKLIST(&t)->next); udb_rptr_zero(&TASKLIST(&t)->next, u); /* process task t */ /* append results for task t and update last_task */ task_process_in_reload(nsd, u, last_task, &t); /* go to next */ udb_ptr_set_ptr(&t, u, &next); /* if the parent has quit, we must quit too, poll the fd for cmds */ if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) { DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd)); if(cmd == NSD_QUIT) { DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); /* sync to disk (if needed) */ udb_base_sync(nsd->db->udb, 0); /* unlink files of remainder of tasks */ while(!udb_ptr_is_null(&t)) { if(TASKLIST(&t)->task_type == task_apply_xfr) { xfrd_unlink_xfrfile(nsd, TASKLIST(&t)->yesno); } udb_ptr_set_rptr(&t, u, &TASKLIST(&t)->next); } udb_ptr_unlink(&t, u); udb_ptr_unlink(&next, u); exit(0); } } } udb_ptr_unlink(&t, u); udb_ptr_unlink(&next, u); } #ifdef BIND8_STATS static void parent_send_stats(struct nsd* nsd, int cmdfd) { size_t i; if(!write_socket(cmdfd, &nsd->st, sizeof(nsd->st))) { log_msg(LOG_ERR, "could not write stats to reload"); return; } for(i=0; ichild_count; i++) if(!write_socket(cmdfd, &nsd->children[i].query_count, sizeof(stc_type))) { log_msg(LOG_ERR, "could not write stats to reload"); return; } } static void reload_do_stats(int cmdfd, struct nsd* nsd, udb_ptr* last) { struct nsdst s; stc_type* p; size_t i; if(block_read(nsd, cmdfd, &s, sizeof(s), RELOAD_SYNC_TIMEOUT) != sizeof(s)) { log_msg(LOG_ERR, "could not read stats from oldpar"); return; } s.db_disk = (nsd->db->udb?nsd->db->udb->base_size:0); s.db_mem = region_get_mem(nsd->db->region); p = (stc_type*)task_new_stat_info(nsd->task[nsd->mytask], last, &s, nsd->child_count); if(!p) return; for(i=0; ichild_count; i++) { if(block_read(nsd, cmdfd, p++, sizeof(stc_type), 1)!= sizeof(stc_type)) return; } } #endif /* BIND8_STATS */ /* * Reload the database, stop parent, re-fork children and continue. * as server_main. */ static void server_reload(struct nsd *nsd, region_type* server_region, netio_type* netio, int cmdsocket) { pid_t mypid; sig_atomic_t cmd = NSD_QUIT_SYNC; int ret; udb_ptr last_task; struct sigaction old_sigchld, ign_sigchld; /* ignore SIGCHLD from the previous server_main that used this pid */ memset(&ign_sigchld, 0, sizeof(ign_sigchld)); ign_sigchld.sa_handler = SIG_IGN; sigaction(SIGCHLD, &ign_sigchld, &old_sigchld); #ifdef HAVE_SETPROCTITLE setproctitle("main"); #endif #ifdef HAVE_CPUSET_T if(nsd->use_cpu_affinity) { set_cpu_affinity(nsd->cpuset); } #endif /* see what tasks we got from xfrd */ task_remap(nsd->task[nsd->mytask]); udb_ptr_init(&last_task, nsd->task[nsd->mytask]); udb_compact_inhibited(nsd->db->udb, 1); reload_process_tasks(nsd, &last_task, cmdsocket); udb_compact_inhibited(nsd->db->udb, 0); udb_compact(nsd->db->udb); #ifndef NDEBUG if(nsd_debug_level >= 1) region_log_stats(nsd->db->region); #endif /* NDEBUG */ /* sync to disk (if needed) */ udb_base_sync(nsd->db->udb, 0); initialize_dname_compression_tables(nsd); #ifdef BIND8_STATS /* Restart dumping stats if required. */ time(&nsd->st.boot); set_bind8_alarm(nsd); #endif #ifdef USE_ZONE_STATS server_zonestat_realloc(nsd); /* realloc for new children */ server_zonestat_switch(nsd); #endif /* listen for the signals of failed children again */ sigaction(SIGCHLD, &old_sigchld, NULL); /* Start new child processes */ if (server_start_children(nsd, server_region, netio, &nsd-> xfrd_listener->fd) != 0) { send_children_quit(nsd); exit(1); } /* if the parent has quit, we must quit too, poll the fd for cmds */ if(block_read(nsd, cmdsocket, &cmd, sizeof(cmd), 0) == sizeof(cmd)) { DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc command from main %d", (int)cmd)); if(cmd == NSD_QUIT) { DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: quit to follow nsd")); send_children_quit(nsd); exit(0); } } /* Send quit command to parent: blocking, wait for receipt. */ do { DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc send quit to main")); if (!write_socket(cmdsocket, &cmd, sizeof(cmd))) { log_msg(LOG_ERR, "problems sending command from reload to oldnsd: %s", strerror(errno)); } /* blocking: wait for parent to really quit. (it sends RELOAD as ack) */ DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc wait for ack main")); ret = block_read(nsd, cmdsocket, &cmd, sizeof(cmd), RELOAD_SYNC_TIMEOUT); if(ret == -2) { DEBUG(DEBUG_IPC, 1, (LOG_ERR, "reload timeout QUITSYNC. retry")); } } while (ret == -2); if(ret == -1) { log_msg(LOG_ERR, "reload: could not wait for parent to quit: %s", strerror(errno)); } DEBUG(DEBUG_IPC,1, (LOG_INFO, "reload: ipc reply main %d %d", ret, (int)cmd)); if(cmd == NSD_QUIT) { /* small race condition possible here, parent got quit cmd. */ send_children_quit(nsd); exit(1); } assert(ret==-1 || ret == 0 || cmd == NSD_RELOAD); #ifdef BIND8_STATS reload_do_stats(cmdsocket, nsd, &last_task); #endif udb_ptr_unlink(&last_task, nsd->task[nsd->mytask]); task_process_sync(nsd->task[nsd->mytask]); #ifdef USE_ZONE_STATS server_zonestat_realloc(nsd); /* realloc for next children */ #endif /* send soainfo to the xfrd process, signal it that reload is done, * it picks up the taskudb */ cmd = NSD_RELOAD_DONE; if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { log_msg(LOG_ERR, "problems sending reload_done xfrd: %s", strerror(errno)); } mypid = getpid(); if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", strerror(errno)); } /* try to reopen file */ if (nsd->file_rotation_ok) log_reopen(nsd->log_filename, 1); /* exit reload, continue as new server_main */ } /* * Get the mode depending on the signal hints that have been received. * Multiple signal hints can be received and will be handled in turn. */ static sig_atomic_t server_signal_mode(struct nsd *nsd) { if(nsd->signal_hint_quit) { nsd->signal_hint_quit = 0; return NSD_QUIT; } else if(nsd->signal_hint_shutdown) { nsd->signal_hint_shutdown = 0; return NSD_SHUTDOWN; } else if(nsd->signal_hint_child) { nsd->signal_hint_child = 0; return NSD_REAP_CHILDREN; } else if(nsd->signal_hint_reload) { nsd->signal_hint_reload = 0; return NSD_RELOAD; } else if(nsd->signal_hint_reload_hup) { nsd->signal_hint_reload_hup = 0; return NSD_RELOAD_REQ; } else if(nsd->signal_hint_stats) { nsd->signal_hint_stats = 0; #ifdef BIND8_STATS set_bind8_alarm(nsd); #endif return NSD_STATS; } else if(nsd->signal_hint_statsusr) { nsd->signal_hint_statsusr = 0; return NSD_STATS; } return NSD_RUN; } /* * The main server simply waits for signals and child processes to * terminate. Child processes are restarted as necessary. */ void server_main(struct nsd *nsd) { region_type *server_region = region_create(xalloc, free); netio_type *netio = netio_create(server_region); netio_handler_type reload_listener; int reload_sockets[2] = {-1, -1}; struct timespec timeout_spec; int status; pid_t child_pid; pid_t reload_pid = -1; sig_atomic_t mode; /* Ensure we are the main process */ assert(nsd->server_kind == NSD_SERVER_MAIN); /* Add listener for the XFRD process */ netio_add_handler(netio, nsd->xfrd_listener); /* Start the child processes that handle incoming queries */ if (server_start_children(nsd, server_region, netio, &nsd->xfrd_listener->fd) != 0) { send_children_quit(nsd); exit(1); } reload_listener.fd = -1; /* This_child MUST be 0, because this is the parent process */ assert(nsd->this_child == 0); /* Run the server until we get a shutdown signal */ while ((mode = nsd->mode) != NSD_SHUTDOWN) { /* Did we receive a signal that changes our mode? */ if(mode == NSD_RUN) { nsd->mode = mode = server_signal_mode(nsd); } switch (mode) { case NSD_RUN: /* see if any child processes terminated */ while((child_pid = waitpid(-1, &status, WNOHANG)) != -1 && child_pid != 0) { int is_child = delete_child_pid(nsd, child_pid); if (is_child != -1 && nsd->children[is_child].need_to_exit) { if(nsd->children[is_child].child_fd == -1) nsd->children[is_child].has_exited = 1; parent_check_all_children_exited(nsd); } else if(is_child != -1) { log_msg(LOG_WARNING, "server %d died unexpectedly with status %d, restarting", (int) child_pid, status); restart_child_servers(nsd, server_region, netio, &nsd->xfrd_listener->fd); } else if (child_pid == reload_pid) { sig_atomic_t cmd = NSD_RELOAD_DONE; pid_t mypid; log_msg(LOG_WARNING, "Reload process %d failed with status %d, continuing with old database", (int) child_pid, status); reload_pid = -1; if(reload_listener.fd != -1) close(reload_listener.fd); reload_listener.fd = -1; reload_listener.event_types = NETIO_EVENT_NONE; task_process_sync(nsd->task[nsd->mytask]); /* inform xfrd reload attempt ended */ if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { log_msg(LOG_ERR, "problems " "sending SOAEND to xfrd: %s", strerror(errno)); } mypid = getpid(); if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", strerror(errno)); } } else if(status != 0) { /* check for status, because we get * the old-servermain because reload * is the process-parent of old-main, * and we get older server-processes * that are exiting after a reload */ log_msg(LOG_WARNING, "process %d terminated with status %d", (int) child_pid, status); } } if (child_pid == -1) { if (errno == EINTR) { continue; } if (errno != ECHILD) log_msg(LOG_WARNING, "wait failed: %s", strerror(errno)); } if (nsd->mode != NSD_RUN) break; /* timeout to collect processes. In case no sigchild happens. */ timeout_spec.tv_sec = 60; timeout_spec.tv_nsec = 0; /* listen on ports, timeout for collecting terminated children */ if(netio_dispatch(netio, &timeout_spec, 0) == -1) { if (errno != EINTR) { log_msg(LOG_ERR, "netio_dispatch failed: %s", strerror(errno)); } } if(nsd->restart_children) { restart_child_servers(nsd, server_region, netio, &nsd->xfrd_listener->fd); nsd->restart_children = 0; } if(nsd->reload_failed) { sig_atomic_t cmd = NSD_RELOAD_DONE; pid_t mypid; nsd->reload_failed = 0; log_msg(LOG_WARNING, "Reload process %d failed, continuing with old database", (int) reload_pid); reload_pid = -1; if(reload_listener.fd != -1) close(reload_listener.fd); reload_listener.fd = -1; reload_listener.event_types = NETIO_EVENT_NONE; task_process_sync(nsd->task[nsd->mytask]); /* inform xfrd reload attempt ended */ if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { log_msg(LOG_ERR, "problems " "sending SOAEND to xfrd: %s", strerror(errno)); } mypid = getpid(); if(!write_socket(nsd->xfrd_listener->fd, &mypid, sizeof(mypid))) { log_msg(LOG_ERR, "problems sending reloadpid to xfrd: %s", strerror(errno)); } } break; case NSD_RELOAD_REQ: { sig_atomic_t cmd = NSD_RELOAD_REQ; log_msg(LOG_WARNING, "SIGHUP received, reloading..."); DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc send reload_req to xfrd")); if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { log_msg(LOG_ERR, "server_main: could not send " "reload_req to xfrd: %s", strerror(errno)); } nsd->mode = NSD_RUN; } break; case NSD_RELOAD: /* Continue to run nsd after reload */ nsd->mode = NSD_RUN; DEBUG(DEBUG_IPC,1, (LOG_INFO, "reloading...")); if (reload_pid != -1) { log_msg(LOG_WARNING, "Reload already in progress (pid = %d)", (int) reload_pid); break; } /* switch the mytask to keep track of who owns task*/ nsd->mytask = 1 - nsd->mytask; if (socketpair(AF_UNIX, SOCK_STREAM, 0, reload_sockets) == -1) { log_msg(LOG_ERR, "reload failed on socketpair: %s", strerror(errno)); reload_pid = -1; break; } /* Do actual reload */ reload_pid = fork(); switch (reload_pid) { case -1: log_msg(LOG_ERR, "fork failed: %s", strerror(errno)); break; default: /* PARENT */ close(reload_sockets[0]); server_reload(nsd, server_region, netio, reload_sockets[1]); DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload exited to become new main")); close(reload_sockets[1]); DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload closed")); /* drop stale xfrd ipc data */ ((struct ipc_handler_conn_data*)nsd-> xfrd_listener->user_data) ->conn->is_reading = 0; reload_pid = -1; reload_listener.fd = -1; reload_listener.event_types = NETIO_EVENT_NONE; DEBUG(DEBUG_IPC,2, (LOG_INFO, "Reload resetup; run")); break; case 0: /* CHILD */ /* server_main keep running until NSD_QUIT_SYNC * received from reload. */ close(reload_sockets[1]); reload_listener.fd = reload_sockets[0]; reload_listener.timeout = NULL; reload_listener.user_data = nsd; reload_listener.event_types = NETIO_EVENT_READ; reload_listener.event_handler = parent_handle_reload_command; /* listens to Quit */ netio_add_handler(netio, &reload_listener); reload_pid = getppid(); break; } break; case NSD_QUIT_SYNC: /* synchronisation of xfrd, parent and reload */ if(!nsd->quit_sync_done && reload_listener.fd != -1) { sig_atomic_t cmd = NSD_RELOAD; /* stop xfrd ipc writes in progress */ DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc send indication reload")); if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { log_msg(LOG_ERR, "server_main: could not send reload " "indication to xfrd: %s", strerror(errno)); } /* wait for ACK from xfrd */ DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: wait ipc reply xfrd")); nsd->quit_sync_done = 1; } nsd->mode = NSD_RUN; break; case NSD_QUIT: /* silent shutdown during reload */ if(reload_listener.fd != -1) { /* acknowledge the quit, to sync reload that we will really quit now */ sig_atomic_t cmd = NSD_RELOAD; DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc ack reload")); if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { log_msg(LOG_ERR, "server_main: " "could not ack quit: %s", strerror(errno)); } #ifdef BIND8_STATS parent_send_stats(nsd, reload_listener.fd); #endif /* BIND8_STATS */ close(reload_listener.fd); } DEBUG(DEBUG_IPC,1, (LOG_INFO, "server_main: shutdown sequence")); /* only quit children after xfrd has acked */ send_children_quit(nsd); #ifdef MEMCLEAN /* OS collects memory pages */ region_destroy(server_region); #endif server_shutdown(nsd); /* ENOTREACH */ break; case NSD_SHUTDOWN: break; case NSD_REAP_CHILDREN: /* continue; wait for child in run loop */ nsd->mode = NSD_RUN; break; case NSD_STATS: #ifdef BIND8_STATS set_children_stats(nsd); #endif nsd->mode = NSD_RUN; break; default: log_msg(LOG_WARNING, "NSD main server mode invalid: %d", (int)nsd->mode); nsd->mode = NSD_RUN; break; } } log_msg(LOG_WARNING, "signal received, shutting down..."); /* close opened ports to avoid race with restart of nsd */ server_close_all_sockets(nsd->udp, nsd->ifs); server_close_all_sockets(nsd->tcp, nsd->ifs); #ifdef HAVE_SSL daemon_remote_close(nsd->rc); #endif send_children_quit_and_wait(nsd); /* Unlink it if possible... */ unlinkpid(nsd->pidfile); unlink(nsd->task[0]->fname); unlink(nsd->task[1]->fname); #ifdef USE_ZONE_STATS unlink(nsd->zonestatfname[0]); unlink(nsd->zonestatfname[1]); #endif #ifdef USE_DNSTAP dt_collector_close(nsd->dt_collector, nsd); #endif if(reload_listener.fd != -1) { sig_atomic_t cmd = NSD_QUIT; DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc send quit to reload-process")); if(!write_socket(reload_listener.fd, &cmd, sizeof(cmd))) { log_msg(LOG_ERR, "server_main: could not send quit to reload: %s", strerror(errno)); } fsync(reload_listener.fd); close(reload_listener.fd); /* wait for reload to finish processing */ while(1) { if(waitpid(reload_pid, NULL, 0) == -1) { if(errno == EINTR) continue; if(errno == ECHILD) break; log_msg(LOG_ERR, "waitpid(reload %d): %s", (int)reload_pid, strerror(errno)); } break; } } if(nsd->xfrd_listener->fd != -1) { /* complete quit, stop xfrd */ sig_atomic_t cmd = NSD_QUIT; DEBUG(DEBUG_IPC,1, (LOG_INFO, "main: ipc send quit to xfrd")); if(!write_socket(nsd->xfrd_listener->fd, &cmd, sizeof(cmd))) { log_msg(LOG_ERR, "server_main: could not send quit to xfrd: %s", strerror(errno)); } fsync(nsd->xfrd_listener->fd); close(nsd->xfrd_listener->fd); (void)kill(nsd->pid, SIGTERM); } #ifdef MEMCLEAN /* OS collects memory pages */ region_destroy(server_region); #endif /* write the nsd.db to disk, wait for it to complete */ udb_base_sync(nsd->db->udb, 1); udb_base_close(nsd->db->udb); server_shutdown(nsd); } static query_state_type server_process_query(struct nsd *nsd, struct query *query) { return query_process(query, nsd); } static query_state_type server_process_query_udp(struct nsd *nsd, struct query *query) { #ifdef RATELIMIT if(query_process(query, nsd) != QUERY_DISCARDED) { if(rrl_process_query(query)) return rrl_slip(query); else return QUERY_PROCESSED; } return QUERY_DISCARDED; #else return query_process(query, nsd); #endif } const char* nsd_event_vs(void) { #ifdef USE_MINI_EVENT return ""; #else return event_get_version(); #endif } #if !defined(USE_MINI_EVENT) && defined(EV_FEATURE_BACKENDS) static const char* ub_ev_backend2str(int b) { switch(b) { case EVBACKEND_SELECT: return "select"; case EVBACKEND_POLL: return "poll"; case EVBACKEND_EPOLL: return "epoll"; case EVBACKEND_KQUEUE: return "kqueue"; case EVBACKEND_DEVPOLL: return "devpoll"; case EVBACKEND_PORT: return "evport"; } return "unknown"; } #endif const char* nsd_event_method(void) { #ifdef USE_MINI_EVENT return "select"; #else struct event_base* b = nsd_child_event_base(); const char* m = "?"; # ifdef EV_FEATURE_BACKENDS m = ub_ev_backend2str(ev_backend((struct ev_loop*)b)); # elif defined(HAVE_EVENT_BASE_GET_METHOD) m = event_base_get_method(b); # endif # ifdef MEMCLEAN event_base_free(b); # endif return m; #endif } struct event_base* nsd_child_event_base(void) { struct event_base* base; #ifdef USE_MINI_EVENT static time_t secs; static struct timeval now; base = event_init(&secs, &now); #else # if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP) /* libev */ base = (struct event_base *)ev_default_loop(EVFLAG_AUTO); # else /* libevent */ # ifdef HAVE_EVENT_BASE_NEW base = event_base_new(); # else base = event_init(); # endif # endif #endif return base; } static void add_udp_handler( struct nsd *nsd, struct nsd_socket *sock, struct udp_handler_data *data) { struct event *handler = &data->event; data->nsd = nsd; data->socket = sock; memset(handler, 0, sizeof(*handler)); event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_udp, data); if(event_base_set(nsd->event_base, handler) != 0) log_msg(LOG_ERR, "nsd udp: event_base_set failed"); if(event_add(handler, NULL) != 0) log_msg(LOG_ERR, "nsd udp: event_add failed"); } void add_tcp_handler( struct nsd *nsd, struct nsd_socket *sock, struct tcp_accept_handler_data *data) { struct event *handler = &data->event; data->nsd = nsd; data->socket = sock; #ifdef HAVE_SSL if (nsd->tls_ctx && nsd->options->tls_port && using_tls_port((struct sockaddr *)&sock->addr.ai_addr, nsd->options->tls_port)) { data->tls_accept = 1; if(verbosity >= 2) { char buf[48]; addrport2str((struct sockaddr_storage*)&sock->addr.ai_addr, buf, sizeof(buf)); VERBOSITY(2, (LOG_NOTICE, "setup TCP for TLS service on interface %s", buf)); } } else { data->tls_accept = 0; } #endif memset(handler, 0, sizeof(*handler)); event_set(handler, sock->s, EV_PERSIST|EV_READ, handle_tcp_accept, data); if(event_base_set(nsd->event_base, handler) != 0) log_msg(LOG_ERR, "nsd tcp: event_base_set failed"); if(event_add(handler, NULL) != 0) log_msg(LOG_ERR, "nsd tcp: event_add failed"); data->event_added = 1; } /* * Serve DNS requests. */ void server_child(struct nsd *nsd) { size_t i, from, numifs; region_type *server_region = region_create(xalloc, free); struct event_base* event_base = nsd_child_event_base(); sig_atomic_t mode; if(!event_base) { log_msg(LOG_ERR, "nsd server could not create event base"); exit(1); } nsd->event_base = event_base; nsd->server_region = server_region; #ifdef RATELIMIT rrl_init(nsd->this_child->child_num); #endif assert(nsd->server_kind != NSD_SERVER_MAIN); DEBUG(DEBUG_IPC, 2, (LOG_INFO, "child process started")); #ifdef HAVE_SETPROCTITLE setproctitle("server %d", nsd->this_child->child_num + 1); #endif #ifdef HAVE_CPUSET_T if(nsd->use_cpu_affinity) { set_cpu_affinity(nsd->this_child->cpuset); } #endif if (!(nsd->server_kind & NSD_SERVER_TCP)) { server_close_all_sockets(nsd->tcp, nsd->ifs); } if (!(nsd->server_kind & NSD_SERVER_UDP)) { server_close_all_sockets(nsd->udp, nsd->ifs); } if (nsd->this_child->parent_fd != -1) { struct event *handler; struct ipc_handler_conn_data* user_data = (struct ipc_handler_conn_data*)region_alloc( server_region, sizeof(struct ipc_handler_conn_data)); user_data->nsd = nsd; user_data->conn = xfrd_tcp_create(server_region, QIOBUFSZ); handler = (struct event*) region_alloc( server_region, sizeof(*handler)); memset(handler, 0, sizeof(*handler)); event_set(handler, nsd->this_child->parent_fd, EV_PERSIST| EV_READ, child_handle_parent_command, user_data); if(event_base_set(event_base, handler) != 0) log_msg(LOG_ERR, "nsd ipcchild: event_base_set failed"); if(event_add(handler, NULL) != 0) log_msg(LOG_ERR, "nsd ipcchild: event_add failed"); } if(nsd->reuseport) { numifs = nsd->ifs / nsd->reuseport; from = numifs * nsd->this_child->child_num; if(from+numifs > nsd->ifs) { /* should not happen */ from = 0; numifs = nsd->ifs; } } else { from = 0; numifs = nsd->ifs; } if (nsd->server_kind & NSD_SERVER_UDP) { int child = nsd->this_child->child_num; memset(msgs, 0, sizeof(msgs)); for (i = 0; i < NUM_RECV_PER_SELECT; i++) { queries[i] = query_create(server_region, compressed_dname_offsets, compression_table_size, compressed_dnames); query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); iovecs[i].iov_base = buffer_begin(queries[i]->packet); iovecs[i].iov_len = buffer_remaining(queries[i]->packet); msgs[i].msg_hdr.msg_iov = &iovecs[i]; msgs[i].msg_hdr.msg_iovlen = 1; msgs[i].msg_hdr.msg_name = &queries[i]->addr; msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; } for (i = 0; i < nsd->ifs; i++) { int listen; struct udp_handler_data *data; listen = nsd_bitset_isset(nsd->udp[i].servers, child); if(i >= from && i < (from + numifs) && listen) { data = region_alloc_zero( nsd->server_region, sizeof(*data)); add_udp_handler(nsd, &nsd->udp[i], data); } else { /* close sockets intended for other servers */ server_close_socket(&nsd->udp[i]); } } } /* * Keep track of all the TCP accept handlers so we can enable * and disable them based on the current number of active TCP * connections. */ if (nsd->server_kind & NSD_SERVER_TCP) { int child = nsd->this_child->child_num; tcp_accept_handler_count = numifs; tcp_accept_handlers = region_alloc_array(server_region, numifs, sizeof(*tcp_accept_handlers)); for (i = 0; i < nsd->ifs; i++) { int listen; struct tcp_accept_handler_data *data; listen = nsd_bitset_isset(nsd->tcp[i].servers, child); if(i >= from && i < (from + numifs) && listen) { data = &tcp_accept_handlers[i-from]; memset(data, 0, sizeof(*data)); add_tcp_handler(nsd, &nsd->tcp[i], data); } else { /* close sockets intended for other servers */ /* * uncomment this once tcp servers are no * longer copied in the tcp fd copy line * in server_init(). server_close_socket(&nsd->tcp[i]); */ /* close sockets not meant for this server*/ if(!listen) server_close_socket(&nsd->tcp[i]); } } } else { tcp_accept_handler_count = 0; } /* The main loop... */ while ((mode = nsd->mode) != NSD_QUIT) { if(mode == NSD_RUN) nsd->mode = mode = server_signal_mode(nsd); /* Do we need to do the statistics... */ if (mode == NSD_STATS) { #ifdef BIND8_STATS int p = nsd->st.period; nsd->st.period = 1; /* force stats printout */ /* Dump the statistics */ bind8_stats(nsd); nsd->st.period = p; #else /* !BIND8_STATS */ log_msg(LOG_NOTICE, "Statistics support not enabled at compile time."); #endif /* BIND8_STATS */ nsd->mode = NSD_RUN; } else if (mode == NSD_REAP_CHILDREN) { /* got signal, notify parent. parent reaps terminated children. */ if (nsd->this_child->parent_fd != -1) { sig_atomic_t parent_notify = NSD_REAP_CHILDREN; if (write(nsd->this_child->parent_fd, &parent_notify, sizeof(parent_notify)) == -1) { log_msg(LOG_ERR, "problems sending command from %d to parent: %s", (int) nsd->this_child->pid, strerror(errno)); } } else /* no parent, so reap 'em */ while (waitpid(-1, NULL, WNOHANG) > 0) ; nsd->mode = NSD_RUN; } else if(mode == NSD_RUN) { /* Wait for a query... */ if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { if (errno != EINTR) { log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); break; } } } else if(mode == NSD_QUIT) { /* ignore here, quit */ } else { log_msg(LOG_ERR, "mode bad value %d, back to service.", (int)mode); nsd->mode = NSD_RUN; } } service_remaining_tcp(nsd); #ifdef BIND8_STATS bind8_stats(nsd); #endif /* BIND8_STATS */ #ifdef MEMCLEAN /* OS collects memory pages */ #ifdef RATELIMIT rrl_deinit(nsd->this_child->child_num); #endif event_base_free(event_base); region_destroy(server_region); #endif server_shutdown(nsd); } static void remaining_tcp_timeout(int ATTR_UNUSED(fd), short event, void* arg) { int* timed_out = (int*)arg; assert(event & EV_TIMEOUT); (void)event; /* wake up the service tcp thread, note event is no longer * registered */ *timed_out = 1; } void service_remaining_tcp(struct nsd* nsd) { struct tcp_handler_data* p; struct event_base* event_base; /* check if it is needed */ if(nsd->current_tcp_count == 0 || tcp_active_list == NULL) return; VERBOSITY(4, (LOG_INFO, "service remaining TCP connections")); /* setup event base */ event_base = nsd_child_event_base(); if(!event_base) { log_msg(LOG_ERR, "nsd remain tcp could not create event base"); return; } /* register tcp connections */ for(p = tcp_active_list; p != NULL; p = p->next) { struct timeval timeout; int fd = p->event.ev_fd; #ifdef USE_MINI_EVENT short event = p->event.ev_flags & (EV_READ|EV_WRITE); #else short event = p->event.ev_events & (EV_READ|EV_WRITE); #endif void (*fn)(int, short, void*); #ifdef HAVE_SSL if(p->tls) { if((event&EV_READ)) fn = handle_tls_reading; else fn = handle_tls_writing; } else { #endif if((event&EV_READ)) fn = handle_tcp_reading; else fn = handle_tcp_writing; #ifdef HAVE_SSL } #endif /* set timeout to 1/10 second */ if(p->tcp_timeout > 100) p->tcp_timeout = 100; timeout.tv_sec = p->tcp_timeout / 1000; timeout.tv_usec = (p->tcp_timeout % 1000)*1000; event_del(&p->event); memset(&p->event, 0, sizeof(p->event)); event_set(&p->event, fd, EV_PERSIST | event | EV_TIMEOUT, fn, p); if(event_base_set(event_base, &p->event) != 0) log_msg(LOG_ERR, "event base set failed"); if(event_add(&p->event, &timeout) != 0) log_msg(LOG_ERR, "event add failed"); } /* handle it */ while(nsd->current_tcp_count > 0) { mode_t m = server_signal_mode(nsd); struct event timeout; struct timeval tv; int timed_out = 0; if(m == NSD_QUIT || m == NSD_SHUTDOWN || m == NSD_REAP_CHILDREN) { /* quit */ break; } /* timer */ /* have to do something every second */ tv.tv_sec = 1; tv.tv_usec = 0; memset(&timeout, 0, sizeof(timeout)); event_set(&timeout, -1, EV_TIMEOUT, remaining_tcp_timeout, &timed_out); if(event_base_set(event_base, &timeout) != 0) log_msg(LOG_ERR, "remaintcp timer: event_base_set failed"); if(event_add(&timeout, &tv) != 0) log_msg(LOG_ERR, "remaintcp timer: event_add failed"); /* service loop */ if(event_base_loop(event_base, EVLOOP_ONCE) == -1) { if (errno != EINTR) { log_msg(LOG_ERR, "dispatch failed: %s", strerror(errno)); break; } } if(!timed_out) { event_del(&timeout); } else { /* timed out, quit */ VERBOSITY(4, (LOG_INFO, "service remaining TCP connections: timed out, quit")); break; } } #ifdef MEMCLEAN event_base_free(event_base); #endif /* continue to quit after return */ } /* Implement recvmmsg and sendmmsg if the platform does not. These functions * are always used, even if nonblocking operations are broken, in which case * NUM_RECV_PER_SELECT is defined to 1 (one). */ #if defined(HAVE_RECVMMSG) #define nsd_recvmmsg recvmmsg #else /* !HAVE_RECVMMSG */ static int nsd_recvmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags, struct timespec *timeout) { unsigned int vpos = 0; ssize_t rcvd; /* timeout is ignored, ensure caller does not expect it to work */ assert(timeout == NULL); (void)timeout; while(vpos < vlen) { rcvd = recvfrom(sockfd, msgvec[vpos].msg_hdr.msg_iov->iov_base, msgvec[vpos].msg_hdr.msg_iov->iov_len, flags, msgvec[vpos].msg_hdr.msg_name, &msgvec[vpos].msg_hdr.msg_namelen); if(rcvd < 0) { break; } else { assert((unsigned long long)rcvd <= (unsigned long long)UINT_MAX); msgvec[vpos].msg_len = (unsigned int)rcvd; vpos++; } } if(vpos) { /* error will be picked up next time */ return (int)vpos; } else if(errno == 0) { return 0; } else if(errno == EAGAIN) { return 0; } return -1; } #endif /* HAVE_RECVMMSG */ #ifdef HAVE_SENDMMSG #define nsd_sendmmsg(...) sendmmsg(__VA_ARGS__) #else /* !HAVE_SENDMMSG */ static int nsd_sendmmsg(int sockfd, struct mmsghdr *msgvec, unsigned int vlen, int flags) { unsigned int vpos = 0; ssize_t snd; while(vpos < vlen) { assert(msgvec[vpos].msg_hdr.msg_iovlen == 1); snd = sendto(sockfd, msgvec[vpos].msg_hdr.msg_iov->iov_base, msgvec[vpos].msg_hdr.msg_iov->iov_len, flags, msgvec[vpos].msg_hdr.msg_name, msgvec[vpos].msg_hdr.msg_namelen); if(snd < 0) { break; } else { msgvec[vpos].msg_len = (unsigned int)snd; vpos++; } } if(vpos) { return (int)vpos; } else if(errno == 0) { return 0; } return -1; } #endif /* HAVE_SENDMMSG */ static void handle_udp(int fd, short event, void* arg) { struct udp_handler_data *data = (struct udp_handler_data *) arg; int received, sent, recvcount, i; struct query *q; if (!(event & EV_READ)) { return; } recvcount = nsd_recvmmsg(fd, msgs, NUM_RECV_PER_SELECT, 0, NULL); /* this printf strangely gave a performance increase on Linux */ /* printf("recvcount %d \n", recvcount); */ if (recvcount == -1) { if (errno != EAGAIN && errno != EINTR) { log_msg(LOG_ERR, "recvmmsg failed: %s", strerror(errno)); STATUP(data->nsd, rxerr); /* No zone statup */ } /* Simply no data available */ return; } for (i = 0; i < recvcount; i++) { loopstart: received = msgs[i].msg_len; queries[i]->addrlen = msgs[i].msg_hdr.msg_namelen; q = queries[i]; if (received == -1) { log_msg(LOG_ERR, "recvmmsg %d failed %s", i, strerror( #if defined(HAVE_RECVMMSG) msgs[i].msg_hdr.msg_flags #else errno #endif )); STATUP(data->nsd, rxerr); /* No zone statup */ query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); iovecs[i].iov_len = buffer_remaining(q->packet); msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; goto swap_drop; } /* Account... */ #ifdef BIND8_STATS if (data->socket->addr.ai_family == AF_INET) { STATUP(data->nsd, qudp); } else if (data->socket->addr.ai_family == AF_INET6) { STATUP(data->nsd, qudp6); } #endif buffer_skip(q->packet, received); buffer_flip(q->packet); #ifdef USE_DNSTAP dt_collector_submit_auth_query(data->nsd, &q->addr, q->addrlen, q->tcp, q->packet); #endif /* USE_DNSTAP */ /* Process and answer the query... */ if (server_process_query_udp(data->nsd, q) != QUERY_DISCARDED) { if (RCODE(q->packet) == RCODE_OK && !AA(q->packet)) { STATUP(data->nsd, nona); ZTATUP(data->nsd, q->zone, nona); } #ifdef USE_ZONE_STATS if (data->socket->addr.ai_family == AF_INET) { ZTATUP(data->nsd, q->zone, qudp); } else if (data->socket->addr.ai_family == AF_INET6) { ZTATUP(data->nsd, q->zone, qudp6); } #endif /* Add EDNS0 and TSIG info if necessary. */ query_add_optional(q, data->nsd); buffer_flip(q->packet); iovecs[i].iov_len = buffer_remaining(q->packet); #ifdef BIND8_STATS /* Account the rcode & TC... */ STATUP2(data->nsd, rcode, RCODE(q->packet)); ZTATUP2(data->nsd, q->zone, rcode, RCODE(q->packet)); if (TC(q->packet)) { STATUP(data->nsd, truncated); ZTATUP(data->nsd, q->zone, truncated); } #endif /* BIND8_STATS */ #ifdef USE_DNSTAP dt_collector_submit_auth_response(data->nsd, &q->addr, q->addrlen, q->tcp, q->packet, q->zone); #endif /* USE_DNSTAP */ } else { query_reset(queries[i], UDP_MAX_MESSAGE_LEN, 0); iovecs[i].iov_len = buffer_remaining(q->packet); msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; swap_drop: STATUP(data->nsd, dropped); ZTATUP(data->nsd, q->zone, dropped); if(i != recvcount-1) { /* swap with last and decrease recvcount */ struct mmsghdr mtmp = msgs[i]; struct iovec iotmp = iovecs[i]; recvcount--; msgs[i] = msgs[recvcount]; iovecs[i] = iovecs[recvcount]; queries[i] = queries[recvcount]; msgs[recvcount] = mtmp; iovecs[recvcount] = iotmp; queries[recvcount] = q; msgs[i].msg_hdr.msg_iov = &iovecs[i]; msgs[recvcount].msg_hdr.msg_iov = &iovecs[recvcount]; goto loopstart; } else { recvcount --; } } } /* send until all are sent */ i = 0; while(iaddr, a, sizeof(a)); log_msg(LOG_ERR, "sendmmsg [0]=%s count=%d failed: %s", a, (int)(recvcount-i), es); } #ifdef BIND8_STATS data->nsd->st.txerr += recvcount-i; #endif /* BIND8_STATS */ break; } i += sent; } for(i=0; ipacket); msgs[i].msg_hdr.msg_namelen = queries[i]->addrlen; } } #ifdef HAVE_SSL /* * Setup an event for the tcp handler. */ static void tcp_handler_setup_event(struct tcp_handler_data* data, void (*fn)(int, short, void *), int fd, short event) { struct timeval timeout; struct event_base* ev_base; timeout.tv_sec = data->nsd->tcp_timeout; timeout.tv_usec = 0L; ev_base = data->event.ev_base; event_del(&data->event); memset(&data->event, 0, sizeof(data->event)); event_set(&data->event, fd, event, fn, data); if(event_base_set(ev_base, &data->event) != 0) log_msg(LOG_ERR, "event base set failed"); if(event_add(&data->event, &timeout) != 0) log_msg(LOG_ERR, "event add failed"); } #endif /* HAVE_SSL */ static void cleanup_tcp_handler(struct tcp_handler_data* data) { event_del(&data->event); #ifdef HAVE_SSL if(data->tls) { SSL_shutdown(data->tls); SSL_free(data->tls); data->tls = NULL; } #endif close(data->event.ev_fd); if(data->prev) data->prev->next = data->next; else tcp_active_list = data->next; if(data->next) data->next->prev = data->prev; /* * Enable the TCP accept handlers when the current number of * TCP connections is about to drop below the maximum number * of TCP connections. */ if (slowaccept || data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) { configure_handler_event_types(EV_READ|EV_PERSIST); if(slowaccept) { event_del(&slowaccept_event); slowaccept = 0; } } --data->nsd->current_tcp_count; assert(data->nsd->current_tcp_count >= 0); region_destroy(data->region); } static void handle_tcp_reading(int fd, short event, void* arg) { struct tcp_handler_data *data = (struct tcp_handler_data *) arg; ssize_t received; struct event_base* ev_base; struct timeval timeout; if ((event & EV_TIMEOUT)) { /* Connection timed out. */ cleanup_tcp_handler(data); return; } if (data->nsd->tcp_query_count > 0 && data->query_count >= data->nsd->tcp_query_count) { /* No more queries allowed on this tcp connection. */ cleanup_tcp_handler(data); return; } assert((event & EV_READ)); if (data->bytes_transmitted == 0) { query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); } /* * Check if we received the leading packet length bytes yet. */ if (data->bytes_transmitted < sizeof(uint16_t)) { received = read(fd, (char *) &data->query->tcplen + data->bytes_transmitted, sizeof(uint16_t) - data->bytes_transmitted); if (received == -1) { if (errno == EAGAIN || errno == EINTR) { /* * Read would block, wait until more * data is available. */ return; } else { char buf[48]; addr2str(&data->query->addr, buf, sizeof(buf)); #ifdef ECONNRESET if (verbosity >= 2 || errno != ECONNRESET) #endif /* ECONNRESET */ log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno)); cleanup_tcp_handler(data); return; } } else if (received == 0) { /* EOF */ cleanup_tcp_handler(data); return; } data->bytes_transmitted += received; if (data->bytes_transmitted < sizeof(uint16_t)) { /* * Not done with the tcplen yet, wait for more * data to become available. */ return; } assert(data->bytes_transmitted == sizeof(uint16_t)); data->query->tcplen = ntohs(data->query->tcplen); /* * Minimum query size is: * * Size of the header (12) * + Root domain name (1) * + Query class (2) * + Query type (2) */ if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); cleanup_tcp_handler(data); return; } if (data->query->tcplen > data->query->maxlen) { VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); cleanup_tcp_handler(data); return; } buffer_set_limit(data->query->packet, data->query->tcplen); } assert(buffer_remaining(data->query->packet) > 0); /* Read the (remaining) query data. */ received = read(fd, buffer_current(data->query->packet), buffer_remaining(data->query->packet)); if (received == -1) { if (errno == EAGAIN || errno == EINTR) { /* * Read would block, wait until more data is * available. */ return; } else { char buf[48]; addr2str(&data->query->addr, buf, sizeof(buf)); #ifdef ECONNRESET if (verbosity >= 2 || errno != ECONNRESET) #endif /* ECONNRESET */ log_msg(LOG_ERR, "failed reading from %s tcp: %s", buf, strerror(errno)); cleanup_tcp_handler(data); return; } } else if (received == 0) { /* EOF */ cleanup_tcp_handler(data); return; } data->bytes_transmitted += received; buffer_skip(data->query->packet, received); if (buffer_remaining(data->query->packet) > 0) { /* * Message not yet complete, wait for more data to * become available. */ return; } assert(buffer_position(data->query->packet) == data->query->tcplen); /* Account... */ #ifdef BIND8_STATS #ifndef INET6 STATUP(data->nsd, ctcp); #else if (data->query->addr.ss_family == AF_INET) { STATUP(data->nsd, ctcp); } else if (data->query->addr.ss_family == AF_INET6) { STATUP(data->nsd, ctcp6); } #endif #endif /* BIND8_STATS */ /* We have a complete query, process it. */ /* tcp-query-count: handle query counter ++ */ data->query_count++; buffer_flip(data->query->packet); #ifdef USE_DNSTAP dt_collector_submit_auth_query(data->nsd, &data->query->addr, data->query->addrlen, data->query->tcp, data->query->packet); #endif /* USE_DNSTAP */ data->query_state = server_process_query(data->nsd, data->query); if (data->query_state == QUERY_DISCARDED) { /* Drop the packet and the entire connection... */ STATUP(data->nsd, dropped); ZTATUP(data->nsd, data->query->zone, dropped); cleanup_tcp_handler(data); return; } #ifdef BIND8_STATS if (RCODE(data->query->packet) == RCODE_OK && !AA(data->query->packet)) { STATUP(data->nsd, nona); ZTATUP(data->nsd, data->query->zone, nona); } #endif /* BIND8_STATS */ #ifdef USE_ZONE_STATS #ifndef INET6 ZTATUP(data->nsd, data->query->zone, ctcp); #else if (data->query->addr.ss_family == AF_INET) { ZTATUP(data->nsd, data->query->zone, ctcp); } else if (data->query->addr.ss_family == AF_INET6) { ZTATUP(data->nsd, data->query->zone, ctcp6); } #endif #endif /* USE_ZONE_STATS */ query_add_optional(data->query, data->nsd); /* Switch to the tcp write handler. */ buffer_flip(data->query->packet); data->query->tcplen = buffer_remaining(data->query->packet); #ifdef BIND8_STATS /* Account the rcode & TC... */ STATUP2(data->nsd, rcode, RCODE(data->query->packet)); ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); if (TC(data->query->packet)) { STATUP(data->nsd, truncated); ZTATUP(data->nsd, data->query->zone, truncated); } #endif /* BIND8_STATS */ #ifdef USE_DNSTAP dt_collector_submit_auth_response(data->nsd, &data->query->addr, data->query->addrlen, data->query->tcp, data->query->packet, data->query->zone); #endif /* USE_DNSTAP */ data->bytes_transmitted = 0; timeout.tv_sec = data->tcp_timeout / 1000; timeout.tv_usec = (data->tcp_timeout % 1000)*1000; ev_base = data->event.ev_base; event_del(&data->event); memset(&data->event, 0, sizeof(data->event)); event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT, handle_tcp_reading, data); if(event_base_set(ev_base, &data->event) != 0) log_msg(LOG_ERR, "event base set tcpr failed"); if(event_add(&data->event, &timeout) != 0) log_msg(LOG_ERR, "event add tcpr failed"); /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ handle_tcp_writing(fd, EV_WRITE, data); } static void handle_tcp_writing(int fd, short event, void* arg) { struct tcp_handler_data *data = (struct tcp_handler_data *) arg; ssize_t sent; struct query *q = data->query; struct timeval timeout; struct event_base* ev_base; if ((event & EV_TIMEOUT)) { /* Connection timed out. */ cleanup_tcp_handler(data); return; } assert((event & EV_WRITE)); if (data->bytes_transmitted < sizeof(q->tcplen)) { /* Writing the response packet length. */ uint16_t n_tcplen = htons(q->tcplen); #ifdef HAVE_WRITEV struct iovec iov[2]; iov[0].iov_base = (uint8_t*)&n_tcplen + data->bytes_transmitted; iov[0].iov_len = sizeof(n_tcplen) - data->bytes_transmitted; iov[1].iov_base = buffer_begin(q->packet); iov[1].iov_len = buffer_limit(q->packet); sent = writev(fd, iov, 2); #else /* HAVE_WRITEV */ sent = write(fd, (const char *) &n_tcplen + data->bytes_transmitted, sizeof(n_tcplen) - data->bytes_transmitted); #endif /* HAVE_WRITEV */ if (sent == -1) { if (errno == EAGAIN || errno == EINTR) { /* * Write would block, wait until * socket becomes writable again. */ return; } else { #ifdef ECONNRESET if(verbosity >= 2 || errno != ECONNRESET) #endif /* ECONNRESET */ #ifdef EPIPE if(verbosity >= 2 || errno != EPIPE) #endif /* EPIPE 'broken pipe' */ log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno)); cleanup_tcp_handler(data); return; } } data->bytes_transmitted += sent; if (data->bytes_transmitted < sizeof(q->tcplen)) { /* * Writing not complete, wait until socket * becomes writable again. */ return; } #ifdef HAVE_WRITEV sent -= sizeof(n_tcplen); /* handle potential 'packet done' code */ goto packet_could_be_done; #endif } sent = write(fd, buffer_current(q->packet), buffer_remaining(q->packet)); if (sent == -1) { if (errno == EAGAIN || errno == EINTR) { /* * Write would block, wait until * socket becomes writable again. */ return; } else { #ifdef ECONNRESET if(verbosity >= 2 || errno != ECONNRESET) #endif /* ECONNRESET */ #ifdef EPIPE if(verbosity >= 2 || errno != EPIPE) #endif /* EPIPE 'broken pipe' */ log_msg(LOG_ERR, "failed writing to tcp: %s", strerror(errno)); cleanup_tcp_handler(data); return; } } data->bytes_transmitted += sent; #ifdef HAVE_WRITEV packet_could_be_done: #endif buffer_skip(q->packet, sent); if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { /* * Still more data to write when socket becomes * writable again. */ return; } assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); if (data->query_state == QUERY_IN_AXFR) { /* Continue processing AXFR and writing back results. */ buffer_clear(q->packet); data->query_state = query_axfr(data->nsd, q); if (data->query_state != QUERY_PROCESSED) { query_add_optional(data->query, data->nsd); /* Reset data. */ buffer_flip(q->packet); q->tcplen = buffer_remaining(q->packet); data->bytes_transmitted = 0; /* Reset timeout. */ timeout.tv_sec = data->tcp_timeout / 1000; timeout.tv_usec = (data->tcp_timeout % 1000)*1000; ev_base = data->event.ev_base; event_del(&data->event); memset(&data->event, 0, sizeof(data->event)); event_set(&data->event, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT, handle_tcp_writing, data); if(event_base_set(ev_base, &data->event) != 0) log_msg(LOG_ERR, "event base set tcpw failed"); if(event_add(&data->event, &timeout) != 0) log_msg(LOG_ERR, "event add tcpw failed"); /* * Write data if/when the socket is writable * again. */ return; } } /* * Done sending, wait for the next request to arrive on the * TCP socket by installing the TCP read handler. */ if (data->nsd->tcp_query_count > 0 && data->query_count >= data->nsd->tcp_query_count) { (void) shutdown(fd, SHUT_WR); } data->bytes_transmitted = 0; timeout.tv_sec = data->tcp_timeout / 1000; timeout.tv_usec = (data->tcp_timeout % 1000)*1000; ev_base = data->event.ev_base; event_del(&data->event); memset(&data->event, 0, sizeof(data->event)); event_set(&data->event, fd, EV_PERSIST | EV_READ | EV_TIMEOUT, handle_tcp_reading, data); if(event_base_set(ev_base, &data->event) != 0) log_msg(LOG_ERR, "event base set tcpw failed"); if(event_add(&data->event, &timeout) != 0) log_msg(LOG_ERR, "event add tcpw failed"); } #ifdef HAVE_SSL /** create SSL object and associate fd */ static SSL* incoming_ssl_fd(SSL_CTX* ctx, int fd) { SSL* ssl = SSL_new((SSL_CTX*)ctx); if(!ssl) { log_crypto_err("could not SSL_new"); return NULL; } SSL_set_accept_state(ssl); (void)SSL_set_mode(ssl, SSL_MODE_AUTO_RETRY); if(!SSL_set_fd(ssl, fd)) { log_crypto_err("could not SSL_set_fd"); SSL_free(ssl); return NULL; } return ssl; } /** TLS handshake to upgrade TCP connection */ static int tls_handshake(struct tcp_handler_data* data, int fd, int writing) { int r; if(data->shake_state == tls_hs_read_event) { /* read condition satisfied back to writing */ tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); data->shake_state = tls_hs_none; return 1; } if(data->shake_state == tls_hs_write_event) { /* write condition satisfied back to reading */ tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); data->shake_state = tls_hs_none; return 1; } /* (continue to) setup the TLS connection */ ERR_clear_error(); r = SSL_do_handshake(data->tls); if(r != 1) { int want = SSL_get_error(data->tls, r); if(want == SSL_ERROR_WANT_READ) { if(data->shake_state == tls_hs_read) { /* try again later */ return 1; } data->shake_state = tls_hs_read; /* switch back to reading mode */ tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); return 1; } else if(want == SSL_ERROR_WANT_WRITE) { if(data->shake_state == tls_hs_write) { /* try again later */ return 1; } data->shake_state = tls_hs_write; /* switch back to writing mode */ tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); return 1; } else { if(r == 0) VERBOSITY(3, (LOG_ERR, "TLS handshake: connection closed prematurely")); else { unsigned long err = ERR_get_error(); if(!squelch_err_ssl_handshake(err)) { char a[64], s[256]; addr2str(&data->query->addr, a, sizeof(a)); snprintf(s, sizeof(s), "TLS handshake failed from %s", a); log_crypto_from_err(s, err); } } cleanup_tcp_handler(data); return 0; } } /* Use to log successful upgrade for testing - could be removed*/ VERBOSITY(3, (LOG_INFO, "TLS handshake succeeded.")); /* set back to the event we need to have when reading (or writing) */ if(data->shake_state == tls_hs_read && writing) { tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST|EV_TIMEOUT|EV_WRITE); } else if(data->shake_state == tls_hs_write && !writing) { tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST|EV_TIMEOUT|EV_READ); } data->shake_state = tls_hs_none; return 1; } /** handle TLS reading of incoming query */ static void handle_tls_reading(int fd, short event, void* arg) { struct tcp_handler_data *data = (struct tcp_handler_data *) arg; ssize_t received; if ((event & EV_TIMEOUT)) { /* Connection timed out. */ cleanup_tcp_handler(data); return; } if (data->nsd->tcp_query_count > 0 && data->query_count >= data->nsd->tcp_query_count) { /* No more queries allowed on this tcp connection. */ cleanup_tcp_handler(data); return; } assert((event & EV_READ)); if (data->bytes_transmitted == 0) { query_reset(data->query, TCP_MAX_MESSAGE_LEN, 1); } if(data->shake_state != tls_hs_none) { if(!tls_handshake(data, fd, 0)) return; if(data->shake_state != tls_hs_none) return; } /* * Check if we received the leading packet length bytes yet. */ if(data->bytes_transmitted < sizeof(uint16_t)) { ERR_clear_error(); if((received=SSL_read(data->tls, (char *) &data->query->tcplen + data->bytes_transmitted, sizeof(uint16_t) - data->bytes_transmitted)) <= 0) { int want = SSL_get_error(data->tls, received); if(want == SSL_ERROR_ZERO_RETURN) { cleanup_tcp_handler(data); return; /* shutdown, closed */ } else if(want == SSL_ERROR_WANT_READ) { /* wants to be called again */ return; } else if(want == SSL_ERROR_WANT_WRITE) { /* switch to writing */ data->shake_state = tls_hs_write_event; tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); return; } cleanup_tcp_handler(data); log_crypto_err("could not SSL_read"); return; } data->bytes_transmitted += received; if (data->bytes_transmitted < sizeof(uint16_t)) { /* * Not done with the tcplen yet, wait for more * data to become available. */ return; } assert(data->bytes_transmitted == sizeof(uint16_t)); data->query->tcplen = ntohs(data->query->tcplen); /* * Minimum query size is: * * Size of the header (12) * + Root domain name (1) * + Query class (2) * + Query type (2) */ if (data->query->tcplen < QHEADERSZ + 1 + sizeof(uint16_t) + sizeof(uint16_t)) { VERBOSITY(2, (LOG_WARNING, "packet too small, dropping tcp connection")); cleanup_tcp_handler(data); return; } if (data->query->tcplen > data->query->maxlen) { VERBOSITY(2, (LOG_WARNING, "insufficient tcp buffer, dropping connection")); cleanup_tcp_handler(data); return; } buffer_set_limit(data->query->packet, data->query->tcplen); } assert(buffer_remaining(data->query->packet) > 0); /* Read the (remaining) query data. */ ERR_clear_error(); received = SSL_read(data->tls, (void*)buffer_current(data->query->packet), (int)buffer_remaining(data->query->packet)); if(received <= 0) { int want = SSL_get_error(data->tls, received); if(want == SSL_ERROR_ZERO_RETURN) { cleanup_tcp_handler(data); return; /* shutdown, closed */ } else if(want == SSL_ERROR_WANT_READ) { /* wants to be called again */ return; } else if(want == SSL_ERROR_WANT_WRITE) { /* switch back writing */ data->shake_state = tls_hs_write_event; tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); return; } cleanup_tcp_handler(data); log_crypto_err("could not SSL_read"); return; } data->bytes_transmitted += received; buffer_skip(data->query->packet, received); if (buffer_remaining(data->query->packet) > 0) { /* * Message not yet complete, wait for more data to * become available. */ return; } assert(buffer_position(data->query->packet) == data->query->tcplen); /* Account... */ #ifndef INET6 STATUP(data->nsd, ctls); #else if (data->query->addr.ss_family == AF_INET) { STATUP(data->nsd, ctls); } else if (data->query->addr.ss_family == AF_INET6) { STATUP(data->nsd, ctls6); } #endif /* We have a complete query, process it. */ /* tcp-query-count: handle query counter ++ */ data->query_count++; buffer_flip(data->query->packet); #ifdef USE_DNSTAP dt_collector_submit_auth_query(data->nsd, &data->query->addr, data->query->addrlen, data->query->tcp, data->query->packet); #endif /* USE_DNSTAP */ data->query_state = server_process_query(data->nsd, data->query); if (data->query_state == QUERY_DISCARDED) { /* Drop the packet and the entire connection... */ STATUP(data->nsd, dropped); ZTATUP(data->nsd, data->query->zone, dropped); cleanup_tcp_handler(data); return; } #ifdef BIND8_STATS if (RCODE(data->query->packet) == RCODE_OK && !AA(data->query->packet)) { STATUP(data->nsd, nona); ZTATUP(data->nsd, data->query->zone, nona); } #endif /* BIND8_STATS */ #ifdef USE_ZONE_STATS #ifndef INET6 ZTATUP(data->nsd, data->query->zone, ctls); #else if (data->query->addr.ss_family == AF_INET) { ZTATUP(data->nsd, data->query->zone, ctls); } else if (data->query->addr.ss_family == AF_INET6) { ZTATUP(data->nsd, data->query->zone, ctls6); } #endif #endif /* USE_ZONE_STATS */ query_add_optional(data->query, data->nsd); /* Switch to the tcp write handler. */ buffer_flip(data->query->packet); data->query->tcplen = buffer_remaining(data->query->packet); #ifdef BIND8_STATS /* Account the rcode & TC... */ STATUP2(data->nsd, rcode, RCODE(data->query->packet)); ZTATUP2(data->nsd, data->query->zone, rcode, RCODE(data->query->packet)); if (TC(data->query->packet)) { STATUP(data->nsd, truncated); ZTATUP(data->nsd, data->query->zone, truncated); } #endif /* BIND8_STATS */ #ifdef USE_DNSTAP dt_collector_submit_auth_response(data->nsd, &data->query->addr, data->query->addrlen, data->query->tcp, data->query->packet, data->query->zone); #endif /* USE_DNSTAP */ data->bytes_transmitted = 0; tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); /* see if we can write the answer right away(usually so,EAGAIN ifnot)*/ handle_tls_writing(fd, EV_WRITE, data); } /** handle TLS writing of outgoing response */ static void handle_tls_writing(int fd, short event, void* arg) { struct tcp_handler_data *data = (struct tcp_handler_data *) arg; ssize_t sent; struct query *q = data->query; /* static variable that holds reassembly buffer used to put the * TCP length in front of the packet, like writev. */ static buffer_type* global_tls_temp_buffer = NULL; buffer_type* write_buffer; if ((event & EV_TIMEOUT)) { /* Connection timed out. */ cleanup_tcp_handler(data); return; } assert((event & EV_WRITE)); if(data->shake_state != tls_hs_none) { if(!tls_handshake(data, fd, 1)) return; if(data->shake_state != tls_hs_none) return; } (void)SSL_set_mode(data->tls, SSL_MODE_ENABLE_PARTIAL_WRITE); /* If we are writing the start of a message, we must include the length * this is done with a copy into write_buffer. */ write_buffer = NULL; if (data->bytes_transmitted == 0) { if(!global_tls_temp_buffer) { /* gets deallocated when nsd shuts down from * nsd.region */ global_tls_temp_buffer = buffer_create(nsd.region, QIOBUFSZ + sizeof(q->tcplen)); if (!global_tls_temp_buffer) { return; } } write_buffer = global_tls_temp_buffer; buffer_clear(write_buffer); buffer_write_u16(write_buffer, q->tcplen); buffer_write(write_buffer, buffer_current(q->packet), (int)buffer_remaining(q->packet)); buffer_flip(write_buffer); } else { write_buffer = q->packet; } /* Write the response */ ERR_clear_error(); sent = SSL_write(data->tls, buffer_current(write_buffer), buffer_remaining(write_buffer)); if(sent <= 0) { int want = SSL_get_error(data->tls, sent); if(want == SSL_ERROR_ZERO_RETURN) { cleanup_tcp_handler(data); /* closed */ } else if(want == SSL_ERROR_WANT_READ) { /* switch back to reading */ data->shake_state = tls_hs_read_event; tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); } else if(want != SSL_ERROR_WANT_WRITE) { cleanup_tcp_handler(data); log_crypto_err("could not SSL_write"); } return; } buffer_skip(write_buffer, sent); if(buffer_remaining(write_buffer) != 0) { /* If not all sent, sync up the real buffer if it wasn't used.*/ if (data->bytes_transmitted == 0 && (ssize_t)sent > (ssize_t)sizeof(q->tcplen)) { buffer_skip(q->packet, (ssize_t)sent - (ssize_t)sizeof(q->tcplen)); } } data->bytes_transmitted += sent; if (data->bytes_transmitted < q->tcplen + sizeof(q->tcplen)) { /* * Still more data to write when socket becomes * writable again. */ return; } assert(data->bytes_transmitted == q->tcplen + sizeof(q->tcplen)); if (data->query_state == QUERY_IN_AXFR) { /* Continue processing AXFR and writing back results. */ buffer_clear(q->packet); data->query_state = query_axfr(data->nsd, q); if (data->query_state != QUERY_PROCESSED) { query_add_optional(data->query, data->nsd); /* Reset data. */ buffer_flip(q->packet); q->tcplen = buffer_remaining(q->packet); data->bytes_transmitted = 0; /* Reset to writing mode. */ tcp_handler_setup_event(data, handle_tls_writing, fd, EV_PERSIST | EV_WRITE | EV_TIMEOUT); /* * Write data if/when the socket is writable * again. */ return; } } /* * Done sending, wait for the next request to arrive on the * TCP socket by installing the TCP read handler. */ if (data->nsd->tcp_query_count > 0 && data->query_count >= data->nsd->tcp_query_count) { (void) shutdown(fd, SHUT_WR); } data->bytes_transmitted = 0; tcp_handler_setup_event(data, handle_tls_reading, fd, EV_PERSIST | EV_READ | EV_TIMEOUT); } #endif static void handle_slowaccept_timeout(int ATTR_UNUSED(fd), short ATTR_UNUSED(event), void* ATTR_UNUSED(arg)) { if(slowaccept) { configure_handler_event_types(EV_PERSIST | EV_READ); slowaccept = 0; } } static int perform_accept(int fd, struct sockaddr *addr, socklen_t *addrlen) { #ifndef HAVE_ACCEPT4 int s = accept(fd, addr, addrlen); if (s != -1) { if (fcntl(s, F_SETFL, O_NONBLOCK) == -1) { log_msg(LOG_ERR, "fcntl failed: %s", strerror(errno)); close(s); s = -1; errno=EINTR; /* stop error printout as error in accept4 by setting this errno, it omits printout, in later code that calls nsd_accept4 */ } } return s; #else return accept4(fd, addr, addrlen, SOCK_NONBLOCK); #endif /* HAVE_ACCEPT4 */ } /* * Handle an incoming TCP connection. The connection is accepted and * a new TCP reader event handler is added. The TCP handler * is responsible for cleanup when the connection is closed. */ static void handle_tcp_accept(int fd, short event, void* arg) { struct tcp_accept_handler_data *data = (struct tcp_accept_handler_data *) arg; int s; int reject = 0; struct tcp_handler_data *tcp_data; region_type *tcp_region; #ifdef INET6 struct sockaddr_storage addr; #else struct sockaddr_in addr; #endif socklen_t addrlen; struct timeval timeout; if (!(event & EV_READ)) { return; } if (data->nsd->current_tcp_count >= data->nsd->maximum_tcp_count) { reject = data->nsd->options->tcp_reject_overflow; if (!reject) { return; } } /* Accept it... */ addrlen = sizeof(addr); s = perform_accept(fd, (struct sockaddr *) &addr, &addrlen); if (s == -1) { /** * EMFILE and ENFILE is a signal that the limit of open * file descriptors has been reached. Pause accept(). * EINTR is a signal interrupt. The others are various OS ways * of saying that the client has closed the connection. */ if (errno == EMFILE || errno == ENFILE) { if (!slowaccept) { /* disable accept events */ struct timeval tv; configure_handler_event_types(0); tv.tv_sec = SLOW_ACCEPT_TIMEOUT; tv.tv_usec = 0L; memset(&slowaccept_event, 0, sizeof(slowaccept_event)); event_set(&slowaccept_event, -1, EV_TIMEOUT, handle_slowaccept_timeout, NULL); (void)event_base_set(data->event.ev_base, &slowaccept_event); (void)event_add(&slowaccept_event, &tv); slowaccept = 1; /* We don't want to spam the logs here */ } } else if (errno != EINTR && errno != EWOULDBLOCK #ifdef ECONNABORTED && errno != ECONNABORTED #endif /* ECONNABORTED */ #ifdef EPROTO && errno != EPROTO #endif /* EPROTO */ ) { log_msg(LOG_ERR, "accept failed: %s", strerror(errno)); } return; } if (reject) { shutdown(s, SHUT_RDWR); close(s); return; } /* * This region is deallocated when the TCP connection is * closed by the TCP handler. */ tcp_region = region_create(xalloc, free); tcp_data = (struct tcp_handler_data *) region_alloc( tcp_region, sizeof(struct tcp_handler_data)); tcp_data->region = tcp_region; tcp_data->query = query_create(tcp_region, compressed_dname_offsets, compression_table_size, compressed_dnames); tcp_data->nsd = data->nsd; tcp_data->query_count = 0; #ifdef HAVE_SSL tcp_data->shake_state = tls_hs_none; tcp_data->tls = NULL; #endif tcp_data->prev = NULL; tcp_data->next = NULL; tcp_data->query_state = QUERY_PROCESSED; tcp_data->bytes_transmitted = 0; memcpy(&tcp_data->query->addr, &addr, addrlen); tcp_data->query->addrlen = addrlen; tcp_data->tcp_timeout = data->nsd->tcp_timeout * 1000; if (data->nsd->current_tcp_count > data->nsd->maximum_tcp_count/2) { /* very busy, give smaller timeout */ tcp_data->tcp_timeout = 200; } memset(&tcp_data->event, 0, sizeof(tcp_data->event)); timeout.tv_sec = tcp_data->tcp_timeout / 1000; timeout.tv_usec = (tcp_data->tcp_timeout % 1000)*1000; #ifdef HAVE_SSL if (data->tls_accept) { tcp_data->tls = incoming_ssl_fd(tcp_data->nsd->tls_ctx, s); if(!tcp_data->tls) { close(s); return; } tcp_data->shake_state = tls_hs_read; memset(&tcp_data->event, 0, sizeof(tcp_data->event)); event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, handle_tls_reading, tcp_data); } else { #endif memset(&tcp_data->event, 0, sizeof(tcp_data->event)); event_set(&tcp_data->event, s, EV_PERSIST | EV_READ | EV_TIMEOUT, handle_tcp_reading, tcp_data); #ifdef HAVE_SSL } #endif if(event_base_set(data->event.ev_base, &tcp_data->event) != 0) { log_msg(LOG_ERR, "cannot set tcp event base"); close(s); region_destroy(tcp_region); return; } if(event_add(&tcp_data->event, &timeout) != 0) { log_msg(LOG_ERR, "cannot add tcp to event base"); close(s); region_destroy(tcp_region); return; } if(tcp_active_list) { tcp_active_list->prev = tcp_data; tcp_data->next = tcp_active_list; } tcp_active_list = tcp_data; /* * Keep track of the total number of TCP handlers installed so * we can stop accepting connections when the maximum number * of simultaneous TCP connections is reached. * * If tcp-reject-overflow is enabled, however, then we do not * change the handler event type; we keep it as-is and accept * overflow TCP connections only so that we can forcibly kill * them off. */ ++data->nsd->current_tcp_count; if (!data->nsd->options->tcp_reject_overflow && data->nsd->current_tcp_count == data->nsd->maximum_tcp_count) { configure_handler_event_types(0); } } static void send_children_command(struct nsd* nsd, sig_atomic_t command, int timeout) { size_t i; assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); for (i = 0; i < nsd->child_count; ++i) { if (nsd->children[i].pid > 0 && nsd->children[i].child_fd != -1) { if (write(nsd->children[i].child_fd, &command, sizeof(command)) == -1) { if(errno != EAGAIN && errno != EINTR) log_msg(LOG_ERR, "problems sending command %d to server %d: %s", (int) command, (int) nsd->children[i].pid, strerror(errno)); } else if (timeout > 0) { (void)block_read(NULL, nsd->children[i].child_fd, &command, sizeof(command), timeout); } fsync(nsd->children[i].child_fd); close(nsd->children[i].child_fd); nsd->children[i].child_fd = -1; } } } static void send_children_quit(struct nsd* nsd) { DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit")); send_children_command(nsd, NSD_QUIT, 0); } static void send_children_quit_and_wait(struct nsd* nsd) { DEBUG(DEBUG_IPC, 1, (LOG_INFO, "send children quit and wait")); send_children_command(nsd, NSD_QUIT_CHILD, 3); } #ifdef BIND8_STATS static void set_children_stats(struct nsd* nsd) { size_t i; assert(nsd->server_kind == NSD_SERVER_MAIN && nsd->this_child == 0); DEBUG(DEBUG_IPC, 1, (LOG_INFO, "parent set stats to send to children")); for (i = 0; i < nsd->child_count; ++i) { nsd->children[i].need_to_send_STATS = 1; nsd->children[i].handler->event_types |= NETIO_EVENT_WRITE; } } #endif /* BIND8_STATS */ static void configure_handler_event_types(short event_types) { size_t i; for (i = 0; i < tcp_accept_handler_count; ++i) { struct event* handler = &tcp_accept_handlers[i].event; if(event_types) { /* reassign */ int fd = handler->ev_fd; struct event_base* base = handler->ev_base; if(tcp_accept_handlers[i].event_added) event_del(handler); memset(handler, 0, sizeof(*handler)); event_set(handler, fd, event_types, handle_tcp_accept, &tcp_accept_handlers[i]); if(event_base_set(base, handler) != 0) log_msg(LOG_ERR, "conhand: cannot event_base"); if(event_add(handler, NULL) != 0) log_msg(LOG_ERR, "conhand: cannot event_add"); tcp_accept_handlers[i].event_added = 1; } else { /* remove */ if(tcp_accept_handlers[i].event_added) { event_del(handler); tcp_accept_handlers[i].event_added = 0; } } } }