/*	$OpenBSD: session.c,v 1.299 2009/10/26 09:27:58 claudio Exp $ */

/*
 * Copyright (c) 2003, 2004, 2005 Henning Brauer <henning@openbsd.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include <sys/param.h>
#include <sys/types.h>

#include <sys/mman.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <net/if_types.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>

#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <poll.h>
#include <pwd.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "bgpd.h"
#include "mrt.h"
#include "session.h"

#define PFD_PIPE_MAIN		0
#define PFD_PIPE_ROUTE		1
#define PFD_PIPE_ROUTE_CTL	2
#define PFD_SOCK_CTL		3
#define PFD_SOCK_RCTL		4
#define PFD_LISTENERS_START	5

void	session_sighdlr(int);
int	setup_listeners(u_int *);
void	init_conf(struct bgpd_config *);
void	init_peer(struct peer *);
void	start_timer_holdtime(struct peer *);
void	start_timer_keepalive(struct peer *);
void	session_close_connection(struct peer *);
void	change_state(struct peer *, enum session_state, enum session_events);
int	session_setup_socket(struct peer *);
void	session_accept(int);
int	session_connect(struct peer *);
void	session_tcp_established(struct peer *);
void	session_capa_ann_none(struct peer *);
int	session_capa_add(struct buf *, u_int8_t, u_int8_t);
int	session_capa_add_mp(struct buf *, u_int16_t, u_int8_t);
struct bgp_msg	*session_newmsg(enum msg_type, u_int16_t);
int	session_sendmsg(struct bgp_msg *, struct peer *);
void	session_open(struct peer *);
void	session_keepalive(struct peer *);
void	session_update(u_int32_t, void *, size_t);
void	session_notification(struct peer *, u_int8_t, u_int8_t, void *,
	    ssize_t);
void	session_rrefresh(struct peer *, u_int16_t, u_int8_t);
int	session_dispatch_msg(struct pollfd *, struct peer *);
int	parse_header(struct peer *, u_char *, u_int16_t *, u_int8_t *);
int	parse_open(struct peer *);
int	parse_update(struct peer *);
int	parse_refresh(struct peer *);
int	parse_notification(struct peer *);
int	parse_capabilities(struct peer *, u_char *, u_int16_t, u_int32_t *);
void	session_dispatch_imsg(struct imsgbuf *, int, u_int *);
void	session_up(struct peer *);
void	session_down(struct peer *);
void	session_demote(struct peer *, int);

int			 la_cmp(struct listen_addr *, struct listen_addr *);
struct peer		*getpeerbyip(struct sockaddr *);
int			 session_match_mask(struct peer *, struct sockaddr *);
struct peer		*getpeerbyid(u_int32_t);
static struct sockaddr	*addr2sa(struct bgpd_addr *, u_int16_t);

struct bgpd_config	*conf, *nconf = NULL;
struct bgpd_sysdep	 sysdep;
struct peer		*npeers;
volatile sig_atomic_t	 session_quit = 0;
int			 pending_reconf = 0;
int			 csock = -1, rcsock = -1;
u_int			 peer_cnt;
struct imsgbuf		*ibuf_rde;
struct imsgbuf		*ibuf_rde_ctl;
struct imsgbuf		*ibuf_main;

struct mrt_head		 mrthead;

void
session_sighdlr(int sig)
{
	switch (sig) {
	case SIGINT:
	case SIGTERM:
		session_quit = 1;
		break;
	}
}

int
setup_listeners(u_int *la_cnt)
{
	int			 ttl = 255;
	int			 opt;
	struct listen_addr	*la;
	u_int			 cnt = 0;

	TAILQ_FOREACH(la, conf->listen_addrs, entry) {
		la->reconf = RECONF_NONE;
		cnt++;

		if (la->flags & LISTENER_LISTENING)
			continue;

		if (la->fd == -1) {
			log_warn("cannot establish listener on %s: invalid fd",
			    log_sockaddr((struct sockaddr *)&la->sa));
			continue;
		}

		opt = 1;
		if (setsockopt(la->fd, IPPROTO_TCP, TCP_MD5SIG,
		    &opt, sizeof(opt)) == -1) {
			if (errno == ENOPROTOOPT) {	/* system w/o md5sig */
				log_warnx("md5sig not available, disabling");
				sysdep.no_md5sig = 1;
			} else
				fatal("setsockopt TCP_MD5SIG");
		}

		/* set ttl to 255 so that ttl-security works */
		if (la->sa.ss_family == AF_INET && setsockopt(la->fd,
		    IPPROTO_IP, IP_TTL, &ttl, sizeof(ttl)) == -1) {
			log_warn("setup_listeners setsockopt TTL");
			continue;
		}

		session_socket_blockmode(la->fd, BM_NONBLOCK);

		if (listen(la->fd, MAX_BACKLOG)) {
			close(la->fd);
			fatal("listen");
		}

		la->flags |= LISTENER_LISTENING;

		log_info("listening on %s",
		    log_sockaddr((struct sockaddr *)&la->sa));
	}

	*la_cnt = cnt;

	return (0);
}

pid_t
session_main(struct bgpd_config *config, struct peer *cpeers,
    struct network_head *net_l, struct filter_head *rules,
    struct mrt_head *m_l, struct rib_names *rib_l, int pipe_m2s[2],
    int pipe_s2r[2], int pipe_m2r[2], int pipe_s2rctl[2])
{
	int			 nfds, timeout;
	unsigned int		 i, j, idx_peers, idx_listeners, idx_mrts;
	pid_t			 pid;
	u_int			 pfd_elms = 0, peer_l_elms = 0, mrt_l_elms = 0;
	u_int			 listener_cnt, ctl_cnt, mrt_cnt;
	u_int			 new_cnt;
	u_int32_t		 ctl_queued;
	struct passwd		*pw;
	struct peer		*p, **peer_l = NULL, *last, *next;
	struct network		*net;
	struct mrt		*m, *xm, **mrt_l = NULL;
	struct filter_rule	*r;
	struct pollfd		*pfd = NULL;
	struct ctl_conn		*ctl_conn;
	struct listen_addr	*la;
	struct rde_rib		*rr;
	void			*newp;
	short			 events;

	conf = config;
	peers = cpeers;

	switch (pid = fork()) {
	case -1:
		fatal("cannot fork");
	case 0:
		break;
	default:
		return (pid);
	}

	/* control socket is outside chroot */
	if ((csock = control_init(0, conf->csock)) == -1)
		fatalx("control socket setup failed");
	if (conf->rcsock != NULL &&
	    (rcsock = control_init(1, conf->rcsock)) == -1)
		fatalx("control socket setup failed");

	if ((pw = getpwnam(BGPD_USER)) == NULL)
		fatal(NULL);

	if (chroot(pw->pw_dir) == -1)
		fatal("chroot");
	if (chdir("/") == -1)
		fatal("chdir(\"/\")");

	setproctitle("session engine");
	bgpd_process = PROC_SE;

	if (pfkey_init(&sysdep) == -1)
		fatalx("pfkey setup failed");

	if (setgroups(1, &pw->pw_gid) ||
	    setresgid(pw->pw_gid, pw->pw_gid, pw->pw_gid) ||
	    setresuid(pw->pw_uid, pw->pw_uid, pw->pw_uid))
		fatal("can't drop privileges");

	listener_cnt = 0;
	setup_listeners(&listener_cnt);

	signal(SIGTERM, session_sighdlr);
	signal(SIGINT, session_sighdlr);
	signal(SIGPIPE, SIG_IGN);
	signal(SIGHUP, SIG_IGN);
	log_info("session engine ready");
	close(pipe_m2s[0]);
	close(pipe_s2r[1]);
	close(pipe_s2rctl[1]);
	close(pipe_m2r[0]);
	close(pipe_m2r[1]);
	init_conf(conf);
	if ((ibuf_rde = malloc(sizeof(struct imsgbuf))) == NULL ||
	    (ibuf_rde_ctl = malloc(sizeof(struct imsgbuf))) == NULL ||
	    (ibuf_main = malloc(sizeof(struct imsgbuf))) == NULL)
		fatal(NULL);
	imsg_init(ibuf_rde, pipe_s2r[0]);
	imsg_init(ibuf_rde_ctl, pipe_s2rctl[0]);
	imsg_init(ibuf_main, pipe_m2s[1]);
	TAILQ_INIT(&ctl_conns);
	control_listen(csock);
	control_listen(rcsock);
	LIST_INIT(&mrthead);
	peer_cnt = 0;
	ctl_cnt = 0;

	/* filter rules are not used in the SE */
	while ((r = TAILQ_FIRST(rules)) != NULL) {
		TAILQ_REMOVE(rules, r, entry);
		free(r);
	}
	free(rules);

	/* network list is not used in the SE */
	while ((net = TAILQ_FIRST(net_l)) != NULL) {
		TAILQ_REMOVE(net_l, net, entry);
		filterset_free(&net->net.attrset);
		free(net);
	}

	/* main mrt list is not used in the SE */
	while ((m = LIST_FIRST(m_l)) != NULL) {
		LIST_REMOVE(m, entry);
		free(m);
	}
	/* rib names not used in the SE */
	while ((rr = SIMPLEQ_FIRST(&ribnames))) {
		SIMPLEQ_REMOVE_HEAD(&ribnames, entry);
		free(rr);
	}

	while (session_quit == 0) {
		/* check for peers to be initialized or deleted */
		last = NULL;
		for (p = peers; p != NULL; p = next) {
			next = p->next;
			if (!pending_reconf) {
				/* cloned peer that idled out? */
				if (p->state == STATE_IDLE && p->conf.cloned &&
				    time(NULL) - p->stats.last_updown >=
				    INTERVAL_HOLD_CLONED)
					p->conf.reconf_action = RECONF_DELETE;

				/* new peer that needs init? */
				if (p->state == STATE_NONE)
					init_peer(p);

				/* reinit due? */
				if (p->conf.reconf_action == RECONF_REINIT) {
					session_stop(p, ERR_CEASE_ADMIN_RESET);
					timer_set(p, Timer_IdleHold, 0);
				}

				/* deletion due? */
				if (p->conf.reconf_action == RECONF_DELETE) {
					if (p->demoted)
						session_demote(p, -1);
					p->conf.demote_group[0] = 0;
					session_stop(p, ERR_CEASE_PEER_UNCONF);
					log_peer_warnx(&p->conf, "removed");
					if (last != NULL)
						last->next = next;
					else
						peers = next;
					timer_remove_all(p);
					free(p);
					peer_cnt--;
					continue;
				}
				p->conf.reconf_action = RECONF_NONE;
			}
			last = p;
		}

		if (peer_cnt > peer_l_elms) {
			if ((newp = realloc(peer_l, sizeof(struct peer *) *
			    peer_cnt)) == NULL) {
				/* panic for now  */
				log_warn("could not resize peer_l from %u -> %u"
				    " entries", peer_l_elms, peer_cnt);
				fatalx("exiting");
			}
			peer_l = newp;
			peer_l_elms = peer_cnt;
		}

		mrt_cnt = 0;
		for (m = LIST_FIRST(&mrthead); m != NULL; m = xm) {
			xm = LIST_NEXT(m, entry);
			if (m->state == MRT_STATE_REMOVE) {
				mrt_clean(m);
				LIST_REMOVE(m, entry);
				free(m);
				continue;
			}
			if (m->wbuf.queued)
				mrt_cnt++;
		}

		if (mrt_cnt > mrt_l_elms) {
			if ((newp = realloc(mrt_l, sizeof(struct mrt *) *
			    mrt_cnt)) == NULL) {
				/* panic for now  */
				log_warn("could not resize mrt_l from %u -> %u"
				    " entries", mrt_l_elms, mrt_cnt);
				fatalx("exiting");
			}
			mrt_l = newp;
			mrt_l_elms = mrt_cnt;
		}

		new_cnt = PFD_LISTENERS_START + listener_cnt + peer_cnt +
		    ctl_cnt + mrt_cnt;
		if (new_cnt > pfd_elms) {
			if ((newp = realloc(pfd, sizeof(struct pollfd) *
			    new_cnt)) == NULL) {
				/* panic for now  */
				log_warn("could not resize pfd from %u -> %u"
				    " entries", pfd_elms, new_cnt);
				fatalx("exiting");
			}
			pfd = newp;
			pfd_elms = new_cnt;
		}

		bzero(pfd, sizeof(struct pollfd) * pfd_elms);
		pfd[PFD_PIPE_MAIN].fd = ibuf_main->fd;
		pfd[PFD_PIPE_MAIN].events = POLLIN;
		if (ibuf_main->w.queued > 0)
			pfd[PFD_PIPE_MAIN].events |= POLLOUT;
		pfd[PFD_PIPE_ROUTE].fd = ibuf_rde->fd;
		pfd[PFD_PIPE_ROUTE].events = POLLIN;
		if (ibuf_rde->w.queued > 0)
			pfd[PFD_PIPE_ROUTE].events |= POLLOUT;

		ctl_queued = 0;
		TAILQ_FOREACH(ctl_conn, &ctl_conns, entry)
			ctl_queued += ctl_conn->ibuf.w.queued;

		pfd[PFD_PIPE_ROUTE_CTL].fd = ibuf_rde_ctl->fd;
		if (ctl_queued < SESSION_CTL_QUEUE_MAX)
			/*
			 * Do not act as unlimited buffer. Don't read in more
			 * messages if the ctl sockets are getting full. 
			 */
			pfd[PFD_PIPE_ROUTE_CTL].events = POLLIN;
		pfd[PFD_SOCK_CTL].fd = csock;
		pfd[PFD_SOCK_CTL].events = POLLIN;
		pfd[PFD_SOCK_RCTL].fd = rcsock;
		pfd[PFD_SOCK_RCTL].events = POLLIN;

		i = PFD_LISTENERS_START;
		TAILQ_FOREACH(la, conf->listen_addrs, entry) {
			pfd[i].fd = la->fd;
			pfd[i].events = POLLIN;
			i++;
		}
		idx_listeners = i;
		timeout = 240;	/* loop every 240s at least */

		for (p = peers; p != NULL; p = p->next) {
			time_t	nextaction;
			struct peer_timer *pt;

			/* check timers */
			if ((pt = timer_nextisdue(p)) != NULL) {
				switch (pt->type) {
				case Timer_Hold:
					bgp_fsm(p, EVNT_TIMER_HOLDTIME);
					break;
				case Timer_ConnectRetry:
					bgp_fsm(p, EVNT_TIMER_CONNRETRY);
					break;
				case Timer_Keepalive:
					bgp_fsm(p, EVNT_TIMER_KEEPALIVE);
					break;
				case Timer_IdleHold:
					bgp_fsm(p, EVNT_START);
					break;
				case Timer_IdleHoldReset:
					p->IdleHoldTime /= 2;
					if (p->IdleHoldTime <=
					    INTERVAL_IDLE_HOLD_INITIAL) {
						p->IdleHoldTime =
						    INTERVAL_IDLE_HOLD_INITIAL;
						timer_stop(p,
						    Timer_IdleHoldReset);
						p->errcnt = 0;
					} else
						timer_set(p,
						    Timer_IdleHoldReset,
						    p->IdleHoldTime);
					break;
				case Timer_CarpUndemote:
					timer_stop(p, Timer_CarpUndemote);
					if (p->demoted &&
					    p->state == STATE_ESTABLISHED)
						session_demote(p, -1);
					break;
				default:
					fatalx("King Bula lost in time");
				}
			}
			if ((nextaction = timer_nextduein(p)) != -1 &&
			    nextaction < timeout)
				timeout = nextaction;

			/* are we waiting for a write? */
			events = POLLIN;
			if (p->wbuf.queued > 0 || p->state == STATE_CONNECT)
				events |= POLLOUT;

			/* poll events */
			if (p->fd != -1 && events != 0) {
				pfd[i].fd = p->fd;
				pfd[i].events = events;
				peer_l[i - idx_listeners] = p;
				i++;
			}
		}

		idx_peers = i;

		LIST_FOREACH(m, &mrthead, entry)
			if (m->wbuf.queued) {
				pfd[i].fd = m->wbuf.fd;
				pfd[i].events = POLLOUT;
				mrt_l[i - idx_peers] = m;
				i++;
			}

		idx_mrts = i;

		TAILQ_FOREACH(ctl_conn, &ctl_conns, entry) {
			pfd[i].fd = ctl_conn->ibuf.fd;
			pfd[i].events = POLLIN;
			if (ctl_conn->ibuf.w.queued > 0)
				pfd[i].events |= POLLOUT;
			i++;
		}

		if (timeout < 0)
			timeout = 0;
		if ((nfds = poll(pfd, i, timeout * 1000)) == -1)
			if (errno != EINTR)
				fatal("poll error");

		if (nfds > 0 && pfd[PFD_PIPE_MAIN].revents & POLLOUT)
			if (msgbuf_write(&ibuf_main->w) < 0)
				fatal("pipe write error");

		if (nfds > 0 && pfd[PFD_PIPE_MAIN].revents & POLLIN) {
			nfds--;
			session_dispatch_imsg(ibuf_main, PFD_PIPE_MAIN,
			    &listener_cnt);
		}

		if (nfds > 0 && pfd[PFD_PIPE_ROUTE].revents & POLLOUT)
			if (msgbuf_write(&ibuf_rde->w) < 0)
				fatal("pipe write error");

		if (nfds > 0 && pfd[PFD_PIPE_ROUTE].revents & POLLIN) {
			nfds--;
			session_dispatch_imsg(ibuf_rde, PFD_PIPE_ROUTE,
			    &listener_cnt);
		}

		if (nfds > 0 && pfd[PFD_PIPE_ROUTE_CTL].revents & POLLIN) {
			nfds--;
			session_dispatch_imsg(ibuf_rde_ctl, PFD_PIPE_ROUTE_CTL,
			    &listener_cnt);
		}

		if (nfds > 0 && pfd[PFD_SOCK_CTL].revents & POLLIN) {
			nfds--;
			ctl_cnt += control_accept(csock, 0);
		}

		if (nfds > 0 && pfd[PFD_SOCK_RCTL].revents & POLLIN) {
			nfds--;
			ctl_cnt += control_accept(rcsock, 1);
		}

		for (j = PFD_LISTENERS_START; nfds > 0 && j < idx_listeners;
		    j++)
			if (pfd[j].revents & POLLIN) {
				nfds--;
				session_accept(pfd[j].fd);
			}

		for (; nfds > 0 && j < idx_peers; j++)
			nfds -= session_dispatch_msg(&pfd[j],
			    peer_l[j - idx_listeners]);

		for (; nfds > 0 && j < idx_mrts; j++)
			if (pfd[j].revents & POLLOUT) {
				nfds--;
				mrt_write(mrt_l[j - idx_peers]);
			}

		for (; nfds > 0 && j < i; j++)
			nfds -= control_dispatch_msg(&pfd[j], &ctl_cnt);
	}

	while ((p = peers) != NULL) {
		peers = p->next;
		session_stop(p, ERR_CEASE_ADMIN_DOWN);
		pfkey_remove(p);
		free(p);
	}

	while ((m = LIST_FIRST(&mrthead)) != NULL) {
		mrt_clean(m);
		LIST_REMOVE(m, entry);
		free(m);
	}

	while ((la = TAILQ_FIRST(conf->listen_addrs)) != NULL) {
		TAILQ_REMOVE(conf->listen_addrs, la, entry);
		free(la);
	}
	free(conf->listen_addrs);
	free(peer_l);
	free(mrt_l);
	free(pfd);

	msgbuf_write(&ibuf_rde->w);
	msgbuf_clear(&ibuf_rde->w);
	free(ibuf_rde);
	msgbuf_write(&ibuf_main->w);
	msgbuf_clear(&ibuf_main->w);
	free(ibuf_main);

	control_shutdown(csock);
	control_shutdown(rcsock);
	log_info("session engine exiting");
	_exit(0);
}

void
init_conf(struct bgpd_config *c)
{
	if (!c->holdtime)
		c->holdtime = INTERVAL_HOLD;
	if (!c->connectretry)
		c->connectretry = INTERVAL_CONNECTRETRY;
}

void
init_peer(struct peer *p)
{
	TAILQ_INIT(&p->timers);
	p->fd = p->wbuf.fd = -1;

	if (p->conf.if_depend[0])
		imsg_compose(ibuf_main, IMSG_IFINFO, 0, 0, -1,
		    p->conf.if_depend, sizeof(p->conf.if_depend));
	else
		p->depend_ok = 1;

	peer_cnt++;

	change_state(p, STATE_IDLE, EVNT_NONE);
	if (p->conf.down)
		timer_stop(p, Timer_IdleHold);		/* no autostart */
	else
		timer_set(p, Timer_IdleHold, 0);	/* start ASAP */

	/*
	 * on startup, demote if requested.
	 * do not handle new peers. they must reach ESTABLISHED beforehands.
	 * peers added at runtime have reconf_action set to RECONF_REINIT.
	 */
	if (p->conf.reconf_action != RECONF_REINIT && p->conf.demote_group[0])
		session_demote(p, +1);
}

void
bgp_fsm(struct peer *peer, enum session_events event)
{
	switch (peer->state) {
	case STATE_NONE:
		/* nothing */
		break;
	case STATE_IDLE:
		switch (event) {
		case EVNT_START:
			timer_stop(peer, Timer_Hold);
			timer_stop(peer, Timer_Keepalive);
			timer_stop(peer, Timer_IdleHold);

			/* allocate read buffer */
			peer->rbuf = calloc(1, sizeof(struct buf_read));
			if (peer->rbuf == NULL)
				fatal(NULL);
			peer->rbuf->wpos = 0;

			/* init write buffer */
			msgbuf_init(&peer->wbuf);

			/* init pfkey - remove old if any, load new ones */
			pfkey_remove(peer);
			if (pfkey_establish(peer) == -1) {
				log_peer_warnx(&peer->conf,
				    "pfkey setup failed");
				return;
			}

			peer->stats.last_sent_errcode = 0;
			peer->stats.last_sent_suberr = 0;

			if (!peer->depend_ok)
				timer_stop(peer, Timer_ConnectRetry);
			else if (peer->passive || peer->conf.passive ||
			    peer->conf.template) {
				change_state(peer, STATE_ACTIVE, event);
				timer_stop(peer, Timer_ConnectRetry);
			} else {
				change_state(peer, STATE_CONNECT, event);
				timer_set(peer, Timer_ConnectRetry,
				    conf->connectretry);
				session_connect(peer);
			}
			peer->passive = 0;
			break;
		default:
			/* ignore */
			break;
		}
		break;
	case STATE_CONNECT:
		switch (event) {
		case EVNT_START:
			/* ignore */
			break;
		case EVNT_CON_OPEN:
			session_tcp_established(peer);
			session_open(peer);
			timer_stop(peer, Timer_ConnectRetry);
			peer->holdtime = INTERVAL_HOLD_INITIAL;
			start_timer_holdtime(peer);
			change_state(peer, STATE_OPENSENT, event);
			break;
		case EVNT_CON_OPENFAIL:
			timer_set(peer, Timer_ConnectRetry,
			    conf->connectretry);
			session_close_connection(peer);
			change_state(peer, STATE_ACTIVE, event);
			break;
		case EVNT_TIMER_CONNRETRY:
			timer_set(peer, Timer_ConnectRetry,
			    conf->connectretry);
			session_connect(peer);
			break;
		default:
			change_state(peer, STATE_IDLE, event);
			break;
		}
		break;
	case STATE_ACTIVE:
		switch (event) {
		case EVNT_START:
			/* ignore */
			break;
		case EVNT_CON_OPEN:
			session_tcp_established(peer);
			session_open(peer);
			timer_stop(peer, Timer_ConnectRetry);
			peer->holdtime = INTERVAL_HOLD_INITIAL;
			start_timer_holdtime(peer);
			change_state(peer, STATE_OPENSENT, event);
			break;
		case EVNT_CON_OPENFAIL:
			timer_set(peer, Timer_ConnectRetry,
			    conf->connectretry);
			session_close_connection(peer);
			change_state(peer, STATE_ACTIVE, event);
			break;
		case EVNT_TIMER_CONNRETRY:
			timer_set(peer, Timer_ConnectRetry,
			    peer->holdtime);
			change_state(peer, STATE_CONNECT, event);
			session_connect(peer);
			break;
		default:
			change_state(peer, STATE_IDLE, event);
			break;
		}
		break;
	case STATE_OPENSENT:
		switch (event) {
		case EVNT_START:
			/* ignore */
			break;
		case EVNT_STOP:
			change_state(peer, STATE_IDLE, event);
			break;
		case EVNT_CON_CLOSED:
			session_close_connection(peer);
			timer_set(peer, Timer_ConnectRetry,
			    conf->connectretry);
			change_state(peer, STATE_ACTIVE, event);
			break;
		case EVNT_CON_FATAL:
			change_state(peer, STATE_IDLE, event);
			break;
		case EVNT_TIMER_HOLDTIME:
			session_notification(peer, ERR_HOLDTIMEREXPIRED,
			    0, NULL, 0);
			change_state(peer, STATE_IDLE, event);
			break;
		case EVNT_RCVD_OPEN:
			/* parse_open calls change_state itself on failure */
			if (parse_open(peer))
				break;
			session_keepalive(peer);
			change_state(peer, STATE_OPENCONFIRM, event);
			break;
		case EVNT_RCVD_NOTIFICATION:
			if (parse_notification(peer)) {
				change_state(peer, STATE_IDLE, event);
				/* don't punish, capa negotiation */
				timer_set(peer, Timer_IdleHold, 0);
				peer->IdleHoldTime /= 2;
			} else
				change_state(peer, STATE_IDLE, event);
			break;
		default:
			session_notification(peer, ERR_FSM, 0, NULL, 0);
			change_state(peer, STATE_IDLE, event);
			break;
		}
		break;
	case STATE_OPENCONFIRM:
		switch (event) {
		case EVNT_START:
			/* ignore */
			break;
		case EVNT_STOP:
			change_state(peer, STATE_IDLE, event);
			break;
		case EVNT_CON_CLOSED:
		case EVNT_CON_FATAL:
			change_state(peer, STATE_IDLE, event);
			break;
		case EVNT_TIMER_HOLDTIME:
			session_notification(peer, ERR_HOLDTIMEREXPIRED,
			    0, NULL, 0);
			change_state(peer, STATE_IDLE, event);
			break;
		case EVNT_TIMER_KEEPALIVE:
			session_keepalive(peer);
			break;
		case EVNT_RCVD_KEEPALIVE:
			start_timer_holdtime(peer);
			change_state(peer, STATE_ESTABLISHED, event);
			break;
		case EVNT_RCVD_NOTIFICATION:
			parse_notification(peer);
			change_state(peer, STATE_IDLE, event);
			break;
		default:
			session_notification(peer, ERR_FSM, 0, NULL, 0);
			change_state(peer, STATE_IDLE, event);
			break;
		}
		break;
	case STATE_ESTABLISHED:
		switch (event) {
		case EVNT_START:
			/* ignore */
			break;
		case EVNT_STOP:
			change_state(peer, STATE_IDLE, event);
			break;
		case EVNT_CON_CLOSED:
		case EVNT_CON_FATAL:
			change_state(peer, STATE_IDLE, event);
			break;
		case EVNT_TIMER_HOLDTIME:
			session_notification(peer, ERR_HOLDTIMEREXPIRED,
			    0, NULL, 0);
			change_state(peer, STATE_IDLE, event);
			break;
		case EVNT_TIMER_KEEPALIVE:
			session_keepalive(peer);
			break;
		case EVNT_RCVD_KEEPALIVE:
			start_timer_holdtime(peer);
			break;
		case EVNT_RCVD_UPDATE:
			start_timer_holdtime(peer);
			if (parse_update(peer))
				change_state(peer, STATE_IDLE, event);
			else
				start_timer_holdtime(peer);
			break;
		case EVNT_RCVD_NOTIFICATION:
			parse_notification(peer);
			change_state(peer, STATE_IDLE, event);
			break;
		default:
			session_notification(peer, ERR_FSM, 0, NULL, 0);
			change_state(peer, STATE_IDLE, event);
			break;
		}
		break;
	}
}

void
start_timer_holdtime(struct peer *peer)
{
	if (peer->holdtime > 0)
		timer_set(peer, Timer_Hold, peer->holdtime);
	else
		timer_stop(peer, Timer_Hold);
}

void
start_timer_keepalive(struct peer *peer)
{
	if (peer->holdtime > 0)
		timer_set(peer, Timer_Keepalive, peer->holdtime / 3);
	else
		timer_stop(peer, Timer_Keepalive);
}

void
session_close_connection(struct peer *peer)
{
	if (peer->fd != -1)
		close(peer->fd);

	peer->fd = peer->wbuf.fd = -1;
}

void
change_state(struct peer *peer, enum session_state state,
    enum session_events event)
{
	struct mrt	*mrt;

	switch (state) {
	case STATE_IDLE:
		/* carp demotion first. new peers handled in init_peer */
		if (peer->state == STATE_ESTABLISHED &&
		    peer->conf.demote_group[0] && !peer->demoted)
			session_demote(peer, +1);

		/*
		 * try to write out what's buffered (maybe a notification),
		 * don't bother if it fails
		 */
		if (peer->state >= STATE_OPENSENT && peer->wbuf.queued)
			msgbuf_write(&peer->wbuf);

		/*
		 * we must start the timer for the next EVNT_START
		 * if we are coming here due to an error and the
		 * session was not established successfully before, the
		 * starttimerinterval needs to be exponentially increased
		 */
		if (peer->IdleHoldTime == 0)
			peer->IdleHoldTime = INTERVAL_IDLE_HOLD_INITIAL;
		peer->holdtime = INTERVAL_HOLD_INITIAL;
		timer_stop(peer, Timer_ConnectRetry);
		timer_stop(peer, Timer_Keepalive);
		timer_stop(peer, Timer_Hold);
		timer_stop(peer, Timer_IdleHoldReset);
		session_close_connection(peer);
		msgbuf_clear(&peer->wbuf);
		free(peer->rbuf);
		peer->rbuf = NULL;
		bzero(&peer->capa.peer, sizeof(peer->capa.peer));
		if (peer->state == STATE_ESTABLISHED)
			session_down(peer);
		if (event != EVNT_STOP) {
			timer_set(peer, Timer_IdleHold, peer->IdleHoldTime);
			if (event != EVNT_NONE &&
			    peer->IdleHoldTime < MAX_IDLE_HOLD/2)
				peer->IdleHoldTime *= 2;
		}
		if (peer->state == STATE_NONE ||
		    peer->state == STATE_ESTABLISHED) {
			/* initialize capability negotiation structures */
			memcpy(&peer->capa.ann, &peer->conf.capabilities,
			    sizeof(peer->capa.ann));
			if (!peer->conf.announce_capa)
				session_capa_ann_none(peer);
		}
		break;
	case STATE_CONNECT:
		break;
	case STATE_ACTIVE:
		break;
	case STATE_OPENSENT:
		break;
	case STATE_OPENCONFIRM:
		break;
	case STATE_ESTABLISHED:
		timer_set(peer, Timer_IdleHoldReset, peer->IdleHoldTime);
		if (peer->demoted)
			timer_set(peer, Timer_CarpUndemote,
			    INTERVAL_HOLD_DEMOTED);
		session_up(peer);
		break;
	default:		/* something seriously fucked */
		break;
	}

	log_statechange(peer, state, event);
	LIST_FOREACH(mrt, &mrthead, entry) {
		if (!(mrt->type == MRT_ALL_IN || mrt->type == MRT_ALL_OUT))
			continue;
		if ((mrt->peer_id == 0 && mrt->group_id == 0) ||
		    mrt->peer_id == peer->conf.id || (mrt->group_id != 0 &&
		    mrt->group_id == peer->conf.groupid))
			mrt_dump_state(mrt, peer->state, state, peer);
	}
	peer->prev_state = peer->state;
	peer->state = state;
}

void
session_accept(int listenfd)
{
	int			 connfd;
	int			 opt;
	socklen_t		 len;
	struct sockaddr_storage	 cliaddr;
	struct peer		*p = NULL;

	len = sizeof(cliaddr);
	if ((connfd = accept(listenfd,
	    (struct sockaddr *)&cliaddr, &len)) == -1) {
		if (errno == EWOULDBLOCK || errno == EINTR)
			return;
		else
			log_warn("accept");
	}

	p = getpeerbyip((struct sockaddr *)&cliaddr);

	if (p != NULL && p->state == STATE_IDLE && p->errcnt < 2) {
		if (timer_running(p, Timer_IdleHold, NULL)) {
			/* fast reconnect after clear */
			p->passive = 1;
			bgp_fsm(p, EVNT_START);
		}
	}

	if (p != NULL &&
	    (p->state == STATE_CONNECT || p->state == STATE_ACTIVE)) {
		if (p->fd != -1) {
			if (p->state == STATE_CONNECT)
				session_close_connection(p);
			else {
				close(connfd);
				return;
			}
		}

		if (p->conf.auth.method != AUTH_NONE && sysdep.no_pfkey) {
			log_peer_warnx(&p->conf,
			    "ipsec or md5sig configured but not available");
			close(connfd);
			return;
		}

		if (p->conf.auth.method == AUTH_MD5SIG) {
			if (sysdep.no_md5sig) {
				log_peer_warnx(&p->conf,
				    "md5sig configured but not available");
				close(connfd);
				return;
			}
			len = sizeof(opt);
			if (getsockopt(connfd, IPPROTO_TCP, TCP_MD5SIG,
			    &opt, &len) == -1)
				fatal("getsockopt TCP_MD5SIG");
			if (!opt) {	/* non-md5'd connection! */
				log_peer_warnx(&p->conf,
				    "connection attempt without md5 signature");
				close(connfd);
				return;
			}
		}
		p->fd = p->wbuf.fd = connfd;
		if (session_setup_socket(p)) {
			close(connfd);
			return;
		}
		session_socket_blockmode(connfd, BM_NONBLOCK);
		bgp_fsm(p, EVNT_CON_OPEN);
	} else {
		log_conn_attempt(p, (struct sockaddr *)&cliaddr);
		close(connfd);
	}
}

int
session_connect(struct peer *peer)
{
	int			 opt = 1;
	struct sockaddr		*sa;

	/*
	 * we do not need the overcomplicated collision detection RFC 1771
	 * describes; we simply make sure there is only ever one concurrent
	 * tcp connection per peer.
	 */
	if (peer->fd != -1)
		return (-1);

	if ((peer->fd = socket(peer->conf.remote_addr.af, SOCK_STREAM,
	    IPPROTO_TCP)) == -1) {
		log_peer_warn(&peer->conf, "session_connect socket");
		bgp_fsm(peer, EVNT_CON_OPENFAIL);
		return (-1);
	}

	if (peer->conf.auth.method != AUTH_NONE && sysdep.no_pfkey) {
		log_peer_warnx(&peer->conf,
		    "ipsec or md5sig configured but not available");
		bgp_fsm(peer, EVNT_CON_OPENFAIL);
		return (-1);
	}

	if (peer->conf.auth.method == AUTH_MD5SIG) {
		if (sysdep.no_md5sig) {
			log_peer_warnx(&peer->conf,
			    "md5sig configured but not available");
			bgp_fsm(peer, EVNT_CON_OPENFAIL);
			return (-1);
		}
		if (setsockopt(peer->fd, IPPROTO_TCP, TCP_MD5SIG,
		    &opt, sizeof(opt)) == -1) {
			log_peer_warn(&peer->conf, "setsockopt md5sig");
			bgp_fsm(peer, EVNT_CON_OPENFAIL);
			return (-1);
		}
	}
	peer->wbuf.fd = peer->fd;

	/* if update source is set we need to bind() */
	if (peer->conf.local_addr.af) {
		sa = addr2sa(&peer->conf.local_addr, 0);
		if (bind(peer->fd, sa, sa->sa_len) == -1) {
			log_peer_warn(&peer->conf, "session_connect bind");
			bgp_fsm(peer, EVNT_CON_OPENFAIL);
			return (-1);
		}
	}

	if (session_setup_socket(peer)) {
		bgp_fsm(peer, EVNT_CON_OPENFAIL);
		return (-1);
	}

	session_socket_blockmode(peer->fd, BM_NONBLOCK);

	sa = addr2sa(&peer->conf.remote_addr, BGP_PORT);
	if (connect(peer->fd, sa, sa->sa_len) == -1) {
		if (errno != EINPROGRESS) {
			if (errno != peer->lasterr)
				log_peer_warn(&peer->conf, "connect");
			peer->lasterr = errno;
			bgp_fsm(peer, EVNT_CON_OPENFAIL);
			return (-1);
		}
	} else
		bgp_fsm(peer, EVNT_CON_OPEN);

	return (0);
}

int
session_setup_socket(struct peer *p)
{
	int	ttl = p->conf.distance;
	int	pre = IPTOS_PREC_INTERNETCONTROL;
	int	nodelay = 1;
	int	bsize;

	if (p->conf.ebgp && p->conf.remote_addr.af == AF_INET) {
		/* set TTL to foreign router's distance - 1=direct n=multihop
		   with ttlsec, we always use 255 */
		if (p->conf.ttlsec) {
			ttl = 256 - p->conf.distance;
			if (setsockopt(p->fd, IPPROTO_IP, IP_MINTTL, &ttl,
			    sizeof(ttl)) == -1) {
				log_peer_warn(&p->conf,
				    "session_setup_socket setsockopt MINTTL");
				return (-1);
			}
			ttl = 255;
		}

		if (setsockopt(p->fd, IPPROTO_IP, IP_TTL, &ttl,
		    sizeof(ttl)) == -1) {
			log_peer_warn(&p->conf,
			    "session_setup_socket setsockopt TTL");
			return (-1);
		}
	}

	if (p->conf.ebgp && p->conf.remote_addr.af == AF_INET6)
		/* set hoplimit to foreign router's distance */
		if (setsockopt(p->fd, IPPROTO_IPV6, IPV6_UNICAST_HOPS, &ttl,
		    sizeof(ttl)) == -1) {
			log_peer_warn(&p->conf,
			    "session_setup_socket setsockopt hoplimit");
			return (-1);
		}

	/* if ttlsec is in use, set minttl */
	if (p->conf.ttlsec) {
		ttl = 256 - p->conf.distance;
		setsockopt(p->fd, IPPROTO_IP, IP_MINTTL, &ttl, sizeof(ttl));

	}

	/* set TCP_NODELAY */
	if (setsockopt(p->fd, IPPROTO_TCP, TCP_NODELAY, &nodelay,
	    sizeof(nodelay)) == -1) {
		log_peer_warn(&p->conf,
		    "session_setup_socket setsockopt TCP_NODELAY");
		return (-1);
	}

	/* set precedence, see RFC 1771 appendix 5 */
	if (p->conf.remote_addr.af == AF_INET &&
	    setsockopt(p->fd, IPPROTO_IP, IP_TOS, &pre, sizeof(pre)) == -1) {
		log_peer_warn(&p->conf,
		    "session_setup_socket setsockopt TOS");
		return (-1);
	}

	/* only increase bufsize (and thus window) if md5 or ipsec is in use */
	if (p->conf.auth.method != AUTH_NONE) {
		/* try to increase bufsize. no biggie if it fails */
		bsize = 65535;
		while (setsockopt(p->fd, SOL_SOCKET, SO_RCVBUF, &bsize,
		    sizeof(bsize)) == -1)
			bsize /= 2;
		bsize = 65535;
		while (setsockopt(p->fd, SOL_SOCKET, SO_SNDBUF, &bsize,
		    sizeof(bsize)) == -1)
			bsize /= 2;
	}

	return (0);
}

void
session_socket_blockmode(int fd, enum blockmodes bm)
{
	int	flags;

	if ((flags = fcntl(fd, F_GETFL, 0)) == -1)
		fatal("fcntl F_GETFL");

	if (bm == BM_NONBLOCK)
		flags |= O_NONBLOCK;
	else
		flags &= ~O_NONBLOCK;

	if ((flags = fcntl(fd, F_SETFL, flags)) == -1)
		fatal("fcntl F_SETFL");
}

void
session_tcp_established(struct peer *peer)
{
	socklen_t	len;

	len = sizeof(peer->sa_local);
	if (getsockname(peer->fd, (struct sockaddr *)&peer->sa_local,
	    &len) == -1)
		log_warn("getsockname");
	len = sizeof(peer->sa_remote);
	if (getpeername(peer->fd, (struct sockaddr *)&peer->sa_remote,
	    &len) == -1)
		log_warn("getpeername");
}

void
session_capa_ann_none(struct peer *peer)
{
	peer->capa.ann.mp_v4 = SAFI_NONE;
	peer->capa.ann.mp_v4 = SAFI_NONE;
	peer->capa.ann.refresh = 0;
	peer->capa.ann.restart = 0;
	peer->capa.ann.as4byte = 0;
}

int
session_capa_add(struct buf *opb, u_int8_t capa_code, u_int8_t capa_len)
{
	int errs = 0;

	errs += buf_add(opb, &capa_code, sizeof(capa_code));
	errs += buf_add(opb, &capa_len, sizeof(capa_len));
	return (errs);
}

int
session_capa_add_mp(struct buf *buf, u_int16_t afi, u_int8_t safi)
{
	u_int8_t		 pad = 0;
	int			 errs = 0;

	afi = htons(afi);
	errs += buf_add(buf, &afi, sizeof(afi));
	errs += buf_add(buf, &pad, sizeof(pad));
	errs += buf_add(buf, &safi, sizeof(safi));

	return (errs);
}

struct bgp_msg *
session_newmsg(enum msg_type msgtype, u_int16_t len)
{
	struct bgp_msg		*msg;
	struct msg_header	 hdr;
	struct buf		*buf;
	int			 errs = 0;

	memset(&hdr.marker, 0xff, sizeof(hdr.marker));
	hdr.len = htons(len);
	hdr.type = msgtype;

	if ((buf = buf_open(len)) == NULL)
		return (NULL);

	errs += buf_add(buf, &hdr.marker, sizeof(hdr.marker));
	errs += buf_add(buf, &hdr.len, sizeof(hdr.len));
	errs += buf_add(buf, &hdr.type, sizeof(hdr.type));

	if (errs > 0 ||
	    (msg = calloc(1, sizeof(*msg))) == NULL) {
		buf_free(buf);
		return (NULL);
	}

	msg->buf = buf;
	msg->type = msgtype;
	msg->len = len;

	return (msg);
}

int
session_sendmsg(struct bgp_msg *msg, struct peer *p)
{
	struct mrt		*mrt;

	LIST_FOREACH(mrt, &mrthead, entry) {
		if (!(mrt->type == MRT_ALL_OUT || (msg->type == UPDATE &&
		    mrt->type == MRT_UPDATE_OUT)))
			continue;
		if ((mrt->peer_id == 0 && mrt->group_id == 0) ||
		    mrt->peer_id == p->conf.id || (mrt->group_id == 0 &&
		    mrt->group_id == p->conf.groupid))
			mrt_dump_bgp_msg(mrt, msg->buf->buf, msg->len, p);
	}

	buf_close(&p->wbuf, msg->buf);
	free(msg);
	return (0);
}

void
session_open(struct peer *p)
{
	struct bgp_msg		*buf;
	struct buf		*opb;
	struct msg_open		 msg;
	u_int16_t		 len;
	u_int8_t		 op_type, optparamlen = 0;
	u_int			 errs = 0;


	if ((opb = buf_dynamic(0, UCHAR_MAX - sizeof(op_type) -
	    sizeof(optparamlen))) == NULL) {
		bgp_fsm(p, EVNT_CON_FATAL);
		return;
	}

	/* multiprotocol extensions, RFC 4760 */
	if (p->capa.ann.mp_v4) {	/* 4 bytes data */
		errs += session_capa_add(opb, CAPA_MP, 4);
		errs += session_capa_add_mp(opb, AFI_IPv4, p->capa.ann.mp_v4);
	}
	if (p->capa.ann.mp_v6) {	/* 4 bytes data */
		errs += session_capa_add(opb, CAPA_MP, 4);
		errs += session_capa_add_mp(opb, AFI_IPv6, p->capa.ann.mp_v6);
	}

	/* route refresh, RFC 2918 */
	if (p->capa.ann.refresh)	/* no data */
		errs += session_capa_add(opb, CAPA_REFRESH, 0);

	/* End-of-RIB marker, RFC 4724 */
	if (p->capa.ann.restart) {	/* 2 bytes data */
		u_char		c[2];

		c[0] = 0x80; /* we're always restarting */
		c[1] = 0;
		errs += session_capa_add(opb, CAPA_RESTART, 2);
		errs += buf_add(opb, &c, 2);
	}

	/* 4-bytes AS numbers, draft-ietf-idr-as4bytes-13 */
	if (p->capa.ann.as4byte) {	/* 4 bytes data */
		u_int32_t	nas;

		nas = htonl(conf->as);
		errs += session_capa_add(opb, CAPA_AS4BYTE, sizeof(nas));
		errs += buf_add(opb, &nas, sizeof(nas));
	}

	if (buf_size(opb))
		optparamlen = buf_size(opb) + sizeof(op_type) +
		    sizeof(optparamlen);

	len = MSGSIZE_OPEN_MIN + optparamlen;
	if (errs || (buf = session_newmsg(OPEN, len)) == NULL) {
		buf_free(opb);
		bgp_fsm(p, EVNT_CON_FATAL);
		return;
	}

	msg.version = 4;
	msg.myas = htons(conf->short_as);
	if (p->conf.holdtime)
		msg.holdtime = htons(p->conf.holdtime);
	else
		msg.holdtime = htons(conf->holdtime);
	msg.bgpid = conf->bgpid;	/* is already in network byte order */
	msg.optparamlen = optparamlen;

	errs += buf_add(buf->buf, &msg.version, sizeof(msg.version));
	errs += buf_add(buf->buf, &msg.myas, sizeof(msg.myas));
	errs += buf_add(buf->buf, &msg.holdtime, sizeof(msg.holdtime));
	errs += buf_add(buf->buf, &msg.bgpid, sizeof(msg.bgpid));
	errs += buf_add(buf->buf, &msg.optparamlen, sizeof(msg.optparamlen));

	if (optparamlen) {
		op_type = OPT_PARAM_CAPABILITIES;
		optparamlen = buf_size(opb);
		errs += buf_add(buf->buf, &op_type, sizeof(op_type));
		errs += buf_add(buf->buf, &optparamlen, sizeof(optparamlen));
		errs += buf_add(buf->buf, opb->buf, buf_size(opb));
	}

	buf_free(opb);

	if (errs > 0) {
		buf_free(buf->buf);
		free(buf);
		bgp_fsm(p, EVNT_CON_FATAL);
		return;
	}

	if (session_sendmsg(buf, p) == -1) {
		bgp_fsm(p, EVNT_CON_FATAL);
		return;
	}

	p->stats.msg_sent_open++;
}

void
session_keepalive(struct peer *p)
{
	struct bgp_msg		*buf;

	if ((buf = session_newmsg(KEEPALIVE, MSGSIZE_KEEPALIVE)) == NULL ||
	    session_sendmsg(buf, p) == -1) {
		bgp_fsm(p, EVNT_CON_FATAL);
		return;
	}

	start_timer_keepalive(p);
	p->stats.msg_sent_keepalive++;
}

void
session_update(u_int32_t peerid, void *data, size_t datalen)
{
	struct peer		*p;
	struct bgp_msg		*buf;

	if ((p = getpeerbyid(peerid)) == NULL) {
		log_warnx("no such peer: id=%u", peerid);
		return;
	}

	if (p->state != STATE_ESTABLISHED)
		return;

	if ((buf = session_newmsg(UPDATE, MSGSIZE_HEADER + datalen)) == NULL) {
		bgp_fsm(p, EVNT_CON_FATAL);
		return;
	}

	if (buf_add(buf->buf, data, datalen)) {
		buf_free(buf->buf);
		free(buf);
		bgp_fsm(p, EVNT_CON_FATAL);
		return;
	}

	if (session_sendmsg(buf, p) == -1) {
		bgp_fsm(p, EVNT_CON_FATAL);
		return;
	}

	start_timer_keepalive(p);
	p->stats.msg_sent_update++;
}

void
session_notification(struct peer *p, u_int8_t errcode, u_int8_t subcode,
    void *data, ssize_t datalen)
{
	struct bgp_msg		*buf;
	u_int			 errs = 0;

	if (p->stats.last_sent_errcode)	/* some notification already sent */
		return;

	if ((buf = session_newmsg(NOTIFICATION,
	    MSGSIZE_NOTIFICATION_MIN + datalen)) == NULL) {
		bgp_fsm(p, EVNT_CON_FATAL);
		return;
	}

	errs += buf_add(buf->buf, &errcode, sizeof(errcode));
	errs += buf_add(buf->buf, &subcode, sizeof(subcode));

	if (datalen > 0)
		errs += buf_add(buf->buf, data, datalen);

	if (errs > 0) {
		buf_free(buf->buf);
		free(buf);
		bgp_fsm(p, EVNT_CON_FATAL);
		return;
	}

	if (session_sendmsg(buf, p) == -1) {
		bgp_fsm(p, EVNT_CON_FATAL);
		return;
	}

	p->stats.msg_sent_notification++;
	p->stats.last_sent_errcode = errcode;
	p->stats.last_sent_suberr = subcode;
}

int
session_neighbor_rrefresh(struct peer *p)
{
	if (!p->capa.peer.refresh)
		return (-1);

	if (p->capa.peer.mp_v4 != SAFI_NONE)
		session_rrefresh(p, AFI_IPv4, p->capa.peer.mp_v4);
	if (p->capa.peer.mp_v6 != SAFI_NONE)
		session_rrefresh(p, AFI_IPv6, p->capa.peer.mp_v6);

	return (0);
}

void
session_rrefresh(struct peer *p, u_int16_t afi, u_int8_t safi)
{
	struct bgp_msg		*buf;
	int			 errs = 0;
	u_int8_t		 null8 = 0;

	if ((buf = session_newmsg(RREFRESH, MSGSIZE_RREFRESH)) == NULL) {
		bgp_fsm(p, EVNT_CON_FATAL);
		return;
	}

	afi = htons(afi);
	errs += buf_add(buf->buf, &afi, sizeof(afi));
	errs += buf_add(buf->buf, &null8, sizeof(null8));
	errs += buf_add(buf->buf, &safi, sizeof(safi));

	if (errs > 0) {
		buf_free(buf->buf);
		free(buf);
		bgp_fsm(p, EVNT_CON_FATAL);
		return;
	}

	if (session_sendmsg(buf, p) == -1) {
		bgp_fsm(p, EVNT_CON_FATAL);
		return;
	}

	p->stats.msg_sent_rrefresh++;
}

int
session_dispatch_msg(struct pollfd *pfd, struct peer *p)
{
	ssize_t		n, rpos, av, left;
	socklen_t	len;
	int		error, processed = 0;
	u_int16_t	msglen;
	u_int8_t	msgtype;

	if (p->state == STATE_CONNECT) {
		if (pfd->revents & POLLOUT) {
			if (pfd->revents & POLLIN) {
				/* error occurred */
				len = sizeof(error);
				if (getsockopt(pfd->fd, SOL_SOCKET, SO_ERROR,
				    &error, &len) == -1 || error) {
					if (error)
						errno = error;
					if (errno != p->lasterr) {
						log_peer_warn(&p->conf,
						    "socket error");
						p->lasterr = errno;
					}
					bgp_fsm(p, EVNT_CON_OPENFAIL);
					return (1);
				}
			}
			bgp_fsm(p, EVNT_CON_OPEN);
			return (1);
		}
		if (pfd->revents & POLLHUP) {
			bgp_fsm(p, EVNT_CON_OPENFAIL);
			return (1);
		}
		if (pfd->revents & (POLLERR|POLLNVAL)) {
			bgp_fsm(p, EVNT_CON_FATAL);
			return (1);
		}
		return (0);
	}

	if (pfd->revents & POLLHUP) {
		bgp_fsm(p, EVNT_CON_CLOSED);
		return (1);
	}
	if (pfd->revents & (POLLERR|POLLNVAL)) {
		bgp_fsm(p, EVNT_CON_FATAL);
		return (1);
	}

	if (pfd->revents & POLLOUT && p->wbuf.queued) {
		if ((error = msgbuf_write(&p->wbuf)) < 0) {
			if (error == -2)
				log_peer_warnx(&p->conf, "Connection closed");
			else
				log_peer_warn(&p->conf, "write error");
			bgp_fsm(p, EVNT_CON_FATAL);
			return (1);
		}
		if (!(pfd->revents & POLLIN))
			return (1);
	}

	if (p->rbuf && pfd->revents & POLLIN) {
		if ((n = read(p->fd, p->rbuf->buf + p->rbuf->wpos,
		    sizeof(p->rbuf->buf) - p->rbuf->wpos)) == -1) {
			if (errno != EINTR && errno != EAGAIN) {
				log_peer_warn(&p->conf, "read error");
				bgp_fsm(p, EVNT_CON_FATAL);
			}
			return (1);
		}
		if (n == 0) {	/* connection closed */
			bgp_fsm(p, EVNT_CON_CLOSED);
			return (1);
		}

		rpos = 0;
		av = p->rbuf->wpos + n;
		p->stats.last_read = time(NULL);

		/*
		 * session might drop to IDLE -> buffers deallocated
		 * we MUST check rbuf != NULL before use
		 */
		for (;;) {
			if (rpos + MSGSIZE_HEADER > av)
				break;
			if (p->rbuf == NULL)
				break;
			if (parse_header(p, p->rbuf->buf + rpos, &msglen,
			    &msgtype) == -1)
				return (0);
			if (rpos + msglen > av)
				break;
			p->rbuf->rptr = p->rbuf->buf + rpos;

			switch (msgtype) {
			case OPEN:
				bgp_fsm(p, EVNT_RCVD_OPEN);
				p->stats.msg_rcvd_open++;
				break;
			case UPDATE:
				bgp_fsm(p, EVNT_RCVD_UPDATE);
				p->stats.msg_rcvd_update++;
				break;
			case NOTIFICATION:
				bgp_fsm(p, EVNT_RCVD_NOTIFICATION);
				p->stats.msg_rcvd_notification++;
				break;
			case KEEPALIVE:
				bgp_fsm(p, EVNT_RCVD_KEEPALIVE);
				p->stats.msg_rcvd_keepalive++;
				break;
			case RREFRESH:
				parse_refresh(p);
				p->stats.msg_rcvd_rrefresh++;
				break;
			default:	/* cannot happen */
				session_notification(p, ERR_HEADER,
				    ERR_HDR_TYPE, &msgtype, 1);
				log_warnx("received message with "
				    "unknown type %u", msgtype);
				bgp_fsm(p, EVNT_CON_FATAL);
			}
			rpos += msglen;
			if (++processed > MSG_PROCESS_LIMIT)
				break;
		}
		if (p->rbuf == NULL)
			return (1);

		if (rpos < av) {
			left = av - rpos;
			memcpy(&p->rbuf->buf, p->rbuf->buf + rpos, left);
			p->rbuf->wpos = left;
		} else
			p->rbuf->wpos = 0;

		return (1);
	}
	return (0);
}

int
parse_header(struct peer *peer, u_char *data, u_int16_t *len, u_int8_t *type)
{
	struct mrt		*mrt;
	u_char			*p;
	u_int16_t		 olen;
	static const u_int8_t	 marker[MSGSIZE_HEADER_MARKER] = { 0xff, 0xff,
				    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
				    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };

	/* caller MUST make sure we are getting 19 bytes! */
	p = data;
	if (memcmp(p, marker, sizeof(marker))) {
		log_peer_warnx(&peer->conf, "sync error");
		session_notification(peer, ERR_HEADER, ERR_HDR_SYNC, NULL, 0);
		bgp_fsm(peer, EVNT_CON_FATAL);
		return (-1);
	}
	p += MSGSIZE_HEADER_MARKER;

	memcpy(&olen, p, 2);
	*len = ntohs(olen);
	p += 2;
	memcpy(type, p, 1);

	if (*len < MSGSIZE_HEADER || *len > MAX_PKTSIZE) {
		log_peer_warnx(&peer->conf,
		    "received message: illegal length: %u byte", *len);
		session_notification(peer, ERR_HEADER, ERR_HDR_LEN,
		    &olen, sizeof(olen));
		bgp_fsm(peer, EVNT_CON_FATAL);
		return (-1);
	}

	switch (*type) {
	case OPEN:
		if (*len < MSGSIZE_OPEN_MIN) {
			log_peer_warnx(&peer->conf,
			    "received OPEN: illegal len: %u byte", *len);
			session_notification(peer, ERR_HEADER, ERR_HDR_LEN,
			    &olen, sizeof(olen));
			bgp_fsm(peer, EVNT_CON_FATAL);
			return (-1);
		}
		break;
	case NOTIFICATION:
		if (*len < MSGSIZE_NOTIFICATION_MIN) {
			log_peer_warnx(&peer->conf,
			    "received NOTIFICATION: illegal len: %u byte",
			    *len);
			session_notification(peer, ERR_HEADER, ERR_HDR_LEN,
			    &olen, sizeof(olen));
			bgp_fsm(peer, EVNT_CON_FATAL);
			return (-1);
		}
		break;
	case UPDATE:
		if (*len < MSGSIZE_UPDATE_MIN) {
			log_peer_warnx(&peer->conf,
			    "received UPDATE: illegal len: %u byte", *len);
			session_notification(peer, ERR_HEADER, ERR_HDR_LEN,
			    &olen, sizeof(olen));
			bgp_fsm(peer, EVNT_CON_FATAL);
			return (-1);
		}
		break;
	case KEEPALIVE:
		if (*len != MSGSIZE_KEEPALIVE) {
			log_peer_warnx(&peer->conf,
			    "received KEEPALIVE: illegal len: %u byte", *len);
			session_notification(peer, ERR_HEADER, ERR_HDR_LEN,
			    &olen, sizeof(olen));
			bgp_fsm(peer, EVNT_CON_FATAL);
			return (-1);
		}
		break;
	case RREFRESH:
		if (*len != MSGSIZE_RREFRESH) {
			log_peer_warnx(&peer->conf,
			    "received RREFRESH: illegal len: %u byte", *len);
			session_notification(peer, ERR_HEADER, ERR_HDR_LEN,
			    &olen, sizeof(olen));
			bgp_fsm(peer, EVNT_CON_FATAL);
			return (-1);
		}
		break;
	default:
		log_peer_warnx(&peer->conf,
		    "received msg with unknown type %u", *type);
		session_notification(peer, ERR_HEADER, ERR_HDR_TYPE,
		    type, 1);
		bgp_fsm(peer, EVNT_CON_FATAL);
		return (-1);
	}
	LIST_FOREACH(mrt, &mrthead, entry) {
		if (!(mrt->type == MRT_ALL_IN || (*type == UPDATE &&
		    mrt->type == MRT_UPDATE_IN)))
			continue;
		if ((mrt->peer_id == 0 && mrt->group_id == 0) ||
		    mrt->peer_id == peer->conf.id || (mrt->group_id != 0 &&
		    mrt->group_id == peer->conf.groupid))
			mrt_dump_bgp_msg(mrt, data, *len, peer);
	}
	return (0);
}

int
parse_open(struct peer *peer)
{
	u_char		*p, *op_val;
	u_int8_t	 version, rversion;
	u_int16_t	 short_as, msglen;
	u_int16_t	 holdtime, oholdtime, myholdtime;
	u_int32_t	 as, bgpid;
	u_int8_t	 optparamlen, plen;
	u_int8_t	 op_type, op_len;

	p = peer->rbuf->rptr;
	p += MSGSIZE_HEADER_MARKER;
	memcpy(&msglen, p, sizeof(msglen));
	msglen = ntohs(msglen);

	p = peer->rbuf->rptr;
	p += MSGSIZE_HEADER;	/* header is already checked */

	memcpy(&version, p, sizeof(version));
	p += sizeof(version);

	if (version != BGP_VERSION) {
		log_peer_warnx(&peer->conf,
		    "peer wants unrecognized version %u", version);
		if (version > BGP_VERSION)
			rversion = version - BGP_VERSION;
		else
			rversion = BGP_VERSION;
		session_notification(peer, ERR_OPEN, ERR_OPEN_VERSION,
		    &rversion, sizeof(rversion));
		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
		return (-1);
	}

	memcpy(&short_as, p, sizeof(short_as));
	p += sizeof(short_as);
	as = peer->short_as = ntohs(short_as);

	memcpy(&oholdtime, p, sizeof(oholdtime));
	p += sizeof(oholdtime);

	holdtime = ntohs(oholdtime);
	if (holdtime && holdtime < peer->conf.min_holdtime) {
		log_peer_warnx(&peer->conf,
		    "peer requests unacceptable holdtime %u", holdtime);
		session_notification(peer, ERR_OPEN, ERR_OPEN_HOLDTIME,
		    NULL, 0);
		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
		return (-1);
	}

	myholdtime = peer->conf.holdtime;
	if (!myholdtime)
		myholdtime = conf->holdtime;
	if (holdtime < myholdtime)
		peer->holdtime = holdtime;
	else
		peer->holdtime = myholdtime;

	memcpy(&bgpid, p, sizeof(bgpid));
	p += sizeof(bgpid);

	/* check bgpid for validity - just disallow 0 */
	if (ntohl(bgpid) == 0) {
		log_peer_warnx(&peer->conf, "peer BGPID %lu unacceptable",
		    ntohl(bgpid));
		session_notification(peer, ERR_OPEN, ERR_OPEN_BGPID,
		    NULL, 0);
		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
		return (-1);
	}
	peer->remote_bgpid = bgpid;

	memcpy(&optparamlen, p, sizeof(optparamlen));
	p += sizeof(optparamlen);

	if (optparamlen != msglen - MSGSIZE_OPEN_MIN) {
			log_peer_warnx(&peer->conf,
			    "corrupt OPEN message received: length mismatch");
			session_notification(peer, ERR_OPEN, 0, NULL, 0);
			change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
			return (-1);
	}

	plen = optparamlen;
	while (plen > 0) {
		if (plen < 2) {
			log_peer_warnx(&peer->conf,
			    "corrupt OPEN message received, len wrong");
			session_notification(peer, ERR_OPEN, 0, NULL, 0);
			change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
			return (-1);
		}
		memcpy(&op_type, p, sizeof(op_type));
		p += sizeof(op_type);
		plen -= sizeof(op_type);
		memcpy(&op_len, p, sizeof(op_len));
		p += sizeof(op_len);
		plen -= sizeof(op_len);
		if (op_len > 0) {
			if (plen < op_len) {
				log_peer_warnx(&peer->conf,
				    "corrupt OPEN message received, len wrong");
				session_notification(peer, ERR_OPEN, 0,
				    NULL, 0);
				change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
				return (-1);
			}
			op_val = p;
			p += op_len;
			plen -= op_len;
		} else
			op_val = NULL;

		switch (op_type) {
		case OPT_PARAM_CAPABILITIES:		/* RFC 3392 */
			if (parse_capabilities(peer, op_val, op_len,
			    &as) == -1) {
				session_notification(peer, ERR_OPEN, 0,
				    NULL, 0);
				change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
				return (-1);
			}
			break;
		case OPT_PARAM_AUTH:			/* deprecated */
		default:
			/*
			 * unsupported type
			 * the RFCs tell us to leave the data section empty
			 * and notify the peer with ERR_OPEN, ERR_OPEN_OPT.
			 * How the peer should know _which_ optional parameter
			 * we don't support is beyond me.
			 */
			log_peer_warnx(&peer->conf,
			    "received OPEN message with unsupported optional "
			    "parameter: type %u", op_type);
			session_notification(peer, ERR_OPEN, ERR_OPEN_OPT,
				NULL, 0);
			change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
			timer_set(peer, Timer_IdleHold, 0);	/* no punish */
			peer->IdleHoldTime /= 2;
			return (-1);
		}
	}

	/* if remote-as is zero and it's a cloned neighbor, accept any */
	if (peer->conf.cloned && !peer->conf.remote_as && as != AS_TRANS) {
		peer->conf.remote_as = as;
		peer->conf.ebgp = (peer->conf.remote_as != conf->as);
		if (!peer->conf.ebgp)
			/* force enforce_as off for iBGP sessions */
			peer->conf.enforce_as = ENFORCE_AS_OFF;
	}

	if (peer->conf.remote_as != as) {
		log_peer_warnx(&peer->conf, "peer sent wrong AS %s",
		    log_as(as));
		session_notification(peer, ERR_OPEN, ERR_OPEN_AS, NULL, 0);
		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
		return (-1);
	}

	return (0);
}

int
parse_update(struct peer *peer)
{
	u_char		*p;
	u_int16_t	 datalen;

	/*
	 * we pass the message verbatim to the rde.
	 * in case of errors the whole session is reset with a
	 * notification anyway, we only need to know the peer
	 */
	p = peer->rbuf->rptr;
	p += MSGSIZE_HEADER_MARKER;
	memcpy(&datalen, p, sizeof(datalen));
	datalen = ntohs(datalen);

	p = peer->rbuf->rptr;
	p += MSGSIZE_HEADER;	/* header is already checked */
	datalen -= MSGSIZE_HEADER;

	if (imsg_compose(ibuf_rde, IMSG_UPDATE, peer->conf.id, 0, -1, p,
	    datalen) == -1)
		return (-1);

	return (0);
}

int
parse_refresh(struct peer *peer)
{
	u_char		*p;
	struct rrefresh	 r;

	p = peer->rbuf->rptr;
	p += MSGSIZE_HEADER;	/* header is already checked */

	/* afi, 2 byte */
	memcpy(&r.afi, p, sizeof(r.afi));
	r.afi = ntohs(r.afi);
	p += 2;
	/* reserved, 1 byte */
	p += 1;
	/* safi, 1 byte */
	memcpy(&r.safi, p, sizeof(r.safi));

	/* afi/safi unchecked -	unrecognized values will be ignored anyway */

	if (imsg_compose(ibuf_rde, IMSG_REFRESH, peer->conf.id, 0, -1, &r,
	    sizeof(r)) == -1)
		return (-1);

	return (0);
}

int
parse_notification(struct peer *peer)
{
	u_char		*p;
	u_int8_t	 errcode;
	u_int8_t	 subcode;
	u_int16_t	 datalen;
	u_int8_t	 capa_code;
	u_int8_t	 capa_len;

	/* just log */
	p = peer->rbuf->rptr;
	p += MSGSIZE_HEADER_MARKER;
	memcpy(&datalen, p, sizeof(datalen));
	datalen = ntohs(datalen);

	p = peer->rbuf->rptr;
	p += MSGSIZE_HEADER;	/* header is already checked */
	datalen -= MSGSIZE_HEADER;

	memcpy(&errcode, p, sizeof(errcode));
	p += sizeof(errcode);
	datalen -= sizeof(errcode);

	memcpy(&subcode, p, sizeof(subcode));
	p += sizeof(subcode);
	datalen -= sizeof(subcode);

	log_notification(peer, errcode, subcode, p, datalen);
	peer->errcnt++;

	if (errcode == ERR_OPEN && subcode == ERR_OPEN_CAPA) {
		if (datalen == 0) {	/* zebra likes to send those.. humbug */
			log_peer_warnx(&peer->conf, "received \"unsupported "
			    "capability\" notification without data part, "
			    "disabling capability announcements altogether");
			session_capa_ann_none(peer);
		}

		while (datalen > 0) {
			if (datalen < 2) {
				log_peer_warnx(&peer->conf,
				    "parse_notification: "
				    "expect len >= 2, len is %u", datalen);
				return (-1);
			}
			memcpy(&capa_code, p, sizeof(capa_code));
			p += sizeof(capa_code);
			datalen -= sizeof(capa_code);
			memcpy(&capa_len, p, sizeof(capa_len));
			p += sizeof(capa_len);
			datalen -= sizeof(capa_len);
			if (datalen < capa_len) {
				log_peer_warnx(&peer->conf,
				    "parse_notification: capa_len %u exceeds "
				    "remaining msg length %u", capa_len,
				    datalen);
				return (-1);
			}
			p += capa_len;
			datalen -= capa_len;
			switch (capa_code) {
			case CAPA_MP:
				peer->capa.ann.mp_v4 = SAFI_NONE;
				peer->capa.ann.mp_v6 = SAFI_NONE;
				log_peer_warnx(&peer->conf,
				    "disabling multiprotocol capability");
				break;
			case CAPA_REFRESH:
				peer->capa.ann.refresh = 0;
				log_peer_warnx(&peer->conf,
				    "disabling route refresh capability");
				break;
			case CAPA_RESTART:
				peer->capa.ann.restart = 0;
				log_peer_warnx(&peer->conf,
				    "disabling restart capability");
				break;
			case CAPA_AS4BYTE:
				peer->capa.ann.as4byte = 0;
				log_peer_warnx(&peer->conf,
				    "disabling 4-byte AS num capability");
				break;
			default:	/* should not happen... */
				log_peer_warnx(&peer->conf, "received "
				    "\"unsupported capability\" notification "
				    "for unknown capability %u, disabling "
				    "capability announcements altogether",
				    capa_code);
				session_capa_ann_none(peer);
				break;
			}
		}

		return (1);
	}

	if (errcode == ERR_OPEN && subcode == ERR_OPEN_OPT) {
		session_capa_ann_none(peer);
		return (1);
	}

	return (0);
}

int
parse_capabilities(struct peer *peer, u_char *d, u_int16_t dlen, u_int32_t *as)
{
	u_int16_t	 len;
	u_int8_t	 capa_code;
	u_int8_t	 capa_len;
	u_char		*capa_val;
	u_int16_t	 mp_afi;
	u_int8_t	 mp_safi;
	u_int32_t	 remote_as;

	len = dlen;
	while (len > 0) {
		if (len < 2) {
			log_peer_warnx(&peer->conf, "parse_capabilities: "
			    "expect len >= 2, len is %u", len);
			return (-1);
		}
		memcpy(&capa_code, d, sizeof(capa_code));
		d += sizeof(capa_code);
		len -= sizeof(capa_code);
		memcpy(&capa_len, d, sizeof(capa_len));
		d += sizeof(capa_len);
		len -= sizeof(capa_len);
		if (capa_len > 0) {
			if (len < capa_len) {
				log_peer_warnx(&peer->conf,
				    "parse_capabilities: "
				    "len %u smaller than capa_len %u",
				    len, capa_len);
				return (-1);
			}
			capa_val = d;
			d += capa_len;
			len -= capa_len;
		} else
			capa_val = NULL;

		switch (capa_code) {
		case CAPA_MP:			/* RFC 4760 */
			if (capa_len != 4) {
				log_peer_warnx(&peer->conf,
				    "parse_capabilities: "
				    "expect len 4, len is %u", capa_len);
				return (-1);
			}
			memcpy(&mp_afi, capa_val, sizeof(mp_afi));
			mp_afi = ntohs(mp_afi);
			memcpy(&mp_safi, capa_val + 3, sizeof(mp_safi));
			switch (mp_afi) {
			case AFI_IPv4:
				if (mp_safi < 1 || mp_safi > 3)
					log_peer_warnx(&peer->conf,
					    "parse_capabilities: AFI IPv4, "
					    "mp_safi %u unknown", mp_safi);
				else
					peer->capa.peer.mp_v4 = mp_safi;
				break;
			case AFI_IPv6:
				if (mp_safi < 1 || mp_safi > 3)
					log_peer_warnx(&peer->conf,
					    "parse_capabilities: AFI IPv6, "
					    "mp_safi %u unknown", mp_safi);
				else
					peer->capa.peer.mp_v6 = mp_safi;
				break;
			default:			/* ignore */
				break;
			}
			break;
		case CAPA_REFRESH:
			peer->capa.peer.refresh = 1;
			break;
		case CAPA_RESTART:
			peer->capa.peer.restart = 1;
			/* we don't care about the further restart capas yet */
			break;
		case CAPA_AS4BYTE:
			if (capa_len != 4) {
				log_peer_warnx(&peer->conf,
				    "parse_capabilities: "
				    "expect len 4, len is %u", capa_len);
				return (-1);
			}
			memcpy(&remote_as, capa_val, sizeof(remote_as));
			*as = ntohl(remote_as);
			peer->capa.peer.as4byte = 1;
			break;
		default:
			break;
		}
	}

	return (0);
}

void
session_dispatch_imsg(struct imsgbuf *ibuf, int idx, u_int *listener_cnt)
{
	struct imsg		 imsg;
	struct mrt		 xmrt;
	struct mrt		*mrt;
	struct peer_config	*pconf;
	struct peer		*p, *next;
	struct listen_addr	*la, *nla;
	struct kif		*kif;
	u_char			*data;
	enum reconf_action	 reconf;
	int			 n, depend_ok;
	u_int8_t		 errcode, subcode;

	if ((n = imsg_read(ibuf)) == -1)
		fatal("session_dispatch_imsg: imsg_read error");

	if (n == 0)	/* connection closed */
		fatalx("session_dispatch_imsg: pipe closed");

	for (;;) {
		if ((n = imsg_get(ibuf, &imsg)) == -1)
			fatal("session_dispatch_imsg: imsg_get error");

		if (n == 0)
			break;

		switch (imsg.hdr.type) {
		case IMSG_RECONF_CONF:
			if (idx != PFD_PIPE_MAIN)
				fatalx("reconf request not from parent");
			if ((nconf = malloc(sizeof(struct bgpd_config))) ==
			    NULL)
				fatal(NULL);
			memcpy(nconf, imsg.data, sizeof(struct bgpd_config));
			if ((nconf->listen_addrs = calloc(1,
			    sizeof(struct listen_addrs))) == NULL)
				fatal(NULL);
			TAILQ_INIT(nconf->listen_addrs);
			npeers = NULL;
			init_conf(nconf);
			pending_reconf = 1;
			break;
		case IMSG_RECONF_PEER:
			if (idx != PFD_PIPE_MAIN)
				fatalx("reconf request not from parent");
			pconf = imsg.data;
			p = getpeerbyaddr(&pconf->remote_addr);
			if (p == NULL) {
				if ((p = calloc(1, sizeof(struct peer))) ==
				    NULL)
					fatal("new_peer");
				p->state = p->prev_state = STATE_NONE;
				p->next = npeers;
				npeers = p;
				reconf = RECONF_REINIT;
			} else
				reconf = RECONF_KEEP;

			memcpy(&p->conf, pconf, sizeof(struct peer_config));
			p->conf.reconf_action = reconf;
			break;
		case IMSG_RECONF_LISTENER:
			if (idx != PFD_PIPE_MAIN)
				fatalx("reconf request not from parent");
			if (nconf == NULL)
				fatalx("IMSG_RECONF_LISTENER but no config");
			nla = imsg.data;
			TAILQ_FOREACH(la, conf->listen_addrs, entry)
				if (!la_cmp(la, nla))
					break;

			if (la == NULL) {
				if (nla->reconf != RECONF_REINIT)
					fatalx("king bula sez: "
					    "expected REINIT");

				if ((nla->fd = imsg.fd) == -1)
					log_warnx("expected to receive fd for "
					    "%s but didn't receive any",
					    log_sockaddr((struct sockaddr *)
					    &nla->sa));

				la = calloc(1, sizeof(struct listen_addr));
				if (la == NULL)
					fatal(NULL);
				memcpy(&la->sa, &nla->sa, sizeof(la->sa));
				la->flags = nla->flags;
				la->fd = nla->fd;
				la->reconf = RECONF_REINIT;
				TAILQ_INSERT_TAIL(nconf->listen_addrs, la,
				    entry);
			} else {
				if (nla->reconf != RECONF_KEEP)
					fatalx("king bula sez: expected KEEP");
				la->reconf = RECONF_KEEP;
			}

			break;
		case IMSG_RECONF_DONE:
			if (idx != PFD_PIPE_MAIN)
				fatalx("reconf request not from parent");
			if (nconf == NULL)
				fatalx("got IMSG_RECONF_DONE but no config");
			conf->as = nconf->as;
			conf->holdtime = nconf->holdtime;
			conf->bgpid = nconf->bgpid;
			conf->min_holdtime = nconf->min_holdtime;

			/* add new peers */
			for (p = npeers; p != NULL; p = next) {
				next = p->next;
				p->next = peers;
				peers = p;
			}
			/* find ones that need attention */
			for (p = peers; p != NULL; p = p->next) {
				/* needs to be deleted? */
				if (p->conf.reconf_action == RECONF_NONE &&
				    !p->conf.cloned)
					p->conf.reconf_action = RECONF_DELETE;
				/* had demotion, is demoted, demote removed? */
				if (p->demoted && !p->conf.demote_group[0])
						session_demote(p, -1);
			}

			/* delete old listeners */
			for (la = TAILQ_FIRST(conf->listen_addrs); la != NULL;
			    la = nla) {
				nla = TAILQ_NEXT(la, entry);
				if (la->reconf == RECONF_NONE) {
					log_info("not listening on %s any more",
					    log_sockaddr(
					    (struct sockaddr *)&la->sa));
					TAILQ_REMOVE(conf->listen_addrs, la,
					    entry);
					close(la->fd);
					free(la);
				}
			}

			/* add new listeners */
			while ((la = TAILQ_FIRST(nconf->listen_addrs)) !=
			    NULL) {
				TAILQ_REMOVE(nconf->listen_addrs, la, entry);
				TAILQ_INSERT_TAIL(conf->listen_addrs, la,
				    entry);
			}

			setup_listeners(listener_cnt);
			free(nconf->listen_addrs);
			free(nconf);
			nconf = NULL;
			pending_reconf = 0;
			log_info("SE reconfigured");
			break;
		case IMSG_IFINFO:
			if (idx != PFD_PIPE_MAIN)
				fatalx("IFINFO message not from parent");
			if (imsg.hdr.len != IMSG_HEADER_SIZE +
			    sizeof(struct kif))
				fatalx("IFINFO imsg with wrong len");
			kif = imsg.data;
			depend_ok = (kif->flags & IFF_UP) &&
			    (LINK_STATE_IS_UP(kif->link_state) ||
			    (kif->link_state == LINK_STATE_UNKNOWN &&
			    kif->media_type != IFT_CARP));

			for (p = peers; p != NULL; p = p->next)
				if (!strcmp(p->conf.if_depend, kif->ifname)) {
					if (depend_ok && !p->depend_ok) {
						p->depend_ok = depend_ok;
						bgp_fsm(p, EVNT_START);
					} else if (!depend_ok && p->depend_ok) {
						p->depend_ok = depend_ok;
						session_stop(p,
						    ERR_CEASE_OTHER_CHANGE);
					}
				}
			break;
		case IMSG_MRT_OPEN:
		case IMSG_MRT_REOPEN:
			if (imsg.hdr.len > IMSG_HEADER_SIZE +
			    sizeof(struct mrt)) {
				log_warnx("wrong imsg len");
				break;
			}

			memcpy(&xmrt, imsg.data, sizeof(struct mrt));
			if ((xmrt.wbuf.fd = imsg.fd) == -1)
				log_warnx("expected to receive fd for mrt dump "
				    "but didn't receive any");

			mrt = mrt_get(&mrthead, &xmrt);
			if (mrt == NULL) {
				/* new dump */
				mrt = calloc(1, sizeof(struct mrt));
				if (mrt == NULL)
					fatal("session_dispatch_imsg");
				memcpy(mrt, &xmrt, sizeof(struct mrt));
				TAILQ_INIT(&mrt->wbuf.bufs);
				LIST_INSERT_HEAD(&mrthead, mrt, entry);
			} else {
				/* old dump reopened */
				close(mrt->wbuf.fd);
				mrt->wbuf.fd = xmrt.wbuf.fd;
			}
			break;
		case IMSG_MRT_CLOSE:
			if (imsg.hdr.len > IMSG_HEADER_SIZE +
			    sizeof(struct mrt)) {
				log_warnx("wrong imsg len");
				break;
			}

			memcpy(&xmrt, imsg.data, sizeof(struct mrt));
			mrt = mrt_get(&mrthead, &xmrt);
			if (mrt != NULL) {
				mrt_clean(mrt);
				LIST_REMOVE(mrt, entry);
				free(mrt);
			}
			break;
		case IMSG_CTL_KROUTE:
		case IMSG_CTL_KROUTE6:
		case IMSG_CTL_KROUTE_ADDR:
		case IMSG_CTL_SHOW_NEXTHOP:
		case IMSG_CTL_SHOW_INTERFACE:
			if (idx != PFD_PIPE_MAIN)
				fatalx("ctl kroute request not from parent");
			control_imsg_relay(&imsg);
			break;
		case IMSG_CTL_SHOW_RIB:
		case IMSG_CTL_SHOW_RIB_PREFIX:
		case IMSG_CTL_SHOW_RIB_ATTR:
		case IMSG_CTL_SHOW_RIB_MEM:
		case IMSG_CTL_SHOW_NETWORK:
		case IMSG_CTL_SHOW_NETWORK6:
		case IMSG_CTL_SHOW_NEIGHBOR:
			if (idx != PFD_PIPE_ROUTE_CTL)
				fatalx("ctl rib request not from RDE");
			control_imsg_relay(&imsg);
			break;
		case IMSG_CTL_END:
		case IMSG_CTL_RESULT:
			control_imsg_relay(&imsg);
			break;
		case IMSG_UPDATE:
			if (idx != PFD_PIPE_ROUTE)
				fatalx("update request not from RDE");
			if (imsg.hdr.len > IMSG_HEADER_SIZE +
			    MAX_PKTSIZE - MSGSIZE_HEADER ||
			    imsg.hdr.len < IMSG_HEADER_SIZE +
			    MSGSIZE_UPDATE_MIN - MSGSIZE_HEADER)
				log_warnx("RDE sent invalid update");
			else
				session_update(imsg.hdr.peerid, imsg.data,
				    imsg.hdr.len - IMSG_HEADER_SIZE);
			break;
		case IMSG_UPDATE_ERR:
			if (idx != PFD_PIPE_ROUTE)
				fatalx("update request not from RDE");
			if (imsg.hdr.len < IMSG_HEADER_SIZE + 2) {
				log_warnx("RDE sent invalid notification");
				break;
			}
			if ((p = getpeerbyid(imsg.hdr.peerid)) == NULL) {
				log_warnx("no such peer: id=%u",
				    imsg.hdr.peerid);
				break;
			}
			data = imsg.data;
			errcode = *data++;
			subcode = *data++;

			if (imsg.hdr.len == IMSG_HEADER_SIZE + 2)
				data = NULL;

			session_notification(p, errcode, subcode,
			    data, imsg.hdr.len - IMSG_HEADER_SIZE - 2);
			switch (errcode) {
			case ERR_CEASE:
				switch (subcode) {
				case ERR_CEASE_MAX_PREFIX:
					bgp_fsm(p, EVNT_STOP);
					if (p->conf.max_prefix_restart)
						timer_set(p, Timer_IdleHold, 60 *
						    p->conf.max_prefix_restart);
					break;
				default:
					bgp_fsm(p, EVNT_CON_FATAL);
					break;
				}
				break;
			default:
				bgp_fsm(p, EVNT_CON_FATAL);
				break;
			}
			break;
		default:
			break;
		}
		imsg_free(&imsg);
	}
}

int
la_cmp(struct listen_addr *a, struct listen_addr *b)
{
	struct sockaddr_in	*in_a, *in_b;
	struct sockaddr_in6	*in6_a, *in6_b;

	if (a->sa.ss_family != b->sa.ss_family)
		return (1);

	switch (a->sa.ss_family) {
	case AF_INET:
		in_a = (struct sockaddr_in *)&a->sa;
		in_b = (struct sockaddr_in *)&b->sa;
		if (in_a->sin_addr.s_addr != in_b->sin_addr.s_addr)
			return (1);
		if (in_a->sin_port != in_b->sin_port)
			return (1);
		break;
	case AF_INET6:
		in6_a = (struct sockaddr_in6 *)&a->sa;
		in6_b = (struct sockaddr_in6 *)&b->sa;
		if (bcmp(&in6_a->sin6_addr, &in6_b->sin6_addr,
		    sizeof(struct in6_addr)))
			return (1);
		if (in6_a->sin6_port != in6_b->sin6_port)
			return (1);
		break;
	default:
		fatal("king bula sez: unknown address family");
		/* NOTREACHED */
	}

	return (0);
}

struct peer *
getpeerbyaddr(struct bgpd_addr *addr)
{
	struct peer *p;

	/* we might want a more effective way to find peers by IP */
	for (p = peers; p != NULL &&
	    memcmp(&p->conf.remote_addr, addr, sizeof(p->conf.remote_addr));
	    p = p->next)
		;	/* nothing */

	return (p);
}

struct peer *
getpeerbydesc(const char *descr)
{
	struct peer	*p, *res = NULL;
	int		 match = 0;

	for (p = peers; p != NULL; p = p->next)
		if (!strcmp(p->conf.descr, descr)) {
			res = p;
			match++;
		}

	if (match > 1)
		log_info("neighbor description \"%s\" not unique, request "
		    "aborted", descr);

	if (match == 1)
		return (res);
	else
		return (NULL);
}

struct peer *
getpeerbyip(struct sockaddr *ip)
{
	struct peer	*p, *newpeer, *loose = NULL;
	u_int32_t	 id;

	/* we might want a more effective way to find peers by IP */
	for (p = peers; p != NULL; p = p->next)
		if (!p->conf.template &&
		    p->conf.remote_addr.af == ip->sa_family) {
			if (p->conf.remote_addr.af == AF_INET &&
			    p->conf.remote_addr.v4.s_addr ==
			    ((struct sockaddr_in *)ip)->sin_addr.s_addr)
				return (p);
			if (p->conf.remote_addr.af == AF_INET6 &&
			    !bcmp(&p->conf.remote_addr.v6,
			    &((struct sockaddr_in6 *)ip)->sin6_addr,
			    sizeof(p->conf.remote_addr.v6)))
				return (p);
		}

	/* try template matching */
	for (p = peers; p != NULL; p = p->next)
		if (p->conf.template &&
		    p->conf.remote_addr.af == ip->sa_family &&
		    session_match_mask(p, ip))
			if (loose == NULL || loose->conf.remote_masklen <
			    p->conf.remote_masklen)
				loose = p;

	if (loose != NULL) {
		/* clone */
		if ((newpeer = malloc(sizeof(struct peer))) == NULL)
			fatal(NULL);
		memcpy(newpeer, loose, sizeof(struct peer));
		for (id = UINT_MAX; id > UINT_MAX / 2; id--) {
			for (p = peers; p != NULL && p->conf.id != id;
			    p = p->next)
				;	/* nothing */
			if (p == NULL) {	/* we found a free id */
				newpeer->conf.id = id;
				break;
			}
		}
		if (newpeer->conf.remote_addr.af == AF_INET) {
			newpeer->conf.remote_addr.v4.s_addr =
			    ((struct sockaddr_in *)ip)->sin_addr.s_addr;
			newpeer->conf.remote_masklen = 32;
		}
		if (newpeer->conf.remote_addr.af == AF_INET6) {
			memcpy(&newpeer->conf.remote_addr.v6,
			    &((struct sockaddr_in6 *)ip)->sin6_addr,
			    sizeof(newpeer->conf.remote_addr.v6));
			newpeer->conf.remote_masklen = 128;
		}
		newpeer->conf.template = 0;
		newpeer->conf.cloned = 1;
		newpeer->state = newpeer->prev_state = STATE_NONE;
		newpeer->conf.reconf_action = RECONF_KEEP;
		newpeer->rbuf = NULL;
		init_peer(newpeer);
		bgp_fsm(newpeer, EVNT_START);
		newpeer->next = peers;
		peers = newpeer;
		return (newpeer);
	}

	return (NULL);
}

int
session_match_mask(struct peer *p, struct sockaddr *ip)
{
	int		 i;
	in_addr_t	 v4mask;
	struct in6_addr	*in;
	struct in6_addr	 mask;

	if (p->conf.remote_addr.af == AF_INET) {
		v4mask = htonl(prefixlen2mask(p->conf.remote_masklen));
		if (p->conf.remote_addr.v4.s_addr ==
		    ((((struct sockaddr_in *)ip)->sin_addr.s_addr) & v4mask))
			return (1);
		else
			return (0);
	}

	if (p->conf.remote_addr.af == AF_INET6) {
		bzero(&mask, sizeof(mask));
		for (i = 0; i < p->conf.remote_masklen / 8; i++)
			mask.s6_addr[i] = 0xff;
		i = p->conf.remote_masklen % 8;
		if (i)
			mask.s6_addr[p->conf.remote_masklen / 8] = 0xff00 >> i;

		in = &((struct sockaddr_in6 *)ip)->sin6_addr;

		for (i = 0; i < 16; i++)
			if ((in->s6_addr[i] & mask.s6_addr[i]) !=
			    p->conf.remote_addr.addr8[i])
				return (0);

		return (1);
	}

	return (0);
}

struct peer *
getpeerbyid(u_int32_t peerid)
{
	struct peer *p;

	/* we might want a more effective way to find peers by IP */
	for (p = peers; p != NULL &&
	    p->conf.id != peerid; p = p->next)
		;	/* nothing */

	return (p);
}

void
session_down(struct peer *peer)
{
	peer->stats.last_updown = time(NULL);
	if (imsg_compose(ibuf_rde, IMSG_SESSION_DOWN, peer->conf.id, 0, -1,
	    NULL, 0) == -1)
		fatalx("imsg_compose error");
}

void
session_up(struct peer *p)
{
	struct session_up	 sup;

	if (imsg_compose(ibuf_rde, IMSG_SESSION_ADD, p->conf.id, 0, -1,
	    &p->conf, sizeof(p->conf)) == -1)
		fatalx("imsg_compose error");

	switch (p->sa_local.ss_family) {
	case AF_INET:
		sup.local_addr.af = AF_INET;
		memcpy(&sup.local_addr.v4,
		    &((struct sockaddr_in *)&p->sa_local)->sin_addr,
		    sizeof(sup.local_addr.v4));
		sup.remote_addr.af = AF_INET;
		memcpy(&sup.remote_addr.v4,
		    &((struct sockaddr_in *)&p->sa_remote)->sin_addr,
		    sizeof(sup.remote_addr.v4));
		break;
	case AF_INET6:
		sup.local_addr.af = AF_INET6;
		memcpy(&sup.local_addr.v6,
		    &((struct sockaddr_in6 *)&p->sa_local)->sin6_addr,
		    sizeof(sup.local_addr.v6));
		sup.remote_addr.af = AF_INET6;
		memcpy(&sup.remote_addr.v6,
		    &((struct sockaddr_in6 *)&p->sa_remote)->sin6_addr,
		    sizeof(sup.remote_addr.v6));
		break;
	default:
		fatalx("session_up: unsupported address family");
	}

	sup.remote_bgpid = p->remote_bgpid;
	sup.short_as = p->short_as;
	memcpy(&sup.capa_announced, &p->capa.ann, sizeof(sup.capa_announced));
	memcpy(&sup.capa_received, &p->capa.peer, sizeof(sup.capa_received));
	p->stats.last_updown = time(NULL);
	if (imsg_compose(ibuf_rde, IMSG_SESSION_UP, p->conf.id, 0, -1,
	    &sup, sizeof(sup)) == -1)
		fatalx("imsg_compose error");
}

int
imsg_compose_parent(int type, pid_t pid, void *data, u_int16_t datalen)
{
	return (imsg_compose(ibuf_main, type, 0, pid, -1, data, datalen));
}

int
imsg_compose_rde(int type, pid_t pid, void *data, u_int16_t datalen)
{
	return (imsg_compose(ibuf_rde, type, 0, pid, -1, data, datalen));
}

static struct sockaddr *
addr2sa(struct bgpd_addr *addr, u_int16_t port)
{
	static struct sockaddr_storage	 ss;
	struct sockaddr_in		*sa_in = (struct sockaddr_in *)&ss;
	struct sockaddr_in6		*sa_in6 = (struct sockaddr_in6 *)&ss;

	bzero(&ss, sizeof(ss));
	switch (addr->af) {
	case AF_INET:
		sa_in->sin_family = AF_INET;
		sa_in->sin_len = sizeof(struct sockaddr_in);
		sa_in->sin_addr.s_addr = addr->v4.s_addr;
		sa_in->sin_port = htons(port);
		break;
	case AF_INET6:
		sa_in6->sin6_family = AF_INET6;
		sa_in6->sin6_len = sizeof(struct sockaddr_in6);
		memcpy(&sa_in6->sin6_addr, &addr->v6,
		    sizeof(sa_in6->sin6_addr));
		sa_in6->sin6_port = htons(port);
		sa_in6->sin6_scope_id = addr->scope_id;
		break;
	}

	return ((struct sockaddr *)&ss);
}

void
session_demote(struct peer *p, int level)
{
	struct demote_msg	msg;

	strlcpy(msg.demote_group, p->conf.demote_group,
	    sizeof(msg.demote_group));
	msg.level = level;
	if (imsg_compose(ibuf_main, IMSG_DEMOTE, p->conf.id, 0, -1,
	    &msg, sizeof(msg)) == -1)
		fatalx("imsg_compose error");

	p->demoted += level;
}

void
session_stop(struct peer *peer, u_int8_t subcode)
{
	switch (peer->state) {
	case STATE_OPENSENT:
	case STATE_OPENCONFIRM:
	case STATE_ESTABLISHED:
		session_notification(peer, ERR_CEASE, subcode, NULL, 0);
		break;
	default:
		/* session not open, no need to send notification */
		break;
	}
	bgp_fsm(peer, EVNT_STOP);
}