/*	$OpenBSD: vioqcow2.c,v 1.22 2023/04/28 18:52:22 dv Exp $	*/

/*
 * Copyright (c) 2018 Ori Bernstein <ori@eigenstate.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include <sys/types.h>
#include <sys/stat.h>

#include <dev/pci/pcireg.h>

#include <assert.h>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <libgen.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "virtio.h"

#define QCOW2_COMPRESSED	0x4000000000000000ull
#define QCOW2_INPLACE		0x8000000000000000ull

#define QCOW2_DIRTY		(1 << 0)
#define QCOW2_CORRUPT		(1 << 1)

enum {
	ICFEATURE_DIRTY		= 1 << 0,
	ICFEATURE_CORRUPT	= 1 << 1,
};

enum {
	ACFEATURE_BITEXT	= 1 << 0,
};

struct qcheader {
	char magic[4];
	uint32_t version;
	uint64_t backingoff;
	uint32_t backingsz;
	uint32_t clustershift;
	uint64_t disksz;
	uint32_t cryptmethod;
	uint32_t l1sz;
	uint64_t l1off;
	uint64_t refoff;
	uint32_t refsz;
	uint32_t snapcount;
	uint64_t snapsz;
	/* v3 additions */
	uint64_t incompatfeatures;
	uint64_t compatfeatures;
	uint64_t autoclearfeatures;
	uint32_t reforder;	/* Bits = 1 << reforder */
	uint32_t headersz;
} __packed;

struct qcdisk {
	pthread_rwlock_t lock;
	struct qcdisk *base;
	struct qcheader header;

	int       fd;
	uint64_t *l1;
	off_t     end;
	off_t	  clustersz;
	off_t	  disksz; /* In bytes */
	uint32_t  cryptmethod;

	uint32_t l1sz;
	off_t	 l1off;

	off_t	 refoff;
	off_t	 refsz;

	uint32_t nsnap;
	off_t	 snapoff;

	/* v3 features */
	uint64_t incompatfeatures;
	uint64_t autoclearfeatures;
	uint32_t refssz;
	uint32_t headersz;
};

extern char *__progname;

static off_t xlate(struct qcdisk *, off_t, int *);
static void copy_cluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
static void inc_refs(struct qcdisk *, off_t, int);
static off_t mkcluster(struct qcdisk *, struct qcdisk *, off_t, off_t);
static int qc2_open(struct qcdisk *, int *, size_t);
static ssize_t qc2_pread(void *, char *, size_t, off_t);
static ssize_t qc2_pwrite(void *, char *, size_t, off_t);
static void qc2_close(void *, int);

/*
 * Initializes a raw disk image backing file from an fd. Stores the
 * number of bytes in *szp, returning -1 for error, 0 for success.
 *
 * May open snapshot base images.
 */
int
virtio_qcow2_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd)
{
	struct qcdisk *diskp;

	diskp = malloc(sizeof(struct qcdisk));
	if (diskp == NULL)
		return -1;
	if (qc2_open(diskp, fd, nfd) == -1) {
		log_warnx("could not open qcow2 disk");
		return -1;
	}
	file->p = diskp;
	file->pread = qc2_pread;
	file->pwrite = qc2_pwrite;
	file->close = qc2_close;
	*szp = diskp->disksz / 512;
	return 0;
}

/*
 * Return the path to the base image given a disk image.
 * Called from vmctl.
 */
ssize_t
virtio_qcow2_get_base(int fd, char *path, size_t npath, const char *dpath)
{
	char dpathbuf[PATH_MAX];
	char expanded[PATH_MAX];
	struct qcheader header;
	uint64_t backingoff;
	uint32_t backingsz;
	char *s = NULL;

	if (pread(fd, &header, sizeof(header), 0) != sizeof(header)) {
		log_warnx("short read on header");
		return -1;
	}
	if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0) {
		log_warnx("invalid magic numbers");
		return -1;
	}
	backingoff = be64toh(header.backingoff);
	backingsz = be32toh(header.backingsz);
	if (backingsz == 0)
		return 0;

	if (backingsz >= npath - 1) {
		log_warnx("snapshot path too long");
		return -1;
	}
	if (pread(fd, path, backingsz, backingoff) != backingsz) {
		log_warnx("could not read snapshot base name");
		return -1;
	}
	path[backingsz] = '\0';

	/*
	 * Relative paths should be interpreted relative to the disk image,
	 * rather than relative to the directory vmd happens to be running in,
	 * since this is the only useful interpretation.
	 */
	if (path[0] == '/') {
		if (realpath(path, expanded) == NULL ||
		    strlcpy(path, expanded, npath) >= npath) {
			log_warnx("unable to resolve %s", path);
			return -1;
		}
	} else {
		if (strlcpy(dpathbuf, dpath, sizeof(dpathbuf)) >=
		    sizeof(dpathbuf)) {
			log_warnx("path too long: %s", dpath);
			return -1;
		}
		s = dirname(dpathbuf);
		if (snprintf(expanded, sizeof(expanded),
		    "%s/%s", s, path) >= (int)sizeof(expanded)) {
			log_warnx("path too long: %s/%s", s, path);
			return -1;
		}
		if (npath < PATH_MAX ||
		    realpath(expanded, path) == NULL) {
			log_warnx("unable to resolve %s", path);
			return -1;
		}
	}

	return strlen(path);
}

static int
qc2_open(struct qcdisk *disk, int *fds, size_t nfd)
{
	char basepath[PATH_MAX];
	struct stat st;
	struct qcheader header;
	uint64_t backingoff;
	uint32_t backingsz;
	off_t i;
	int version, fd;

	pthread_rwlock_init(&disk->lock, NULL);
	fd = fds[0];
	disk->fd = fd;
	disk->base = NULL;
	disk->l1 = NULL;

	if (pread(fd, &header, sizeof(header), 0) != sizeof(header))
		fatalx("short read on header");
	if (strncmp(header.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
		fatalx("invalid magic numbers");

	disk->clustersz		= (1ull << be32toh(header.clustershift));
	disk->disksz		= be64toh(header.disksz);
	disk->cryptmethod	= be32toh(header.cryptmethod);
	disk->l1sz		= be32toh(header.l1sz);
	disk->l1off		= be64toh(header.l1off);
	disk->refsz		= be32toh(header.refsz);
	disk->refoff		= be64toh(header.refoff);
	disk->nsnap		= be32toh(header.snapcount);
	disk->snapoff		= be64toh(header.snapsz);

	/*
	 * The additional features here are defined as 0 in the v2 format,
	 * so as long as we clear the buffer before parsing, we don't need
	 * to check versions here.
	 */
	disk->incompatfeatures = be64toh(header.incompatfeatures);
	disk->autoclearfeatures = be64toh(header.autoclearfeatures);
	disk->refssz = be32toh(header.refsz);
	disk->headersz = be32toh(header.headersz);

	/*
	 * We only know about the dirty or corrupt bits here.
	 */
	if (disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT))
		fatalx("unsupported features %llx",
		    disk->incompatfeatures & ~(QCOW2_DIRTY|QCOW2_CORRUPT));
	if (be32toh(header.reforder) != 4)
		fatalx("unsupported refcount size\n");

	disk->l1 = calloc(disk->l1sz, sizeof(*disk->l1));
	if (!disk->l1)
		fatal("%s: could not allocate l1 table", __func__);
	if (pread(disk->fd, disk->l1, 8 * disk->l1sz, disk->l1off)
	    != 8 * disk->l1sz)
		fatalx("%s: unable to read qcow2 L1 table", __func__);
	for (i = 0; i < disk->l1sz; i++)
		disk->l1[i] = be64toh(disk->l1[i]);
	version = be32toh(header.version);
	if (version != 2 && version != 3)
		fatalx("%s: unknown qcow2 version %d", __func__, version);

	backingoff = be64toh(header.backingoff);
	backingsz = be32toh(header.backingsz);
	if (backingsz != 0) {
		if (backingsz >= sizeof(basepath) - 1) {
			fatalx("%s: snapshot path too long", __func__);
		}
		if (pread(fd, basepath, backingsz, backingoff) != backingsz) {
			fatalx("%s: could not read snapshot base name",
			    __func__);
		}
		basepath[backingsz] = 0;
		if (nfd <= 1) {
			fatalx("%s: missing base image %s", __func__,
			    basepath);
		}


		disk->base = calloc(1, sizeof(struct qcdisk));
		if (!disk->base)
			fatal("%s: could not open %s", __func__, basepath);
		if (qc2_open(disk->base, fds + 1, nfd - 1) == -1)
			fatalx("%s: could not open %s", __func__, basepath);
		if (disk->base->clustersz != disk->clustersz)
			fatalx("%s: all disk parts must share clustersize",
			    __func__);
	}
	if (fstat(fd, &st) == -1)
		fatal("%s: unable to stat disk", __func__);

	disk->end = st.st_size;

	log_debug("%s: qcow2 disk version %d size %lld end %lld snap %d",
	    __func__, version, disk->disksz, disk->end, disk->nsnap);

	return 0;
}

static ssize_t
qc2_pread(void *p, char *buf, size_t len, off_t off)
{
	struct qcdisk *disk, *d;
	off_t phys_off, end, cluster_off;
	ssize_t sz, rem;

	disk = p;
	end = off + len;
	if (off < 0 || end > disk->disksz)
		return -1;

	/* handle head chunk separately */
	rem = len;
	while (off != end) {
		for (d = disk; d; d = d->base)
			if ((phys_off = xlate(d, off, NULL)) > 0)
				break;
		/* Break out into chunks. This handles
		 * three cases:
		 *
		 *    |----+====|========|====+-----|
		 *
		 * Either we are at the start of the read,
		 * and the cluster has some leading bytes.
		 * This means that we are reading the tail
		 * of the cluster, and our size is:
		 *
		 * 	clustersz - (off % clustersz).
		 *
		 * Otherwise, we're reading the middle section.
		 * We're already aligned here, so we can just
		 * read the whole cluster size. Or we're at the
		 * tail, at which point we just want to read the
		 * remaining bytes.
		 */
		cluster_off = off % disk->clustersz;
		sz = disk->clustersz - cluster_off;
		if (sz > rem)
			sz = rem;
		/*
		 * If we're within the disk, but don't have backing bytes,
		 * just read back zeros.
		 */
		if (!d)
			bzero(buf, sz);
		else if (pread(d->fd, buf, sz, phys_off) != sz)
			return -1;
		off += sz;
		buf += sz;
		rem -= sz;
	}
	return len;
}

ssize_t
qc2_pwrite(void *p, char *buf, size_t len, off_t off)
{
	struct qcdisk *disk, *d;
	off_t phys_off, cluster_off, end;
	ssize_t sz, rem;
	int inplace;

	d = p;
	disk = p;
	inplace = 1;
	end = off + len;
	if (off < 0 || end > disk->disksz)
		return -1;
	rem = len;
	while (off != end) {
		/* See the read code for a summary of the computation */
		cluster_off = off % disk->clustersz;
		sz = disk->clustersz - cluster_off;
		if (sz > rem)
			sz = rem;

		phys_off = xlate(disk, off, &inplace);
		if (phys_off == -1)
			return -1;
		/*
		 * If we couldn't find the cluster in the writable disk,
		 * see if it exists in the base image. If it does, we
		 * need to copy it before the write. The copy happens
		 * in the '!inplace' if clause below te search.
		 */
		if (phys_off == 0)
			for (d = disk->base; d; d = d->base)
				if ((phys_off = xlate(d, off, NULL)) > 0)
					break;
		if (!inplace || phys_off == 0)
			phys_off = mkcluster(disk, d, off, phys_off);
		if (phys_off == -1)
			return -1;
		if (phys_off < disk->clustersz)
			fatalx("%s: writing reserved cluster", __func__);
		if (pwrite(disk->fd, buf, sz, phys_off) != sz)
			return -1;
		off += sz;
		buf += sz;
		rem -= sz;
	}
	return len;
}

static void
qc2_close(void *p, int stayopen)
{
	struct qcdisk *disk;

	disk = p;
	if (disk->base)
		qc2_close(disk->base, stayopen);
	if (!stayopen)
		close(disk->fd);
	free(disk->l1);
	free(disk);
}

/*
 * Translates a virtual offset into an on-disk offset.
 * Returns:
 * 	-1 on error
 * 	 0 on 'not found'
 * 	>0 on found
 */
static off_t
xlate(struct qcdisk *disk, off_t off, int *inplace)
{
	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff;
	uint64_t buf;


	/*
	 * Clear out inplace flag -- xlate misses should not
	 * be flagged as updatable in place. We will still
	 * return 0 from them, but this leaves less surprises
	 * in the API.
	 */
	if (inplace)
		*inplace = 0;
	pthread_rwlock_rdlock(&disk->lock);
	if (off < 0)
		goto err;

	l2sz = disk->clustersz / 8;
	l1off = (off / disk->clustersz) / l2sz;
	if (l1off >= disk->l1sz)
		goto err;

	l2tab = disk->l1[l1off];
	l2tab &= ~QCOW2_INPLACE;
	if (l2tab == 0) {
		pthread_rwlock_unlock(&disk->lock);
		return 0;
	}
	l2off = (off / disk->clustersz) % l2sz;
	pread(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8);
	cluster = be64toh(buf);
	/*
	 * cluster may be 0, but all future operations don't affect
	 * the return value.
	 */
	if (inplace)
		*inplace = !!(cluster & QCOW2_INPLACE);
	if (cluster & QCOW2_COMPRESSED)
		fatalx("%s: compressed clusters unsupported", __func__);
	pthread_rwlock_unlock(&disk->lock);
	clusteroff = 0;
	cluster &= ~QCOW2_INPLACE;
	if (cluster)
		clusteroff = off % disk->clustersz;
	return cluster + clusteroff;
err:
	pthread_rwlock_unlock(&disk->lock);
	return -1;
}

/*
 * Allocates a new cluster on disk, creating a new L2 table
 * if needed. The cluster starts off with a refs of one,
 * and the writable bit set.
 *
 * Returns -1 on error, and the physical address within the
 * cluster of the write offset if it exists.
 */
static off_t
mkcluster(struct qcdisk *disk, struct qcdisk *base, off_t off, off_t src_phys)
{
	off_t l2sz, l1off, l2tab, l2off, cluster, clusteroff, orig;
	uint64_t buf;

	pthread_rwlock_wrlock(&disk->lock);

	cluster = -1;
	/* L1 entries always exist */
	l2sz = disk->clustersz / 8;
	l1off = off / (disk->clustersz * l2sz);
	if (l1off >= disk->l1sz)
		fatalx("l1 offset outside disk");

	disk->end = (disk->end + disk->clustersz - 1) & ~(disk->clustersz - 1);

	l2tab = disk->l1[l1off];
	l2off = (off / disk->clustersz) % l2sz;
	/* We may need to create or clone an L2 entry to map the block */
	if (l2tab == 0 || (l2tab & QCOW2_INPLACE) == 0) {
		orig = l2tab & ~QCOW2_INPLACE;
		l2tab = disk->end;
		disk->end += disk->clustersz;
		if (ftruncate(disk->fd, disk->end) == -1)
			fatal("%s: ftruncate failed", __func__);

		/*
		 * If we translated, found a L2 entry, but it needed to
		 * be copied, copy it.
		 */
		if (orig != 0)
			copy_cluster(disk, disk, l2tab, orig);
		/* Update l1 -- we flush it later */
		disk->l1[l1off] = l2tab | QCOW2_INPLACE;
		inc_refs(disk, l2tab, 1);
	}
	l2tab &= ~QCOW2_INPLACE;

	/* Grow the disk */
	if (ftruncate(disk->fd, disk->end + disk->clustersz) < 0)
		fatal("%s: could not grow disk", __func__);
	if (src_phys > 0)
		copy_cluster(disk, base, disk->end, src_phys);
	cluster = disk->end;
	disk->end += disk->clustersz;
	buf = htobe64(cluster | QCOW2_INPLACE);
	if (pwrite(disk->fd, &buf, sizeof(buf), l2tab + l2off * 8) != 8)
		fatalx("%s: could not write cluster", __func__);

	/* TODO: lazily sync: currently VMD doesn't close things */
	buf = htobe64(disk->l1[l1off]);
	if (pwrite(disk->fd, &buf, sizeof(buf), disk->l1off + 8 * l1off) != 8)
		fatalx("%s: could not write l1", __func__);
	inc_refs(disk, cluster, 1);

	pthread_rwlock_unlock(&disk->lock);
	clusteroff = off % disk->clustersz;
	if (cluster + clusteroff < disk->clustersz)
		fatalx("write would clobber header");
	return cluster + clusteroff;
}

/* Copies a cluster containing src to dst. Src and dst need not be aligned. */
static void
copy_cluster(struct qcdisk *disk, struct qcdisk *base, off_t dst, off_t src)
{
	char *scratch;

	scratch = malloc(disk->clustersz);
	if (!scratch)
		fatal("out of memory");
	src &= ~(disk->clustersz - 1);
	dst &= ~(disk->clustersz - 1);
	if (pread(base->fd, scratch, disk->clustersz, src) == -1)
		fatal("%s: could not read cluster", __func__);
	if (pwrite(disk->fd, scratch, disk->clustersz, dst) == -1)
		fatal("%s: could not write cluster", __func__);
	free(scratch);
}

static void
inc_refs(struct qcdisk *disk, off_t off, int newcluster)
{
	off_t l1off, l1idx, l2idx, l2cluster;
	size_t nper;
	uint16_t refs;
	uint64_t buf;

	off &= ~QCOW2_INPLACE;
	nper = disk->clustersz / 2;
	l1idx = (off / disk->clustersz) / nper;
	l2idx = (off / disk->clustersz) % nper;
	l1off = disk->refoff + 8 * l1idx;
	if (pread(disk->fd, &buf, sizeof(buf), l1off) != 8)
		fatal("could not read refs");

	l2cluster = be64toh(buf);
	if (l2cluster == 0) {
		l2cluster = disk->end;
		disk->end += disk->clustersz;
		if (ftruncate(disk->fd, disk->end) < 0)
			fatal("%s: failed to allocate ref block", __func__);
		buf = htobe64(l2cluster);
		if (pwrite(disk->fd, &buf, sizeof(buf), l1off) != 8)
			fatal("%s: failed to write ref block", __func__);
	}

	refs = 1;
	if (!newcluster) {
		if (pread(disk->fd, &refs, sizeof(refs),
		    l2cluster + 2 * l2idx) != 2)
			fatal("could not read ref cluster");
		refs = be16toh(refs) + 1;
	}
	refs = htobe16(refs);
	if (pwrite(disk->fd, &refs, sizeof(refs), l2cluster + 2 * l2idx) != 2)
		fatal("%s: could not write ref block", __func__);
}

/*
 * virtio_qcow2_create
 *
 * Create an empty qcow2 imagefile with the specified path and size.
 *
 * Parameters:
 *  imgfile_path: path to the image file to create
 *  imgsize     : size of the image file to create (in bytes)
 *
 * Return:
 *  EEXIST: The requested image file already exists
 *  0     : Image file successfully created
 *  Exxxx : Various other Exxxx errno codes due to other I/O errors
 */
int
virtio_qcow2_create(const char *imgfile_path,
    const char *base_path, uint64_t disksz)
{
	struct qcheader hdr, basehdr;
	int fd, ret;
	ssize_t base_len;
	uint64_t l1sz, refsz, initsz, clustersz;
	uint64_t l1off, refoff, v, i, l1entrysz, refentrysz;
	uint16_t refs;

	if (base_path) {
		fd = open(base_path, O_RDONLY);
		if (read(fd, &basehdr, sizeof(basehdr)) != sizeof(basehdr))
			errx(1, "failure to read base image header");
		close(fd);
		if (strncmp(basehdr.magic,
		    VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW)) != 0)
			errx(1, "base image is not a qcow2 file");
		if (!disksz)
			disksz = betoh64(basehdr.disksz);
		else if (disksz != betoh64(basehdr.disksz))
			errx(1, "base size does not match requested size");
	}
	if (!base_path && !disksz)
		errx(1, "missing disk size");

	clustersz = (1<<16);
	l1off = ALIGNSZ(sizeof(hdr), clustersz);

	l1entrysz = clustersz * clustersz / 8;
	l1sz = (disksz + l1entrysz - 1) / l1entrysz;

	refoff = ALIGNSZ(l1off + 8*l1sz, clustersz);
	refentrysz = clustersz * clustersz * clustersz / 2;
	refsz = (disksz + refentrysz - 1) / refentrysz;

	initsz = ALIGNSZ(refoff + refsz*clustersz, clustersz);
	base_len = base_path ? strlen(base_path) : 0;

	memcpy(hdr.magic, VM_MAGIC_QCOW, strlen(VM_MAGIC_QCOW));
	hdr.version		= htobe32(3);
	hdr.backingoff		= htobe64(base_path ? sizeof(hdr) : 0);
	hdr.backingsz		= htobe32(base_len);
	hdr.clustershift	= htobe32(16);
	hdr.disksz		= htobe64(disksz);
	hdr.cryptmethod		= htobe32(0);
	hdr.l1sz		= htobe32(l1sz);
	hdr.l1off		= htobe64(l1off);
	hdr.refoff		= htobe64(refoff);
	hdr.refsz		= htobe32(refsz);
	hdr.snapcount		= htobe32(0);
	hdr.snapsz		= htobe64(0);
	hdr.incompatfeatures	= htobe64(0);
	hdr.compatfeatures	= htobe64(0);
	hdr.autoclearfeatures	= htobe64(0);
	hdr.reforder		= htobe32(4);
	hdr.headersz		= htobe32(sizeof(hdr));

	/* Refuse to overwrite an existing image */
	fd = open(imgfile_path, O_RDWR | O_CREAT | O_TRUNC | O_EXCL,
	    S_IRUSR | S_IWUSR);
	if (fd == -1)
		return (errno);

	/* Write out the header */
	if (write(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
		goto error;

	/* Add the base image */
	if (base_path && write(fd, base_path, base_len) != base_len)
		goto error;

	/* Extend to desired size, and add one refcount cluster */
	if (ftruncate(fd, (off_t)initsz + clustersz) == -1)
		goto error;

	/*
	 * Paranoia: if our disk image takes more than one cluster
	 * to refcount the initial image, fail.
	 */
	if (initsz/clustersz > clustersz/2) {
		errno = ERANGE;
		goto error;
	}

	/* Add a refcount block, and refcount ourselves. */
	v = htobe64(initsz);
	if (pwrite(fd, &v, 8, refoff) != 8)
		goto error;
	for (i = 0; i < initsz/clustersz + 1; i++) {
		refs = htobe16(1);
		if (pwrite(fd, &refs, 2, initsz + 2*i) != 2)
			goto error;
	}

	ret = close(fd);
	return (ret);
error:
	ret = errno;
	close(fd);
	unlink(imgfile_path);
	return (errno);
}