/*	$OpenBSD: pctr.c,v 1.10 2003/06/10 22:20:49 deraadt Exp $	*/

/*
 * Pentium performance counter control program for OpenBSD.
 * Copyright 1996 David Mazieres <dm@lcs.mit.edu>.
 *
 * Modification and redistribution in source and binary forms is
 * permitted provided that due credit is given to the author and the
 * OpenBSD project by leaving this copyright notice intact.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/param.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/ioctl.h>
#include <err.h>
#include <fcntl.h>
#include <machine/cpu.h>
#include <machine/pctr.h>
#include <machine/specialreg.h>

#define CFL_MESI 0x1   /* Unit mask accepts MESI encoding */
#define CFL_SA   0x2   /* Unit mask accepts Self/Any bit */
#define CFL_C0   0x4   /* Counter 0 only */
#define CFL_C1   0x8   /* Counter 1 only */

/* Kernel cpuid values. */
int cpu_id, cpu_feature;
char cpu_vendor[16];

int pctr_isintel;

#define usetsc		(cpu_feature & CPUID_TSC)
#define usep5ctr	(pctr_isintel && (((cpu_id >> 8) & 15) == 5) && \
				(((cpu_id >> 4) & 15) > 0))
#define usep6ctr	(pctr_isintel && ((cpu_id >> 8) & 15) == 6)
#define cpufamily	((cpu_id >> 8) & 15)

extern char *__progname;

struct ctrfn {
	u_int fn;
	int flags;
	char *name;
	char *desc;
};

struct ctrfn p5fn[] = {
	{0x00, 0, "Data read", NULL},
	{0x01, 0, "Data write", NULL},
	{0x02, 0, "Data TLB miss", NULL},
	{0x03, 0, "Data read miss", NULL},
	{0x04, 0, "Data write miss", NULL},
	{0x05, 0, "Write (hit) to M or E state lines", NULL},
	{0x06, 0, "Data cache lines written back", NULL},
	{0x07, 0, "Data cache snoops", NULL},
	{0x08, 0, "Data cache snoop hits", NULL},
	{0x09, 0, "Memory accesses in both pipes", NULL},
	{0x0a, 0, "Bank conflicts", NULL},
	{0x0b, 0, "Misaligned data memory references", NULL},
	{0x0c, 0, "Code read", NULL},
	{0x0d, 0, "Code TLB miss", NULL},
	{0x0e, 0, "Code cache miss", NULL},
	{0x0f, 0, "Any segment register load", NULL},
	{0x12, 0, "Branches", NULL},
	{0x13, 0, "BTB hits", NULL},
	{0x14, 0, "Taken branch or BTB hit", NULL},
	{0x15, 0, "Pipeline flushes", NULL},
	{0x16, 0, "Instructions executed", NULL},
	{0x17, 0, "Instructions executed in the V-pipe", NULL},
	{0x18, 0, "Bus utilization (clocks)", NULL},
	{0x19, 0, "Pipeline stalled by write backup", NULL},
	{0x1a, 0, "Pipeline stalled by data memory read", NULL},
	{0x1b, 0, "Pipeline stalled by write to E or M line", NULL},
	{0x1c, 0, "Locked bus cycle", NULL},
	{0x1d, 0, "I/O read or write cycle", NULL},
	{0x1e, 0, "Noncacheable memory references", NULL},
	{0x1f, 0, "AGI (Address Generation Interlock)", NULL},
	{0x22, 0, "Floating-point operations", NULL},
	{0x23, 0, "Breakpoint 0 match", NULL},
	{0x24, 0, "Breakpoint 1 match", NULL},
	{0x25, 0, "Breakpoint 2 match", NULL},
	{0x26, 0, "Breakpoint 3 match", NULL},
	{0x27, 0, "Hardware interrupts", NULL},
	{0x28, 0, "Data read or data write", NULL},
	{0x29, 0, "Data read miss or data write miss", NULL},
	{0x0, 0, NULL, NULL},
};

struct ctrfn p6fn[] = {
	{0x03, 0, "LD_BLOCKS",
	 "Number of store buffer blocks."},
	{0x04, 0, "SB_DRAINS",
	 "Number of store buffer drain cycles."},
	{0x05, 0, "MISALIGN_MEM_REF",
	 "Number of misaligned data memory references."},
	{0x06, 0, "SEGMENT_REG_LOADS",
	 "Number of segment register loads."},
	{0x10, CFL_C0, "FP_COMP_OPS_EXE",
	 "Number of computational floating-point operations executed."},
	{0x11, CFL_C1, "FP_ASSIST",
	 "Number of floating-point exception cases handled by microcode."},
	{0x12, CFL_C1, "MUL",
	 "Number of multiplies."},
	{0x13, CFL_C1, "DIV",
	 "Number of divides."},
	{0x14, CFL_C0, "CYCLES_DIV_BUSY",
	 "Number of cycles during which the divider is busy."},
	{0x21, 0, "L2_ADS",
	 "Number of L2 address strobes."},
	{0x22, 0, "L2_DBUS_BUSY",
	 "Number of cycles durring which the data bus was busy."},
	{0x23, 0, "L2_DBUS_BUSY_RD",
	 "Number of cycles during which the data bus was busy transferring "
	 "data from L2 to the processor."},
	{0x24, 0, "L2_LINES_IN",
	 "Number of lines allocated in the L2."},
	{0x25, 0, "L2_M_LINES_INM",
	 "Number of modified lines allocated in the L2."},
	{0x26, 0, "L2_LINES_OUT",
	 "Number of lines removed from the L2 for any reason."},
	{0x27, 0, "L2_M_LINES_OUTM",
	 "Number of modified lines removed from the L2 for any reason."},
	{0x28, CFL_MESI, "L2_IFETCH",
	 "Number of L2 instruction fetches."},
	{0x29, CFL_MESI, "L2_LD", 
	 "Number of L2 data loads."},
	{0x2a, CFL_MESI, "L2_ST",
	 "Number of L2 data stores."},
	{0x2e, CFL_MESI, "L2_RQSTS",
	 "Number of L2 requests."},
	{0x43, 0, "DATA_MEM_REFS",
	 "All memory references, both cacheable and non-cacheable."},
	{0x45, 0, "DCU_LINES_IN",
	 "Total lines allocated in the DCU."},
	{0x46, 0, "DCU_M_LINES_IN",
	 "Number of M state lines allocated in the DCU."},
	{0x47, 0, "DCU_M_LINES_OUT",
	 "Number of M state lines evicted from the DCU.  "
	 "This includes evictions via snoop HITM, intervention or replacement"},
	{0x48, 0, "DCU_MISS_OUTSTANDING",
	 "Weighted number of cycles while a DCU miss is outstanding."},
	{0x60, 0, "BUS_REQ_OUTSTANDING",
	 "Number of bus requests outstanding."},
	{0x61, 0, "BUS_BNR_DRV",
	 "Number of bus clock cycles during which the processor is "
	 "driving the BNR pin."},
	{0x62, CFL_SA, "BUS_DRDY_CLOCKS",
	 "Number of clocks during which DRDY is asserted."},
	{0x63, CFL_SA, "BUS_LOCK_CLOCKS",
	 "Number of clocks during which LOCK is asserted."},
	{0x64, 0, "BUS_DATA_RCV",
	 "Number of bus clock cycles during which the processor is "
	 "receiving data."},
	{0x65, CFL_SA, "BUS_TRAN_BRD",
	 "Number of burst read transactions."},
	{0x66, CFL_SA, "BUS_TRAN_RFO",
	 "Number of read for ownership transactions."},
	{0x67, CFL_SA, "BUS_TRANS_WB", 
	 "Number of write back transactions."},
	{0x68, CFL_SA, "BUS_TRAN_IFETCH",
	 "Number of instruction fetch transactions."},
	{0x69, CFL_SA, "BUS_TRAN_INVAL",
	 "Number of invalidate transactions."},
	{0x6a, CFL_SA, "BUS_TRAN_PWR",
	 "Number of partial write transactions."},
	{0x6b, CFL_SA, "BUS_TRANS_P",
	 "Number of partial transactions."},
	{0x6c, CFL_SA, "BUS_TRANS_IO",
	 "Number of I/O transactions."},
	{0x6d, CFL_SA, "BUS_TRAN_DEF",
	 "Number of deferred transactions."},
	{0x6e, CFL_SA, "BUS_TRAN_BURST",
	 "Number of burst transactions."},
	{0x6f, CFL_SA, "BUS_TRAN_MEM",
	 "Number of memory transactions."},
	{0x70, CFL_SA, "BUS_TRAN_ANY",
	 "Number of all transactions."},
	{0x79, 0, "CPU_CLK_UNHALTED",
	 "Number of cycles during which the processor is not halted."},
	{0x7a, 0, "BUS_HIT_DRV",
	 "Number of bus clock cycles during which the processor is "
	 "driving the HIT pin."},
	{0x7b, 0, "BUS_HITM_DRV",
	 "Number of bus clock cycles during which the processor is "
	 "driving the HITM pin."},
	{0x7e, 0, "BUS_SNOOP_STALL",
	 "Number of clock cycles during which the bus is snoop stalled."},
	{0x80, 0, "IFU_IFETCH",
	 "Number of instruction fetches, both cacheable and non-cacheable."},
	{0x81, 0, "IFU_IFETCH_MISS",
	 "Number of instruction fetch misses."},
	{0x85, 0, "ITLB_MISS",
	 "Number of ITLB misses."},
	{0x86, 0, "IFU_MEM_STALL",
	 "Number of cycles that the instruction fetch pipe stage is stalled, "
	 "including cache mises, ITLB misses, ITLB faults, "
	 "and victim cache evictions"},
	{0x87, 0, "ILD_STALL",
	 "Number of cycles that the instruction length decoder is stalled"},
	{0xa2, 0, "RESOURCE_STALLS",
	 "Number of cycles during which there are resource-related stalls."},
	{0xc0, 0, "INST_RETIRED",
	 "Number of instructions retired."},
	{0xc1, CFL_C0, "FLOPS",
	 "Number of computational floating-point operations retired."},
	{0xc2, 0, "UOPS_RETIRED",
	 "Number of UOPs retired."},
	{0xc4, 0, "BR_INST_RETIRED",
	 "Number of branch instructions retired."},
	{0xc5, 0, "BR_MISS_PRED_RETIRED",
	 "Number of mispredicted branches retired."},
	{0xc6, 0, "CYCLES_INT_MASKED",
	 "Number of processor cycles for which interrupts are disabled."},
	{0xc7, 0, "CYCLES_INT_PENDING_AND_MASKED",
	 "Number of processor cycles for which interrupts are disabled "
	 "and interrupts are pending."},
	{0xc8, 0, "HW_INT_RX",
	 "Number of hardware interrupts received."},
	{0xc9, 0, "BR_TAKEN_RETIRED",
	 "Number of taken branches retired."},
	{0xca, 0, "BR_MISS_PRED_TAKEN_RET",
	 "Number of taken mispredictioned branches retired."},
	{0xd0, 0, "INST_DECODER",
	 "Number of instructions decoded."},
	{0xd2, 0, "PARTIAL_RAT_STALLS",
	 "Number of cycles or events for partial stalls."},
	{0xe0, 0, "BR_INST_DECODED",
	 "Number of branch instructions decoded."},
	{0xe2, 0, "BTB_MISSES",
	 "Number of branches that miss the BTB."},
	{0xe4, 0, "BR_BOGUS",
	 "Number of bogus branches."},
	{0xe6, 0, "BACLEARS",
	 "Number of times BACLEAR is asserted."},
	{0x0, 0, NULL, NULL},
};

static void
printdesc(char *desc)
{
	char *p;

	for (;;) {
		while (*desc == ' ')
			desc++;
		if (strlen(desc) < 70) {
			if (*desc)
				printf("      %s\n", desc);
			return;
		}
		p = desc + 72;
		while (*--p != ' ')
			;
		while (*--p == ' ')
			;
		p++;
		printf("      %.*s\n", p - desc, desc);
		desc = p;
	}
}

/* Print all possible counter functions */
static void
list(int fam)
{
	struct ctrfn *cfnp;

	if (fam == 5)
		cfnp = p5fn;
	else if (fam == 6)
		cfnp = p6fn;
	else {
		fprintf(stderr, "Unknown CPU family %d\n", fam);
		exit (1);
	}
	printf("Hardware counter functions for the %s:\n\n",
	fam == 5 ? "Pentium" : "Pentium Pro");
	for (; cfnp->name; cfnp++) {
		printf("%02x  %s", cfnp->fn, cfnp->name);
		if (cfnp->flags & CFL_MESI)
			printf("/mesi");
		else if (cfnp->flags & CFL_SA)
			printf("/a");
		if (cfnp->flags & CFL_C0)
			printf("  (ctr0 only)");
		if (cfnp->flags & CFL_C1)
			printf("  (ctr1 only)");
		printf("\n");
		if (cfnp->desc)
			printdesc(cfnp->desc);
	}
}

struct ctrfn *
fn2cfnp(u_int family, u_int sel)
{
	struct ctrfn *cfnp;

	if (family == 6) {
		cfnp = p6fn;
		sel &= 0xff;
	} else {
		cfnp = p5fn;
		sel &= 0x3f;
	}
	for (; cfnp->name; cfnp++)
		if (cfnp->fn == sel)
			return (cfnp);
	return (NULL);
}

static char *
fn2str(int family, u_int sel)
{
	static char buf[128];
	char um[9] = "";
	char cm[6] = "";
	struct ctrfn *cfnp;
	u_int fn;

	if (family == 5) {
		fn = sel & 0x3f;
		cfnp = fn2cfnp (family, fn);
		snprintf(buf, sizeof buf, "%c%c%c %02x %s",
		    sel & P5CTR_C ? 'c' : '-',
		    sel & P5CTR_U ? 'u' : '-',
		    sel & P5CTR_K ? 'k' : '-',
		    fn, cfnp ? cfnp->name : "unknown function");
	} else if (family == 6) {
		fn = sel & 0xff;
		cfnp = fn2cfnp (family, fn);
		if (cfnp && cfnp->flags & CFL_MESI)
			snprintf(um, sizeof um, "/%c%c%c%c",
			    sel & P6CTR_UM_M ? 'm' : '-',
			    sel & P6CTR_UM_E ? 'e' : '-',
			    sel & P6CTR_UM_S ? 's' : '-',
			    sel & P6CTR_UM_I ? 'i' : '-');
		else if (cfnp && cfnp->flags & CFL_SA)
			snprintf(um, sizeof um, "/%c",
			    sel & P6CTR_UM_A ? 'a' : '-');
		if (sel >> 24)
			snprintf(cm, sizeof cm, "+%d", sel >> 24);
		snprintf(buf, sizeof buf, "%c%c%c%c %02x%s%s%*s %s",
		    sel & P6CTR_I ? 'i' : '-',
		    sel & P6CTR_E ? 'e' : '-',
		    sel & P6CTR_K ? 'k' : '-',
		    sel & P6CTR_U ? 'u' : '-',
		    fn, cm, um, 7 - (strlen (cm) + strlen (um)), "",
		    cfnp ? cfnp->name : "unknown function");
	} else
		return (NULL);
	return (buf);
}

/* Print status of counters */
static void
readst(void)
{
	int fd, i;
	struct pctrst st;

	fd = open (_PATH_PCTR, O_RDONLY);
	if (fd < 0)
		err (1, _PATH_PCTR);
	if (ioctl (fd, PCIOCRD, &st) < 0)
		err (1, "PCIOCRD");
	close (fd);

	if (usep5ctr || usep6ctr) {
		for (i = 0; i < PCTR_NUM; i++)
			printf(" ctr%d = %16qd  [%s]\n", i, st.pctr_hwc[i],
			    fn2str (cpufamily, st.pctr_fn[i]));
	}
	printf("  tsc = %16qd\n  idl = %16qd\n", st.pctr_tsc, st.pctr_idl);
}

static void
setctr(int ctr, u_int val)
{
	int fd;

	fd = open (_PATH_PCTR, O_WRONLY);
	if (fd < 0)
		err (1, _PATH_PCTR);
	if (ioctl (fd, PCIOCS0 + ctr, &val) < 0)
		err (1, "PCIOCSn");
	close (fd);
}

static void
usage(void)
{
	fprintf(stderr,
	   "usage:\n"
	   "  %s\n"
	   "    Read the counters.\n"
	   "  %s -l [5|6]\n"
	   "    List all possible counter functions for P5/P6.\n",
	   __progname, __progname);
	if (usep5ctr)
		fprintf(stderr,
	     "  %s -s {0|1} [-[c][u][k]] function\n"
	     "    Configure counter.\n"
	     "      0/1 - counter to configure\n"
	     "        c - count cycles not events\n"
	     "        u - count events in user mode (ring 3)\n"
	     "        k - count events in kernel mode (rings 0-2)\n",
	     __progname);
	else if (usep6ctr)
		fprintf(stderr,
	     "  %s -s {0|1} [-[i][e][k][u]] "
	     "function[+cm][/{[m][e][s][i]|[a]}]\n"
	     "    Configure counter.\n"
	     "       0/1 - counter number to configure\n"
	     "         i - invert cm\n"
	     "         e - edge detect\n"
	     "         k - count events in kernel mode (rings 0-2)\n"
	     "         u - count events in user mode (ring 3)\n"
	     "        cm - # events/cycle required to bump ctr\n"
	     "      mesi - Modified/Exclusive/Shared/Invalid in cache\n"
	     "       s/a - self generated/all events\n", __progname);
	exit (1);
}

int
main(int argc, char **argv)
{
	u_int ctr;
	char *cp;
	u_int fn, fl = 0;
	char **ap;
	int ac;
	struct ctrfn *cfnp;
	int mib[2];
	size_t len;

	/* Get the kernel cpuid return values. */
	mib[0] = CTL_MACHDEP;
	mib[1] = CPU_CPUVENDOR;
	if (sysctl(mib, 2, NULL, &len, NULL, 0) == -1)
		err(1, "sysctl CPU_CPUVENDOR");
	if (len > sizeof(cpu_vendor))		/* Shouldn't ever happen. */
		err(1, "sysctl CPU_CPUVENDOR too big");
	if (sysctl(mib, 2, cpu_vendor, &len, NULL, 0) == -1)
		err(1, "sysctl CPU_CPUVENDOR");

	mib[1] = CPU_CPUID;
	len = sizeof(cpu_id);
	if (sysctl(mib, 2, &cpu_id, &len, NULL, 0) == -1)
		err(1, "sysctl CPU_CPUID");

	mib[1] = CPU_CPUFEATURE;
	len = sizeof(cpu_feature);
	if (sysctl(mib, 2, &cpu_feature, &len, NULL, 0) == -1)
		err(1, "sysctl CPU_CPUFEATURE");

	pctr_isintel = (strcmp(cpu_vendor, "GenuineIntel") == 0);

	if (argc <= 1)
		readst ();
	else if (argc == 2 && !strcmp (argv[1], "-l"))
		list (cpufamily);
	else if (argc == 3 && !strcmp (argv[1], "-l"))
		list (atoi (argv[2]));
	else if (!strcmp (argv[1], "-s") && argc >= 4) {
		ctr = atoi (argv[2]);
		if (ctr >= PCTR_NUM)
			usage ();
		ap = &argv[3];
		ac = argc - 3;

		if (usep6ctr)
		fl |= P6CTR_EN;
		if (**ap == '-') {
			cp = *ap;
			if (usep6ctr) {
				while (*++cp)
					switch (*cp) {
					case 'i':
						fl |= P6CTR_I;
						break;
					case 'e':
						fl |= P6CTR_E;
						break;
					case 'k':
						fl |= P6CTR_K;
						break;
					case 'u':
						fl |= P6CTR_U;
						break;
					default:
						usage ();
					}
			} else if(usep5ctr) {
				while (*++cp)
					switch (*cp) {
					case 'c':
						fl |= P5CTR_C;
						break;
					case 'k':
						fl |= P5CTR_K;
						break;
					case 'u':
						fl |= P5CTR_U;
						break;
					default:
						usage ();
					}
			}
			ap++;
			ac--;
		} else {
			if (usep6ctr)
				fl |= P6CTR_U|P6CTR_K;
			else if (usep5ctr)
				fl |= P5CTR_U|P5CTR_K;
		}

		if (!ac)
			usage ();

		fn = strtoul (*ap, NULL, 16);
		if ((usep6ctr && (fn & ~0xff)) || (!usep6ctr && (fn & ~0x3f)))
			usage ();
		fl |= fn;
		if (usep6ctr && (cp = strchr (*ap, '+'))) {
			cp++;
			fn = strtol (cp, NULL, 0);
			if (fn & ~0xff)
				usage ();
			fl |= (fn << 24);
		}
		cfnp = fn2cfnp (6, fl);
		if (usep6ctr && cfnp && (cp = strchr (*ap, '/'))) {
			if (cfnp->flags & CFL_MESI) {
				while (*++cp)
					switch (*cp) {
					case 'm':
						fl |= P6CTR_UM_M;
						break;
					case 'e':
						fl |= P6CTR_UM_E;
						break;
					case 's':
						fl |= P6CTR_UM_S;
						break;
					case 'i':
						fl |= P6CTR_UM_I;
						break;
					default:
						usage ();
					}
			} else if (cfnp->flags & CFL_SA) {
				while (*++cp)
					switch (*cp) {
					case 'a':
						fl |= P6CTR_UM_A;
						break;
					default:
						usage ();
					}
			} else
				usage ();
		} else if (cfnp && (cfnp->flags & CFL_MESI))
			fl |= P6CTR_UM_MESI;
		ap++;
		ac--;

		if (ac)
			usage ();

		if (usep6ctr && ! (fl & 0xff))
			fl = 0;
		setctr (ctr, fl);
	} else
		usage ();

	return 0;
}