12 files changed, 363 insertions, 446 deletions
diff --git a/sys/arch/amd64/amd64/bus_space.c b/sys/arch/amd64/amd64/bus_space.c
index 84489296f7a..569806ff8c2 100644
--- a/sys/arch/amd64/amd64/bus_space.c
+++ b/sys/arch/amd64/amd64/bus_space.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: bus_space.c,v 1.4 2007/01/15 23:19:05 jsg Exp $	*/
+/*	$OpenBSD: bus_space.c,v 1.5 2007/05/25 16:22:11 art Exp $	*/
 /*	$NetBSD: bus_space.c,v 1.2 2003/03/14 18:47:53 christos Exp $	*/
 
 /*-
@@ -245,9 +245,8 @@ x86_mem_add_mapping(bus_addr_t bpa, bus_size_t size, int cacheable,
     bus_space_handle_t *bshp)
 {
 	u_long pa, endpa;
-	vaddr_t va;
+	vaddr_t va, sva;
 	pt_entry_t *pte;
-	int32_t cpumask = 0;
 
 	pa = trunc_page(bpa);
 	endpa = round_page(bpa + size);
@@ -261,6 +260,8 @@ x86_mem_add_mapping(bus_addr_t bpa, bus_size_t size, int cacheable,
 	if (va == 0)
 		return (ENOMEM);
 
+	sva = va;
+
 	*bshp = (bus_space_handle_t)(va + (bpa & PGOFSET));
 
 	for (; pa < endpa; pa += PAGE_SIZE, va += PAGE_SIZE) {
@@ -286,12 +287,13 @@ x86_mem_add_mapping(bus_addr_t bpa, bus_size_t size, int cacheable,
 				*pte &= ~PG_N;
 			else
 				*pte |= PG_N;
-			pmap_tlb_shootdown(pmap_kernel(), va, *pte,
-			    &cpumask);
 		}
 	}
+	if (!cacheable) {
+		pmap_tlb_shootrange(pmap_kernel(), sva, sva + size);
+		pmap_tlb_shootwait();
+	}
 
-	pmap_tlb_shootnow(cpumask);
 	pmap_update(pmap_kernel());
 
 	return 0;
diff --git a/sys/arch/amd64/amd64/genassym.cf b/sys/arch/amd64/amd64/genassym.cf
index 638ea1e5ffc..06905160182 100644
--- a/sys/arch/amd64/amd64/genassym.cf
+++ b/sys/arch/amd64/amd64/genassym.cf
@@ -1,4 +1,4 @@
-#	$OpenBSD: genassym.cf,v 1.11 2007/05/10 17:59:23 deraadt Exp $
+#	$OpenBSD: genassym.cf,v 1.12 2007/05/25 16:22:11 art Exp $
 #	Written by Artur Grabowski art@openbsd.org, Public Domain
 
 include <sys/param.h>
@@ -149,3 +149,5 @@ export	NKL3_KIMG_ENTRIES
 export	NKL2_KIMG_ENTRIES
 
 export	CR4_DEFAULT
+
+export	PAGE_SIZE
diff --git a/sys/arch/amd64/amd64/ipi.c b/sys/arch/amd64/amd64/ipi.c
index 30054c80fee..88d642cf7e8 100644
--- a/sys/arch/amd64/amd64/ipi.c
+++ b/sys/arch/amd64/amd64/ipi.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: ipi.c,v 1.6 2007/05/10 17:59:23 deraadt Exp $	*/
+/*	$OpenBSD: ipi.c,v 1.7 2007/05/25 16:22:11 art Exp $	*/
 /*	$NetBSD: ipi.c,v 1.2 2003/03/01 13:05:37 fvdl Exp $	*/
 
 /*-
@@ -74,6 +74,15 @@ x86_send_ipi(struct cpu_info *ci, int ipimask)
 	return ret;
 }
 
+int
+x86_fast_ipi(struct cpu_info *ci, int ipi)
+{
+	if (!(ci->ci_flags & CPUF_RUNNING))
+		return (ENOENT);
+
+	return (x86_ipi(ipi, ci->ci_apicid, LAPIC_DLMODE_FIXED));
+}
+
 void
 x86_broadcast_ipi(int ipimask)
 {
diff --git a/sys/arch/amd64/amd64/ipifuncs.c b/sys/arch/amd64/amd64/ipifuncs.c
index 22248e3ff5b..11434d9eee1 100644
--- a/sys/arch/amd64/amd64/ipifuncs.c
+++ b/sys/arch/amd64/amd64/ipifuncs.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: ipifuncs.c,v 1.6 2007/05/06 03:37:08 gwk Exp $	*/
+/*	$OpenBSD: ipifuncs.c,v 1.7 2007/05/25 16:22:11 art Exp $	*/
 /*	$NetBSD: ipifuncs.c,v 1.1 2003/04/26 18:39:28 fvdl Exp $ */
 
 /*-
@@ -83,7 +83,7 @@ void (*ipifunc[X86_NIPI])(struct cpu_info *) =
 	x86_64_ipi_nop,
 	x86_64_ipi_flush_fpu,
 	x86_64_ipi_synch_fpu,
-	pmap_do_tlb_shootdown,
+	NULL,
 	x86_64_reload_mtrr,
 	gdt_reload_cpu,
 #ifdef DDB
diff --git a/sys/arch/amd64/amd64/lapic.c b/sys/arch/amd64/amd64/lapic.c
index 23832b90467..c07be9e57b1 100644
--- a/sys/arch/amd64/amd64/lapic.c
+++ b/sys/arch/amd64/amd64/lapic.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: lapic.c,v 1.9 2007/02/19 11:59:00 tom Exp $	*/
+/*	$OpenBSD: lapic.c,v 1.10 2007/05/25 16:22:11 art Exp $	*/
 /* $NetBSD: lapic.c,v 1.2 2003/05/08 01:04:35 fvdl Exp $ */
 
 /*-
@@ -210,6 +210,12 @@ lapic_boot_init(paddr_t lapic_base)
 #ifdef MULTIPROCESSOR
 	idt_allocmap[LAPIC_IPI_VECTOR] = 1;
 	idt_vec_set(LAPIC_IPI_VECTOR, Xintr_lapic_ipi);
+	idt_allocmap[LAPIC_IPI_INVLTLB] = 1;
+	idt_vec_set(LAPIC_IPI_INVLTLB, Xipi_invltlb);
+	idt_allocmap[LAPIC_IPI_INVLPG] = 1;
+	idt_vec_set(LAPIC_IPI_INVLPG, Xipi_invlpg);
+	idt_allocmap[LAPIC_IPI_INVLRANGE] = 1;
+	idt_vec_set(LAPIC_IPI_INVLRANGE, Xipi_invlrange);
 #endif
 	idt_allocmap[LAPIC_SPURIOUS_VECTOR] = 1;
 	idt_vec_set(LAPIC_SPURIOUS_VECTOR, Xintrspurious);
diff --git a/sys/arch/amd64/amd64/pmap.c b/sys/arch/amd64/amd64/pmap.c
index a6fb4ceb6b4..aac10a10054 100644
--- a/sys/arch/amd64/amd64/pmap.c
+++ b/sys/arch/amd64/amd64/pmap.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: pmap.c,v 1.22 2007/05/18 14:41:55 art Exp $	*/
+/*	$OpenBSD: pmap.c,v 1.23 2007/05/25 16:22:11 art Exp $	*/
 /*	$NetBSD: pmap.c,v 1.3 2003/05/08 18:13:13 thorpej Exp $	*/
 
 /*
@@ -255,55 +255,6 @@ struct simplelock pmaps_lock;
 #define COUNT(x)	/* nothing */
 
 /*
- * TLB Shootdown:
- *
- * When a mapping is changed in a pmap, the TLB entry corresponding to
- * the virtual address must be invalidated on all processors.  In order
- * to accomplish this on systems with multiple processors, messages are
- * sent from the processor which performs the mapping change to all
- * processors on which the pmap is active.  For other processors, the
- * ASN generation numbers for that processor is invalidated, so that
- * the next time the pmap is activated on that processor, a new ASN
- * will be allocated (which implicitly invalidates all TLB entries).
- *
- * Shootdown job queue entries are allocated using a simple special-
- * purpose allocator for speed.
- */
-struct pmap_tlb_shootdown_job {
-	TAILQ_ENTRY(pmap_tlb_shootdown_job) pj_list;
-	vaddr_t pj_va;			/* virtual address */
-	pmap_t pj_pmap;			/* the pmap which maps the address */
-	pt_entry_t pj_pte;		/* the PTE bits */
-	struct pmap_tlb_shootdown_job *pj_nextfree;
-};
-
-#define PMAP_TLB_SHOOTDOWN_JOB_ALIGN 64
-union pmap_tlb_shootdown_job_al {
-	struct pmap_tlb_shootdown_job pja_job;
-	char pja_align[PMAP_TLB_SHOOTDOWN_JOB_ALIGN];
-};
-
-struct pmap_tlb_shootdown_q {
-	TAILQ_HEAD(, pmap_tlb_shootdown_job) pq_head;
-	int pq_pte;			/* aggregate PTE bits */
-	int pq_count;			/* number of pending requests */
-	struct mutex pq_mutex;	/* spin lock on queue */
-	int pq_flushg;		/* pending flush global */
-	int pq_flushu;		/* pending flush user */
-} pmap_tlb_shootdown_q[X86_MAXPROCS];
-
-#define	PMAP_TLB_MAXJOBS	16
-
-void	pmap_tlb_shootdown_q_drain(struct pmap_tlb_shootdown_q *);
-struct pmap_tlb_shootdown_job *pmap_tlb_shootdown_job_get
-	    (struct pmap_tlb_shootdown_q *);
-void	pmap_tlb_shootdown_job_put(struct pmap_tlb_shootdown_q *,
-	    struct pmap_tlb_shootdown_job *);
-
-struct mutex pmap_tlb_shootdown_job_mutex = MUTEX_INITIALIZER(IPL_NONE);
-union pmap_tlb_shootdown_job_al *pj_page, *pj_free;
-
-/*
  * global data structures
  */
 
@@ -349,6 +300,11 @@ struct pmap_head pmaps;
 
 struct pool pmap_pmap_pool;
 
+/*
+ * When we're freeing a ptp, we need to delay the freeing until all
+ * tlb shootdown has been done. This is the list of the to-be-freed pages.
+ */
+TAILQ_HEAD(pg_to_free, vm_page);
 
 /*
  * pool and cache that PDPs are allocated from
@@ -383,16 +339,16 @@ void  pmap_enter_pv(struct vm_page *, struct pv_entry *, struct pmap *,
 struct vm_page *pmap_get_ptp(struct pmap *, vaddr_t, pd_entry_t **);
 struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
 void pmap_free_ptp(struct pmap *, struct vm_page *,
-    vaddr_t, pt_entry_t *, pd_entry_t **, int32_t *);
-void pmap_freepage(struct pmap *, struct vm_page *, int);
+    vaddr_t, pt_entry_t *, pd_entry_t **, struct pg_to_free *);
+void pmap_freepage(struct pmap *, struct vm_page *, int, struct pg_to_free *);
 static boolean_t pmap_is_active(struct pmap *, int);
 void pmap_map_ptes(struct pmap *, pt_entry_t **, pd_entry_t ***);
 struct pv_entry *pmap_remove_pv(struct vm_page *, struct pmap *, vaddr_t);
 void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
 boolean_t pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
-    vaddr_t, int32_t *, int);
+    vaddr_t, int);
 void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t,
-    vaddr_t, vaddr_t, int32_t *, int);
+    vaddr_t, vaddr_t, int);
 #define PMAP_REMOVE_ALL		0	/* remove all mappings */
 #define PMAP_REMOVE_SKIPWIRED	1	/* skip wired mappings */
 
@@ -449,33 +405,8 @@ pmap_sync_flags_pte(struct vm_page *pg, u_long pte)
 void
 pmap_apte_flush(struct pmap *pmap)
 {
-#if defined(MULTIPROCESSOR)
-	struct pmap_tlb_shootdown_q *pq;
-	struct cpu_info *ci, *self = curcpu();
-	CPU_INFO_ITERATOR cii;
-#endif
-
-	tlbflush();		/* flush TLB on current processor */
-#if defined(MULTIPROCESSOR)
-	/*
-	 * Flush the APTE mapping from all other CPUs that
-	 * are using the pmap we are using (who's APTE space
-	 * is the one we've just modified).
-	 *
-	 * XXXthorpej -- find a way to defer the IPI.
-	 */
-	CPU_INFO_FOREACH(cii, ci) {
-		if (ci == self)
-			continue;
-		if (pmap_is_active(pmap, ci->ci_cpuid)) {
-			pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
-			mtx_enter(&pq->pq_mutex);
-			pq->pq_flushu++;
-			mtx_leave(&pq->pq_mutex);
-			x86_send_ipi(ci, X86_IPI_TLB);
-		}
-	}
-#endif
+	pmap_tlb_shoottlb();
+	pmap_tlb_shootwait();
 }
 
 /*
@@ -569,31 +500,22 @@ pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot)
 {
 	pt_entry_t *pte, opte, npte;
 
-	if (va < VM_MIN_KERNEL_ADDRESS)
-		pte = vtopte(va);
-	else
-		pte = kvtopte(va);
+	pte = kvtopte(va);
 
 	npte = pa | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
 	     PG_V | pmap_pg_g;
 	if ((cpu_feature & CPUID_NXE) && !(prot & VM_PROT_EXECUTE))
 		npte |= PG_NX;
-	opte = pmap_pte_set(pte, npte); /* zap! */
+	opte = pmap_pte_set(pte, npte);
 #ifdef LARGEPAGES
 	/* XXX For now... */
 	if (opte & PG_PS)
 		panic("pmap_kenter_pa: PG_PS");
 #endif
 	if (pmap_valid_entry(opte)) {
-#if defined(MULTIPROCESSOR)
-		int32_t cpumask = 0;
-
-		pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
-		pmap_tlb_shootnow(cpumask);
-#else
-		/* Don't bother deferring in the single CPU case. */
-		pmap_update_pg(va);
-#endif
+		/* This shouldn't happen */
+		pmap_tlb_shootpage(pmap_kernel(), va);
+		pmap_tlb_shootwait();
 	}
 }
 
@@ -609,31 +531,25 @@ pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot)
  */
 
 void
-pmap_kremove(vaddr_t va, vsize_t len)
+pmap_kremove(vaddr_t sva, vsize_t len)
 {
 	pt_entry_t *pte, opte;
-	int32_t cpumask = 0;
+	vaddr_t va, eva;
 
-	len >>= PAGE_SHIFT;
-	for ( /* null */ ; len ; len--, va += PAGE_SIZE) {
-		if (va < VM_MIN_KERNEL_ADDRESS)
-			pte = vtopte(va);
-		else
-			pte = kvtopte(va);
-		opte = pmap_pte_set(pte, 0); /* zap! */
+	eva = sva + len;
+
+	for (va = sva; va != eva; va += PAGE_SIZE) {
+		pte = kvtopte(va);
+
+		opte = pmap_pte_set(pte, 0);
 #ifdef LARGEPAGES
-		/* XXX For now... */
-		if (opte & PG_PS)
-			panic("pmap_kremove: PG_PS");
+		KASSERT((opte & PG_PS) == 0);
 #endif
-#ifdef DIAGNOSTIC
-		if (opte & PG_PVLIST)
-			panic("pmap_kremove: PG_PVLIST mapping for 0x%lx",
-			      va);
-#endif
-		pmap_tlb_shootdown(pmap_kernel(), va, opte, &cpumask);
+		KASSERT((opte & PG_PVLIST) == 0);
 	}
-	pmap_tlb_shootnow(cpumask);
+
+	pmap_tlb_shootrange(pmap_kernel(), sva, eva);
+	pmap_tlb_shootwait();
 }
 
 /*
@@ -838,15 +754,6 @@ pmap_bootstrap(vaddr_t kva_start, paddr_t max_pa)
 	    &pool_allocator_nointr);
 
 	/*
-	 * Initialize the TLB shootdown queues.
-	 */
-
-	for (i = 0; i < X86_MAXPROCS; i++) {
-		TAILQ_INIT(&pmap_tlb_shootdown_q[i].pq_head);
-		mtx_init(&pmap_tlb_shootdown_q[i].pq_mutex, IPL_IPI);
-	}
-
-	/*
 	 * initialize the PDE pool and cache.
 	 */
 
@@ -896,21 +803,6 @@ pmap_prealloc_lowmem_ptps(void)
 void
 pmap_init(void)
 {
-	struct vm_page *pg;
-	int i;
-
-	pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_USERESERVE);
-	if (pg == NULL)
-		panic("pmap_init: pj_page");
-	pj_page = (void *)pmap_map_direct(pg);
-
-	for (i = 0;
-	     i < (PAGE_SIZE / sizeof (union pmap_tlb_shootdown_job_al) - 1);
-	     i++)
-		pj_page[i].pja_job.pj_nextfree = &pj_page[i + 1].pja_job;
-	pj_page[i].pja_job.pj_nextfree = NULL;
-	pj_free = &pj_page[0];
-
 	/*
 	 * done: pmap module is up (and ready for business)
 	 */
@@ -998,7 +890,8 @@ pmap_find_ptp(struct pmap *pmap, vaddr_t va, paddr_t pa, int level)
 }
 
 void
-pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
+pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level,
+    struct pg_to_free *pagelist)
 {
 	int lidx;
 	struct uvm_object *obj;
@@ -1007,19 +900,16 @@ pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
 
 	obj = &pmap->pm_obj[lidx];
 	pmap->pm_stats.resident_count--;
-	if (lidx != 0)
-		simple_lock(&obj->vmobjlock);
 	if (pmap->pm_ptphint[lidx] == ptp)
 		pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq);
 	ptp->wire_count = 0;
-	uvm_pagefree(ptp);
-	if (lidx != 0)
-		simple_unlock(&obj->vmobjlock);
+	uvm_pagerealloc(ptp, NULL, 0);
+	TAILQ_INSERT_TAIL(pagelist, ptp, listq);
 }
 
 void
 pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
-    pt_entry_t *ptes, pd_entry_t **pdes, int32_t *cpumaskp)
+    pt_entry_t *ptes, pd_entry_t **pdes, struct pg_to_free *pagelist)
 {
 	unsigned long index;
 	int level;
@@ -1028,19 +918,17 @@ pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
 
 	level = 1;
 	do {
-		pmap_freepage(pmap, ptp, level);
+		pmap_freepage(pmap, ptp, level, pagelist);
 		index = pl_i(va, level + 1);
 		opde = pmap_pte_set(&pdes[level - 1][index], 0);
 		invaladdr = level == 1 ? (vaddr_t)ptes :
 		    (vaddr_t)pdes[level - 2];
-		pmap_tlb_shootdown(curpcb->pcb_pmap,
-		    invaladdr + index * PAGE_SIZE,
-		    opde, cpumaskp);
+		pmap_tlb_shootpage(curpcb->pcb_pmap,
+		    invaladdr + index * PAGE_SIZE);
 #if defined(MULTIPROCESSOR)
 		invaladdr = level == 1 ? (vaddr_t)PTE_BASE :
 		    (vaddr_t)normal_pdes[level - 2];
-		pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE, opde,
-		    cpumaskp);
+		pmap_tlb_shootpage(pmap, invaladdr + index * PAGE_SIZE);
 #endif
 		if (level < PTP_LEVELS - 1) {
 			ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
@@ -1623,7 +1511,7 @@ pmap_copy_page(struct vm_page *srcpg, struct vm_page *dstpg)
 
 void
 pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
-    vaddr_t startva, vaddr_t endva, int32_t *cpumaskp, int flags)
+    vaddr_t startva, vaddr_t endva, int flags)
 {
 	struct pv_entry *pve;
 	pt_entry_t *pte = (pt_entry_t *) ptpva;
@@ -1654,8 +1542,6 @@ pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
 			pmap->pm_stats.wired_count--;
 		pmap->pm_stats.resident_count--;
 
-		pmap_tlb_shootdown(pmap, startva, opte, cpumaskp);
-
 		if (ptp)
 			ptp->wire_count--;		/* dropping a PTE */
 
@@ -1706,7 +1592,7 @@ pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
 
 boolean_t
 pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
-    vaddr_t va, int32_t *cpumaskp, int flags)
+    vaddr_t va, int flags)
 {
 	struct pv_entry *pve;
 	struct vm_page *pg;
@@ -1728,8 +1614,6 @@ pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
 	if (ptp)
 		ptp->wire_count--;		/* dropping a PTE */
 
-	pmap_tlb_shootdown(pmap, va, opte, cpumaskp);
-
 	pg = PHYS_TO_VM_PAGE(opte & PG_FRAME);
 
 	/*
@@ -1786,7 +1670,11 @@ pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
 	paddr_t ptppa;
 	vaddr_t blkendva;
 	struct vm_page *ptp;
-	int32_t cpumask = 0;
+	vaddr_t va;
+	int shootall = 0;
+	struct pg_to_free empty_ptps;
+
+	TAILQ_INIT(&empty_ptps);
 
 	PMAP_MAP_TO_HEAD_LOCK();
 	pmap_map_ptes(pmap, &ptes, &pdes);	/* locks pmap */
@@ -1817,7 +1705,7 @@ pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
 
 			/* do it! */
 			result = pmap_remove_pte(pmap, ptp,
-			    &ptes[pl1_i(sva)], sva, &cpumask, flags);
+			    &ptes[pl1_i(sva)], sva, flags);
 
 			/*
 			 * if mapping removed and the PTP is no longer
@@ -1826,21 +1714,28 @@ pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
 
 			if (result && ptp && ptp->wire_count <= 1)
 				pmap_free_ptp(pmap, ptp, sva, ptes, pdes,
-				    &cpumask);
+				    &empty_ptps);
+			pmap_tlb_shootpage(pmap, sva);
 		}
 
-		pmap_tlb_shootnow(cpumask);
+		pmap_tlb_shootwait();
 		pmap_unmap_ptes(pmap);		/* unlock pmap */
 		PMAP_MAP_TO_HEAD_UNLOCK();
+
+		while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
+			TAILQ_REMOVE(&empty_ptps, ptp, listq);
+			uvm_pagefree(ptp);
+                }
+
 		return;
 	}
 
-	cpumask = 0;
-
-	for (/* null */ ; sva < eva ; sva = blkendva) {
+	if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel())
+		shootall = 1;
 
+	for (va = sva; va < eva; va = blkendva) {
 		/* determine range of block */
-		blkendva = x86_round_pdr(sva+1);
+		blkendva = x86_round_pdr(va + 1);
 		if (blkendva > eva)
 			blkendva = eva;
 
@@ -1858,11 +1753,11 @@ pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
 		 * be VM_MAX_ADDRESS.
 		 */
 
-		if (pl_i(sva, PTP_LEVELS) == PDIR_SLOT_PTE)
+		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
 			/* XXXCDC: ugly hack to avoid freeing PDP here */
 			continue;
 
-		if (!pmap_pdes_valid(sva, pdes, &pde))
+		if (!pmap_pdes_valid(va, pdes, &pde))
 			continue;
 
 		/* PA of the PTP */
@@ -1873,7 +1768,7 @@ pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
 			/* we never free kernel PTPs */
 			ptp = NULL;
 		} else {
-			ptp = pmap_find_ptp(pmap, sva, ptppa, 1);
+			ptp = pmap_find_ptp(pmap, va, ptppa, 1);
 #ifdef DIAGNOSTIC
 			if (ptp == NULL)
 				panic("pmap_remove: unmanaged PTP "
@@ -1881,18 +1776,28 @@ pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags)
 #endif
 		}
 		pmap_remove_ptes(pmap, ptp,
-		    (vaddr_t)&ptes[pl1_i(sva)], sva, blkendva, &cpumask, flags);
+		    (vaddr_t)&ptes[pl1_i(va)], va, blkendva, flags);
 
 		/* if PTP is no longer being used, free it! */
 		if (ptp && ptp->wire_count <= 1) {
-			pmap_free_ptp(pmap, ptp, sva, ptes,pdes,
-			    &cpumask);
+			pmap_free_ptp(pmap, ptp, va, ptes, pdes, &empty_ptps);
 		}
 	}
 
-	pmap_tlb_shootnow(cpumask);
+	if (shootall)
+		pmap_tlb_shoottlb();
+	else
+		pmap_tlb_shootrange(pmap, sva, eva);
+
+	pmap_tlb_shootwait();
+
 	pmap_unmap_ptes(pmap);
 	PMAP_MAP_TO_HEAD_UNLOCK();
+
+	while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
+		TAILQ_REMOVE(&empty_ptps, ptp, listq);
+		uvm_pagefree(ptp);
+	}
 }
 
 /*
@@ -1910,7 +1815,10 @@ pmap_page_remove(struct vm_page *pg)
 #ifdef DIAGNOSTIC
 	pd_entry_t pde;
 #endif
-	int32_t cpumask = 0;
+	struct pg_to_free empty_ptps;
+	struct vm_page *ptp;
+
+	TAILQ_INIT(&empty_ptps);
 
 	PMAP_HEAD_TO_MAP_LOCK();
 
@@ -1940,7 +1848,7 @@ pmap_page_remove(struct vm_page *pg)
 			pve->pv_pmap->pm_stats.wired_count--;
 		pve->pv_pmap->pm_stats.resident_count--;
 
-		pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte, &cpumask);
+		pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va);
 
 		pmap_sync_flags_pte(pg, opte);
 
@@ -1949,7 +1857,7 @@ pmap_page_remove(struct vm_page *pg)
 			pve->pv_ptp->wire_count--;
 			if (pve->pv_ptp->wire_count <= 1) {
 				pmap_free_ptp(pve->pv_pmap, pve->pv_ptp,
-					      pve->pv_va, ptes, pdes, &cpumask);
+				    pve->pv_va, ptes, pdes, &empty_ptps);
 			}
 		}
 		pmap_unmap_ptes(pve->pv_pmap);		/* unlocks pmap */
@@ -1957,7 +1865,12 @@ pmap_page_remove(struct vm_page *pg)
 	}
 
 	PMAP_HEAD_TO_MAP_UNLOCK();
-	pmap_tlb_shootnow(cpumask);
+	pmap_tlb_shootwait();
+
+	while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
+		TAILQ_REMOVE(&empty_ptps, ptp, listq);
+		uvm_pagefree(ptp);
+	}
 }
 
 /*
@@ -2015,7 +1928,6 @@ pmap_clear_attrs(struct vm_page *pg, unsigned long clearbits)
 	struct pv_entry *pve;
 	pt_entry_t *ptes, opte;
 	pd_entry_t **pdes;
-	int32_t cpumask = 0;
 	u_long clearflags;
 	int result;
 
@@ -2040,15 +1952,14 @@ pmap_clear_attrs(struct vm_page *pg, unsigned long clearbits)
 			result = 1;
 			pmap_pte_clearbits(&ptes[pl1_i(pve->pv_va)],
 			    (opte & clearbits));
-			pmap_tlb_shootdown(pve->pv_pmap, pve->pv_va, opte,
-			    &cpumask);
+			pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va);
 		}
 		pmap_unmap_ptes(pve->pv_pmap);		/* unlocks pmap */
 	}
 
 	PMAP_HEAD_TO_MAP_UNLOCK();
 
-	pmap_tlb_shootnow(cpumask);
+	pmap_tlb_shootwait();
 
 	return (result != 0);
 }
@@ -2084,7 +1995,8 @@ pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
 	pt_entry_t nx, opte, *ptes, *spte, *epte;
 	pd_entry_t **pdes;
 	vaddr_t blockend;
-	int32_t cpumask = 0;
+	int shootall = 0;
+	vaddr_t va;
 
 	pmap_map_ptes(pmap, &ptes, &pdes);		/* locks pmap */
 
@@ -2096,9 +2008,11 @@ pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
 	if ((cpu_feature & CPUID_NXE) && !(prot & VM_PROT_EXECUTE))
 		nx = PG_NX;
 
-	for (/* null */ ; sva < eva ; sva = blockend) {
+	if ((eva - sva > 32 * PAGE_SIZE) && pmap != pmap_kernel())
+		shootall = 1;
 
-		blockend = (sva & L2_FRAME) + NBPD_L2;
+	for (va = sva; va < eva ; va = blockend) {
+		blockend = (va & L2_FRAME) + NBPD_L2;
 		if (blockend > eva)
 			blockend = eva;
 
@@ -2112,19 +2026,19 @@ pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
 		 */
 
 		/* XXXCDC: ugly hack to avoid freeing PDP here */
-		if (pl_i(sva, PTP_LEVELS) == PDIR_SLOT_PTE)
+		if (pl_i(va, PTP_LEVELS) == PDIR_SLOT_PTE)
 			continue;
 
 		/* empty block? */
-		if (!pmap_pdes_valid(sva, pdes, NULL))
+		if (!pmap_pdes_valid(va, pdes, NULL))
 			continue;
 
 #ifdef DIAGNOSTIC
-		if (sva >= VM_MAXUSER_ADDRESS && sva < VM_MAX_ADDRESS)
+		if (va >= VM_MAXUSER_ADDRESS && va < VM_MAX_ADDRESS)
 			panic("pmap_write_protect: PTE space");
 #endif
 
-		spte = &ptes[pl1_i(sva)];
+		spte = &ptes[pl1_i(va)];
 		epte = &ptes[pl1_i(blockend)];
 
 		for (/*null */; spte < epte ; spte++) {
@@ -2133,13 +2047,16 @@ pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
 			opte = *spte;
 			pmap_pte_clearbits(spte, PG_RW);
 			pmap_pte_setbits(spte, nx);
-			if (opte != *spte)
-				pmap_tlb_shootdown(pmap, ptoa(spte - ptes),
-				    *spte, &cpumask);
 		}
 	}
 
-	pmap_tlb_shootnow(cpumask);
+	if (shootall)
+		pmap_tlb_shoottlb();
+	else
+		pmap_tlb_shootrange(pmap, sva, eva);
+
+	pmap_tlb_shootwait();
+
 	pmap_unmap_ptes(pmap);		/* unlocks pmap */
 }
 
@@ -2413,17 +2330,9 @@ enter_now:
 	 * If we changed anything other than modified/used bits,
 	 * flush the TLB.  (is this overkill?)
 	 */
-	if ((opte & ~(PG_M|PG_U)) != npte) {
-#if defined(MULTIPROCESSOR)
-		int32_t cpumask = 0;
-
-		pmap_tlb_shootdown(pmap, va, opte, &cpumask);
-		pmap_tlb_shootnow(cpumask);
-#else
-		/* Don't bother deferring in the single CPU case. */
-		if (pmap_is_curpmap(pmap))
-			pmap_update_pg(va);
-#endif
+	if (opte & PG_V) {
+		pmap_tlb_shootpage(pmap, va);
+		pmap_tlb_shootwait();
 	}
 
 	error = 0;
@@ -2632,276 +2541,186 @@ pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
 }
 #endif
 
-/******************** TLB shootdown code ********************/
-
-
 void
-pmap_tlb_shootnow(int32_t cpumask)
+pmap_virtual_space(vaddr_t *vstartp, vaddr_t *vendp)
 {
+	*vstartp = virtual_avail;
+	*vendp = VM_MAX_KERNEL_ADDRESS;
+}
+
 #ifdef MULTIPROCESSOR
-	struct cpu_info *ci, *self;
-	CPU_INFO_ITERATOR cii;
-	int s;
-#ifdef DIAGNOSTIC
-	int count = 0;
-#endif
-#endif
+/*
+ * Locking for tlb shootdown.
+ *
+ * We lock by setting tlb_shoot_wait to the number of cpus that will
+ * receive our tlb shootdown. After sending the IPIs, we don't need to
+ * worry about locking order or interrupts spinning for the lock because
+ * the call that grabs the "lock" isn't the one that releases it. And
+ * there is nothing that can block the IPI that releases the lock.
+ *
+ * The functions are organized so that we first count the number of
+ * cpus we need to send the IPI to, then we grab the counter, then
+ * we send the IPIs, then we finally do our own shootdown.
+ *
+ * Our shootdown is last to make it parallell with the other cpus
+ * to shorten the spin time.
+ *
+ * Notice that we depend on failures to send IPIs only being able to
+ * happen during boot. If they happen later, the above assumption
+ * doesn't hold since we can end up in situations where noone will
+ * release the lock if we get an interrupt in a bad moment.
+ */
 
-	if (cpumask == 0)
-		return;
+volatile long tlb_shoot_wait;
 
-#ifdef MULTIPROCESSOR
-	self = curcpu();
-	s = splipi();
-	self->ci_tlb_ipi_mask = cpumask;
-#endif
+volatile vaddr_t tlb_shoot_addr1;
+volatile vaddr_t tlb_shoot_addr2;
 
-	pmap_do_tlb_shootdown(0);	/* do *our* work. */
+/* XXX */
+#define SPINLOCK_SPIN_HOOK __asm __volatile("pause": : :"memory")
 
-#ifdef MULTIPROCESSOR
-	splx(s);
+void
+pmap_tlb_shootpage(struct pmap *pm, vaddr_t va)
+{
+	struct cpu_info *ci, *self = curcpu();
+	CPU_INFO_ITERATOR cii;
+	long wait = 0;
+	int mask = 0;
 
-	/*
-	 * Send the TLB IPI to other CPUs pending shootdowns.
-	 */
 	CPU_INFO_FOREACH(cii, ci) {
-		if (ci == self)
+		if (ci == self || !pmap_is_active(pm, ci->ci_cpuid) ||
+		    !(ci->ci_flags & CPUF_RUNNING))
 			continue;
-		if (cpumask & (1U << ci->ci_cpuid))
-			if (x86_send_ipi(ci, X86_IPI_TLB) != 0)
-			     x86_atomic_clearbits_ul(&self->ci_tlb_ipi_mask,
-				    (1U << ci->ci_cpuid));
+		mask |= 1 << ci->ci_cpuid;
+		wait++;
 	}
 
-	while (self->ci_tlb_ipi_mask != 0)
-#ifdef DIAGNOSTIC
-		if (count++ > 1000000000)
-			panic("TLB IPI rendezvous failed (mask %x)",
-			    self->ci_tlb_ipi_mask);
-#else
-		/* XXX insert pause instruction */
-		;
-#endif
-#endif
+	if (wait > 0) {
+		int s = splvm();
+
+		while (x86_atomic_cas_ul(&tlb_shoot_wait, 0, wait) != 0) {
+			while (tlb_shoot_wait != 0)
+				SPINLOCK_SPIN_HOOK;
+		}
+		tlb_shoot_addr1 = va;
+		CPU_INFO_FOREACH(cii, ci) {
+			if ((mask & 1 << ci->ci_cpuid) == 0)
+				continue;
+			if (x86_fast_ipi(ci, LAPIC_IPI_INVLPG) != 0)
+				panic("pmap_tlb_shootpage: ipi failed");
+		}
+		splx(s);
+	}
+
+	if (pmap_is_curpmap(pm))
+		pmap_update_pg(va);
 }
 
-/*
- * pmap_tlb_shootdown:
- *
- *	Cause the TLB entry for pmap/va to be shot down.
- */
 void
-pmap_tlb_shootdown(pmap_t pmap, vaddr_t va, pt_entry_t pte, int32_t *cpumaskp)
+pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva)
 {
 	struct cpu_info *ci, *self = curcpu();
-	struct pmap_tlb_shootdown_q *pq;
-	struct pmap_tlb_shootdown_job *pj;
 	CPU_INFO_ITERATOR cii;
-
-#ifdef LARGEPAGES
-	if (pte & PG_PS)
-		va &= PG_LGFRAME;
-#endif
-
-	if (pmap_initialized == FALSE || cpus_attached == 0) {
-		pmap_update_pg(va);
-		return;
-	}
-
-#if 0
-	printf("doshootdown %lx\n", va);
-#endif
+	long wait = 0;
+	int mask = 0;
+	vaddr_t va;
 
 	CPU_INFO_FOREACH(cii, ci) {
-		/* Note: we queue shootdown events for ourselves here! */
-		if (pmap_is_active(pmap, ci->ci_cpuid) == 0)
-			continue;
-		if (ci != self && !(ci->ci_flags & CPUF_RUNNING))
+		if (ci == self || !pmap_is_active(pm, ci->ci_cpuid) ||
+		    !(ci->ci_flags & CPUF_RUNNING))
 			continue;
-		pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
-		mtx_enter(&pq->pq_mutex);
+		mask |= 1 << ci->ci_cpuid;
+		wait++;
+	}
 
-		/*
-		 * If there's a global flush already queued, or a
-		 * non-global flush, and this pte doesn't have the G
-		 * bit set, don't bother.
-		 */
-		if (pq->pq_flushg > 0 ||
-		    (pq->pq_flushu > 0 && (pte & pmap_pg_g) == 0)) {
-			mtx_leave(&pq->pq_mutex);
-			continue;
-		}
+	if (wait > 0) {
+		int s = splvm();
 
-		pj = pmap_tlb_shootdown_job_get(pq);
-		pq->pq_pte |= pte;
-		if (pj == NULL) {
-			/*
-			 * Couldn't allocate a job entry.
-			 * Kill it now for this cpu, unless the failure
-			 * was due to too many pending flushes; otherwise,
-			 * tell other cpus to kill everything..
-			 */
-			if (ci == self && pq->pq_count < PMAP_TLB_MAXJOBS) {
-				pmap_update_pg(va);
-			} else {
-				if (pq->pq_pte & pmap_pg_g)
-					pq->pq_flushg++;
-				else
-					pq->pq_flushu++;
-				/*
-				 * Since we've nailed the whole thing,
-				 * drain the job entries pending for that
-				 * processor.
-				 */
-				pmap_tlb_shootdown_q_drain(pq);
-				*cpumaskp |= 1U << ci->ci_cpuid;
-			}
-		} else {
-			pj->pj_pmap = pmap;
-			pj->pj_va = va;
-			pj->pj_pte = pte;
-			TAILQ_INSERT_TAIL(&pq->pq_head, pj, pj_list);
-			*cpumaskp |= 1U << ci->ci_cpuid;
+		while (x86_atomic_cas_ul(&tlb_shoot_wait, 0, wait) != 0) {
+			while (tlb_shoot_wait != 0)
+				SPINLOCK_SPIN_HOOK;
+		}
+		tlb_shoot_addr1 = sva;
+		tlb_shoot_addr2 = eva;
+		CPU_INFO_FOREACH(cii, ci) {
+			if ((mask & 1 << ci->ci_cpuid) == 0)
+				continue;
+			if (x86_fast_ipi(ci, LAPIC_IPI_INVLRANGE) != 0)
+				panic("pmap_tlb_shootrange: ipi failed");
 		}
-		mtx_leave(&pq->pq_mutex);
+		splx(s);
 	}
+
+	if (pmap_is_curpmap(pm))
+		for (va = sva; va < eva; va += PAGE_SIZE)
+			pmap_update_pg(va);
 }
 
-/*
- * pmap_do_tlb_shootdown:
- *
- *	Process pending TLB shootdown operations for this processor.
- */
 void
-pmap_do_tlb_shootdown(struct cpu_info *self)
+pmap_tlb_shoottlb(void)
 {
-	u_long cpu_id = cpu_number();
-	struct pmap_tlb_shootdown_q *pq = &pmap_tlb_shootdown_q[cpu_id];
-	struct pmap_tlb_shootdown_job *pj;
-#ifdef MULTIPROCESSOR
-	struct cpu_info *ci;
+	struct cpu_info *ci, *self = curcpu();
 	CPU_INFO_ITERATOR cii;
-#endif
-
-	mtx_enter(&pq->pq_mutex);
+	long wait = 0;
+	int mask = 0;
 
-	if (pq->pq_flushg) {
-		COUNT(flushg);
-		tlbflushg();
-		pq->pq_flushg = 0;
-		pq->pq_flushu = 0;
-		pmap_tlb_shootdown_q_drain(pq);
-	} else {
-		/*
-		 * TLB flushes for PTEs with PG_G set may be in the queue
-		 * after a flushu, they need to be dealt with.
-		 */
-		if (pq->pq_flushu) {
-			COUNT(flushu);
-			tlbflush();
-		}
-		while ((pj = TAILQ_FIRST(&pq->pq_head)) != NULL) {
-			TAILQ_REMOVE(&pq->pq_head, pj, pj_list);
+	CPU_INFO_FOREACH(cii, ci) {
+		if (ci == self || !(ci->ci_flags & CPUF_RUNNING))
+			continue;
+		mask |= 1 << ci->ci_cpuid;
+		wait++;
+	}
 
-			if ((!pq->pq_flushu && pmap_is_curpmap(pj->pj_pmap)) ||
-			    (pj->pj_pte & pmap_pg_g))
-				pmap_update_pg(pj->pj_va);
+	if (wait) {
+		int s = splvm();
 
-			pmap_tlb_shootdown_job_put(pq, pj);
+		while (x86_atomic_cas_ul(&tlb_shoot_wait, 0, wait) != 0) {
+			while (tlb_shoot_wait != 0)
+				SPINLOCK_SPIN_HOOK;
 		}
 
-		pq->pq_flushu = pq->pq_pte = 0;
+		CPU_INFO_FOREACH(cii, ci) {
+			if ((mask & 1 << ci->ci_cpuid) == 0)
+				continue;
+			if (x86_fast_ipi(ci, LAPIC_IPI_INVLTLB) != 0)
+				panic("pmap_tlb_shoottlb: ipi failed");
+		}
+		splx(s);
 	}
 
-#ifdef MULTIPROCESSOR
-	CPU_INFO_FOREACH(cii, ci)
-		x86_atomic_clearbits_ul(&ci->ci_tlb_ipi_mask,
-		    (1U << cpu_id));
-#endif
-	mtx_leave(&pq->pq_mutex);
+	tlbflush();
 }
 
-
-/*
- * pmap_tlb_shootdown_q_drain:
- *
- *	Drain a processor's TLB shootdown queue.  We do not perform
- *	the shootdown operations.  This is merely a convenience
- *	function.
- *
- *	Note: We expect the queue to be locked.
- */
 void
-pmap_tlb_shootdown_q_drain(struct pmap_tlb_shootdown_q *pq)
+pmap_tlb_shootwait(void)
 {
-	struct pmap_tlb_shootdown_job *pj;
-
-	while ((pj = TAILQ_FIRST(&pq->pq_head)) != NULL) {
-		TAILQ_REMOVE(&pq->pq_head, pj, pj_list);
-		pmap_tlb_shootdown_job_put(pq, pj);
-	}
-	pq->pq_pte = 0;
+	while (tlb_shoot_wait != 0)
+		SPINLOCK_SPIN_HOOK;
 }
 
-/*
- * pmap_tlb_shootdown_job_get:
- *
- *	Get a TLB shootdown job queue entry.  This places a limit on
- *	the number of outstanding jobs a processor may have.
- *
- *	Note: We expect the queue to be locked.
- */
-struct pmap_tlb_shootdown_job *
-pmap_tlb_shootdown_job_get(struct pmap_tlb_shootdown_q *pq)
-{
-	struct pmap_tlb_shootdown_job *pj;
-
-	if (pq->pq_count >= PMAP_TLB_MAXJOBS)
-		return (NULL);
-
-	mtx_enter(&pmap_tlb_shootdown_job_mutex);
-
-	if (pj_free == NULL) {
-		mtx_leave(&pmap_tlb_shootdown_job_mutex);
-		return NULL;
-	}
-	pj = &pj_free->pja_job;
-	pj_free =
-	    (union pmap_tlb_shootdown_job_al *)pj_free->pja_job.pj_nextfree;
+#else
 
-	mtx_leave(&pmap_tlb_shootdown_job_mutex);
+void
+pmap_tlb_shootpage(struct pmap *pm, vaddr_t va)
+{
+	if (pmap_is_curpmap(pm))
+		pmap_update_pg(va);
 
-	pq->pq_count++;
-	return (pj);
 }
 
-/*
- * pmap_tlb_shootdown_job_put:
- *
- *	Put a TLB shootdown job queue entry onto the free list.
- *
- *	Note: We expect the queue to be locked.
- */
 void
-pmap_tlb_shootdown_job_put(struct pmap_tlb_shootdown_q *pq,
-    struct pmap_tlb_shootdown_job *pj)
+pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva)
 {
+	vaddr_t va;
 
-#ifdef DIAGNOSTIC
-	if (pq->pq_count == 0)
-		panic("pmap_tlb_shootdown_job_put: queue length inconsistency");
-#endif
-	mtx_enter(&pmap_tlb_shootdown_job_mutex);
-	pj->pj_nextfree = &pj_free->pja_job;
-	pj_free = (union pmap_tlb_shootdown_job_al *)pj;
-	mtx_leave(&pmap_tlb_shootdown_job_mutex);
+	for (va = sva; va < eva; va += PAGE_SIZE)
+		pmap_update_pg(va);	
 
-	pq->pq_count--;
 }
 
 void
-pmap_virtual_space(vaddr_t *vstartp, vaddr_t *vendp)
+pmap_tlb_shoottlb(void)
 {
-	*vstartp = virtual_avail;
-	*vendp = VM_MAX_KERNEL_ADDRESS;
+	tlbflush();
 }
+#endif /* MULTIPROCESSOR */
diff --git a/sys/arch/amd64/amd64/vector.S b/sys/arch/amd64/amd64/vector.S
index e77ef0a45b7..465c2c00775 100644
--- a/sys/arch/amd64/amd64/vector.S
+++ b/sys/arch/amd64/amd64/vector.S
@@ -1,4 +1,4 @@
-/*	$OpenBSD: vector.S,v 1.8 2007/05/10 17:59:23 deraadt Exp $	*/
+/*	$OpenBSD: vector.S,v 1.9 2007/05/25 16:22:11 art Exp $	*/
 /*	$NetBSD: vector.S,v 1.5 2004/06/28 09:13:11 fvdl Exp $	*/
 
 /*
@@ -318,6 +318,54 @@ IDTVEC(resume_lapic_ipi)
 	sti
 	INTRFASTEXIT
 
+IDTVEC(ipi_invltlb)
+	pushq	%rax
+
+	ioapic_asm_ack()
+
+	movq	%cr3, %rax
+	movq	%rax, %cr3
+
+	lock
+	decq	tlb_shoot_wait
+
+	popq	%rax
+	iretq
+
+IDTVEC(ipi_invlpg)
+	pushq	%rax
+
+	ioapic_asm_ack()
+
+	movq	tlb_shoot_addr1, %rax
+	invlpg	(%rax)
+
+	lock
+	decq	tlb_shoot_wait
+
+	popq	%rax
+	iretq
+
+IDTVEC(ipi_invlrange)
+	pushq	%rax
+	pushq	%rdx
+
+	ioapic_asm_ack()
+
+	movq	tlb_shoot_addr1, %rax
+	movq	tlb_shoot_addr2, %rdx
+1:	invlpg	(%rax)
+	addq	$PAGE_SIZE, %rax
+	cmpq	%rdx, %rax
+	jb	1b
+
+	lock
+	decq	tlb_shoot_wait
+
+	popq	%rdx
+	popq	%rax
+	iretq
+
 #endif /* MULTIPROCESSOR */
 	
 	/*
diff --git a/sys/arch/amd64/amd64/vm_machdep.c b/sys/arch/amd64/amd64/vm_machdep.c
index 34932f46dbf..1d0932858b7 100644
--- a/sys/arch/amd64/amd64/vm_machdep.c
+++ b/sys/arch/amd64/amd64/vm_machdep.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: vm_machdep.c,v 1.8 2006/11/29 12:26:13 miod Exp $	*/
+/*	$OpenBSD: vm_machdep.c,v 1.9 2007/05/25 16:22:11 art Exp $	*/
 /*	$NetBSD: vm_machdep.c,v 1.1 2003/04/26 18:39:33 fvdl Exp $	*/
 
 /*-
@@ -262,12 +262,18 @@ void
 pagemove(caddr_t from, caddr_t to, size_t size)
 {
 	pt_entry_t *fpte, *tpte, ofpte, otpte;
-	int32_t cpumask = 0;
+	vaddr_t fsva, tsva, feva, teva;
 
 #ifdef DIAGNOSTIC
 	if ((size & PAGE_MASK) != 0)
 		panic("pagemove");
 #endif
+
+	fsva = (vaddr_t)from;
+	tsva = (vaddr_t)to;
+	feva = fsva + size;
+	teva = tsva + size;
+
 	fpte = kvtopte((vaddr_t)from);
 	tpte = kvtopte((vaddr_t)to);
 #ifdef LARGEPAGES
@@ -282,17 +288,13 @@ pagemove(caddr_t from, caddr_t to, size_t size)
 		ofpte = *fpte;
 		*tpte++ = *fpte;
 		*fpte++ = 0;
-		if (otpte & PG_V)
-			pmap_tlb_shootdown(pmap_kernel(),
-			    (vaddr_t)to, otpte, &cpumask);
-		if (ofpte & PG_V)
-			pmap_tlb_shootdown(pmap_kernel(),
-			    (vaddr_t)from, ofpte, &cpumask);
 		from += PAGE_SIZE;
 		to += PAGE_SIZE;
 		size -= PAGE_SIZE;
 	}
-	pmap_tlb_shootnow(cpumask);
+	pmap_tlb_shootrange(pmap_kernel(), fsva, feva);
+	pmap_tlb_shootrange(pmap_kernel(), tsva, teva);
+	pmap_tlb_shootwait();
 }
 
 /*
diff --git a/sys/arch/amd64/include/atomic.h b/sys/arch/amd64/include/atomic.h
index db184d78122..1df5a74ef2c 100644
--- a/sys/arch/amd64/include/atomic.h
+++ b/sys/arch/amd64/include/atomic.h
@@ -1,4 +1,4 @@
-/*	$OpenBSD: atomic.h,v 1.5 2007/02/19 17:18:42 deraadt Exp $	*/
+/*	$OpenBSD: atomic.h,v 1.6 2007/05/25 16:22:11 art Exp $	*/
 /*	$NetBSD: atomic.h,v 1.1 2003/04/26 18:39:37 fvdl Exp $	*/
 
 /*
@@ -90,6 +90,16 @@ x86_atomic_clearbits_u32(volatile u_int32_t *ptr, u_int32_t bits)
 	__asm __volatile(LOCK " andl %1,%0" :  "=m" (*ptr) : "ir" (~bits));
 }
 
+static __inline u_long
+x86_atomic_cas_ul(volatile u_long *ptr, u_long expect, u_long set)
+{
+	u_long res;
+
+	__asm volatile(LOCK " cmpxchgq %2, %1" : "=a" (res), "=m" (*ptr)
+	    : "r" (set), "a" (expect), "m" (*ptr) : "memory");
+
+	return (res);
+}
 
 /*
  * XXX XXX XXX
diff --git a/sys/arch/amd64/include/i82489var.h b/sys/arch/amd64/include/i82489var.h
index ebcf439f2df..1ae48071abb 100644
--- a/sys/arch/amd64/include/i82489var.h
+++ b/sys/arch/amd64/include/i82489var.h
@@ -87,6 +87,18 @@ extern void Xresume_lapic_ipi(void);
 #define LAPIC_IPI_VECTOR			0xe0
 
 /*
+ * We take 0xf0-0xfe for fast IPI handlers.
+ */
+#define LAPIC_IPI_OFFSET			0xf0
+#define LAPIC_IPI_INVLTLB			(LAPIC_IPI_OFFSET + 0)
+#define LAPIC_IPI_INVLPG			(LAPIC_IPI_OFFSET + 1)
+#define LAPIC_IPI_INVLRANGE			(LAPIC_IPI_OFFSET + 2)
+
+extern void Xipi_invltlb(void);
+extern void Xipi_invlpg(void);
+extern void Xipi_invlrange(void);
+
+/*
  * Vector used for local apic timer interrupts.
  */
 
diff --git a/sys/arch/amd64/include/intr.h b/sys/arch/amd64/include/intr.h
index a658b1c525d..17bbb6a115b 100644
--- a/sys/arch/amd64/include/intr.h
+++ b/sys/arch/amd64/include/intr.h
@@ -1,4 +1,4 @@
-/*	$OpenBSD: intr.h,v 1.10 2007/05/16 19:37:06 thib Exp $	*/
+/*	$OpenBSD: intr.h,v 1.11 2007/05/25 16:22:11 art Exp $	*/
 /*	$NetBSD: intr.h,v 1.2 2003/05/04 22:01:56 fvdl Exp $	*/
 
 /*-
@@ -223,6 +223,7 @@ void intr_printconfig(void);
 
 #ifdef MULTIPROCESSOR
 int x86_send_ipi(struct cpu_info *, int);
+int x86_fast_ipi(struct cpu_info *, int);
 void x86_broadcast_ipi(int);
 void x86_multicast_ipi(int, int);
 void x86_ipi_handler(void);
diff --git a/sys/arch/amd64/include/pmap.h b/sys/arch/amd64/include/pmap.h
index 6a53ba4384a..b06e2528580 100644
--- a/sys/arch/amd64/include/pmap.h
+++ b/sys/arch/amd64/include/pmap.h
@@ -1,4 +1,4 @@
-/*	$OpenBSD: pmap.h,v 1.11 2007/05/15 16:38:33 art Exp $	*/
+/*	$OpenBSD: pmap.h,v 1.12 2007/05/25 16:22:11 art Exp $	*/
 /*	$NetBSD: pmap.h,v 1.1 2003/04/26 18:39:46 fvdl Exp $	*/
 
 /*
@@ -412,9 +412,15 @@ void		pmap_write_protect(struct pmap *, vaddr_t,
 
 vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */
 
-void	pmap_tlb_shootdown(pmap_t, vaddr_t, pt_entry_t, int32_t *);
-void	pmap_tlb_shootnow(int32_t);
-void	pmap_do_tlb_shootdown(struct cpu_info *);
+void	pmap_tlb_shootpage(struct pmap *, vaddr_t);
+void	pmap_tlb_shootrange(struct pmap *, vaddr_t, vaddr_t);
+void	pmap_tlb_shoottlb(void);
+#ifdef MULTIPROCESSOR
+void	pmap_tlb_shootwait(void);
+#else
+#define	pmap_tlb_shootwait()
+#endif
+
 void	pmap_prealloc_lowmem_ptps(void);
 
 void	pagezero(vaddr_t);