diff options
-rw-r--r-- | sys/conf/files | 3 | ||||
-rw-r--r-- | sys/dev/softraid.c | 23 | ||||
-rw-r--r-- | sys/dev/softraid_raid6.c | 1038 | ||||
-rw-r--r-- | sys/dev/softraidvar.h | 13 |
4 files changed, 1074 insertions, 3 deletions
diff --git a/sys/conf/files b/sys/conf/files index 28e45437fd7..83c1e1d5028 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1,4 +1,4 @@ -# $OpenBSD: files,v 1.467 2009/06/17 01:30:30 thib Exp $ +# $OpenBSD: files,v 1.468 2009/07/23 15:15:25 jordan Exp $ # $NetBSD: files,v 1.87 1996/05/19 17:17:50 jonathan Exp $ # @(#)files.newconf 7.5 (Berkeley) 5/10/93 @@ -458,6 +458,7 @@ file dev/softraid_raid1.c softraid file dev/softraid_raidp.c softraid file dev/softraid_crypto.c softraid & crypto file dev/softraid_aoe.c softraid & ether & aoe +file dev/softraid_raid6.c softraid # legitimate pseudo-devices pseudo-device vnd: disk diff --git a/sys/dev/softraid.c b/sys/dev/softraid.c index 6831c675b7d..1efe2c1a947 100644 --- a/sys/dev/softraid.c +++ b/sys/dev/softraid.c @@ -1,4 +1,4 @@ -/* $OpenBSD: softraid.c,v 1.167 2009/07/12 21:48:03 jsing Exp $ */ +/* $OpenBSD: softraid.c,v 1.168 2009/07/23 15:15:25 jordan Exp $ */ /* * Copyright (c) 2007, 2008, 2009 Marco Peereboom <marco@peereboom.us> * Copyright (c) 2008 Chris Kuethe <ckuethe@openbsd.org> @@ -2714,6 +2714,23 @@ sr_ioctl_createraid(struct sr_softc *sc, struct bioc_createraid *bc, int user) (ch_entry->src_meta.scmi.scm_coerced_size & ~((strip_size >> DEV_BSHIFT) - 1)) * (no_chunk - 1); break; +#ifdef not_yet + case 6: + if (no_chunk < 4) + goto unwind; + strlcpy(sd->sd_name, "RAID 6", + sizeof(sd->sd_name)); + /* + * XXX add variable strip size later even though + * MAXPHYS is really the clever value, users like + * to tinker with that type of stuff + */ + strip_size = MAXPHYS; + vol_size = + (ch_entry->src_meta.scmi.scm_coerced_size & + ~((strip_size >> DEV_BSHIFT) - 1)) * (no_chunk - 2); + break; +#endif /* not_yet */ #ifdef AOE #ifdef not_yet case 'A': @@ -3125,6 +3142,10 @@ sr_discipline_init(struct sr_discipline *sd, int level) sd->sd_type = SR_MD_RAID5; sr_raidp_discipline_init(sd); break; + case 6: + sd->sd_type = SR_MD_RAID6; + sr_raid6_discipline_init(sd); + break; #ifdef AOE /* AOE target. */ case 'A': diff --git a/sys/dev/softraid_raid6.c b/sys/dev/softraid_raid6.c new file mode 100644 index 00000000000..1ac910993c9 --- /dev/null +++ b/sys/dev/softraid_raid6.c @@ -0,0 +1,1038 @@ +/* $OpenBSD: softraid_raid6.c,v 1.1 2009/07/23 15:15:26 jordan Exp $ */ +/* + * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us> + * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "bio.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/device.h> +#include <sys/ioctl.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/disk.h> +#include <sys/rwlock.h> +#include <sys/queue.h> +#include <sys/fcntl.h> +#include <sys/disklabel.h> +#include <sys/mount.h> +#include <sys/sensors.h> +#include <sys/stat.h> +#include <sys/conf.h> +#include <sys/uio.h> + +#include <scsi/scsi_all.h> +#include <scsi/scsiconf.h> +#include <scsi/scsi_disk.h> + +#include <dev/softraidvar.h> +#include <dev/rndvar.h> + +uint8_t gf_pow[512], gf_log[256]; + +/* RAID 6 functions. */ +int sr_raid6_alloc_resources(struct sr_discipline *); +int sr_raid6_free_resources(struct sr_discipline *); +int sr_raid6_rw(struct sr_workunit *); +int sr_raid6_openings(struct sr_discipline *); +void sr_raid6_intr(struct buf *); +void sr_raid6_recreate_wu(struct sr_workunit *); +void sr_raid6_set_chunk_state(struct sr_discipline *, int, int); +void sr_raid6_set_vol_state(struct sr_discipline *); + +void sr_raid6_xorp(void *, void *, int); +void sr_raid6_xorq(void *, void *, int, int); +int sr_raid6_addio(struct sr_workunit *wu, int, daddr64_t, daddr64_t, + void *, int, int, void *, void *, int); +void sr_dump(void *, int); +void sr_raid6_scrub(struct sr_discipline *); + +void *sr_get_block(struct sr_discipline *, int); +void sr_put_block(struct sr_discipline *, void *); + +void gf_init(void); +uint8_t gf_mul(uint8_t, uint8_t); +uint8_t gf_inv(uint8_t); + +#define SR_NOFAIL 0x00 +#define SR_FAILX (1L << 0) +#define SR_FAILY (1L << 1) +#define SR_FAILP (1L << 2) +#define SR_FAILQ (1L << 3) + +struct sr_raid6_opaque { + int gn; + void *pbuf; + void *qbuf; +}; + +/* discipline initialisation. */ +void +sr_raid6_discipline_init(struct sr_discipline *sd) +{ + /* Initialize GF256 tables */ + gf_init(); + + /* fill out discipline members. */ + sd->sd_max_ccb_per_wu = max(6, 2 * sd->sd_meta->ssdi.ssd_chunk_no); /* only if stripsize <= MAXPHYS */ + sd->sd_max_wu = SR_RAID6_NOWU; + sd->sd_rebuild = 0; + + /* setup discipline pointers. */ + sd->sd_alloc_resources = sr_raid6_alloc_resources; + sd->sd_free_resources = sr_raid6_free_resources; + sd->sd_start_discipline = NULL; + sd->sd_scsi_inquiry = sr_raid_inquiry; + sd->sd_scsi_read_cap = sr_raid_read_cap; + sd->sd_scsi_tur = sr_raid_tur; + sd->sd_scsi_req_sense = sr_raid_request_sense; + sd->sd_scsi_start_stop = sr_raid_start_stop; + sd->sd_scsi_sync = sr_raid_sync; + sd->sd_scsi_rw = sr_raid6_rw; + sd->sd_set_chunk_state = sr_raid6_set_chunk_state; + sd->sd_set_vol_state = sr_raid6_set_vol_state; + sd->sd_openings = sr_raid6_openings; +} + +int +sr_raid6_openings(struct sr_discipline *sd) +{ + return (sd->sd_max_wu >> 1); /* 2 wu's per IO */ +} + +int +sr_raid6_alloc_resources(struct sr_discipline *sd) +{ + int rv = EINVAL; + + if (!sd) + return (rv); + + DNPRINTF(SR_D_DIS, "%s: sr_raid6_alloc_resources\n", + DEVNAME(sd->sd_sc)); + + if (sr_wu_alloc(sd)) + goto bad; + if (sr_ccb_alloc(sd)) + goto bad; + + /* setup runtime values */ + sd->mds.mdd_raid6.sr6_strip_bits = + sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size); + if (sd->mds.mdd_raid6.sr6_strip_bits == -1) + goto bad; + + rv = 0; +bad: + return (rv); +} + +int +sr_raid6_free_resources(struct sr_discipline *sd) +{ + int rv = EINVAL; + + if (!sd) + return (rv); + + DNPRINTF(SR_D_DIS, "%s: sr_raid6_free_resources\n", + DEVNAME(sd->sd_sc)); + + sr_wu_free(sd); + sr_ccb_free(sd); + + rv = 0; + return (rv); +} + +void +sr_raid6_set_chunk_state(struct sr_discipline *sd, int c, int new_state) +{ + int old_state, s; + + /* XXX this is for RAID 0 */ + DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n", + DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, + sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state); + + /* ok to go to splbio since this only happens in error path */ + s = splbio(); + old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status; + + /* multiple IOs to the same chunk that fail will come through here */ + if (old_state == new_state) + goto done; + + switch (old_state) { + case BIOC_SDONLINE: + switch (new_state) { + case BIOC_SDOFFLINE: + case BIOC_SDSCRUB: + break; + default: + goto die; + } + break; + + case BIOC_SDOFFLINE: + if (new_state == BIOC_SDREBUILD) { + ; + } else + goto die; + break; + + case BIOC_SDSCRUB: + switch (new_state) { + case BIOC_SDONLINE: + case BIOC_SDOFFLINE: + break; + default: + goto die; + } + break; + + case BIOC_SDREBUILD: + switch (new_state) { + case BIOC_SDONLINE: + case BIOC_SDOFFLINE: + break; + default: + goto die; + } + break; + + default: +die: + splx(s); /* XXX */ + panic("%s: %s: %s: invalid chunk state transition " + "%d -> %d\n", DEVNAME(sd->sd_sc), + sd->sd_meta->ssd_devname, + sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, + old_state, new_state); + /* NOTREACHED */ + } + + sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state; + sd->sd_set_vol_state(sd); + + sd->sd_must_flush = 1; + workq_add_task(NULL, 0, sr_meta_save_callback, sd, NULL); +done: + splx(s); +} + +void +sr_raid6_set_vol_state(struct sr_discipline *sd) +{ + int states[SR_MAX_STATES]; + int new_state, i, s, nd; + int old_state = sd->sd_vol_status; + + /* XXX this is for RAID 0 */ + + DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n", + DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); + + nd = sd->sd_meta->ssdi.ssd_chunk_no; + + for (i = 0; i < SR_MAX_STATES; i++) + states[i] = 0; + + for (i = 0; i < nd; i++) { + s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status; + if (s >= SR_MAX_STATES) + panic("%s: %s: %s: invalid chunk state", + DEVNAME(sd->sd_sc), + sd->sd_meta->ssd_devname, + sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname); + states[s]++; + } + + if (states[BIOC_SDONLINE] == nd) + new_state = BIOC_SVONLINE; + else if (states[BIOC_SDONLINE] < nd - 2) + new_state = BIOC_SVOFFLINE; + else if (states[BIOC_SDOFFLINE] == nd - 2) + new_state = BIOC_SVDEGRADED; + else if (states[BIOC_SDSCRUB] != 0) + new_state = BIOC_SVSCRUB; + else if (states[BIOC_SDREBUILD] != 0) + new_state = BIOC_SVREBUILD; + else { + printf("old_state = %d, ", old_state); + for (i = 0; i < nd; i++) + printf("%d = %d, ", i, + sd->sd_vol.sv_chunks[i]->src_meta.scm_status); + panic("invalid new_state"); + } + + DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state %d -> %d\n", + DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, + old_state, new_state); + + switch (old_state) { + case BIOC_SVONLINE: + switch (new_state) { + case BIOC_SVONLINE: /* can go to same state */ + case BIOC_SVOFFLINE: + case BIOC_SVDEGRADED: + case BIOC_SVREBUILD: /* happens on boot */ + break; + default: + goto die; + } + break; + + case BIOC_SVOFFLINE: + /* XXX this might be a little too much */ + goto die; + + case BIOC_SVSCRUB: + switch (new_state) { + case BIOC_SVONLINE: + case BIOC_SVOFFLINE: + case BIOC_SVDEGRADED: + case BIOC_SVSCRUB: /* can go to same state */ + break; + default: + goto die; + } + break; + + case BIOC_SVBUILDING: + switch (new_state) { + case BIOC_SVONLINE: + case BIOC_SVOFFLINE: + case BIOC_SVBUILDING: /* can go to the same state */ + break; + default: + goto die; + } + break; + + case BIOC_SVREBUILD: + switch (new_state) { + case BIOC_SVONLINE: + case BIOC_SVOFFLINE: + case BIOC_SVDEGRADED: + case BIOC_SVREBUILD: /* can go to the same state */ + break; + default: + goto die; + } + break; + + case BIOC_SVDEGRADED: + switch (new_state) { + case BIOC_SVOFFLINE: + case BIOC_SVREBUILD: + case BIOC_SVDEGRADED: /* can go to the same state */ + break; + default: + goto die; + } + break; + + default: +die: + panic("%s: %s: invalid volume state transition %d -> %d\n", + DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, + old_state, new_state); + /* NOTREACHED */ + } + + sd->sd_vol_status = new_state; +} + +/* modes: + * readq: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, + * SR_CCBF_FREEBUF, qbuf, NULL, 0); + * readp: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, + * SR_CCBF_FREEBUF, pbuf, NULL, 0); + * readx: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, + * SR_CCBF_FREEBUF, pbuf, qbuf, gf_pow[i]); + */ + +int +sr_raid6_rw(struct sr_workunit *wu) +{ + struct sr_workunit *wu_w = NULL; + struct sr_discipline *sd = wu->swu_dis; + struct scsi_xfer *xs = wu->swu_xs; + struct sr_chunk *scp; + int s, fail, i; + daddr64_t blk, lbaoffs, strip_no, chunk, qchunk, pchunk, fchunk; + daddr64_t strip_size, no_chunk, lba, chunk_offs, phys_offs; + daddr64_t strip_bits, length, strip_offs, datalen; + void *pbuf, *data, *qbuf; + + /* blk and scsi error will be handled by sr_validate_io */ + if (sr_validate_io(wu, &blk, "sr_raid6_rw")) + goto bad; + + strip_size = sd->sd_meta->ssdi.ssd_strip_size; + strip_bits = sd->mds.mdd_raid6.sr6_strip_bits; + no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 2; + + data = xs->data; + datalen = xs->datalen; + lbaoffs = blk << DEV_BSHIFT; + + if (xs->flags & SCSI_DATA_OUT) + /* create write workunit */ + if ((wu_w = sr_wu_get(sd, 0)) == NULL) { + printf("%s: can't get wu_w", DEVNAME(sd->sd_sc)); + goto bad; + } + + wu->swu_blk_start = 0; + while (datalen != 0) { + strip_no = lbaoffs >> strip_bits; + strip_offs = lbaoffs & (strip_size - 1); + chunk_offs = (strip_no / no_chunk) << strip_bits; + phys_offs = chunk_offs + strip_offs + + ((SR_META_OFFSET + SR_META_SIZE) << DEV_BSHIFT); + + /* get size remaining in this stripe */ + length = MIN(strip_size - strip_offs, datalen); + + /* map disk offset to parity/data drive */ + chunk = strip_no % no_chunk; + + qchunk = (no_chunk + 1) - ((strip_no / no_chunk) % (no_chunk+2)); + if (qchunk == 0) + pchunk = no_chunk + 1; + else + pchunk = qchunk - 1; + if (chunk >= pchunk) + chunk++; + if (chunk >= qchunk) + chunk++; + + lba = phys_offs >> DEV_BSHIFT; + + /* XXX big hammer.. exclude I/O from entire stripe */ + if (wu->swu_blk_start == 0) + wu->swu_blk_start = chunk_offs >> DEV_BSHIFT; + wu->swu_blk_end = ((chunk_offs + (no_chunk << strip_bits)) >> DEV_BSHIFT) - 1; + + fail = 0; + + /* Get P-fail flag */ + scp = sd->sd_vol.sv_chunks[pchunk]; + switch (scp->src_meta.scm_status) { + case BIOC_SDOFFLINE: + case BIOC_SDREBUILD: + case BIOC_SDHOTSPARE: + fail |= SR_FAILP; + break; + } + + /* Get Q-fail flag */ + scp = sd->sd_vol.sv_chunks[qchunk]; + switch (scp->src_meta.scm_status) { + case BIOC_SDOFFLINE: + case BIOC_SDREBUILD: + case BIOC_SDHOTSPARE: + fail |= SR_FAILQ; + break; + } + + /* Get disk-fail flag */ + scp = sd->sd_vol.sv_chunks[chunk]; + switch (scp->src_meta.scm_status) { + case BIOC_SDOFFLINE: + case BIOC_SDREBUILD: + case BIOC_SDHOTSPARE: + fail |= SR_FAILX; + + /* Check for dual-drive failure */ + if (!(fail & (SR_FAILP|SR_FAILQ)) && + (sd->sd_vol_status == BIOC_SVDEGRADED)) + fail |= SR_FAILY; + break; + } + + if (xs->flags & SCSI_DATA_IN) { + switch (fail) { + case SR_NOFAIL: + /* drive is good. issue single read request */ + if (sr_raid6_addio(wu, chunk, lba, length, + data, xs->flags, 0, NULL, NULL, 0)) + goto bad; + break; + case SR_FAILX: + case SR_FAILX+SR_FAILQ: + /* Dx, (Q) failed: Dx = Dz ^ P (same as RAID5) */ + printf("Disk %llx offline, " + "regenerating Dx+Q\n", chunk); + + /* Calculate: Dx = P^Dz + * P: sr_raid6_xorp(data, ---, length); + * Dz: sr_raid6_xorp(data, ---, length); + */ + memset(data, 0, length); + for (i = 0; i < no_chunk+2; i++) { + if (i != chunk && i != qchunk) { + /* Read Dz */ + if (sr_raid6_addio(wu, i, lba, length, + NULL, SCSI_DATA_IN, SR_CCBF_FREEBUF, + data, NULL, 0)) + goto bad; + } + } + break; + case SR_FAILX+SR_FAILP: + /* Dx, P failed */ + printf("Disk %llx offline, " + "regenerating Dx+P\n", chunk); + + pbuf = sr_get_block(sd, length); + if (pbuf == NULL) + goto bad; + + /* Calculate: Dx*gx = Q^(Dz*gz) + * Q: sr_raid6_xorp(data, --, length); + * Dz: sr_raid6_xorq(data, --, length, gf_pow[i]); + */ + memset(data, 0, length); + for (i = 0; i < no_chunk+2; i++) { + if (i == qchunk) { + /* Read Q */ + if (sr_raid6_addio(wu, i, lba, length, + NULL, SCSI_DATA_IN, SR_CCBF_FREEBUF, + pbuf, NULL, 0)) + goto bad; + } else if (i != chunk && i != pchunk) { + /* Read Dz * gz */ + if (sr_raid6_addio(wu, i, lba, length, + NULL, SCSI_DATA_IN, SR_CCBF_FREEBUF, + NULL, pbuf, gf_pow[i])) + goto bad; + } + } + + /* XXX: bag of fail */ + wu->swu_flags |= SR_WUF_FAIL; + sr_raid_startwu(wu); + while ((wu->swu_flags & SR_WUF_FAILIOCOMP) == 0) { + tsleep(wu, PRIBIO, "sr_getdata", 0); + } + + /* On completion, pbuf = Dx*gx */ + sr_raid6_xorq(data, pbuf, length, gf_inv(gf_pow[chunk])); + sr_put_block(sd, pbuf); + + sr_wu_put(wu); + scsi_done(xs); + return(0); + + break; + case SR_FAILX+SR_FAILY: + /* Dx, Dy failed */ + + /* cheat.. get other failed drive */ + for (fchunk=0; fchunk<no_chunk+2; fchunk++) { + if (fchunk != chunk && fchunk != qchunk && fchunk != pchunk) + break; + } + + printf("Disk %llx & %llx offline, " + "regenerating Dx+Dy\n", chunk, fchunk); + qbuf = sr_get_block(sd, length); + if (qbuf == NULL) + goto bad; + pbuf = sr_get_block(sd, length); + if (pbuf == NULL) + goto bad; + + /* Calculate: Dx*gx^Dy*gy = Q^(Dz*gz) ; Dx^Dy = P^Dz + * Q: sr_raid6_xorp(qbuf, --, length); + * P: sr_raid6_xorp(pbuf, --, length); + * Dz: sr_raid6_xorp(pbuf, --, length); + * sr_raid6_xorq(qbuf, --, length, gf_pow[i]); + */ + memset(data, 0, length); + for (i = 0; i < no_chunk+2; i++) { + if (i == qchunk) { + /* read Q */ + if (sr_raid6_addio(wu, i, lba, length, + NULL, SCSI_DATA_IN, SR_CCBF_FREEBUF, + qbuf, NULL, 0)) + goto bad; + } else if (i == pchunk) { + /* read P */ + if (sr_raid6_addio(wu, i, lba, length, + NULL, SCSI_DATA_IN, SR_CCBF_FREEBUF, + pbuf, NULL, 0)) + goto bad; + } else if (i != chunk) { + /* read Dz * gz */ + if (sr_raid6_addio(wu, i, lba, length, + NULL, SCSI_DATA_IN, SR_CCBF_FREEBUF, + pbuf, qbuf, gf_pow[i])) + goto bad; + } + } + + + /* XXX: bag of fail */ + wu->swu_flags |= SR_WUF_FAIL; + sr_raid_startwu(wu); + while ((wu->swu_flags & SR_WUF_FAILIOCOMP) == 0) { + tsleep(wu, PRIBIO, "sr_getdata", 0); + } + + /* On completion, pbuf = Dx ^ Dy; qbuf = Dx*gx ^ Dy*gy */ + sr_raid6_xorq(data, qbuf, length, + gf_inv(gf_pow[chunk] ^ gf_pow[fchunk])); + sr_raid6_xorq(data, pbuf, length, + gf_inv(gf_pow[255+chunk-fchunk] ^ 1)); // Dx + + sr_put_block(sd, pbuf); + sr_put_block(sd, qbuf); + + sr_wu_put(wu); + scsi_done(xs); + return(0); + + break; + default: + printf("%s: is offline, can't read\n", + DEVNAME(sd->sd_sc)); + goto bad; + } + } else { + /* XXX handle writes to failed/offline disk? */ + if (scp->src_meta.scm_status == BIOC_SDOFFLINE) + goto bad; + + /* + * initialize pbuf with contents of new data to be + * written. This will be XORed with old data and old + * parity in the intr routine. The result in pbuf + * is the new parity data. + */ + qbuf = sr_get_block(sd, length); + if (qbuf == NULL) + goto bad; + + pbuf = sr_get_block(sd, length); + if (pbuf == NULL) + goto bad; + + /* Calulate P = Dn; Q = gn * Dn */ + sr_raid6_xorp(pbuf, data, length); + sr_raid6_xorq(qbuf, data, length, gf_pow[chunk]); + + /* Read old data: P ^= Dn' ; Q ^= (gn * Dn') */ + if (sr_raid6_addio(wu, chunk, lba, length, NULL, + SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, qbuf, + gf_pow[chunk])) + goto bad; + + /* Read old xor-parity: P ^= P' */ + if (sr_raid6_addio(wu, pchunk, lba, length, NULL, + SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, NULL, 0)) + goto bad; + + /* Read old q-parity: Q ^= Q' */ + if (sr_raid6_addio(wu, qchunk, lba, length, NULL, + SCSI_DATA_IN, SR_CCBF_FREEBUF, qbuf, NULL, 0)) + goto bad; + + /* write new data */ + if (sr_raid6_addio(wu_w, chunk, lba, length, data, + xs->flags, 0, NULL, NULL, 0)) + goto bad; + + /* write new xor-parity */ + if (sr_raid6_addio(wu_w, pchunk, lba, length, pbuf, + xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0)) + goto bad; + + /* write new q-parity */ + if (sr_raid6_addio(wu_w, qchunk, lba, length, qbuf, + xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0)) + goto bad; + } + + /* advance to next block */ + lbaoffs += length; + datalen -= length; + data += length; + } + + s = splbio(); + if (wu_w) { + /* collide write request with reads */ + wu_w->swu_blk_start = wu->swu_blk_start; + wu_w->swu_blk_end = wu->swu_blk_end; + + /* + * put xs block in write request (scsi_done not called till + * write completes) + */ + wu_w->swu_xs = wu->swu_xs; + wu->swu_xs = NULL; + + wu_w->swu_state = SR_WU_DEFERRED; + wu->swu_collider = wu_w; + TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link); + } + + /* rebuild io, let rebuild routine deal with it */ + if (wu->swu_flags & SR_WUF_REBUILD) + goto queued; + + /* current io failed, restart */ + if (wu->swu_state == SR_WU_RESTART) + goto start; + + /* deferred io failed, don't restart */ + if (wu->swu_state == SR_WU_REQUEUE) + goto queued; + + if (sr_check_io_collision(wu)) + goto queued; + +start: + sr_raid_startwu(wu); +queued: + splx(s); + return (0); +bad: + /* wu is unwound by sr_wu_put */ + if (wu_w) + sr_wu_put(wu_w); + return (1); +} + +void +sr_raid6_intr(struct buf *bp) +{ + struct sr_ccb *ccb = (struct sr_ccb *)bp; + struct sr_workunit *wu = ccb->ccb_wu, *wup; + struct sr_discipline *sd = wu->swu_dis; + struct scsi_xfer *xs = wu->swu_xs; + struct sr_softc *sc = sd->sd_sc; + struct sr_raid6_opaque *pq = ccb->ccb_opaque; + int s, pend; + + DNPRINTF(SR_D_INTR, "%s: sr_intr bp %p xs %p\n", + DEVNAME(sc), bp, xs); + + DNPRINTF(SR_D_INTR, "%s: sr_intr: b_bcount: %d b_resid: %d" + " b_flags: 0x%0x block: %lld target: %d\n", DEVNAME(sc), + ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_resid, ccb->ccb_buf.b_flags, + ccb->ccb_buf.b_blkno, ccb->ccb_target); + + s = splbio(); + + if (ccb->ccb_buf.b_flags & B_ERROR) { + DNPRINTF(SR_D_INTR, "%s: i/o error on block %lld target: %d\n", + DEVNAME(sc), ccb->ccb_buf.b_blkno, ccb->ccb_target); + printf("io error: disk %x\n", ccb->ccb_target); + wu->swu_ios_failed++; + ccb->ccb_state = SR_CCB_FAILED; + if (ccb->ccb_target != -1) + sd->sd_set_chunk_state(sd, ccb->ccb_target, + BIOC_SDOFFLINE); + else + panic("%s: invalid target on wu: %p", DEVNAME(sc), wu); + } else { + ccb->ccb_state = SR_CCB_OK; + wu->swu_ios_succeeded++; + + /* XOR data to result */ + if (pq) { + if (pq->pbuf) + /* Calculate xor-parity */ + sr_raid6_xorp(pq->pbuf, ccb->ccb_buf.b_data, + ccb->ccb_buf.b_bcount); + if (pq->qbuf) + /* Calculate q-parity */ + sr_raid6_xorq(pq->qbuf, ccb->ccb_buf.b_data, + ccb->ccb_buf.b_bcount, pq->gn); + free(pq, M_DEVBUF); + ccb->ccb_opaque = NULL; + } + } + + /* free allocated data buffer */ + if (ccb->ccb_flag & SR_CCBF_FREEBUF) { + sr_put_block(sd, ccb->ccb_buf.b_data); + ccb->ccb_buf.b_data = NULL; + } + wu->swu_ios_complete++; + + DNPRINTF(SR_D_INTR, "%s: sr_intr: comp: %d count: %d failed: %d\n", + DEVNAME(sc), wu->swu_ios_complete, wu->swu_io_count, + wu->swu_ios_failed); + + if (wu->swu_ios_complete >= wu->swu_io_count) { + + /* if all ios failed, retry reads and give up on writes */ + if (wu->swu_ios_failed == wu->swu_ios_complete) { + if (xs->flags & SCSI_DATA_IN) { + printf("%s: retrying read on block %lld\n", + DEVNAME(sc), ccb->ccb_buf.b_blkno); + sr_ccb_put(ccb); + TAILQ_INIT(&wu->swu_ccb); + wu->swu_state = SR_WU_RESTART; + if (sd->sd_scsi_rw(wu)) + goto bad; + else + goto retry; + } else { + printf("%s: permanently fail write on block " + "%lld\n", DEVNAME(sc), + ccb->ccb_buf.b_blkno); + xs->error = XS_DRIVER_STUFFUP; + goto bad; + } + } + + if (xs != NULL) { + xs->error = XS_NOERROR; + xs->resid = 0; + xs->flags |= ITSDONE; + } + + pend = 0; + TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) { + if (wu == wup) { + /* wu on pendq, remove */ + TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link); + pend = 1; + + if (wu->swu_collider) { + if (wu->swu_ios_failed) + /* toss all ccbs and recreate */ + sr_raid6_recreate_wu(wu->swu_collider); + + /* restart deferred wu */ + wu->swu_collider->swu_state = + SR_WU_INPROGRESS; + TAILQ_REMOVE(&sd->sd_wu_defq, + wu->swu_collider, swu_link); + sr_raid_startwu(wu->swu_collider); + } + break; + } + } + + if (!pend) + printf("%s: wu: %p not on pending queue\n", + DEVNAME(sc), wu); + + if (wu->swu_flags & SR_WUF_FAIL) { + wu->swu_flags |= SR_WUF_FAILIOCOMP; + wakeup(wu); + } + else if (wu->swu_flags & SR_WUF_REBUILD) { + if (wu->swu_xs->flags & SCSI_DATA_OUT) { + wu->swu_flags |= SR_WUF_REBUILDIOCOMP; + wakeup(wu); + } + } else { + /* do not change the order of these 2 functions */ + sr_wu_put(wu); + if (xs != NULL) + scsi_done(xs); + } + + if (sd->sd_sync && sd->sd_wu_pending == 0) + wakeup(sd); + } + +retry: + splx(s); + return; +bad: + xs->error = XS_DRIVER_STUFFUP; + xs->flags |= ITSDONE; + if (wu->swu_flags & SR_WUF_REBUILD) { + wu->swu_flags |= SR_WUF_REBUILDIOCOMP; + wakeup(wu); + } else { + /* do not change the order of these 2 functions */ + sr_wu_put(wu); + scsi_done(xs); + } + + splx(s); +} + +void +sr_raid6_recreate_wu(struct sr_workunit *wu) +{ + struct sr_discipline *sd = wu->swu_dis; + struct sr_workunit *wup = wu; + struct sr_ccb *ccb; + + do { + DNPRINTF(SR_D_INTR, "%s: sr_raid6_recreate_wu: %p\n", wup); + + /* toss all ccbs */ + while ((ccb = TAILQ_FIRST(&wup->swu_ccb)) != NULL) { + TAILQ_REMOVE(&wup->swu_ccb, ccb, ccb_link); + sr_ccb_put(ccb); + } + TAILQ_INIT(&wup->swu_ccb); + + /* recreate ccbs */ + wup->swu_state = SR_WU_REQUEUE; + if (sd->sd_scsi_rw(wup)) + panic("could not requeue io"); + + wup = wup->swu_collider; + } while (wup); +} + +int +sr_raid6_addio(struct sr_workunit *wu, int dsk, daddr64_t blk, daddr64_t len, + void *data, int flag, int ccbflag, void *pbuf, void *qbuf, int gn) +{ + struct sr_discipline *sd = wu->swu_dis; + struct sr_ccb *ccb; + struct sr_raid6_opaque *pqbuf; + + ccb = sr_ccb_get(sd); + if (!ccb) + return (-1); + + /* allocate temporary buffer */ + if (data == NULL) { + data = sr_get_block(sd, len); + if (data == NULL) + return (-1); + } + + DNPRINTF(0, "%sio: %d.%llx %llx %p:%p\n", + flag & SCSI_DATA_IN ? "read" : "write", + dsk, blk, len, pbuf, qbuf); + + ccb->ccb_flag = ccbflag; + if (flag & SCSI_POLL) { + ccb->ccb_buf.b_flags = 0; + ccb->ccb_buf.b_iodone = NULL; + } else { + ccb->ccb_buf.b_flags = B_CALL; + ccb->ccb_buf.b_iodone = sr_raid6_intr; + } + if (flag & SCSI_DATA_IN) + ccb->ccb_buf.b_flags |= B_READ; + else + ccb->ccb_buf.b_flags |= B_WRITE; + + /* add offset for metadata */ + ccb->ccb_buf.b_flags |= B_PHYS; + ccb->ccb_buf.b_blkno = blk; + ccb->ccb_buf.b_bcount = len; + ccb->ccb_buf.b_bufsize = len; + ccb->ccb_buf.b_resid = len; + ccb->ccb_buf.b_data = data; + ccb->ccb_buf.b_error = 0; + ccb->ccb_buf.b_proc = curproc; + ccb->ccb_buf.b_dev = sd->sd_vol.sv_chunks[dsk]->src_dev_mm; + ccb->ccb_buf.b_vp = NULL; + + ccb->ccb_wu = wu; + ccb->ccb_target = dsk; + if (pbuf || qbuf) { + pqbuf = malloc(sizeof(struct sr_raid6_opaque), M_DEVBUF, M_CANFAIL); + if (pqbuf == NULL) { + sr_ccb_put(ccb); + return (-1); + } + pqbuf->pbuf = pbuf; + pqbuf->qbuf = qbuf; + pqbuf->gn = gn; + ccb->ccb_opaque = pqbuf; + } + + LIST_INIT(&ccb->ccb_buf.b_dep); + TAILQ_INSERT_TAIL(&wu->swu_ccb, ccb, ccb_link); + + DNPRINTF(SR_D_DIS, "%s: %s: sr_raid6: b_bcount: %d " + "b_blkno: %x b_flags 0x%0x b_data %p\n", + DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, + ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_blkno, + ccb->ccb_buf.b_flags, ccb->ccb_buf.b_data); + + wu->swu_io_count++; + + return (0); +} + +/* Perform RAID6 parity calculation. + * P=xor parity, Q=GF256 parity, D=data, gn=disk# */ +void +sr_raid6_xorp(void *p, void *d, int len) +{ + uint32_t *pbuf = p, *data = d; + + /* Faster, X bytes at a time */ + len >>= 4; + while (len--) + pbuf[len] ^= data[len]; +} + +void +sr_raid6_xorq(void *q, void *d, int len, int gn) +{ + uint8_t *qbuf = q, *data = d; + + /* Have to do this a byte at a time */ + while (len--) + qbuf[len] ^= gf_mul(data[len], gn); +} + +/* Create GF256 log/pow tables: polynomial = 0x11D */ +void +gf_init(void) +{ + int i; + uint8_t p = 1; + + /* use 2N pow table to avoid using % in multiply */ + for (i=0; i<256; i++) { + gf_log[p] = i; + gf_pow[i] = gf_pow[i+255] = p; + p = ((p << 1) ^ ((p & 0x80) ? 0x1D : 0x00)); + } +} + +/* GF256 multiplication using exponent/logarithm table */ +uint8_t +gf_mul(uint8_t a, uint8_t b) +{ + /* g^a * g^b = g^(a+b) */ + if (!a || !b) + return (0); + return gf_pow[gf_log[a] + gf_log[b]]; +} + +uint8_t +gf_inv(uint8_t a) +{ + return gf_pow[255 - gf_log[a]]; +} + diff --git a/sys/dev/softraidvar.h b/sys/dev/softraidvar.h index 04ba09af530..407e76bcfe6 100644 --- a/sys/dev/softraidvar.h +++ b/sys/dev/softraidvar.h @@ -1,4 +1,4 @@ -/* $OpenBSD: softraidvar.h,v 1.79 2009/07/12 16:31:56 jsing Exp $ */ +/* $OpenBSD: softraidvar.h,v 1.80 2009/07/23 15:15:25 jordan Exp $ */ /* * Copyright (c) 2006 Marco Peereboom <marco@peereboom.us> * Copyright (c) 2008 Chris Kuethe <ckuethe@openbsd.org> @@ -264,6 +264,8 @@ struct sr_workunit { int swu_flags; /* additional hints */ #define SR_WUF_REBUILD (1<<0) /* rebuild io */ #define SR_WUF_REBUILDIOCOMP (1<<1) /* rbuild io complete */ +#define SR_WUF_FAIL (1<<2) /* RAID6: failure */ +#define SR_WUF_FAILIOCOMP (1<<3) int swu_fake; /* faked wu */ /* workunit io range */ @@ -307,6 +309,12 @@ struct sr_raidp { int32_t srp_strip_bits; }; +/* RAID 6 */ +#define SR_RAID6_NOWU 16 +struct sr_raid6 { + int32_t sr6_strip_bits; +}; + /* CRYPTO */ #define SR_CRYPTO_NOWU 16 struct sr_crypto { @@ -394,6 +402,7 @@ struct sr_discipline { #define SR_MD_AOE_INIT 5 #define SR_MD_AOE_TARG 6 #define SR_MD_RAID4 7 +#define SR_MD_RAID6 8 char sd_name[10]; /* human readable dis name */ u_int8_t sd_scsibus; /* scsibus discipline uses */ struct scsi_link sd_link; /* link to midlayer */ @@ -402,6 +411,7 @@ struct sr_discipline { struct sr_raid0 mdd_raid0; struct sr_raid1 mdd_raid1; struct sr_raidp mdd_raidp; + struct sr_raid6 mdd_raid6; struct sr_crypto mdd_crypto; #ifdef AOE struct sr_aoe mdd_aoe; @@ -536,6 +546,7 @@ void sr_raid_startwu(struct sr_workunit *); void sr_raid0_discipline_init(struct sr_discipline *); void sr_raid1_discipline_init(struct sr_discipline *); void sr_raidp_discipline_init(struct sr_discipline *); +void sr_raid6_discipline_init(struct sr_discipline *); void sr_crypto_discipline_init(struct sr_discipline *); void sr_aoe_discipline_init(struct sr_discipline *); void sr_aoe_server_discipline_init(struct sr_discipline *); |