/* $OpenBSD: softraid_raid6.c,v 1.18 2010/07/02 09:20:26 jsing Exp $ */ /* * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us> * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org> * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "bio.h" #include <sys/param.h> #include <sys/systm.h> #include <sys/buf.h> #include <sys/device.h> #include <sys/ioctl.h> #include <sys/proc.h> #include <sys/malloc.h> #include <sys/kernel.h> #include <sys/disk.h> #include <sys/rwlock.h> #include <sys/queue.h> #include <sys/fcntl.h> #include <sys/disklabel.h> #include <sys/mount.h> #include <sys/sensors.h> #include <sys/stat.h> #include <sys/conf.h> #include <sys/uio.h> #include <scsi/scsi_all.h> #include <scsi/scsiconf.h> #include <scsi/scsi_disk.h> #include <dev/softraidvar.h> #include <dev/rndvar.h> uint8_t *gf_map[256]; uint8_t gf_pow[768]; int gf_log[256]; /* RAID 6 functions. */ int sr_raid6_create(struct sr_discipline *, struct bioc_createraid *, int, int64_t); int sr_raid6_assemble(struct sr_discipline *, struct bioc_createraid *, int); int sr_raid6_alloc_resources(struct sr_discipline *); int sr_raid6_free_resources(struct sr_discipline *); int sr_raid6_rw(struct sr_workunit *); int sr_raid6_openings(struct sr_discipline *); void sr_raid6_intr(struct buf *); void sr_raid6_recreate_wu(struct sr_workunit *); void sr_raid6_set_chunk_state(struct sr_discipline *, int, int); void sr_raid6_set_vol_state(struct sr_discipline *); void sr_raid6_xorp(void *, void *, int); void sr_raid6_xorq(void *, void *, int, int); int sr_raid6_addio(struct sr_workunit *wu, int, daddr64_t, daddr64_t, void *, int, int, void *, void *, int); void sr_dump(void *, int); void sr_raid6_scrub(struct sr_discipline *); int sr_failio(struct sr_workunit *); void *sr_get_block(struct sr_discipline *, int); void sr_put_block(struct sr_discipline *, void *); void gf_init(void); uint8_t gf_inv(uint8_t); int gf_premul(uint8_t); uint8_t gf_mul(uint8_t, uint8_t); #define SR_NOFAIL 0x00 #define SR_FAILX (1L << 0) #define SR_FAILY (1L << 1) #define SR_FAILP (1L << 2) #define SR_FAILQ (1L << 3) #define M_FAIL 0x00 #define M_RX 0x01 #define M_RXP 0x02 #define M_RXQ 0x03 #define M_RXY 0x04 #define M_RFLG 0x0F #define M_WXPQ 0x10 #define M_WXY 0x20 #define M_WPQ 0x30 #define M_WFLG 0xF0 /* Mapping of Failure Flags to Read/Write state */ uint8_t sr_rwmode[16] = { [SR_FAILX+SR_FAILY+SR_FAILP] = M_FAIL, [SR_FAILX+SR_FAILY+SR_FAILQ] = M_FAIL, [SR_FAILX+SR_FAILP+SR_FAILQ] = M_FAIL, [SR_FAILY+SR_FAILP+SR_FAILQ] = M_FAIL, [SR_FAILX+SR_FAILY+SR_FAILP+SR_FAILQ] = M_FAIL, [SR_NOFAIL] = M_RX | M_WXPQ, [SR_FAILY] = M_RX | M_WXPQ, [SR_FAILP] = M_RX | M_WXPQ, [SR_FAILQ] = M_RX | M_WXPQ, [SR_FAILY+SR_FAILP] = M_RX | M_WXPQ, [SR_FAILY+SR_FAILQ] = M_RX | M_WXPQ, [SR_FAILP+SR_FAILQ] = M_RX | M_WXPQ, [SR_FAILX] = M_RXQ | M_WPQ, [SR_FAILX+SR_FAILQ] = M_RXQ | M_WPQ, [SR_FAILX+SR_FAILP] = M_RXP | M_WPQ, [SR_FAILX+SR_FAILY] = M_RXY | M_WXY, }; struct sr_raid6_opaque { int gn; void *pbuf; void *qbuf; }; /* discipline initialisation. */ void sr_raid6_discipline_init(struct sr_discipline *sd) { /* Initialize GF256 tables */ gf_init(); /* fill out discipline members. */ sd->sd_type = SR_MD_RAID6; sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE; sd->sd_max_wu = SR_RAID6_NOWU; /* setup discipline pointers. */ sd->sd_create = sr_raid6_create; sd->sd_assemble = sr_raid6_assemble; sd->sd_alloc_resources = sr_raid6_alloc_resources; sd->sd_free_resources = sr_raid6_free_resources; sd->sd_start_discipline = NULL; sd->sd_scsi_inquiry = sr_raid_inquiry; sd->sd_scsi_read_cap = sr_raid_read_cap; sd->sd_scsi_tur = sr_raid_tur; sd->sd_scsi_req_sense = sr_raid_request_sense; sd->sd_scsi_start_stop = sr_raid_start_stop; sd->sd_scsi_sync = sr_raid_sync; sd->sd_scsi_rw = sr_raid6_rw; sd->sd_set_chunk_state = sr_raid6_set_chunk_state; sd->sd_set_vol_state = sr_raid6_set_vol_state; sd->sd_openings = sr_raid6_openings; } int sr_raid6_create(struct sr_discipline *sd, struct bioc_createraid *bc, int no_chunk, int64_t coerced_size) { if (no_chunk < 4) return EINVAL; strlcpy(sd->sd_name, "RAID 6", sizeof(sd->sd_name)); /* * XXX add variable strip size later even though MAXPHYS is really * the clever value, users like * to tinker with that type of stuff. */ sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS; sd->sd_meta->ssdi.ssd_size = (coerced_size & ~((sd->sd_meta->ssdi.ssd_strip_size >> DEV_BSHIFT) - 1)) * (no_chunk - 2); /* only if stripsize <= MAXPHYS */ sd->sd_max_ccb_per_wu = max(6, 2 * no_chunk); return 0; } int sr_raid6_assemble(struct sr_discipline *sd, struct bioc_createraid *bc, int no_chunk) { /* only if stripsize <= MAXPHYS */ sd->sd_max_ccb_per_wu = max(6, 2 * sd->sd_meta->ssdi.ssd_chunk_no); return 0; } int sr_raid6_openings(struct sr_discipline *sd) { return (sd->sd_max_wu >> 1); /* 2 wu's per IO */ } int sr_raid6_alloc_resources(struct sr_discipline *sd) { int rv = EINVAL; if (!sd) return (rv); DNPRINTF(SR_D_DIS, "%s: sr_raid6_alloc_resources\n", DEVNAME(sd->sd_sc)); if (sr_wu_alloc(sd)) goto bad; if (sr_ccb_alloc(sd)) goto bad; /* setup runtime values */ sd->mds.mdd_raid6.sr6_strip_bits = sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size); if (sd->mds.mdd_raid6.sr6_strip_bits == -1) goto bad; rv = 0; bad: return (rv); } int sr_raid6_free_resources(struct sr_discipline *sd) { int rv = EINVAL; if (!sd) return (rv); DNPRINTF(SR_D_DIS, "%s: sr_raid6_free_resources\n", DEVNAME(sd->sd_sc)); sr_wu_free(sd); sr_ccb_free(sd); rv = 0; return (rv); } void sr_raid6_set_chunk_state(struct sr_discipline *sd, int c, int new_state) { int old_state, s; /* XXX this is for RAID 0 */ DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state); /* ok to go to splbio since this only happens in error path */ s = splbio(); old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status; /* multiple IOs to the same chunk that fail will come through here */ if (old_state == new_state) goto done; switch (old_state) { case BIOC_SDONLINE: switch (new_state) { case BIOC_SDOFFLINE: case BIOC_SDSCRUB: break; default: goto die; } break; case BIOC_SDOFFLINE: if (new_state == BIOC_SDREBUILD) { ; } else goto die; break; case BIOC_SDSCRUB: switch (new_state) { case BIOC_SDONLINE: case BIOC_SDOFFLINE: break; default: goto die; } break; case BIOC_SDREBUILD: switch (new_state) { case BIOC_SDONLINE: case BIOC_SDOFFLINE: break; default: goto die; } break; default: die: splx(s); /* XXX */ panic("%s: %s: %s: invalid chunk state transition " "%d -> %d\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, old_state, new_state); /* NOTREACHED */ } sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state; sd->sd_set_vol_state(sd); sd->sd_must_flush = 1; workq_add_task(NULL, 0, sr_meta_save_callback, sd, NULL); done: splx(s); } void sr_raid6_set_vol_state(struct sr_discipline *sd) { int states[SR_MAX_STATES]; int new_state, i, s, nd; int old_state = sd->sd_vol_status; /* XXX this is for RAID 0 */ DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); nd = sd->sd_meta->ssdi.ssd_chunk_no; for (i = 0; i < SR_MAX_STATES; i++) states[i] = 0; for (i = 0; i < nd; i++) { s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status; if (s >= SR_MAX_STATES) panic("%s: %s: %s: invalid chunk state", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname); states[s]++; } if (states[BIOC_SDONLINE] == nd) new_state = BIOC_SVONLINE; else if (states[BIOC_SDONLINE] < nd - 2) new_state = BIOC_SVOFFLINE; else if (states[BIOC_SDSCRUB] != 0) new_state = BIOC_SVSCRUB; else if (states[BIOC_SDREBUILD] != 0) new_state = BIOC_SVREBUILD; else if (states[BIOC_SDONLINE] < nd) new_state = BIOC_SVDEGRADED; else { printf("old_state = %d, ", old_state); for (i = 0; i < nd; i++) printf("%d = %d, ", i, sd->sd_vol.sv_chunks[i]->src_meta.scm_status); panic("invalid new_state"); } DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state %d -> %d\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, old_state, new_state); switch (old_state) { case BIOC_SVONLINE: switch (new_state) { case BIOC_SVONLINE: /* can go to same state */ case BIOC_SVOFFLINE: case BIOC_SVDEGRADED: case BIOC_SVREBUILD: /* happens on boot */ break; default: goto die; } break; case BIOC_SVOFFLINE: /* XXX this might be a little too much */ goto die; case BIOC_SVSCRUB: switch (new_state) { case BIOC_SVONLINE: case BIOC_SVOFFLINE: case BIOC_SVDEGRADED: case BIOC_SVSCRUB: /* can go to same state */ break; default: goto die; } break; case BIOC_SVBUILDING: switch (new_state) { case BIOC_SVONLINE: case BIOC_SVOFFLINE: case BIOC_SVBUILDING: /* can go to the same state */ break; default: goto die; } break; case BIOC_SVREBUILD: switch (new_state) { case BIOC_SVONLINE: case BIOC_SVOFFLINE: case BIOC_SVDEGRADED: case BIOC_SVREBUILD: /* can go to the same state */ break; default: goto die; } break; case BIOC_SVDEGRADED: switch (new_state) { case BIOC_SVOFFLINE: case BIOC_SVREBUILD: case BIOC_SVDEGRADED: /* can go to the same state */ break; default: goto die; } break; default: die: panic("%s: %s: invalid volume state transition %d -> %d\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, old_state, new_state); /* NOTREACHED */ } sd->sd_vol_status = new_state; } /* modes: * readq: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, * SR_CCBF_FREEBUF, qbuf, NULL, 0); * readp: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, * SR_CCBF_FREEBUF, pbuf, NULL, 0); * readx: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, * SR_CCBF_FREEBUF, pbuf, qbuf, gf_pow[i]); */ int sr_raid6_rw(struct sr_workunit *wu) { struct sr_workunit *wu_w = NULL; struct sr_discipline *sd = wu->swu_dis; struct scsi_xfer *xs = wu->swu_xs; struct sr_chunk *scp; int s, fail, i, rwmode, gxinv, pxinv; daddr64_t blk, lbaoffs, strip_no, chunk, qchunk, pchunk, fchunk; daddr64_t strip_size, no_chunk, lba, chunk_offs, phys_offs; daddr64_t strip_bits, length, strip_offs, datalen, row_size; void *pbuf, *data, *qbuf; /* blk and scsi error will be handled by sr_validate_io */ if (sr_validate_io(wu, &blk, "sr_raid6_rw")) goto bad; strip_size = sd->sd_meta->ssdi.ssd_strip_size; strip_bits = sd->mds.mdd_raid6.sr6_strip_bits; no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 2; row_size = (no_chunk << strip_bits) >> DEV_BSHIFT; data = xs->data; datalen = xs->datalen; lbaoffs = blk << DEV_BSHIFT; rwmode = (xs->flags & SCSI_DATA_IN) ? M_RFLG : M_WFLG; if (xs->flags & SCSI_DATA_OUT) /* create write workunit */ if ((wu_w = sr_wu_get(sd, 0)) == NULL) { printf("%s: can't get wu_w", DEVNAME(sd->sd_sc)); goto bad; } wu->swu_blk_start = 0; while (datalen != 0) { strip_no = lbaoffs >> strip_bits; strip_offs = lbaoffs & (strip_size - 1); chunk_offs = (strip_no / no_chunk) << strip_bits; phys_offs = chunk_offs + strip_offs + (sd->sd_meta->ssd_data_offset << DEV_BSHIFT); /* get size remaining in this stripe */ length = MIN(strip_size - strip_offs, datalen); /* map disk offset to parity/data drive */ chunk = strip_no % no_chunk; qchunk = (no_chunk + 1) - ((strip_no / no_chunk) % (no_chunk+2)); if (qchunk == 0) pchunk = no_chunk + 1; else pchunk = qchunk - 1; if (chunk >= pchunk) chunk++; if (chunk >= qchunk) chunk++; lba = phys_offs >> DEV_BSHIFT; /* XXX big hammer.. exclude I/O from entire stripe */ if (wu->swu_blk_start == 0) wu->swu_blk_start = (strip_no / no_chunk) * row_size; wu->swu_blk_end = (strip_no / no_chunk) * row_size + (row_size - 1); fail = 0; fchunk = -1; /* Get disk-fail flags */ for (i=0; i< no_chunk+2; i++) { scp = sd->sd_vol.sv_chunks[i]; switch (scp->src_meta.scm_status) { case BIOC_SDOFFLINE: case BIOC_SDREBUILD: case BIOC_SDHOTSPARE: if (i == qchunk) fail |= SR_FAILQ; else if (i == pchunk) fail |= SR_FAILP; else if (i == chunk) fail |= SR_FAILX; else { /* dual data-disk failure */ fail |= SR_FAILY; fchunk = i; } break; } } if (xs->flags & SCSI_DATA_IN) { if (!(fail & SR_FAILX)) { /* drive is good. issue single read request */ if (sr_raid6_addio(wu, chunk, lba, length, data, xs->flags, 0, NULL, NULL, 0)) goto bad; } else if (fail & SR_FAILP) { /* Dx, P failed */ printf("Disk %llx offline, " "regenerating Dx+P\n", chunk); gxinv = gf_inv(gf_pow[chunk]); /* Calculate: Dx = (Q^Dz*gz)*inv(gx) */ memset(data, 0, length); if (sr_raid6_addio(wu, qchunk, lba, length, NULL, SCSI_DATA_IN, SR_CCBF_FREEBUF, NULL, data, gxinv)) goto bad; /* Read Dz * gz * inv(gx) */ for (i = 0; i < no_chunk+2; i++) { if (i == qchunk || i == pchunk || i == chunk) continue; if (sr_raid6_addio(wu, i, lba, length, NULL, SCSI_DATA_IN, SR_CCBF_FREEBUF, NULL, data, gf_mul(gf_pow[i], gxinv))) goto bad; } /* data will contain correct value on completion */ } else if (fail & SR_FAILY) { /* Dx, Dy failed */ printf("Disk %llx & %llx offline, " "regenerating Dx+Dy\n", chunk, fchunk); gxinv = gf_inv(gf_pow[chunk] ^ gf_pow[fchunk]); pxinv = gf_mul(gf_pow[fchunk], gxinv); /* read Q * inv(gx + gy) */ memset(data, 0, length); if (sr_raid6_addio(wu, qchunk, lba, length, NULL, SCSI_DATA_IN, SR_CCBF_FREEBUF, NULL, data, gxinv)) goto bad; /* read P * gy * inv(gx + gy) */ if (sr_raid6_addio(wu, pchunk, lba, length, NULL, SCSI_DATA_IN, SR_CCBF_FREEBUF, NULL, data, pxinv)) goto bad; /* Calculate: Dx*gx^Dy*gy = Q^(Dz*gz) ; Dx^Dy = P^Dz * Q: sr_raid6_xorp(qbuf, --, length); * P: sr_raid6_xorp(pbuf, --, length); * Dz: sr_raid6_xorp(pbuf, --, length); * sr_raid6_xorq(qbuf, --, length, gf_pow[i]); */ for (i = 0; i < no_chunk+2; i++) { if (i == qchunk || i == pchunk || i == chunk || i == fchunk) continue; /* read Dz * (gz + gy) * inv(gx + gy) */ if (sr_raid6_addio(wu, i, lba, length, NULL, SCSI_DATA_IN, SR_CCBF_FREEBUF, NULL, data, pxinv ^ gf_mul(gf_pow[i], gxinv))) goto bad; } } else { /* Two cases: single disk (Dx) or (Dx+Q) * Dx = Dz ^ P (same as RAID5) */ printf("Disk %llx offline, " "regenerating Dx%s\n", chunk, fail & SR_FAILQ ? "+Q" : " single"); /* Calculate: Dx = P^Dz * P: sr_raid6_xorp(data, ---, length); * Dz: sr_raid6_xorp(data, ---, length); */ memset(data, 0, length); for (i = 0; i < no_chunk+2; i++) { if (i != chunk && i != qchunk) { /* Read Dz */ if (sr_raid6_addio(wu, i, lba, length, NULL, SCSI_DATA_IN, SR_CCBF_FREEBUF, data, NULL, 0)) goto bad; } } /* data will contain correct value on completion */ } } else { /* XXX handle writes to failed/offline disk? */ if (fail & (SR_FAILX|SR_FAILQ|SR_FAILP)) goto bad; /* * initialize pbuf with contents of new data to be * written. This will be XORed with old data and old * parity in the intr routine. The result in pbuf * is the new parity data. */ qbuf = sr_get_block(sd, length); if (qbuf == NULL) goto bad; pbuf = sr_get_block(sd, length); if (pbuf == NULL) goto bad; /* Calulate P = Dn; Q = gn * Dn */ if (gf_premul(gf_pow[chunk])) goto bad; sr_raid6_xorp(pbuf, data, length); sr_raid6_xorq(qbuf, data, length, gf_pow[chunk]); /* Read old data: P ^= Dn' ; Q ^= (gn * Dn') */ if (sr_raid6_addio(wu, chunk, lba, length, NULL, SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, qbuf, gf_pow[chunk])) goto bad; /* Read old xor-parity: P ^= P' */ if (sr_raid6_addio(wu, pchunk, lba, length, NULL, SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, NULL, 0)) goto bad; /* Read old q-parity: Q ^= Q' */ if (sr_raid6_addio(wu, qchunk, lba, length, NULL, SCSI_DATA_IN, SR_CCBF_FREEBUF, qbuf, NULL, 0)) goto bad; /* write new data */ if (sr_raid6_addio(wu_w, chunk, lba, length, data, xs->flags, 0, NULL, NULL, 0)) goto bad; /* write new xor-parity */ if (sr_raid6_addio(wu_w, pchunk, lba, length, pbuf, xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0)) goto bad; /* write new q-parity */ if (sr_raid6_addio(wu_w, qchunk, lba, length, qbuf, xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0)) goto bad; } /* advance to next block */ lbaoffs += length; datalen -= length; data += length; } s = splbio(); if (wu_w) { /* collide write request with reads */ wu_w->swu_blk_start = wu->swu_blk_start; wu_w->swu_blk_end = wu->swu_blk_end; /* * put xs block in write request (scsi_done not called till * write completes) */ wu_w->swu_xs = wu->swu_xs; wu->swu_xs = NULL; wu_w->swu_state = SR_WU_DEFERRED; wu->swu_collider = wu_w; TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link); } /* rebuild io, let rebuild routine deal with it */ if (wu->swu_flags & SR_WUF_REBUILD) goto queued; /* current io failed, restart */ if (wu->swu_state == SR_WU_RESTART) goto start; /* deferred io failed, don't restart */ if (wu->swu_state == SR_WU_REQUEUE) goto queued; if (sr_check_io_collision(wu)) goto queued; start: sr_raid_startwu(wu); queued: splx(s); return (0); bad: /* wu is unwound by sr_wu_put */ if (wu_w) sr_wu_put(wu_w); return (1); } /* Handle failure I/O completion */ int sr_failio(struct sr_workunit *wu) { struct sr_discipline *sd = wu->swu_dis; struct sr_ccb *ccb; if (!(wu->swu_flags & SR_WUF_FAIL)) return (0); /* Wu is a 'fake'.. don't do real I/O just intr */ TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link); TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link) sr_raid6_intr(&ccb->ccb_buf); return (1); } void sr_raid6_intr(struct buf *bp) { struct sr_ccb *ccb = (struct sr_ccb *)bp; struct sr_workunit *wu = ccb->ccb_wu, *wup; struct sr_discipline *sd = wu->swu_dis; struct scsi_xfer *xs = wu->swu_xs; struct sr_softc *sc = sd->sd_sc; struct sr_raid6_opaque *pq = ccb->ccb_opaque; int s, pend; DNPRINTF(SR_D_INTR, "%s: sr_intr bp %p xs %p\n", DEVNAME(sc), bp, xs); DNPRINTF(SR_D_INTR, "%s: sr_intr: b_bcount: %d b_resid: %d" " b_flags: 0x%0x block: %lld target: %d\n", DEVNAME(sc), ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_resid, ccb->ccb_buf.b_flags, ccb->ccb_buf.b_blkno, ccb->ccb_target); s = splbio(); if (ccb->ccb_buf.b_flags & B_ERROR) { DNPRINTF(SR_D_INTR, "%s: i/o error on block %lld target: %d\n", DEVNAME(sc), ccb->ccb_buf.b_blkno, ccb->ccb_target); printf("io error: disk %x\n", ccb->ccb_target); wu->swu_ios_failed++; ccb->ccb_state = SR_CCB_FAILED; if (ccb->ccb_target != -1) sd->sd_set_chunk_state(sd, ccb->ccb_target, BIOC_SDOFFLINE); else panic("%s: invalid target on wu: %p", DEVNAME(sc), wu); } else { ccb->ccb_state = SR_CCB_OK; wu->swu_ios_succeeded++; /* XOR data to result */ if (pq) { if (pq->pbuf) /* Calculate xor-parity */ sr_raid6_xorp(pq->pbuf, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount); if (pq->qbuf) /* Calculate q-parity */ sr_raid6_xorq(pq->qbuf, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount, pq->gn); free(pq, M_DEVBUF); ccb->ccb_opaque = NULL; } } /* free allocated data buffer */ if (ccb->ccb_flag & SR_CCBF_FREEBUF) { sr_put_block(sd, ccb->ccb_buf.b_data); ccb->ccb_buf.b_data = NULL; } wu->swu_ios_complete++; DNPRINTF(SR_D_INTR, "%s: sr_intr: comp: %d count: %d failed: %d\n", DEVNAME(sc), wu->swu_ios_complete, wu->swu_io_count, wu->swu_ios_failed); if (wu->swu_ios_complete >= wu->swu_io_count) { /* if all ios failed, retry reads and give up on writes */ if (wu->swu_ios_failed == wu->swu_ios_complete) { if (xs->flags & SCSI_DATA_IN) { printf("%s: retrying read on block %lld\n", DEVNAME(sc), ccb->ccb_buf.b_blkno); sr_ccb_put(ccb); TAILQ_INIT(&wu->swu_ccb); wu->swu_state = SR_WU_RESTART; if (sd->sd_scsi_rw(wu)) goto bad; else goto retry; } else { printf("%s: permanently fail write on block " "%lld\n", DEVNAME(sc), ccb->ccb_buf.b_blkno); xs->error = XS_DRIVER_STUFFUP; goto bad; } } if (xs != NULL) { xs->error = XS_NOERROR; xs->resid = 0; } pend = 0; TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) { if (wu == wup) { /* wu on pendq, remove */ TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link); pend = 1; if (wu->swu_collider) { if (wu->swu_ios_failed) /* toss all ccbs and recreate */ sr_raid6_recreate_wu(wu->swu_collider); /* restart deferred wu */ wu->swu_collider->swu_state = SR_WU_INPROGRESS; TAILQ_REMOVE(&sd->sd_wu_defq, wu->swu_collider, swu_link); if (sr_failio(wu->swu_collider) == 0) sr_raid_startwu(wu->swu_collider); } break; } } if (!pend) printf("%s: wu: %p not on pending queue\n", DEVNAME(sc), wu); if (wu->swu_flags & SR_WUF_REBUILD) { if (wu->swu_xs->flags & SCSI_DATA_OUT) { wu->swu_flags |= SR_WUF_REBUILDIOCOMP; wakeup(wu); } } else { /* do not change the order of these 2 functions */ sr_wu_put(wu); if (xs != NULL) scsi_done(xs); } if (sd->sd_sync && sd->sd_wu_pending == 0) wakeup(sd); } retry: splx(s); return; bad: xs->error = XS_DRIVER_STUFFUP; if (wu->swu_flags & SR_WUF_REBUILD) { wu->swu_flags |= SR_WUF_REBUILDIOCOMP; wakeup(wu); } else { /* do not change the order of these 2 functions */ sr_wu_put(wu); scsi_done(xs); } splx(s); } void sr_raid6_recreate_wu(struct sr_workunit *wu) { struct sr_discipline *sd = wu->swu_dis; struct sr_workunit *wup = wu; struct sr_ccb *ccb; do { DNPRINTF(SR_D_INTR, "%s: sr_raid6_recreate_wu: %p\n", wup); /* toss all ccbs */ while ((ccb = TAILQ_FIRST(&wup->swu_ccb)) != NULL) { TAILQ_REMOVE(&wup->swu_ccb, ccb, ccb_link); sr_ccb_put(ccb); } TAILQ_INIT(&wup->swu_ccb); /* recreate ccbs */ wup->swu_state = SR_WU_REQUEUE; if (sd->sd_scsi_rw(wup)) panic("could not requeue io"); wup = wup->swu_collider; } while (wup); } int sr_raid6_addio(struct sr_workunit *wu, int dsk, daddr64_t blk, daddr64_t len, void *data, int flag, int ccbflag, void *pbuf, void *qbuf, int gn) { struct sr_discipline *sd = wu->swu_dis; struct sr_ccb *ccb; struct sr_raid6_opaque *pqbuf; ccb = sr_ccb_get(sd); if (!ccb) return (-1); /* allocate temporary buffer */ if (data == NULL) { data = sr_get_block(sd, len); if (data == NULL) return (-1); } DNPRINTF(0, "%sio: %d.%llx %llx %p:%p\n", flag & SCSI_DATA_IN ? "read" : "write", dsk, blk, len, pbuf, qbuf); ccb->ccb_flag = ccbflag; if (flag & SCSI_POLL) { ccb->ccb_buf.b_flags = 0; ccb->ccb_buf.b_iodone = NULL; } else { ccb->ccb_buf.b_flags = B_CALL; ccb->ccb_buf.b_iodone = sr_raid6_intr; } if (flag & SCSI_DATA_IN) ccb->ccb_buf.b_flags |= B_READ; else ccb->ccb_buf.b_flags |= B_WRITE; /* add offset for metadata */ ccb->ccb_buf.b_flags |= B_PHYS; ccb->ccb_buf.b_blkno = blk; ccb->ccb_buf.b_bcount = len; ccb->ccb_buf.b_bufsize = len; ccb->ccb_buf.b_resid = len; ccb->ccb_buf.b_data = data; ccb->ccb_buf.b_error = 0; ccb->ccb_buf.b_proc = curproc; ccb->ccb_buf.b_dev = sd->sd_vol.sv_chunks[dsk]->src_dev_mm; ccb->ccb_buf.b_vp = sd->sd_vol.sv_chunks[dsk]->src_vn; ccb->ccb_buf.b_bq = NULL; if ((ccb->ccb_buf.b_flags & B_READ) == 0) ccb->ccb_buf.b_vp->v_numoutput++; ccb->ccb_wu = wu; ccb->ccb_target = dsk; if (pbuf || qbuf) { if (qbuf && gf_premul(gn)) return (-1); pqbuf = malloc(sizeof(struct sr_raid6_opaque), M_DEVBUF, M_CANFAIL); if (pqbuf == NULL) { sr_ccb_put(ccb); return (-1); } pqbuf->pbuf = pbuf; pqbuf->qbuf = qbuf; pqbuf->gn = gn; ccb->ccb_opaque = pqbuf; } LIST_INIT(&ccb->ccb_buf.b_dep); TAILQ_INSERT_TAIL(&wu->swu_ccb, ccb, ccb_link); DNPRINTF(SR_D_DIS, "%s: %s: sr_raid6: b_bcount: %d " "b_blkno: %x b_flags 0x%0x b_data %p\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_blkno, ccb->ccb_buf.b_flags, ccb->ccb_buf.b_data); wu->swu_io_count++; return (0); } /* Perform RAID6 parity calculation. * P=xor parity, Q=GF256 parity, D=data, gn=disk# */ void sr_raid6_xorp(void *p, void *d, int len) { uint8_t *pbuf = p, *data = d; while (len--) pbuf[len] ^= data[len]; } void sr_raid6_xorq(void *q, void *d, int len, int gn) { uint8_t *qbuf = q, *data = d; uint8_t *gn_map = gf_map[gn]; /* Have to do this a byte at a time */ /* Faster multiply.. gn is always constant */ while (len--) qbuf[len] ^= gn_map[data[len]]; } /* Create GF256 log/pow tables: polynomial = 0x11D */ void gf_init(void) { int i; uint8_t p = 1; /* use 2N pow table to avoid using % in multiply */ for (i=0; i<256; i++) { gf_log[p] = i; gf_pow[i] = gf_pow[i+255] = p; p = ((p << 1) ^ ((p & 0x80) ? 0x1D : 0x00)); } gf_log[0] = 512; } uint8_t gf_inv(uint8_t a) { return gf_pow[255 - gf_log[a]]; } uint8_t gf_mul(uint8_t a, uint8_t b) { return gf_pow[gf_log[a] + gf_log[b]]; } /* Precalculate multiplication tables for drive gn */ int gf_premul(uint8_t gn) { int i; if (gf_map[gn] != NULL) return (0); if ((gf_map[gn] = malloc(256, M_DEVBUF, M_CANFAIL)) == NULL) return (-1); for (i=0; i<256; i++) gf_map[gn][i] = gf_pow[gf_log[i] + gf_log[gn]]; return (0); }