diff options
author | Marco Peereboom <marco@cvs.openbsd.org> | 2009-06-10 03:24:03 +0000 |
---|---|---|
committer | Marco Peereboom <marco@cvs.openbsd.org> | 2009-06-10 03:24:03 +0000 |
commit | 8ac48e3106a31781ca1936d46d4308bbb159e58c (patch) | |
tree | 6a77c0f06ec34b5e6cc480abba6865295f19d6ee /sys/dev/softraid_raidp.c | |
parent | 2bc2b6103c65d8ce73638a950f705228d8533b83 (diff) |
Add framework for raid 4 & 5 so that we can work in tree.
Diffstat (limited to 'sys/dev/softraid_raidp.c')
-rw-r--r-- | sys/dev/softraid_raidp.c | 618 |
1 files changed, 618 insertions, 0 deletions
diff --git a/sys/dev/softraid_raidp.c b/sys/dev/softraid_raidp.c new file mode 100644 index 00000000000..28460c7b23e --- /dev/null +++ b/sys/dev/softraid_raidp.c @@ -0,0 +1,618 @@ +/* $OpenBSD: softraid_raidp.c,v 1.1 2009/06/10 03:24:02 marco Exp $ */ +/* + * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us> + * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "bio.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/device.h> +#include <sys/ioctl.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/disk.h> +#include <sys/rwlock.h> +#include <sys/queue.h> +#include <sys/fcntl.h> +#include <sys/disklabel.h> +#include <sys/mount.h> +#include <sys/sensors.h> +#include <sys/stat.h> +#include <sys/conf.h> +#include <sys/uio.h> + +#include <scsi/scsi_all.h> +#include <scsi/scsiconf.h> +#include <scsi/scsi_disk.h> + +#include <dev/softraidvar.h> +#include <dev/rndvar.h> + +/* RAID P functions. */ +int sr_raidp_alloc_resources(struct sr_discipline *); +int sr_raidp_free_resources(struct sr_discipline *); +int sr_raidp_rw(struct sr_workunit *); +void sr_raidp_intr(struct buf *); +void sr_raidp_recreate_wu(struct sr_workunit *); +void sr_raidp_set_chunk_state(struct sr_discipline *, int, int); +void sr_raidp_set_vol_state(struct sr_discipline *); + +/* Discipline initialisation. */ +void +sr_raidp_discipline_init(struct sr_discipline *sd) +{ + + /* Fill out discipline members. */ + sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no; + sd->sd_max_wu = SR_RAIDP_NOWU; + sd->sd_rebuild = 1; + + /* Setup discipline pointers. */ + sd->sd_alloc_resources = sr_raidp_alloc_resources; + sd->sd_free_resources = sr_raidp_free_resources; + sd->sd_start_discipline = NULL; + sd->sd_scsi_inquiry = sr_raid_inquiry; + sd->sd_scsi_read_cap = sr_raid_read_cap; + sd->sd_scsi_tur = sr_raid_tur; + sd->sd_scsi_req_sense = sr_raid_request_sense; + sd->sd_scsi_start_stop = sr_raid_start_stop; + sd->sd_scsi_sync = sr_raid_sync; + sd->sd_scsi_rw = sr_raidp_rw; + sd->sd_set_chunk_state = sr_raidp_set_chunk_state; + sd->sd_set_vol_state = sr_raidp_set_vol_state; +} + +int +sr_raidp_alloc_resources(struct sr_discipline *sd) +{ + int rv = EINVAL; + + if (!sd) + return (rv); + + DNPRINTF(SR_D_DIS, "%s: sr_raidp_alloc_resources\n", + DEVNAME(sd->sd_sc)); + + if (sr_wu_alloc(sd)) + goto bad; + if (sr_ccb_alloc(sd)) + goto bad; + + rv = 0; +bad: + return (rv); +} + +int +sr_raidp_free_resources(struct sr_discipline *sd) +{ + int rv = EINVAL; + + if (!sd) + return (rv); + + DNPRINTF(SR_D_DIS, "%s: sr_raidp_free_resources\n", + DEVNAME(sd->sd_sc)); + + sr_wu_free(sd); + sr_ccb_free(sd); + + rv = 0; + return (rv); +} + +void +sr_raidp_set_chunk_state(struct sr_discipline *sd, int c, int new_state) +{ + int old_state, s; + + DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n", + DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, + sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state); + + /* ok to go to splbio since this only happens in error path */ + s = splbio(); + old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status; + + /* multiple IOs to the same chunk that fail will come through here */ + if (old_state == new_state) + goto done; + + switch (old_state) { + case BIOC_SDONLINE: + switch (new_state) { + case BIOC_SDOFFLINE: + break; + case BIOC_SDSCRUB: + break; + default: + goto die; + } + break; + + case BIOC_SDOFFLINE: + if (new_state == BIOC_SDREBUILD) { + ; + } else + goto die; + break; + + case BIOC_SDSCRUB: + if (new_state == BIOC_SDONLINE) { + ; + } else + goto die; + break; + + case BIOC_SDREBUILD: + if (new_state == BIOC_SDONLINE) { + ; + } else + goto die; + break; + + case BIOC_SDHOTSPARE: + if (new_state == BIOC_SDREBUILD) { + ; + } else + goto die; + break; + + default: +die: + splx(s); /* XXX */ + panic("%s: %s: %s: invalid chunk state transition " + "%d -> %d\n", DEVNAME(sd->sd_sc), + sd->sd_meta->ssd_devname, + sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, + old_state, new_state); + /* NOTREACHED */ + } + + sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state; + sd->sd_set_vol_state(sd); + + sd->sd_must_flush = 1; + workq_add_task(NULL, 0, sr_meta_save_callback, sd, NULL); +done: + splx(s); +} + +void +sr_raidp_set_vol_state(struct sr_discipline *sd) +{ + int states[SR_MAX_STATES]; + int new_state, i, s, nd; + int old_state = sd->sd_vol_status; + + DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n", + DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); + + nd = sd->sd_meta->ssdi.ssd_chunk_no; + + for (i = 0; i < SR_MAX_STATES; i++) + states[i] = 0; + + for (i = 0; i < nd; i++) { + s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status; + if (s >= SR_MAX_STATES) + panic("%s: %s: %s: invalid chunk state", + DEVNAME(sd->sd_sc), + sd->sd_meta->ssd_devname, + sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname); + states[s]++; + } + + if (states[BIOC_SDONLINE] == nd) + new_state = BIOC_SVONLINE; + else if (states[BIOC_SDONLINE] == 0) + new_state = BIOC_SVOFFLINE; + else if (states[BIOC_SDSCRUB] != 0) + new_state = BIOC_SVSCRUB; + else if (states[BIOC_SDREBUILD] != 0) + new_state = BIOC_SVREBUILD; + else if (states[BIOC_SDOFFLINE] != 0) + new_state = BIOC_SVDEGRADED; + else { + printf("old_state = %d, ", old_state); + for (i = 0; i < nd; i++) + printf("%d = %d, ", i, + sd->sd_vol.sv_chunks[i]->src_meta.scm_status); + panic("invalid new_state"); + } + + DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state %d -> %d\n", + DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, + old_state, new_state); + + switch (old_state) { + case BIOC_SVONLINE: + switch (new_state) { + case BIOC_SVONLINE: /* can go to same state */ + case BIOC_SVOFFLINE: + case BIOC_SVDEGRADED: + case BIOC_SVREBUILD: /* happens on boot */ + break; + default: + goto die; + } + break; + + case BIOC_SVOFFLINE: + /* XXX this might be a little too much */ + goto die; + + case BIOC_SVSCRUB: + switch (new_state) { + case BIOC_SVONLINE: + case BIOC_SVOFFLINE: + case BIOC_SVDEGRADED: + case BIOC_SVSCRUB: /* can go to same state */ + break; + default: + goto die; + } + break; + + case BIOC_SVBUILDING: + switch (new_state) { + case BIOC_SVONLINE: + case BIOC_SVOFFLINE: + case BIOC_SVBUILDING: /* can go to the same state */ + break; + default: + goto die; + } + break; + + case BIOC_SVREBUILD: + switch (new_state) { + case BIOC_SVONLINE: + case BIOC_SVOFFLINE: + case BIOC_SVDEGRADED: + case BIOC_SVREBUILD: /* can go to the same state */ + break; + default: + goto die; + } + break; + + case BIOC_SVDEGRADED: + switch (new_state) { + case BIOC_SVOFFLINE: + case BIOC_SVREBUILD: + case BIOC_SVDEGRADED: /* can go to the same state */ + break; + default: + goto die; + } + break; + + default: +die: + panic("%s: %s: invalid volume state transition " + "%d -> %d\n", DEVNAME(sd->sd_sc), + sd->sd_meta->ssd_devname, + old_state, new_state); + /* NOTREACHED */ + } + + sd->sd_vol_status = new_state; +} + +int +sr_raidp_rw(struct sr_workunit *wu) +{ +#if 0 + struct sr_discipline *sd = wu->swu_dis; + struct scsi_xfer *xs = wu->swu_xs; +#endif + int s; + daddr64_t blk; +#if 0 + struct sr_ccb *ccb; + struct sr_chunk *scp; + int ios, x, i, s, rt; +#endif + /* blk and scsi error will be handled by sr_validate_io */ + if (sr_validate_io(wu, &blk, "sr_raidp_rw")) + goto bad; + + /* calculate physical block */ + blk += SR_META_SIZE + SR_META_OFFSET; + + /* XXX replace this with raid 4 & 5! */ + goto bad; +#if 0 + if (xs->flags & SCSI_DATA_IN) + ios = 1; + else + ios = sd->sd_meta->ssdi.ssd_chunk_no; + wu->swu_io_count = ios; + + for (i = 0; i < ios; i++) { + ccb = sr_ccb_get(sd); + if (!ccb) { + /* should never happen but handle more gracefully */ + printf("%s: %s: too many ccbs queued\n", + DEVNAME(sd->sd_sc), + sd->sd_meta->ssd_devname); + goto bad; + } + + if (xs->flags & SCSI_POLL) { + ccb->ccb_buf.b_flags = 0; + ccb->ccb_buf.b_iodone = NULL; + } else { + ccb->ccb_buf.b_flags = B_CALL; + ccb->ccb_buf.b_iodone = sr_raidp_intr; + } + + ccb->ccb_buf.b_flags |= B_PHYS; + ccb->ccb_buf.b_blkno = blk; + ccb->ccb_buf.b_bcount = xs->datalen; + ccb->ccb_buf.b_bufsize = xs->datalen; + ccb->ccb_buf.b_resid = xs->datalen; + ccb->ccb_buf.b_data = xs->data; + ccb->ccb_buf.b_error = 0; + ccb->ccb_buf.b_proc = curproc; + ccb->ccb_wu = wu; + + if (xs->flags & SCSI_DATA_IN) { + rt = 0; +ragain: + /* interleave reads */ + x = sd->mds.mdd_raidp.sr1_counter++ % + sd->sd_meta->ssdi.ssd_chunk_no; + scp = sd->sd_vol.sv_chunks[x]; + switch (scp->src_meta.scm_status) { + case BIOC_SDONLINE: + case BIOC_SDSCRUB: + ccb->ccb_buf.b_flags |= B_READ; + break; + + case BIOC_SDOFFLINE: + case BIOC_SDREBUILD: + case BIOC_SDHOTSPARE: + if (rt++ < sd->sd_meta->ssdi.ssd_chunk_no) + goto ragain; + + /* FALLTHROUGH */ + default: + /* volume offline */ + printf("%s: is offline, can't read\n", + DEVNAME(sd->sd_sc)); + sr_ccb_put(ccb); + goto bad; + } + } else { + /* writes go on all working disks */ + x = i; + scp = sd->sd_vol.sv_chunks[x]; + switch (scp->src_meta.scm_status) { + case BIOC_SDONLINE: + case BIOC_SDSCRUB: + case BIOC_SDREBUILD: + ccb->ccb_buf.b_flags |= B_WRITE; + break; + + case BIOC_SDHOTSPARE: /* should never happen */ + case BIOC_SDOFFLINE: + wu->swu_io_count--; + sr_ccb_put(ccb); + continue; + + default: + goto bad; + } + + } + ccb->ccb_target = x; + ccb->ccb_buf.b_dev = sd->sd_vol.sv_chunks[x]->src_dev_mm; + ccb->ccb_buf.b_vp = NULL; + + LIST_INIT(&ccb->ccb_buf.b_dep); + + TAILQ_INSERT_TAIL(&wu->swu_ccb, ccb, ccb_link); + + DNPRINTF(SR_D_DIS, "%s: %s: sr_raidp: b_bcount: %d " + "b_blkno: %x b_flags 0x%0x b_data %p\n", + DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, + ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_blkno, + ccb->ccb_buf.b_flags, ccb->ccb_buf.b_data); + } +#endif + s = splbio(); + + /* rebuild io, let rebuild routine deal with it */ + if (wu->swu_flags & SR_WUF_REBUILD) + goto queued; + + /* current io failed, restart */ + if (wu->swu_state == SR_WU_RESTART) + goto start; + + /* deferred io failed, don't restart */ + if (wu->swu_state == SR_WU_REQUEUE) + goto queued; + + if (sr_check_io_collision(wu)) + goto queued; + +start: + sr_raid_startwu(wu); +queued: + splx(s); + return (0); +bad: + /* wu is unwound by sr_wu_put */ + return (1); +} + +void +sr_raidp_intr(struct buf *bp) +{ + struct sr_ccb *ccb = (struct sr_ccb *)bp; + struct sr_workunit *wu = ccb->ccb_wu, *wup; + struct sr_discipline *sd = wu->swu_dis; + struct scsi_xfer *xs = wu->swu_xs; + struct sr_softc *sc = sd->sd_sc; + int s, pend; + + DNPRINTF(SR_D_INTR, "%s: sr_intr bp %x xs %x\n", + DEVNAME(sc), bp, xs); + + DNPRINTF(SR_D_INTR, "%s: sr_intr: b_bcount: %d b_resid: %d" + " b_flags: 0x%0x block: %lld target: %d\n", DEVNAME(sc), + ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_resid, ccb->ccb_buf.b_flags, + ccb->ccb_buf.b_blkno, ccb->ccb_target); + + s = splbio(); + + if (ccb->ccb_buf.b_flags & B_ERROR) { + DNPRINTF(SR_D_INTR, "%s: i/o error on block %lld target: %d\n", + DEVNAME(sc), ccb->ccb_buf.b_blkno, ccb->ccb_target); + wu->swu_ios_failed++; + ccb->ccb_state = SR_CCB_FAILED; + if (ccb->ccb_target != -1) + sd->sd_set_chunk_state(sd, ccb->ccb_target, + BIOC_SDOFFLINE); + else + panic("%s: invalid target on wu: %p", DEVNAME(sc), wu); + } else { + ccb->ccb_state = SR_CCB_OK; + wu->swu_ios_succeeded++; + } + wu->swu_ios_complete++; + + DNPRINTF(SR_D_INTR, "%s: sr_intr: comp: %d count: %d failed: %d\n", + DEVNAME(sc), wu->swu_ios_complete, wu->swu_io_count, + wu->swu_ios_failed); + + if (wu->swu_ios_complete >= wu->swu_io_count) { + /* if all ios failed, retry reads and give up on writes */ + if (wu->swu_ios_failed == wu->swu_ios_complete) { + if (xs->flags & SCSI_DATA_IN) { + printf("%s: retrying read on block %lld\n", + DEVNAME(sc), ccb->ccb_buf.b_blkno); + sr_ccb_put(ccb); + TAILQ_INIT(&wu->swu_ccb); + wu->swu_state = SR_WU_RESTART; + if (sd->sd_scsi_rw(wu)) + goto bad; + else + goto retry; + } else { + printf("%s: permanently fail write on block " + "%lld\n", DEVNAME(sc), + ccb->ccb_buf.b_blkno); + xs->error = XS_DRIVER_STUFFUP; + goto bad; + } + } + + xs->error = XS_NOERROR; + xs->resid = 0; + xs->flags |= ITSDONE; + + pend = 0; + TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) { + if (wu == wup) { + /* wu on pendq, remove */ + TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link); + pend = 1; + + if (wu->swu_collider) { + if (wu->swu_ios_failed) + /* toss all ccbs and recreate */ + sr_raidp_recreate_wu(wu->swu_collider); + + /* restart deferred wu */ + wu->swu_collider->swu_state = + SR_WU_INPROGRESS; + TAILQ_REMOVE(&sd->sd_wu_defq, + wu->swu_collider, swu_link); + sr_raid_startwu(wu->swu_collider); + } + break; + } + } + + if (!pend) + printf("%s: wu: %p not on pending queue\n", + DEVNAME(sc), wu); + + if (wu->swu_flags & SR_WUF_REBUILD) { + if (wu->swu_xs->flags & SCSI_DATA_OUT) { + wu->swu_flags |= SR_WUF_REBUILDIOCOMP; + wakeup(wu); + } + } else { + /* do not change the order of these 2 functions */ + sr_wu_put(wu); + scsi_done(xs); + } + + if (sd->sd_sync && sd->sd_wu_pending == 0) + wakeup(sd); + } + +retry: + splx(s); + return; +bad: + xs->error = XS_DRIVER_STUFFUP; + xs->flags |= ITSDONE; + if (wu->swu_flags & SR_WUF_REBUILD) { + wu->swu_flags |= SR_WUF_REBUILDIOCOMP; + wakeup(wu); + } else { + /* do not change the order of these 2 functions */ + sr_wu_put(wu); + scsi_done(xs); + } + + splx(s); +} + +void +sr_raidp_recreate_wu(struct sr_workunit *wu) +{ + struct sr_discipline *sd = wu->swu_dis; + struct sr_workunit *wup = wu; + struct sr_ccb *ccb; + + do { + DNPRINTF(SR_D_INTR, "%s: sr_raidp_recreate_wu: %p\n", wup); + + /* toss all ccbs */ + while ((ccb = TAILQ_FIRST(&wup->swu_ccb)) != NULL) { + TAILQ_REMOVE(&wup->swu_ccb, ccb, ccb_link); + sr_ccb_put(ccb); + } + TAILQ_INIT(&wup->swu_ccb); + + /* recreate ccbs */ + wup->swu_state = SR_WU_REQUEUE; + if (sd->sd_scsi_rw(wup)) + panic("could not requeue io"); + + wup = wup->swu_collider; + } while (wup); +} |