diff options
author | Niklas Hallqvist <niklas@cvs.openbsd.org> | 1999-01-11 14:29:56 +0000 |
---|---|---|
committer | Niklas Hallqvist <niklas@cvs.openbsd.org> | 1999-01-11 14:29:56 +0000 |
commit | 5a29b52d01b420bb61a3112d2d44740a0fa99601 (patch) | |
tree | 7d6238740f53a56f5c76ba8256c785b13caaa24a /sys/dev/raidframe | |
parent | 799a3ea9a9c07e091f5f4e62273c6f105cf86191 (diff) |
Import of CMU's RAIDframe via NetBSD.
Diffstat (limited to 'sys/dev/raidframe')
155 files changed, 56247 insertions, 0 deletions
diff --git a/sys/dev/raidframe/rf_acctrace.c b/sys/dev/raidframe/rf_acctrace.c new file mode 100644 index 00000000000..8e3c7a9b26a --- /dev/null +++ b/sys/dev/raidframe/rf_acctrace.c @@ -0,0 +1,295 @@ +/* $OpenBSD: rf_acctrace.c,v 1.1 1999/01/11 14:28:58 niklas Exp $ */ +/* $NetBSD: rf_acctrace.c,v 1.1 1998/11/13 04:20:26 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/***************************************************************************** + * + * acctrace.c -- code to support collecting information about each access + * + *****************************************************************************/ + +/* : + * Log: rf_acctrace.c,v + * Revision 1.29 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.28 1996/07/17 21:00:58 jimz + * clean up timer interface, tracing + * + * Revision 1.27 1996/06/14 14:35:24 jimz + * clean up dfstrace protection + * + * Revision 1.26 1996/06/13 19:09:04 jimz + * remove trace.dat file before beginning + * + * Revision 1.25 1996/06/12 04:41:26 jimz + * tweaks to make genplot work with user-level driver + * (mainly change stat collection) + * + * Revision 1.24 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.23 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.22 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.21 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.20 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.19 1996/05/30 12:59:18 jimz + * make etimer happier, more portable + * + * Revision 1.18 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.17 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.16 1996/05/20 16:15:49 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.15 1996/05/18 20:10:00 jimz + * bit of cleanup to compile cleanly in kernel, once again + * + * Revision 1.14 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.13 1995/11/30 16:26:43 wvcii + * added copyright info + * + */ + +#ifdef _KERNEL +#define KERNEL +#endif + +#include "rf_threadstuff.h" +#include "rf_types.h" +#include <sys/stat.h> +#include <sys/types.h> + +#ifdef KERNEL +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#include <dfstrace.h> +#endif /* !__NetBSD__ && !__OpenBSD__ */ +#if DFSTRACE > 0 +#include <sys/dfs_log.h> +#include <sys/dfstracebuf.h> +#endif /* DFSTRACE > 0 */ +#endif /* KERNEL */ + +#include "rf_debugMem.h" +#include "rf_acctrace.h" +#include "rf_general.h" +#include "rf_raid.h" +#include "rf_etimer.h" +#include "rf_hist.h" +#include "rf_shutdown.h" +#include "rf_sys.h" + +static long numTracesSoFar; +static int accessTraceBufCount = 0; +static RF_AccTraceEntry_t *access_tracebuf; +static long traceCount; + +int rf_stopCollectingTraces; +RF_DECLARE_MUTEX(rf_tracing_mutex) +int rf_trace_fd; + +static void rf_ShutdownAccessTrace(void *); + +static void rf_ShutdownAccessTrace(ignored) + void *ignored; +{ + if (rf_accessTraceBufSize) { + if (accessTraceBufCount) rf_FlushAccessTraceBuf(); +#ifndef KERNEL + close(rf_trace_fd); +#endif /* !KERNEL */ + RF_Free(access_tracebuf, rf_accessTraceBufSize * sizeof(RF_AccTraceEntry_t)); + } + rf_mutex_destroy(&rf_tracing_mutex); +#if defined(KERNEL) && DFSTRACE > 0 + printf("RAIDFRAME: %d trace entries were sent to dfstrace\n",traceCount); +#endif /* KERNEL && DFSTRACE > 0 */ +} + +int rf_ConfigureAccessTrace(listp) + RF_ShutdownList_t **listp; +{ + int rc; + + numTracesSoFar = accessTraceBufCount = rf_stopCollectingTraces = 0; + if (rf_accessTraceBufSize) { + RF_Malloc(access_tracebuf, rf_accessTraceBufSize * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *)); + accessTraceBufCount = 0; +#ifndef KERNEL + rc = unlink("trace.dat"); + if (rc && (errno != ENOENT)) { + perror("unlink"); + RF_ERRORMSG("Unable to remove existing trace.dat\n"); + return(errno); + } + if ((rf_trace_fd = open("trace.dat",O_WRONLY|O_CREAT|O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH)) < 0 ) { + perror("Unable to open trace.dat for output"); + return(errno); + } +#endif /* !KERNEL */ + } + traceCount = 0; + numTracesSoFar = 0; + rc = rf_mutex_init(&rf_tracing_mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + } + rc = rf_ShutdownCreate(listp, rf_ShutdownAccessTrace, NULL); + if (rc) { + RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + if (rf_accessTraceBufSize) { + RF_Free(access_tracebuf, rf_accessTraceBufSize * sizeof(RF_AccTraceEntry_t)); +#ifndef KERNEL + close(rf_trace_fd); +#endif /* !KERNEL */ + rf_mutex_destroy(&rf_tracing_mutex); + } + } + return(rc); +} + +/* install a trace record. cause a flush to disk or to the trace collector daemon + * if the trace buffer is at least 1/2 full. + */ +void rf_LogTraceRec(raid, rec) + RF_Raid_t *raid; + RF_AccTraceEntry_t *rec; +{ + RF_AccTotals_t *acc = &raid->acc_totals; +#if 0 + RF_Etimer_t timer; + int i, n; +#endif + + if (rf_stopCollectingTraces || ((rf_maxNumTraces >= 0) && (numTracesSoFar >= rf_maxNumTraces))) + return; + +#ifndef KERNEL + if (rf_accessTraceBufSize) { + RF_LOCK_MUTEX(rf_tracing_mutex); + numTracesSoFar++; + bcopy((char *)rec, (char *)&access_tracebuf[ accessTraceBufCount++ ], sizeof(RF_AccTraceEntry_t)); + if (accessTraceBufCount == rf_accessTraceBufSize) + rf_FlushAccessTraceBuf(); + RF_UNLOCK_MUTEX(rf_tracing_mutex); + } +#endif /* !KERNEL */ +#if defined(KERNEL) && DFSTRACE > 0 + rec->index = traceCount++; + if (traceon & DFS_TRACE_RAIDFRAME) { + dfs_log(DFS_NOTE, (char *) rec, (int) sizeof(*rec), 0); + } +#endif /* KERNEL && DFSTRACE > 0 */ + /* update AccTotals for this device */ + if (!raid->keep_acc_totals) + return; + acc->num_log_ents++; + if (rec->reconacc) { + acc->recon_start_to_fetch_us += rec->specific.recon.recon_start_to_fetch_us; + acc->recon_fetch_to_return_us += rec->specific.recon.recon_fetch_to_return_us; + acc->recon_return_to_submit_us += rec->specific.recon.recon_return_to_submit_us; + acc->recon_num_phys_ios += rec->num_phys_ios; + acc->recon_phys_io_us += rec->phys_io_us; + acc->recon_diskwait_us += rec->diskwait_us; + acc->recon_reccount++; + } + else { + RF_HIST_ADD(acc->tot_hist, rec->total_us); + RF_HIST_ADD(acc->dw_hist, rec->diskwait_us); + /* count of physical ios which are too big. often due to thermal recalibration */ + /* if bigvals > 0, you should probably ignore this data set */ + if (rec->diskwait_us > 100000) + acc->bigvals++; + acc->total_us += rec->total_us; + acc->suspend_ovhd_us += rec->specific.user.suspend_ovhd_us; + acc->map_us += rec->specific.user.map_us; + acc->lock_us += rec->specific.user.lock_us; + acc->dag_create_us += rec->specific.user.dag_create_us; + acc->dag_retry_us += rec->specific.user.dag_retry_us; + acc->exec_us += rec->specific.user.exec_us; + acc->cleanup_us += rec->specific.user.cleanup_us; + acc->exec_engine_us += rec->specific.user.exec_engine_us; + acc->xor_us += rec->xor_us; + acc->q_us += rec->q_us; + acc->plog_us += rec->plog_us; + acc->diskqueue_us += rec->diskqueue_us; + acc->diskwait_us += rec->diskwait_us; + acc->num_phys_ios += rec->num_phys_ios; + acc->phys_io_us = rec->phys_io_us; + acc->user_reccount++; + } +} + + +/* assumes the tracing mutex is locked at entry. In order to allow this to be called + * from interrupt context, we don't do any copyouts here, but rather just wake trace + * buffer collector thread. + */ +void rf_FlushAccessTraceBuf() +{ +#ifndef KERNEL + int size = accessTraceBufCount * sizeof(RF_AccTraceEntry_t); + + if (write(rf_trace_fd, (char *) access_tracebuf, size) < size ) { + fprintf(stderr, "Unable to write traces to file. tracing disabled\n"); + RF_Free(access_tracebuf, rf_accessTraceBufSize * sizeof(RF_AccTraceEntry_t)); + rf_accessTraceBufSize = 0; + close(rf_trace_fd); + } +#endif /* !KERNEL */ + accessTraceBufCount = 0; +} diff --git a/sys/dev/raidframe/rf_acctrace.h b/sys/dev/raidframe/rf_acctrace.h new file mode 100644 index 00000000000..0b3441e3e49 --- /dev/null +++ b/sys/dev/raidframe/rf_acctrace.h @@ -0,0 +1,196 @@ +/* $OpenBSD: rf_acctrace.h,v 1.1 1999/01/11 14:28:58 niklas Exp $ */ +/* $NetBSD: rf_acctrace.h,v 1.1 1998/11/13 04:20:26 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/***************************************************************************** + * + * acctrace.h -- header file for acctrace.c + * + *****************************************************************************/ + +/* : + * + * Log: rf_acctrace.h,v + * Revision 1.32 1996/08/02 15:12:38 jimz + * remove dead code + * + * Revision 1.31 1996/07/27 14:34:39 jimz + * remove bogus semicolon + * + * Revision 1.30 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.29 1996/07/17 21:00:58 jimz + * clean up timer interface, tracing + * + * Revision 1.28 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.27 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * / + * + * Revision 1.26 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.25 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.24 1996/05/30 12:59:18 jimz + * make etimer happier, more portable + * + * Revision 1.23 1996/05/28 12:34:30 jimz + * nail down size of reconacc + * + * Revision 1.22 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.21 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.20 1996/05/02 14:57:24 jimz + * change to boolean_t + * + * Revision 1.19 1995/12/14 18:37:06 jimz + * convert to rf_types.h types + * + * Revision 1.18 1995/11/30 16:26:49 wvcii + * added copyright info + * + * Revision 1.17 1995/09/30 19:49:23 jimz + * add AccTotals structure, for capturing totals in kernel + * + * Revision 1.16 1995/09/12 00:20:55 wvcii + * added support for tracing disk queue time + * + * Revision 1.15 95/09/06 19:23:12 wvcii + * increased MAX_IOS_PER_TRACE_ENTRY from 1 to 4 + * + */ + +#ifndef _RF__RF_ACCTRACE_H_ +#define _RF__RF_ACCTRACE_H_ + +#include "rf_types.h" +#include "rf_hist.h" +#include "rf_etimer.h" + +typedef struct RF_user_acc_stats_s { + RF_uint64 suspend_ovhd_us; /* us spent mucking in the access-suspension code */ + RF_uint64 map_us; /* us spent mapping the access */ + RF_uint64 lock_us; /* us spent locking & unlocking stripes, including time spent blocked */ + RF_uint64 dag_create_us; /* us spent creating the DAGs */ + RF_uint64 dag_retry_us; /* _total_ us spent retrying the op -- not broken down into components */ + RF_uint64 exec_us; /* us spent in DispatchDAG */ + RF_uint64 exec_engine_us; /* us spent in engine, not including blocking time */ + RF_uint64 cleanup_us; /* us spent tearing down the dag & maps, and generally cleaning up */ +} RF_user_acc_stats_t; + +typedef struct RF_recon_acc_stats_s { + RF_uint32 recon_start_to_fetch_us; + RF_uint32 recon_fetch_to_return_us; + RF_uint32 recon_return_to_submit_us; +} RF_recon_acc_stats_t; + +typedef struct RF_acctrace_entry_s { + union { + RF_user_acc_stats_t user; + RF_recon_acc_stats_t recon; + } specific; + RF_uint8 reconacc; /* whether this is a tracerec for a user acc or a recon acc */ + RF_uint64 xor_us; /* us spent doing XORs */ + RF_uint64 q_us; /* us spent doing XORs */ + RF_uint64 plog_us; /* us spent waiting to stuff parity into log */ + RF_uint64 diskqueue_us; /* _total_ us spent in disk queue(s), incl concurrent ops */ + RF_uint64 diskwait_us; /* _total_ us spent waiting actually waiting on the disk, incl concurrent ops */ + RF_uint64 total_us; /* total us spent on this access */ + RF_uint64 num_phys_ios; /* number of physical I/Os invoked */ + RF_uint64 phys_io_us; /* time of physical I/O */ + RF_Etimer_t tot_timer; /* a timer used to compute total access time */ + RF_Etimer_t timer; /* a generic timer val for timing events that live across procedure boundaries */ + RF_Etimer_t recon_timer; /* generic timer for recon stuff */ + RF_uint64 index; +} RF_AccTraceEntry_t; + +typedef struct RF_AccTotals_s { + /* user acc stats */ + RF_uint64 suspend_ovhd_us; + RF_uint64 map_us; + RF_uint64 lock_us; + RF_uint64 dag_create_us; + RF_uint64 dag_retry_us; + RF_uint64 exec_us; + RF_uint64 exec_engine_us; + RF_uint64 cleanup_us; + RF_uint64 user_reccount; + /* recon acc stats */ + RF_uint64 recon_start_to_fetch_us; + RF_uint64 recon_fetch_to_return_us; + RF_uint64 recon_return_to_submit_us; + RF_uint64 recon_io_overflow_count; + RF_uint64 recon_phys_io_us; + RF_uint64 recon_num_phys_ios; + RF_uint64 recon_diskwait_us; + RF_uint64 recon_reccount; + /* trace entry stats */ + RF_uint64 xor_us; + RF_uint64 q_us; + RF_uint64 plog_us; + RF_uint64 diskqueue_us; + RF_uint64 diskwait_us; + RF_uint64 total_us; + RF_uint64 num_log_ents; + RF_uint64 phys_io_overflow_count; + RF_uint64 num_phys_ios; + RF_uint64 phys_io_us; + RF_uint64 bigvals; + /* histograms */ + RF_Hist_t dw_hist[RF_HIST_NUM_BUCKETS]; + RF_Hist_t tot_hist[RF_HIST_NUM_BUCKETS]; +} RF_AccTotals_t; + +#if RF_UTILITY == 0 +RF_DECLARE_EXTERN_MUTEX(rf_tracing_mutex) +#endif /* RF_UTILITY == 0 */ + +int rf_ConfigureAccessTrace(RF_ShutdownList_t **listp); +void rf_LogTraceRec(RF_Raid_t *raid, RF_AccTraceEntry_t *rec); +void rf_FlushAccessTraceBuf(void); + +#endif /* !_RF__RF_ACCTRACE_H_ */ diff --git a/sys/dev/raidframe/rf_alloclist.c b/sys/dev/raidframe/rf_alloclist.c new file mode 100644 index 00000000000..5f0de4a4070 --- /dev/null +++ b/sys/dev/raidframe/rf_alloclist.c @@ -0,0 +1,294 @@ +/* $OpenBSD: rf_alloclist.c,v 1.1 1999/01/11 14:28:58 niklas Exp $ */ +/* $NetBSD: rf_alloclist.c,v 1.1 1998/11/13 04:20:26 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Log: rf_alloclist.c,v + * Revision 1.28 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.27 1996/06/12 03:29:54 jimz + * don't barf just because we can't create an alloclist + * + * Revision 1.26 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.25 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.24 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.23 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.22 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.21 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.20 1996/05/20 16:15:59 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.19 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.18 1996/05/16 22:27:45 jimz + * get rid of surreal_MakeAllocList (what was that, anyway?) + * + * Revision 1.17 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.16 1995/11/30 16:27:07 wvcii + * added copyright info + * + * Revision 1.15 1995/10/05 20:37:56 jimz + * assert non-NULLness of pointer to FREE in FreeAllocList() + * + * Revision 1.14 1995/06/11 20:11:24 holland + * changed fl_hist,miss_count from long to int to get around weird kernel bug + * + * Revision 1.13 1995/05/01 13:28:00 holland + * parity range locks, locking disk requests, recon+parityscan in kernel, etc. + * + * Revision 1.12 1995/04/21 19:13:04 holland + * minor change to avoid a syntax error on DO_FREE + * + * Revision 1.11 1995/02/17 19:39:56 holland + * added size param to all calls to Free(). + * this is ignored at user level, but necessary in the kernel. + * + * Revision 1.10 1995/02/10 18:08:07 holland + * added DO_FREE macro to fix what I broke during kernelization + * + * Revision 1.9 1995/02/10 17:34:10 holland + * kernelization changes + * + * Revision 1.8 1995/02/03 22:31:36 holland + * many changes related to kernelization + * + * Revision 1.7 1995/02/01 15:13:05 holland + * moved #include of general.h out of raid.h and into each file + * + * Revision 1.6 1995/01/11 19:27:02 holland + * many changes related to performance tuning + * + * Revision 1.5 1994/11/29 20:53:10 danner + * Marks mods + * + * Revision 1.3 1994/11/19 21:01:07 danner + * First merge with mark + * + * Revision 1.1.1.1 1994/11/19 20:23:38 danner + * First PQ checkin + * + * Revision 1.2 1994/11/16 15:45:35 danner + * fixed free bug in FreeAllocList + * + * + */ + +/**************************************************************************** + * + * Alloclist.c -- code to manipulate allocation lists + * + * an allocation list is just a list of AllocListElem structures. Each + * such structure contains a fixed-size array of pointers. Calling + * FreeAList() causes each pointer to be freed. + * + ***************************************************************************/ + +#include "rf_types.h" +#include "rf_threadstuff.h" +#include "rf_alloclist.h" +#include "rf_debugMem.h" +#include "rf_etimer.h" +#include "rf_general.h" +#include "rf_shutdown.h" +#include "rf_sys.h" + +RF_DECLARE_STATIC_MUTEX(alist_mutex) +static unsigned int fl_hit_count, fl_miss_count; + +static RF_AllocListElem_t *al_free_list=NULL; +static int al_free_list_count; + +#define RF_AL_FREELIST_MAX 256 + +#ifndef KERNEL +#define DO_FREE(_p,_sz) free((_p)) +#else /* !KERNEL */ +#define DO_FREE(_p,_sz) RF_Free((_p),(_sz)) +#endif /* !KERNEL */ + +static void rf_ShutdownAllocList(void *); + +static void rf_ShutdownAllocList(ignored) + void *ignored; +{ + RF_AllocListElem_t *p, *pt; + + for (p = al_free_list; p; ) { + pt = p; + p = p->next; + DO_FREE(pt, sizeof(*pt)); + } + rf_mutex_destroy(&alist_mutex); + /* + printf("Alloclist: Free list hit count %lu (%lu %%) miss count %lu (%lu %%)\n", + fl_hit_count, (100*fl_hit_count)/(fl_hit_count+fl_miss_count), + fl_miss_count, (100*fl_miss_count)/(fl_hit_count+fl_miss_count)); + */ +} + +int rf_ConfigureAllocList(listp) + RF_ShutdownList_t **listp; +{ + int rc; + + rc = rf_mutex_init(&alist_mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + return(rc); + } + al_free_list = NULL; + fl_hit_count = fl_miss_count = al_free_list_count = 0; + rc = rf_ShutdownCreate(listp, rf_ShutdownAllocList, NULL); + if (rc) { + RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", + __FILE__, __LINE__, rc); + rf_mutex_destroy(&alist_mutex); + return(rc); + } + return(0); +} + + +/* we expect the lists to have at most one or two elements, so we're willing + * to search for the end. If you ever observe the lists growing longer, + * increase POINTERS_PER_ALLOC_LIST_ELEMENT. + */ +void rf_real_AddToAllocList(l, p, size, lockflag) + RF_AllocListElem_t *l; + void *p; + int size; + int lockflag; +{ + RF_AllocListElem_t *newelem; + + for ( ; l->next; l=l->next) + RF_ASSERT(l->numPointers == RF_POINTERS_PER_ALLOC_LIST_ELEMENT); /* find end of list */ + + RF_ASSERT(l->numPointers >= 0 && l->numPointers <= RF_POINTERS_PER_ALLOC_LIST_ELEMENT); + if (l->numPointers == RF_POINTERS_PER_ALLOC_LIST_ELEMENT) { + newelem = rf_real_MakeAllocList(lockflag); + l->next = newelem; + l = newelem; + } + l->pointers[ l->numPointers ] = p; + l->sizes [ l->numPointers ] = size; + l->numPointers++; + +} + + +/* we use the debug_mem_mutex here because we need to lock it anyway to call free. + * this is probably a bug somewhere else in the code, but when I call malloc/free + * outside of any lock I have endless trouble with malloc appearing to return the + * same pointer twice. Since we have to lock it anyway, we might as well use it + * as the lock around the al_free_list. Note that we can't call Free with the + * debug_mem_mutex locked. + */ +void rf_FreeAllocList(l) + RF_AllocListElem_t *l; +{ + int i; + RF_AllocListElem_t *temp, *p; + + for (p=l; p; p=p->next) { + RF_ASSERT(p->numPointers >= 0 && p->numPointers <= RF_POINTERS_PER_ALLOC_LIST_ELEMENT); + for (i=0; i<p->numPointers; i++) { + RF_ASSERT(p->pointers[i]); + RF_Free(p->pointers[i], p->sizes[i]); + } + } +#ifndef KERNEL + RF_LOCK_MUTEX(rf_debug_mem_mutex); +#endif /* !KERNEL */ + while (l) { + temp = l; + l = l->next; + if (al_free_list_count > RF_AL_FREELIST_MAX) {DO_FREE(temp, sizeof(*temp));} + else {temp->next = al_free_list; al_free_list = temp; al_free_list_count++;} + } +#ifndef KERNEL + RF_UNLOCK_MUTEX(rf_debug_mem_mutex); +#endif /* !KERNEL */ +} + +RF_AllocListElem_t *rf_real_MakeAllocList(lockflag) + int lockflag; +{ + RF_AllocListElem_t *p; + +#ifndef KERNEL + if (lockflag) { RF_LOCK_MUTEX(rf_debug_mem_mutex); } +#endif /* !KERNEL */ + if (al_free_list) {fl_hit_count++; p = al_free_list; al_free_list = p->next; al_free_list_count--;} + else { + fl_miss_count++; +#ifndef KERNEL + p = (RF_AllocListElem_t *) malloc(sizeof(RF_AllocListElem_t)); /* can't use Malloc at user level b/c we already locked the mutex */ +#else /* !KERNEL */ + RF_Malloc(p, sizeof(RF_AllocListElem_t), (RF_AllocListElem_t *)); /* no allocation locking in kernel, so this is fine */ +#endif /* !KERNEL */ + } +#ifndef KERNEL + if (lockflag) { RF_UNLOCK_MUTEX(rf_debug_mem_mutex); } +#endif /* !KERNEL */ + if (p == NULL) { + return(NULL); + } + bzero((char *)p, sizeof(RF_AllocListElem_t)); + return(p); +} diff --git a/sys/dev/raidframe/rf_alloclist.h b/sys/dev/raidframe/rf_alloclist.h new file mode 100644 index 00000000000..b33f7a46e8b --- /dev/null +++ b/sys/dev/raidframe/rf_alloclist.h @@ -0,0 +1,84 @@ +/* $OpenBSD: rf_alloclist.h,v 1.1 1999/01/11 14:28:59 niklas Exp $ */ +/* $NetBSD: rf_alloclist.h,v 1.1 1998/11/13 04:20:26 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/**************************************************************************** + * + * alloclist.h -- header file for alloclist.c + * + ***************************************************************************/ + +/* : + * Log: rf_alloclist.h,v + * Revision 1.11 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.10 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.9 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.8 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.7 1995/11/30 16:27:13 wvcii + * added copyright info + * + */ + +#ifndef _RF__RF_ALLOCLIST_H_ +#define _RF__RF_ALLOCLIST_H_ + +#include "rf_types.h" + +#define RF_POINTERS_PER_ALLOC_LIST_ELEMENT 20 + +struct RF_AllocListElem_s { + void *pointers[RF_POINTERS_PER_ALLOC_LIST_ELEMENT]; + int sizes[RF_POINTERS_PER_ALLOC_LIST_ELEMENT]; + int numPointers; + RF_AllocListElem_t *next; +}; + +#define rf_MakeAllocList(_ptr_) _ptr_ = rf_real_MakeAllocList(1); +#define rf_AddToAllocList(_l_,_ptr_,_sz_) rf_real_AddToAllocList((_l_), (_ptr_), (_sz_), 1) + +int rf_ConfigureAllocList(RF_ShutdownList_t **listp); + +#if RF_UTILITY == 0 +void rf_real_AddToAllocList(RF_AllocListElem_t *l, void *p, int size, int lockflag); +void rf_FreeAllocList(RF_AllocListElem_t *l); +RF_AllocListElem_t *rf_real_MakeAllocList(int lockflag); +#endif /* RF_UTILITY == 0 */ + +#endif /* !_RF__RF_ALLOCLIST_H_ */ diff --git a/sys/dev/raidframe/rf_archs.h b/sys/dev/raidframe/rf_archs.h new file mode 100644 index 00000000000..6a4850829ce --- /dev/null +++ b/sys/dev/raidframe/rf_archs.h @@ -0,0 +1,211 @@ +/* $OpenBSD: rf_archs.h,v 1.1 1999/01/11 14:28:59 niklas Exp $ */ +/* $NetBSD: rf_archs.h,v 1.1 1998/11/13 04:20:26 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* rf_archs.h -- defines for which architectures you want to + * include is some particular build of raidframe. Unfortunately, + * it's difficult to exclude declustering, P+Q, and distributed + * sparing because the code is intermixed with RAID5 code. This + * should be fixed. + * + * this is really intended only for use in the kernel, where I + * am worried about the size of the object module. At user level and + * in the simulator, I don't really care that much, so all the + * architectures can be compiled together. Note that by itself, turning + * off these defines does not affect the size of the executable; you + * have to edit the makefile for that. + * + * comment out any line below to eliminate that architecture. + * the list below includes all the modules that can be compiled + * out. + * + * : + * Log: rf_archs.h,v + * Revision 1.32 1996/08/20 23:05:40 jimz + * define RF_KEEP_DISKSTATS to 1 + * + * Revision 1.31 1996/07/31 15:34:04 jimz + * include evenodd + * + * Revision 1.30 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.29 1996/07/26 20:11:46 jimz + * only define RF_DEMO for CMU_PDL + * + * Revision 1.28 1996/07/26 20:10:57 jimz + * define RF_CMU_PDL only if it isn't already defined + * + * Revision 1.27 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.26 1996/06/17 14:38:33 jimz + * properly #if out RF_DEMO code + * fix bug in MakeConfig that was causing weird behavior + * in configuration routines (config was not zeroed at start) + * clean up genplot handling of stacks + * + * Revision 1.25 1996/06/14 21:24:59 jimz + * turn on RF_CMU_PDL by default + * + * Revision 1.24 1996/06/13 20:41:57 jimz + * add RF_INCLUDE_QUEUE_RANDOM (0) + * + * Revision 1.23 1996/06/11 18:12:36 jimz + * get rid of JOIN operations + * use ThreadGroup stuff instead + * fix some allocation/deallocation and sync bugs + * + * Revision 1.22 1996/06/10 22:24:55 wvcii + * added symbols for enabling forward or backward error + * recovery experiments + * + * Revision 1.21 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.20 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.19 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.18 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.17 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.16 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.15 1996/05/15 22:32:59 jimz + * remove cache and vs stuff + * + * Revision 1.14 1995/11/30 16:27:34 wvcii + * added copyright info + * + * Revision 1.13 1995/11/28 21:23:44 amiri + * added the interleaved declustering architecture + * ('I'), with distributed sparing. + * + * Revision 1.12 1995/11/17 16:59:45 amiri + * don't INCLUDE_CHAINDECLUSTER in the kernel + * source. + * + * Revision 1.11 1995/11/16 16:15:21 amiri + * don't include RAID5 with rotated sparing (INCLUDE_RAID5_RS) in kernel + * + * Revision 1.10 1995/10/12 17:40:47 jimz + * define INCLUDE_LS + * + * Revision 1.9 1995/10/11 06:56:47 jimz + * define INCLUDE_VS (sanity check for compilation) + * + * Revision 1.8 1995/10/05 18:56:24 jimz + * don't INCLUDE_VS + * + * Revision 1.7 1995/10/04 03:51:20 wvcii + * added raid 1 + * + * Revision 1.6 1995/09/07 09:59:29 wvcii + * unstable archs conditionally defined for !KERNEL makes + * + * + */ + +#ifndef _RF__RF_ARCHS_H_ +#define _RF__RF_ARCHS_H_ + +/* + * Turn off if you do not have CMU PDL support compiled + * into your kernel. + */ +#ifndef RF_CMU_PDL +#define RF_CMU_PDL 0 +#endif /* !RF_CMU_PDL */ + +/* + * Khalil's performance-displaying demo stuff. + * Relies on CMU meter tools. + */ +#ifndef KERNEL +#if RF_CMU_PDL > 0 +#define RF_DEMO 1 +#endif /* RF_CMU_PDL > 0 */ +#endif /* !KERNEL */ + +#define RF_INCLUDE_EVENODD 1 + +#define RF_INCLUDE_RAID5_RS 1 +#define RF_INCLUDE_PARITYLOGGING 1 + +#define RF_INCLUDE_CHAINDECLUSTER 1 +#define RF_INCLUDE_INTERDECLUSTER 1 + +#define RF_INCLUDE_RAID0 1 +#define RF_INCLUDE_RAID1 1 +#define RF_INCLUDE_RAID4 1 +#define RF_INCLUDE_RAID5 1 +#define RF_INCLUDE_RAID6 0 +#define RF_INCLUDE_DECL_PQ 0 + +#define RF_MEMORY_REDZONES 0 +#define RF_RECON_STATS 1 + +#define RF_INCLUDE_QUEUE_RANDOM 0 + +#define RF_KEEP_DISKSTATS 1 + +/* These two symbols enable nonstandard forms of error recovery. + * These modes are only valid for performance measurements and + * data corruption will occur if an error occurs when either + * forward or backward error recovery are enabled. In general + * both of the following two definitions should be commented + * out--this forces RAIDframe to use roll-away error recovery + * which does guarantee proper error recovery without data corruption + */ +/* #define RF_FORWARD 1 */ +/* #define RF_BACKWARD 1 */ + +#include "rf_options.h" + +#endif /* !_RF__RF_ARCHS_H_ */ diff --git a/sys/dev/raidframe/rf_aselect.c b/sys/dev/raidframe/rf_aselect.c new file mode 100644 index 00000000000..f6a1918b7a5 --- /dev/null +++ b/sys/dev/raidframe/rf_aselect.c @@ -0,0 +1,618 @@ +/* $OpenBSD: rf_aselect.c,v 1.1 1999/01/11 14:28:59 niklas Exp $ */ +/* $NetBSD: rf_aselect.c,v 1.1 1998/11/13 04:20:26 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/***************************************************************************** + * + * aselect.c -- algorithm selection code + * + *****************************************************************************/ +/* + * : + * Log: rf_aselect.c,v + * Revision 1.35 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.34 1996/07/27 18:39:39 jimz + * cleanup sweep + * + * Revision 1.33 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.32 1996/06/12 03:29:40 jimz + * Note: things that call InitHdrNode should check + * for successful return. + * + * Revision 1.31 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.30 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.29 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.28 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.27 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.26 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.25 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.24 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.23 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.22 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.21 1996/05/08 21:01:24 jimz + * fixed up enum type names that were conflicting with other + * enums and function names (ie, "panic") + * future naming trends will be towards RF_ and rf_ for + * everything raidframe-related + * + * Revision 1.20 1996/05/03 19:45:35 wvcii + * removed includes of old deg creation files + * updated SelectAlgorithm comments + * + * Revision 1.19 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.18 1995/11/30 16:27:48 wvcii + * added copyright info + * + * Revision 1.17 1995/11/19 16:25:55 wvcii + * SelectAlgorithm now creates an array, returned in desc->dagArray + * return value is now int (1 = FAIL) + * + * Revision 1.16 1995/11/17 15:09:58 wvcii + * fixed bug in SelectAlgorithm in which multiple graphs per stripe are required + * + * Revision 1.15 1995/11/07 17:12:42 wvcii + * changed SelectAlgorithm as follows: + * + * dag creation funcs now create term nodes + * dag selection funcs no longer return numHdrSucc, numTermAnt + * there is now one dag hdr for each dag in a request, implying + * that SelectAlgorithm now returns a linked list of dag hdrs + * + */ + +#include "rf_archs.h" +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_dagutils.h" +#include "rf_dagfuncs.h" +#include "rf_general.h" +#include "rf_desc.h" +#include "rf_map.h" + +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) +/* the function below is not used... so don't define it! */ +#else +static void TransferDagMemory(RF_DagHeader_t *, RF_DagHeader_t *); +#endif + +static int InitHdrNode(RF_DagHeader_t **, RF_Raid_t *, int); +static void UpdateNodeHdrPtr(RF_DagHeader_t *, RF_DagNode_t *); +int rf_SelectAlgorithm(RF_RaidAccessDesc_t *, RF_RaidAccessFlags_t ); + + +/****************************************************************************** + * + * Create and Initialiaze a dag header and termination node + * + *****************************************************************************/ +static int InitHdrNode(hdr, raidPtr, memChunkEnable) + RF_DagHeader_t **hdr; + RF_Raid_t *raidPtr; + int memChunkEnable; +{ + /* create and initialize dag hdr */ + *hdr = rf_AllocDAGHeader(); + rf_MakeAllocList((*hdr)->allocList); + if ((*hdr)->allocList == NULL) { + rf_FreeDAGHeader(*hdr); + return(ENOMEM); + } + (*hdr)->status = rf_enable; + (*hdr)->numSuccedents = 0; + (*hdr)->raidPtr = raidPtr; + (*hdr)->next = NULL; + return(0); +} + +/****************************************************************************** + * + * Transfer allocation list and mem chunks from one dag to another + * + *****************************************************************************/ +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) +/* the function below is not used... so don't define it! */ +#else +static void TransferDagMemory(daga, dagb) + RF_DagHeader_t *daga; + RF_DagHeader_t *dagb; +{ + RF_AccessStripeMapHeader_t *end; + RF_AllocListElem_t *p; + int i, memChunksXfrd = 0, xtraChunksXfrd = 0; + + /* transfer allocList from dagb to daga */ + for (p = dagb->allocList; p ; p = p->next) + { + for (i = 0; i < p->numPointers; i++) + { + rf_AddToAllocList(daga->allocList, p->pointers[i], p->sizes[i]); + p->pointers[i] = NULL; + p->sizes[i] = 0; + } + p->numPointers = 0; + } + + /* transfer chunks from dagb to daga */ + while ((memChunksXfrd + xtraChunksXfrd < dagb->chunkIndex + dagb->xtraChunkIndex) && (daga->chunkIndex < RF_MAXCHUNKS)) + { + /* stuff chunks into daga's memChunk array */ + if (memChunksXfrd < dagb->chunkIndex) + { + daga->memChunk[daga->chunkIndex++] = dagb->memChunk[memChunksXfrd]; + dagb->memChunk[memChunksXfrd++] = NULL; + } + else + { + daga->memChunk[daga->xtraChunkIndex++] = dagb->xtraMemChunk[xtraChunksXfrd]; + dagb->xtraMemChunk[xtraChunksXfrd++] = NULL; + } + } + /* use escape hatch to hold excess chunks */ + while (memChunksXfrd + xtraChunksXfrd < dagb->chunkIndex + dagb->xtraChunkIndex) { + if (memChunksXfrd < dagb->chunkIndex) + { + daga->xtraMemChunk[daga->xtraChunkIndex++] = dagb->memChunk[memChunksXfrd]; + dagb->memChunk[memChunksXfrd++] = NULL; + } + else + { + daga->xtraMemChunk[daga->xtraChunkIndex++] = dagb->xtraMemChunk[xtraChunksXfrd]; + dagb->xtraMemChunk[xtraChunksXfrd++] = NULL; + } + } + RF_ASSERT((memChunksXfrd == dagb->chunkIndex) && (xtraChunksXfrd == dagb->xtraChunkIndex)); + RF_ASSERT(daga->chunkIndex <= RF_MAXCHUNKS); + RF_ASSERT(daga->xtraChunkIndex <= daga->xtraChunkCnt); + dagb->chunkIndex = 0; + dagb->xtraChunkIndex = 0; + + /* transfer asmList from dagb to daga */ + if (dagb->asmList) + { + if (daga->asmList) + { + end = daga->asmList; + while (end->next) + end = end->next; + end->next = dagb->asmList; + } + else + daga->asmList = dagb->asmList; + dagb->asmList = NULL; + } +} +#endif /* __NetBSD__ || __OpenBSD__ */ + +/***************************************************************************************** + * + * Ensure that all node->dagHdr fields in a dag are consistent + * + * IMPORTANT: This routine recursively searches all succedents of the node. If a + * succedent is encountered whose dagHdr ptr does not require adjusting, that node's + * succedents WILL NOT BE EXAMINED. + * + ****************************************************************************************/ +static void UpdateNodeHdrPtr(hdr, node) + RF_DagHeader_t *hdr; + RF_DagNode_t *node; +{ + int i; + RF_ASSERT(hdr != NULL && node != NULL); + for (i = 0; i < node->numSuccedents; i++) + if (node->succedents[i]->dagHdr != hdr) + UpdateNodeHdrPtr(hdr, node->succedents[i]); + node->dagHdr = hdr; +} + +/****************************************************************************** + * + * Create a DAG to do a read or write operation. + * + * create an array of dagLists, one list per parity stripe. + * return the lists in the array desc->dagArray. + * + * Normally, each list contains one dag for the entire stripe. In some + * tricky cases, we break this into multiple dags, either one per stripe + * unit or one per block (sector). When this occurs, these dags are returned + * as a linked list (dagList) which is executed sequentially (to preserve + * atomic parity updates in the stripe). + * + * dags which operate on independent parity goups (stripes) are returned in + * independent dagLists (distinct elements in desc->dagArray) and may be + * executed concurrently. + * + * Finally, if the SelectionFunc fails to create a dag for a block, we punt + * and return 1. + * + * The above process is performed in two phases: + * 1) create an array(s) of creation functions (eg stripeFuncs) + * 2) create dags and concatenate/merge to form the final dag. + * + * Because dag's are basic blocks (single entry, single exit, unconditional + * control flow, we can add the following optimizations (future work): + * first-pass optimizer to allow max concurrency (need all data dependencies) + * second-pass optimizer to eliminate common subexpressions (need true + * data dependencies) + * third-pass optimizer to eliminate dead code (need true data dependencies) + *****************************************************************************/ + +#define MAXNSTRIPES 50 + +int rf_SelectAlgorithm(desc, flags) + RF_RaidAccessDesc_t *desc; + RF_RaidAccessFlags_t flags; +{ + RF_AccessStripeMapHeader_t *asm_h = desc->asmap; + RF_IoType_t type = desc->type; + RF_Raid_t *raidPtr = desc->raidPtr; + void *bp = desc->bp; + + RF_AccessStripeMap_t *asmap = asm_h->stripeMap; + RF_AccessStripeMap_t *asm_p; + RF_DagHeader_t *dag_h = NULL, *tempdag_h, *lastdag_h; + int i, j, k; + RF_VoidFuncPtr *stripeFuncs, normalStripeFuncs[MAXNSTRIPES]; + RF_AccessStripeMap_t *asm_up, *asm_bp; + RF_AccessStripeMapHeader_t ***asmh_u, *endASMList; + RF_AccessStripeMapHeader_t ***asmh_b; + RF_VoidFuncPtr **stripeUnitFuncs, uFunc; + RF_VoidFuncPtr **blockFuncs, bFunc; + int numStripesBailed = 0, cantCreateDAGs = RF_FALSE; + int numStripeUnitsBailed = 0; + int stripeNum, numUnitDags = 0, stripeUnitNum, numBlockDags = 0; + RF_StripeNum_t numStripeUnits; + RF_SectorNum_t numBlocks; + RF_RaidAddr_t address; + int length; + RF_PhysDiskAddr_t *physPtr; + caddr_t buffer; + + lastdag_h = NULL; + asmh_u = asmh_b = NULL; + stripeUnitFuncs = NULL; + blockFuncs = NULL; + + /* get an array of dag-function creation pointers, try to avoid calling malloc */ + if (asm_h->numStripes <= MAXNSTRIPES) stripeFuncs = normalStripeFuncs; + else RF_Calloc(stripeFuncs, asm_h->numStripes, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr *)); + + /* walk through the asm list once collecting information */ + /* attempt to find a single creation function for each stripe */ + desc->numStripes = 0; + for (i=0,asm_p = asmap; asm_p; asm_p=asm_p->next,i++) { + desc->numStripes++; + (raidPtr->Layout.map->SelectionFunc)(raidPtr, type, asm_p, &stripeFuncs[i]); + /* check to see if we found a creation func for this stripe */ + if (stripeFuncs[i] == (RF_VoidFuncPtr) NULL) + { + /* could not find creation function for entire stripe + so, let's see if we can find one for each stripe unit in the stripe */ + + if (numStripesBailed == 0) + { + /* one stripe map header for each stripe we bail on */ + RF_Malloc(asmh_u, sizeof(RF_AccessStripeMapHeader_t **) * asm_h->numStripes, (RF_AccessStripeMapHeader_t ***)); + /* create an array of ptrs to arrays of stripeFuncs */ + RF_Calloc(stripeUnitFuncs, asm_h->numStripes, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr **)); + } + + /* create an array of creation funcs (called stripeFuncs) for this stripe */ + numStripeUnits = asm_p->numStripeUnitsAccessed; + RF_Calloc(stripeUnitFuncs[numStripesBailed], numStripeUnits, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr *)); + RF_Malloc(asmh_u[numStripesBailed], numStripeUnits * sizeof(RF_AccessStripeMapHeader_t *), (RF_AccessStripeMapHeader_t **)); + + /* lookup array of stripeUnitFuncs for this stripe */ + for (j=0, physPtr = asm_p->physInfo; physPtr; physPtr = physPtr->next, j++) + { + /* remap for series of single stripe-unit accesses */ + address = physPtr->raidAddress; + length = physPtr->numSector; + buffer = physPtr->bufPtr; + + asmh_u[numStripesBailed][j] = rf_MapAccess(raidPtr, address, length, buffer, RF_DONT_REMAP); + asm_up = asmh_u[numStripesBailed][j]->stripeMap; + + /* get the creation func for this stripe unit */ + (raidPtr->Layout.map-> SelectionFunc)(raidPtr, type, asm_up, &(stripeUnitFuncs[numStripesBailed][j])); + + /* check to see if we found a creation func for this stripe unit */ + if (stripeUnitFuncs[numStripesBailed][j] == (RF_VoidFuncPtr) NULL) + { + /* could not find creation function for stripe unit so, + let's see if we can find one for each block in the stripe unit */ + if (numStripeUnitsBailed == 0) + { + /* one stripe map header for each stripe unit we bail on */ + RF_Malloc(asmh_b, sizeof(RF_AccessStripeMapHeader_t **) * asm_h->numStripes * raidPtr->Layout.numDataCol, (RF_AccessStripeMapHeader_t ***)); + /* create an array of ptrs to arrays of blockFuncs */ + RF_Calloc(blockFuncs, asm_h->numStripes * raidPtr->Layout.numDataCol, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr **)); + } + + /* create an array of creation funcs (called blockFuncs) for this stripe unit */ + numBlocks = physPtr->numSector; + numBlockDags += numBlocks; + RF_Calloc(blockFuncs[numStripeUnitsBailed], numBlocks, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr *)); + RF_Malloc(asmh_b[numStripeUnitsBailed], numBlocks * sizeof(RF_AccessStripeMapHeader_t *), (RF_AccessStripeMapHeader_t **)); + + /* lookup array of blockFuncs for this stripe unit */ + for (k=0; k < numBlocks; k++) + { + /* remap for series of single stripe-unit accesses */ + address = physPtr->raidAddress + k; + length = 1; + buffer = physPtr->bufPtr + (k * (1<<raidPtr->logBytesPerSector)); + + asmh_b[numStripeUnitsBailed][k] = rf_MapAccess(raidPtr, address, length, buffer, RF_DONT_REMAP); + asm_bp = asmh_b[numStripeUnitsBailed][k]->stripeMap; + + /* get the creation func for this stripe unit */ + (raidPtr->Layout.map-> SelectionFunc)(raidPtr, type, asm_bp, &(blockFuncs[numStripeUnitsBailed][k])); + + /* check to see if we found a creation func for this stripe unit */ + if (blockFuncs[numStripeUnitsBailed][k] == NULL) + cantCreateDAGs = RF_TRUE; + } + numStripeUnitsBailed++; + } + else + { + numUnitDags++; + } + } + RF_ASSERT(j == numStripeUnits); + numStripesBailed++; + } + } + + if (cantCreateDAGs) + { + /* free memory and punt */ + if (asm_h->numStripes > MAXNSTRIPES) + RF_Free(stripeFuncs, asm_h->numStripes * sizeof(RF_VoidFuncPtr)); + if (numStripesBailed > 0) + { + stripeNum = 0; + for (i = 0, asm_p = asmap; asm_p; asm_p = asm_p->next, i++) + if (stripeFuncs[i] == NULL) + { + numStripeUnits = asm_p->numStripeUnitsAccessed; + for (j = 0; j < numStripeUnits; j++) + rf_FreeAccessStripeMap(asmh_u[stripeNum][j]); + RF_Free(asmh_u[stripeNum], numStripeUnits * sizeof(RF_AccessStripeMapHeader_t *)); + RF_Free(stripeUnitFuncs[stripeNum], numStripeUnits * sizeof(RF_VoidFuncPtr)); + stripeNum++; + } + RF_ASSERT(stripeNum == numStripesBailed); + RF_Free(stripeUnitFuncs, asm_h->numStripes * sizeof(RF_VoidFuncPtr)); + RF_Free(asmh_u, asm_h->numStripes * sizeof(RF_AccessStripeMapHeader_t **)); + } + return(1); + } + else + { + /* begin dag creation */ + stripeNum = 0; + stripeUnitNum = 0; + + /* create an array of dagLists and fill them in */ + RF_CallocAndAdd(desc->dagArray, desc->numStripes, sizeof(RF_DagList_t), (RF_DagList_t *), desc->cleanupList); + + for (i=0, asm_p = asmap; asm_p; asm_p=asm_p->next,i++) { + /* grab dag header for this stripe */ + dag_h = NULL; + desc->dagArray[i].desc = desc; + + if (stripeFuncs[i] == (RF_VoidFuncPtr) NULL) + { + /* use bailout functions for this stripe */ + for (j = 0, physPtr = asm_p->physInfo; physPtr; physPtr=physPtr->next, j++) + { + uFunc = stripeUnitFuncs[stripeNum][j]; + if (uFunc == (RF_VoidFuncPtr) NULL) + { + /* use bailout functions for this stripe unit */ + for (k = 0; k < physPtr->numSector; k++) + { + /* create a dag for this block */ + InitHdrNode(&tempdag_h, raidPtr, rf_useMemChunks); + desc->dagArray[i].numDags++; + if (dag_h == NULL) { + dag_h = tempdag_h; + } + else { + lastdag_h->next = tempdag_h; + } + lastdag_h = tempdag_h; + + bFunc = blockFuncs[stripeUnitNum][k]; + RF_ASSERT(bFunc); + asm_bp = asmh_b[stripeUnitNum][k]->stripeMap; + (*bFunc)(raidPtr, asm_bp, tempdag_h, bp, flags, tempdag_h->allocList); + } + stripeUnitNum++; + } + else + { + /* create a dag for this unit */ + InitHdrNode(&tempdag_h, raidPtr, rf_useMemChunks); + desc->dagArray[i].numDags++; + if (dag_h == NULL) { + dag_h = tempdag_h; + } + else { + lastdag_h->next = tempdag_h; + } + lastdag_h = tempdag_h; + + asm_up = asmh_u[stripeNum][j]->stripeMap; + (*uFunc)(raidPtr, asm_up, tempdag_h, bp, flags, tempdag_h->allocList); + } + } + RF_ASSERT(j == asm_p->numStripeUnitsAccessed); + /* merge linked bailout dag to existing dag collection */ + stripeNum++; + } + else { + /* Create a dag for this parity stripe */ + InitHdrNode(&tempdag_h, raidPtr, rf_useMemChunks); + desc->dagArray[i].numDags++; + if (dag_h == NULL) { + dag_h = tempdag_h; + } + else { + lastdag_h->next = tempdag_h; + } + lastdag_h = tempdag_h; + + (stripeFuncs[i])(raidPtr, asm_p, tempdag_h, bp, flags, tempdag_h->allocList); + } + desc->dagArray[i].dags = dag_h; + } + RF_ASSERT(i == desc->numStripes); + + /* free memory */ + if (asm_h->numStripes > MAXNSTRIPES) + RF_Free(stripeFuncs, asm_h->numStripes * sizeof(RF_VoidFuncPtr)); + if ((numStripesBailed > 0) || (numStripeUnitsBailed > 0)) + { + stripeNum = 0; + stripeUnitNum = 0; + if (dag_h->asmList) + { + endASMList = dag_h->asmList; + while (endASMList->next) + endASMList = endASMList->next; + } + else + endASMList = NULL; + /* walk through io, stripe by stripe */ + for (i = 0, asm_p = asmap; asm_p; asm_p = asm_p->next, i++) + if (stripeFuncs[i] == NULL) + { + numStripeUnits = asm_p->numStripeUnitsAccessed; + /* walk through stripe, stripe unit by stripe unit */ + for (j = 0, physPtr = asm_p->physInfo; physPtr; physPtr = physPtr->next, j++) + { + if (stripeUnitFuncs[stripeNum][j] == NULL) + { + numBlocks = physPtr->numSector; + /* walk through stripe unit, block by block */ + for (k = 0; k < numBlocks; k++) + if (dag_h->asmList == NULL) + { + dag_h->asmList = asmh_b[stripeUnitNum][k]; + endASMList = dag_h->asmList; + } + else + { + endASMList->next = asmh_b[stripeUnitNum][k]; + endASMList = endASMList->next; + } + RF_Free(asmh_b[stripeUnitNum], numBlocks * sizeof(RF_AccessStripeMapHeader_t *)); + RF_Free(blockFuncs[stripeUnitNum], numBlocks * sizeof(RF_VoidFuncPtr)); + stripeUnitNum++; + } + if (dag_h->asmList == NULL) + { + dag_h->asmList = asmh_u[stripeNum][j]; + endASMList = dag_h->asmList; + } + else + { + endASMList->next = asmh_u[stripeNum][j]; + endASMList = endASMList->next; + } + } + RF_Free(asmh_u[stripeNum], numStripeUnits * sizeof(RF_AccessStripeMapHeader_t *)); + RF_Free(stripeUnitFuncs[stripeNum], numStripeUnits * sizeof(RF_VoidFuncPtr)); + stripeNum++; + } + RF_ASSERT(stripeNum == numStripesBailed); + RF_Free(stripeUnitFuncs, asm_h->numStripes * sizeof(RF_VoidFuncPtr)); + RF_Free(asmh_u, asm_h->numStripes * sizeof(RF_AccessStripeMapHeader_t **)); + if (numStripeUnitsBailed > 0) + { + RF_ASSERT(stripeUnitNum == numStripeUnitsBailed); + RF_Free(blockFuncs, raidPtr->Layout.numDataCol * asm_h->numStripes * sizeof(RF_VoidFuncPtr)); + RF_Free(asmh_b, raidPtr->Layout.numDataCol * asm_h->numStripes * sizeof(RF_AccessStripeMapHeader_t **)); + } + } + return(0); + } +} diff --git a/sys/dev/raidframe/rf_aselect.h b/sys/dev/raidframe/rf_aselect.h new file mode 100644 index 00000000000..1b1d3e51795 --- /dev/null +++ b/sys/dev/raidframe/rf_aselect.h @@ -0,0 +1,60 @@ +/* $OpenBSD: rf_aselect.h,v 1.1 1999/01/11 14:29:00 niklas Exp $ */ +/* $NetBSD: rf_aselect.h,v 1.1 1998/11/13 04:20:26 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/***************************************************************************** + * + * aselect.h -- header file for algorithm selection code + * + *****************************************************************************/ +/* : + * Log: rf_aselect.h,v + * Revision 1.5 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.4 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.3 1995/11/30 16:28:00 wvcii + * added copyright info + * + * Revision 1.2 1995/11/19 16:20:46 wvcii + * changed SelectAlgorithm prototype + * + */ + +#ifndef _RF__RF_ASELECT_H_ +#define _RF__RF_ASELECT_H_ + +#include "rf_desc.h" + +int rf_SelectAlgorithm(RF_RaidAccessDesc_t *desc, RF_RaidAccessFlags_t flags); + +#endif /* !_RF__RF_ASELECT_H_ */ diff --git a/sys/dev/raidframe/rf_callback.c b/sys/dev/raidframe/rf_callback.c new file mode 100644 index 00000000000..dffd52fc7a6 --- /dev/null +++ b/sys/dev/raidframe/rf_callback.c @@ -0,0 +1,121 @@ +/* $OpenBSD: rf_callback.c,v 1.1 1999/01/11 14:29:00 niklas Exp $ */ +/* $NetBSD: rf_callback.c,v 1.1 1998/11/13 04:20:26 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/***************************************************************************************** + * + * callback.c -- code to manipulate callback descriptor + * + ****************************************************************************************/ + +/* : + * Log: rf_callback.c,v + * Revision 1.11 1996/06/17 03:18:04 jimz + * include shutdown.h for macroized ShutdownCreate + * + * Revision 1.10 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.9 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.8 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.7 1996/05/17 16:30:41 jimz + * convert to RF_FREELIST stuff + * + * Revision 1.6 1995/12/01 15:16:04 root + * added copyright info + * + */ + +#ifndef _KERNEL +#ifdef __NetBSD__ +#include <unistd.h> +#endif /* __NetBSD__ */ +#endif + +#include "rf_types.h" +#include "rf_threadstuff.h" +#include "rf_callback.h" +#include "rf_debugMem.h" +#include "rf_freelist.h" +#include "rf_shutdown.h" + +static RF_FreeList_t *rf_callback_freelist; + +#define RF_MAX_FREE_CALLBACK 64 +#define RF_CALLBACK_INC 4 +#define RF_CALLBACK_INITIAL 4 + +static void rf_ShutdownCallback(void *); +static void rf_ShutdownCallback(ignored) + void *ignored; +{ + RF_FREELIST_DESTROY(rf_callback_freelist,next,(RF_CallbackDesc_t *)); +} + +int rf_ConfigureCallback(listp) + RF_ShutdownList_t **listp; +{ + int rc; + + RF_FREELIST_CREATE(rf_callback_freelist, RF_MAX_FREE_CALLBACK, + RF_CALLBACK_INC, sizeof(RF_CallbackDesc_t)); + if (rf_callback_freelist == NULL) + return(ENOMEM); + rc = rf_ShutdownCreate(listp, rf_ShutdownCallback, NULL); + if (rc) { + RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + rf_ShutdownCallback(NULL); + return(rc); + } + RF_FREELIST_PRIME(rf_callback_freelist, RF_CALLBACK_INITIAL,next, + (RF_CallbackDesc_t *)); + return(0); +} + +RF_CallbackDesc_t *rf_AllocCallbackDesc() +{ + RF_CallbackDesc_t *p; + + RF_FREELIST_GET(rf_callback_freelist,p,next,(RF_CallbackDesc_t *)); + return(p); +} + +void rf_FreeCallbackDesc(p) + RF_CallbackDesc_t *p; +{ + RF_FREELIST_FREE(rf_callback_freelist,p,next); +} diff --git a/sys/dev/raidframe/rf_callback.h b/sys/dev/raidframe/rf_callback.h new file mode 100644 index 00000000000..cb3db8ebbbd --- /dev/null +++ b/sys/dev/raidframe/rf_callback.h @@ -0,0 +1,92 @@ +/* $OpenBSD: rf_callback.h,v 1.1 1999/01/11 14:29:00 niklas Exp $ */ +/* $NetBSD: rf_callback.h,v 1.1 1998/11/13 04:20:26 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/***************************************************************************************** + * + * callback.h -- header file for callback.c + * + * the reconstruction code must manage concurrent I/Os on multiple drives. + * it sometimes needs to suspend operation on a particular drive until some + * condition occurs. we can't block the thread, of course, or we wouldn't + * be able to manage our other outstanding I/Os. Instead we just suspend + * new activity on the indicated disk, and create a callback descriptor and + * put it someplace where it will get invoked when the condition that's + * stalling us has cleared. When the descriptor is invoked, it will call + * a function that will restart operation on the indicated disk. + * + ****************************************************************************************/ + +/* : + * Log: rf_callback.h,v + * Revision 1.8 1996/08/01 15:57:28 jimz + * minor cleanup + * + * Revision 1.7 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.6 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.5 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.4 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.3 1996/05/17 16:30:46 jimz + * add prototypes + * + * Revision 1.2 1995/12/01 15:15:55 root + * added copyright info + * + */ + +#ifndef _RF__RF_CALLBACK_H_ +#define _RF__RF_CALLBACK_H_ + +#include "rf_types.h" + +struct RF_CallbackDesc_s { + void (*callbackFunc)(RF_CBParam_t); /* function to call */ + RF_CBParam_t callbackArg; /* args to give to function, or just info about this callback */ + RF_CBParam_t callbackArg2; + RF_RowCol_t row; /* disk row and column IDs to give to the callback func */ + RF_RowCol_t col; + RF_CallbackDesc_t *next; /* next entry in list */ +}; + +int rf_ConfigureCallback(RF_ShutdownList_t **listp); +RF_CallbackDesc_t *rf_AllocCallbackDesc(void); +void rf_FreeCallbackDesc(RF_CallbackDesc_t *p); + +#endif /* !_RF__RF_CALLBACK_H_ */ diff --git a/sys/dev/raidframe/rf_ccmn.h b/sys/dev/raidframe/rf_ccmn.h new file mode 100644 index 00000000000..f13778c0cd4 --- /dev/null +++ b/sys/dev/raidframe/rf_ccmn.h @@ -0,0 +1,115 @@ +/* $OpenBSD: rf_ccmn.h,v 1.1 1999/01/11 14:29:01 niklas Exp $ */ +/* $NetBSD: rf_ccmn.h,v 1.1 1998/11/13 04:20:26 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* rf_ccmn.h + * header file that declares the ccmn routines, and includes + * the files needed to use them. + */ + +/* : + * Log: rf_ccmn.h,v + * Revision 1.4 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.3 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.2 1995/12/01 15:16:45 root + * added copyright info + * + */ + +#ifndef _RF__RF_CCMN_H_ +#define _RF__RF_CCMN_H_ + +#ifdef __osf__ +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/file.h> +#include <sys/param.h> +#include <sys/uio.h> +#include <sys/time.h> +#include <sys/buf.h> +#include <sys/ioctl.h> +#include <io/common/iotypes.h> +#include <io/cam/cam_debug.h> +#include <io/cam/cam.h> +#include <io/cam/dec_cam.h> +#include <io/cam/uagt.h> +#include <io/cam/scsi_all.h> +#include <io/cam/scsi_direct.h> + +#ifdef KERNEL +#include <sys/conf.h> +#include <sys/mtio.h> +#include <io/common/devio.h> +#include <io/common/devdriver.h> +#include <io/cam/scsi_status.h> +#include <io/cam/pdrv.h> +#include <io/common/pt.h> +#include <sys/disklabel.h> +#include <io/cam/cam_disk.h> +#include <io/cam/ccfg.h> + +extern void ccmn_init(); +extern long ccmn_open_unit(); +extern void ccmn_close_unit(); +extern u_long ccmn_send_ccb(); +extern void ccmn_rem_ccb(); +extern void ccmn_abort_que(); +extern void ccmn_term_que(); +extern CCB_HEADER *ccmn_get_ccb(); +extern void ccmn_rel_ccb(); +extern CCB_SCSIIO *ccmn_io_ccb_bld(); +extern CCB_GETDEV *ccmn_gdev_ccb_bld(); +extern CCB_SETDEV *ccmn_sdev_ccb_bld(); +extern CCB_SETASYNC *ccmn_sasy_ccb_bld(); +extern CCB_RELSIM *ccmn_rsq_ccb_bld(); +extern CCB_PATHINQ *ccmn_pinq_ccb_bld(); +extern CCB_ABORT *ccmn_abort_ccb_bld(); +extern CCB_TERMIO *ccmn_term_ccb_bld(); +extern CCB_RESETDEV *ccmn_bdr_ccb_bld(); +extern CCB_RESETBUS *ccmn_br_ccb_bld(); +extern CCB_SCSIIO *ccmn_tur(); +extern CCB_SCSIIO *ccmn_mode_select(); +extern u_long ccmn_ccb_status(); +extern struct buf *ccmn_get_bp(); +extern void ccmn_rel_bp(); +extern u_char *ccmn_get_dbuf(); +extern void ccmn_rel_dbuf(); + +extern struct device *camdinfo[]; +extern struct controller *camminfo[]; +extern PDRV_UNIT_ELEM pdrv_unit_table[]; + +#endif /* KERNEL */ +#endif /* __osf__ */ + +#endif /* !_RF__RF_CCMN_H_ */ diff --git a/sys/dev/raidframe/rf_chaindecluster.c b/sys/dev/raidframe/rf_chaindecluster.c new file mode 100644 index 00000000000..bbb7caa92ec --- /dev/null +++ b/sys/dev/raidframe/rf_chaindecluster.c @@ -0,0 +1,382 @@ +/* $OpenBSD: rf_chaindecluster.c,v 1.1 1999/01/11 14:29:01 niklas Exp $ */ +/* $NetBSD: rf_chaindecluster.c,v 1.1 1998/11/13 04:20:26 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Khalil Amiri + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/****************************************************************************** + * + * rf_chaindecluster.c -- implements chained declustering + * + *****************************************************************************/ + +/* : + * Log: rf_chaindecluster.c,v + * Revision 1.33 1996/08/02 13:20:34 jimz + * get rid of bogus (long) casts + * + * Revision 1.32 1996/07/31 16:56:18 jimz + * dataBytesPerStripe, sectorsPerDisk init arch-indep. + * + * Revision 1.31 1996/07/29 14:05:12 jimz + * fix numPUs/numRUs confusion (everything is now numRUs) + * clean up some commenting, return values + * + * Revision 1.30 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.29 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.28 1996/06/19 17:53:48 jimz + * move GetNumSparePUs, InstallSpareTable ops into layout switch + * + * Revision 1.27 1996/06/11 15:19:57 wvcii + * added include of rf_chaindecluster.h + * fixed parameter list of rf_ConfigureChainDecluster + * + * Revision 1.26 1996/06/11 08:55:15 jimz + * improved error-checking at configuration time + * + * Revision 1.25 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.24 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.23 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.22 1996/06/06 17:31:30 jimz + * use CreateMirrorPartitionReadDAG for mirrored reads + * + * Revision 1.21 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.20 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.19 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.18 1996/05/31 16:13:28 amiri + * removed/added some commnets. + * + * Revision 1.17 1996/05/31 05:01:52 amiri + * fixed a bug related to sparing layout. + * + * Revision 1.16 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.15 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.14 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.13 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.12 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.11 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.10 1996/05/03 19:53:56 wvcii + * removed include of rf_redstripe.h + * moved dag creation routines to new dag library + * + */ + +#include "rf_archs.h" +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_chaindecluster.h" +#include "rf_dag.h" +#include "rf_dagutils.h" +#include "rf_dagffrd.h" +#include "rf_dagffwr.h" +#include "rf_dagdegrd.h" +#include "rf_dagfuncs.h" +#include "rf_threadid.h" +#include "rf_general.h" +#include "rf_utils.h" + +typedef struct RF_ChaindeclusterConfigInfo_s { + RF_RowCol_t **stripeIdentifier; /* filled in at config time + * and used by IdentifyStripe */ + RF_StripeCount_t numSparingRegions; + RF_StripeCount_t stripeUnitsPerSparingRegion; + RF_SectorNum_t mirrorStripeOffset; +} RF_ChaindeclusterConfigInfo_t; + +int rf_ConfigureChainDecluster( + RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr) +{ + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_StripeCount_t num_used_stripeUnitsPerDisk; + RF_ChaindeclusterConfigInfo_t *info; + RF_RowCol_t i; + + /* create a Chained Declustering configuration structure */ + RF_MallocAndAdd(info, sizeof(RF_ChaindeclusterConfigInfo_t), (RF_ChaindeclusterConfigInfo_t *), raidPtr->cleanupList); + if (info == NULL) + return(ENOMEM); + layoutPtr->layoutSpecificInfo = (void *) info; + + /* fill in the config structure. */ + info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol, 2 , raidPtr->cleanupList); + if (info->stripeIdentifier == NULL) + return(ENOMEM); + for (i=0; i< raidPtr->numCol; i++) { + info->stripeIdentifier[i][0] = i % raidPtr->numCol; + info->stripeIdentifier[i][1] = (i+1) % raidPtr->numCol; + } + + RF_ASSERT(raidPtr->numRow == 1); + + /* fill in the remaining layout parameters */ + num_used_stripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk - (layoutPtr->stripeUnitsPerDisk % + (2*raidPtr->numCol-2) ); + info->numSparingRegions = num_used_stripeUnitsPerDisk / (2*raidPtr->numCol-2); + info->stripeUnitsPerSparingRegion = raidPtr->numCol * (raidPtr->numCol - 1); + info->mirrorStripeOffset = info->numSparingRegions * (raidPtr->numCol-1); + layoutPtr->numStripe = info->numSparingRegions * info->stripeUnitsPerSparingRegion; + layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector; + layoutPtr->numDataCol = 1; + layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; + layoutPtr->numParityCol = 1; + + layoutPtr->dataStripeUnitsPerDisk = num_used_stripeUnitsPerDisk; + + raidPtr->sectorsPerDisk = + num_used_stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit; + + raidPtr->totalSectors = + (layoutPtr->numStripe) * layoutPtr->sectorsPerStripeUnit; + + layoutPtr->stripeUnitsPerDisk = raidPtr->sectorsPerDisk / layoutPtr->sectorsPerStripeUnit; + + return(0); +} + +RF_ReconUnitCount_t rf_GetNumSpareRUsChainDecluster(raidPtr) + RF_Raid_t *raidPtr; +{ + RF_ChaindeclusterConfigInfo_t *info = (RF_ChaindeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; + + /* + * The layout uses two stripe units per disk as spare within each + * sparing region. + */ + return (2*info->numSparingRegions); +} + + +/* Maps to the primary copy of the data, i.e. the first mirror pair */ +void rf_MapSectorChainDecluster( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_ChaindeclusterConfigInfo_t *info = (RF_ChaindeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; + RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; + RF_SectorNum_t index_within_region, index_within_disk; + RF_StripeNum_t sparing_region_id; + int col_before_remap; + + *row = 0; + sparing_region_id = SUID / info->stripeUnitsPerSparingRegion; + index_within_region = SUID % info->stripeUnitsPerSparingRegion; + index_within_disk = index_within_region / raidPtr->numCol; + col_before_remap = SUID % raidPtr->numCol; + + if (!remap) { + *col = col_before_remap; + *diskSector = ( index_within_disk + ( (raidPtr->numCol-1) * sparing_region_id) ) * + raidPtr->Layout.sectorsPerStripeUnit; + *diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit); + } + else { + /* remap sector to spare space...*/ + *diskSector = sparing_region_id * (raidPtr->numCol+1) * raidPtr->Layout.sectorsPerStripeUnit; + *diskSector += (raidPtr->numCol-1) * raidPtr->Layout.sectorsPerStripeUnit; + *diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit); + index_within_disk = index_within_region / raidPtr->numCol; + if (index_within_disk < col_before_remap ) + *col = index_within_disk; + else if (index_within_disk == raidPtr->numCol-2 ) { + *col = (col_before_remap+raidPtr->numCol-1) % raidPtr->numCol; + *diskSector += raidPtr->Layout.sectorsPerStripeUnit; + } + else + *col = (index_within_disk + 2) % raidPtr->numCol; + } + +} + + + +/* Maps to the second copy of the mirror pair, which is chain declustered. The second copy is contained + in the next disk (mod numCol) after the disk containing the primary copy. + The offset into the disk is one-half disk down */ +void rf_MapParityChainDecluster( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_ChaindeclusterConfigInfo_t *info = (RF_ChaindeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; + RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; + RF_SectorNum_t index_within_region, index_within_disk; + RF_StripeNum_t sparing_region_id; + int col_before_remap; + + *row = 0; + if (!remap) { + *col = SUID % raidPtr->numCol; + *col = (*col + 1) % raidPtr->numCol; + *diskSector = info->mirrorStripeOffset * raidPtr->Layout.sectorsPerStripeUnit; + *diskSector += ( SUID / raidPtr->numCol ) * raidPtr->Layout.sectorsPerStripeUnit; + *diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit); + } + else { + /* remap parity to spare space ... */ + sparing_region_id = SUID / info->stripeUnitsPerSparingRegion; + index_within_region = SUID % info->stripeUnitsPerSparingRegion; + index_within_disk = index_within_region / raidPtr->numCol; + *diskSector = sparing_region_id * (raidPtr->numCol+1) * raidPtr->Layout.sectorsPerStripeUnit; + *diskSector += (raidPtr->numCol) * raidPtr->Layout.sectorsPerStripeUnit; + *diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit); + col_before_remap = SUID % raidPtr->numCol; + if (index_within_disk < col_before_remap) + *col = index_within_disk; + else if (index_within_disk == raidPtr->numCol-2 ) { + *col = (col_before_remap+2) % raidPtr->numCol; + *diskSector -= raidPtr->Layout.sectorsPerStripeUnit; + } + else + *col = (index_within_disk + 2) % raidPtr->numCol; + } + +} + +void rf_IdentifyStripeChainDecluster( + RF_Raid_t *raidPtr, + RF_RaidAddr_t addr, + RF_RowCol_t **diskids, + RF_RowCol_t *outRow) +{ + RF_ChaindeclusterConfigInfo_t *info = (RF_ChaindeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; + RF_StripeNum_t SUID; + RF_RowCol_t col; + + SUID = addr / raidPtr->Layout.sectorsPerStripeUnit; + col = SUID % raidPtr->numCol; + *outRow = 0; + *diskids = info->stripeIdentifier[ col ]; +} + +void rf_MapSIDToPSIDChainDecluster( + RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t stripeID, + RF_StripeNum_t *psID, + RF_ReconUnitNum_t *which_ru) +{ + *which_ru = 0; + *psID = stripeID; +} + +/****************************************************************************** + * select a graph to perform a single-stripe access + * + * Parameters: raidPtr - description of the physical array + * type - type of operation (read or write) requested + * asmap - logical & physical addresses for this access + * createFunc - function to use to create the graph (return value) + *****************************************************************************/ + +void rf_RAIDCDagSelect( + RF_Raid_t *raidPtr, + RF_IoType_t type, + RF_AccessStripeMap_t *asmap, + RF_VoidFuncPtr *createFunc) +#if 0 + void (**createFunc)(RF_Raid_t *, RF_AccessStripeMap_t *, + RF_DagHeader_t *, void *, RF_RaidAccessFlags_t, + RF_AllocListElem_t *)) +#endif +{ + RF_ASSERT(RF_IO_IS_R_OR_W(type)); + RF_ASSERT(raidPtr->numRow == 1); + + if (asmap->numDataFailed + asmap->numParityFailed > 1) { + RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n"); + *createFunc = NULL; + return; + } + + *createFunc = (type == RF_IO_TYPE_READ) ? (RF_VoidFuncPtr)rf_CreateFaultFreeReadDAG :(RF_VoidFuncPtr) rf_CreateRaidOneWriteDAG; + + if (type == RF_IO_TYPE_READ) { + if ( ( raidPtr->status[0] == rf_rs_degraded ) || ( raidPtr->status[0] == rf_rs_reconstructing) ) + *createFunc = (RF_VoidFuncPtr)rf_CreateRaidCDegradedReadDAG; /* array status is degraded, implement workload shifting */ + else + *createFunc = (RF_VoidFuncPtr)rf_CreateMirrorPartitionReadDAG; /* array status not degraded, so use mirror partition dag */ + } + else + *createFunc = (RF_VoidFuncPtr)rf_CreateRaidOneWriteDAG; +} diff --git a/sys/dev/raidframe/rf_chaindecluster.h b/sys/dev/raidframe/rf_chaindecluster.h new file mode 100644 index 00000000000..52a94deac2f --- /dev/null +++ b/sys/dev/raidframe/rf_chaindecluster.h @@ -0,0 +1,123 @@ +/* $OpenBSD: rf_chaindecluster.h,v 1.1 1999/01/11 14:29:01 niklas Exp $ */ +/* $NetBSD: rf_chaindecluster.h,v 1.1 1998/11/13 04:20:26 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Khalil Amiri + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* rf_chaindecluster.h + * header file for Chained Declustering + */ + +/* + * : + * Log: rf_chaindecluster.h,v + * Revision 1.14 1996/07/29 14:05:12 jimz + * fix numPUs/numRUs confusion (everything is now numRUs) + * clean up some commenting, return values + * + * Revision 1.13 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.12 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.11 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.10 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.9 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.8 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.7 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.6 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.5 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.4 1996/02/22 16:45:59 amiri + * added declaration of dag selection function + * + * Revision 1.3 1995/12/01 15:16:56 root + * added copyright info + * + * Revision 1.2 1995/11/17 19:55:21 amiri + * prototyped MapParityChainDecluster + */ + +#ifndef _RF__RF_CHAINDECLUSTER_H_ +#define _RF__RF_CHAINDECLUSTER_H_ + +int rf_ConfigureChainDecluster(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr); +RF_ReconUnitCount_t rf_GetNumSpareRUsChainDecluster(RF_Raid_t *raidPtr); +void rf_MapSectorChainDecluster(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_MapParityChainDecluster(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_IdentifyStripeChainDecluster(RF_Raid_t *raidPtr, RF_RaidAddr_t addr, + RF_RowCol_t **diskids, RF_RowCol_t *outRow); +void rf_MapSIDToPSIDChainDecluster(RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t stripeID, RF_StripeNum_t *psID, + RF_ReconUnitNum_t *which_ru); +void rf_RAIDCDagSelect(RF_Raid_t *raidPtr, RF_IoType_t type, + RF_AccessStripeMap_t *asmap, + RF_VoidFuncPtr *); +#if 0 + void (**createFunc)(RF_Raid_t *, + RF_AccessStripeMap_t *, + RF_DagHeader_t *, + void *, + RF_RaidAccessFlags_t, + RF_AllocListElem_t *) +); +#endif + +#endif /* !_RF__RF_CHAINDECLUSTER_H_ */ diff --git a/sys/dev/raidframe/rf_configure.h b/sys/dev/raidframe/rf_configure.h new file mode 100644 index 00000000000..aee456c52a2 --- /dev/null +++ b/sys/dev/raidframe/rf_configure.h @@ -0,0 +1,127 @@ +/* $OpenBSD: rf_configure.h,v 1.1 1999/01/11 14:29:02 niklas Exp $ */ +/* $NetBSD: rf_configure.h,v 1.1 1998/11/13 04:20:26 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/******************************** + * + * rf_configure.h + * + * header file for raidframe configuration in the kernel version only. + * configuration is invoked via ioctl rather than at boot time + * + *******************************/ + +/* : + * Log: rf_configure.h,v + * Revision 1.16 1996/06/19 14:57:53 jimz + * move layout-specific config parsing hooks into RF_LayoutSW_t + * table in rf_layout.c + * + * Revision 1.15 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.14 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.13 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.12 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.11 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.10 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.9 1996/05/18 20:09:51 jimz + * bit of cleanup to compile cleanly in kernel, once again + * + * Revision 1.8 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.7 1995/12/01 15:16:26 root + * added copyright info + * + */ + +#ifndef _RF__RF_CONFIGURE_H_ +#define _RF__RF_CONFIGURE_H_ + +#include "rf_archs.h" +#include "rf_types.h" + +#include <sys/param.h> +#include <sys/proc.h> + +#include <sys/ioctl.h> + +/* the raidframe configuration, passed down through an ioctl. + * the driver can be reconfigured (with total loss of data) at any time, + * but it must be shut down first. + */ +struct RF_Config_s { + RF_RowCol_t numRow, numCol, numSpare; /* number of rows, columns, and spare disks */ + dev_t devs[RF_MAXROW][RF_MAXCOL]; /* device numbers for disks comprising array */ + char devnames[RF_MAXROW][RF_MAXCOL][50]; /* device names */ + dev_t spare_devs[RF_MAXSPARE]; /* device numbers for spare disks */ + char spare_names[RF_MAXSPARE][50]; /* device names */ + RF_SectorNum_t sectPerSU; /* sectors per stripe unit */ + RF_StripeNum_t SUsPerPU; /* stripe units per parity unit */ + RF_StripeNum_t SUsPerRU; /* stripe units per reconstruction unit */ + RF_ParityConfig_t parityConfig; /* identifies the RAID architecture to be used */ + RF_DiskQueueType_t diskQueueType; /* 'f' = fifo, 'c' = cvscan, not used in kernel */ + char maxOutstandingDiskReqs; /* # concurrent reqs to be sent to a disk. not used in kernel. */ + char debugVars[RF_MAXDBGV][50]; /* space for specifying debug variables & their values */ + unsigned int layoutSpecificSize; /* size in bytes of layout-specific info */ + void *layoutSpecific; /* a pointer to a layout-specific structure to be copied in */ +}; + +#ifndef KERNEL +int rf_MakeConfig(char *configname, RF_Config_t *cfgPtr); +int rf_MakeLayoutSpecificNULL(FILE *fp, RF_Config_t *cfgPtr, void *arg); +int rf_MakeLayoutSpecificDeclustered(FILE *configfp, RF_Config_t *cfgPtr, void *arg); +void *rf_ReadSpareTable(RF_SparetWait_t *req, char *fname); +#endif /* !KERNEL */ + +#endif /* !_RF__RF_CONFIGURE_H_ */ diff --git a/sys/dev/raidframe/rf_copyback.c b/sys/dev/raidframe/rf_copyback.c new file mode 100644 index 00000000000..b2fe641fded --- /dev/null +++ b/sys/dev/raidframe/rf_copyback.c @@ -0,0 +1,577 @@ +/* $OpenBSD: rf_copyback.c,v 1.1 1999/01/11 14:29:02 niklas Exp $ */ +/* $NetBSD: rf_copyback.c,v 1.1 1998/11/13 04:20:27 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/***************************************************************************************** + * + * copyback.c -- code to copy reconstructed data back from spare space to + * the replaced disk. + * + * the code operates using callbacks on the I/Os to continue with the next + * unit to be copied back. We do this because a simple loop containing blocking I/Os + * will not work in the simulator. + * + ****************************************************************************************/ + +/* + * : + * Log: rf_copyback.c,v + * Revision 1.26 1996/08/06 22:26:00 jimz + * don't include sys/buf.h on linux + * + * Revision 1.25 1996/07/30 03:30:40 jimz + * include rf_types.h first + * + * Revision 1.24 1996/07/27 18:39:52 jimz + * cleanup sweep + * + * Revision 1.23 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.22 1996/07/11 19:08:00 jimz + * generalize reconstruction mechanism + * allow raid1 reconstructs via copyback (done with array + * quiesced, not online, therefore not disk-directed) + * + * Revision 1.21 1996/07/11 16:03:47 jimz + * fixed hanging bug in rf_CopybackWriteDoneProc() + * + * Revision 1.20 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.19 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.18 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.17 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.16 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.15 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.14 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.13 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.12 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.11 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.10 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.9 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.8 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.7 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.6 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.5 1995/12/01 15:15:31 root + * added copyright info + * + * Revision 1.4 1995/06/23 13:41:36 robby + * updeated to prototypes in rf_layout.h + * + */ + +#include "rf_types.h" +#include <sys/time.h> +#ifndef LINUX +#include <sys/buf.h> +#endif /* !LINUX */ +#include "rf_raid.h" +#include "rf_threadid.h" +#include "rf_mcpair.h" +#include "rf_acctrace.h" +#include "rf_etimer.h" +#include "rf_general.h" +#include "rf_utils.h" +#include "rf_copyback.h" +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#include "rf_camlayer.h" +#endif +#include "rf_decluster.h" +#include "rf_driver.h" +#include "rf_shutdown.h" +#include "rf_sys.h" + +#define RF_COPYBACK_DATA 0 +#define RF_COPYBACK_PARITY 1 + +int rf_copyback_in_progress; + +static int rf_CopybackReadDoneProc(RF_CopybackDesc_t *desc, int status); +static int rf_CopybackWriteDoneProc(RF_CopybackDesc_t *desc, int status); +static void rf_CopybackOne(RF_CopybackDesc_t *desc, int typ, + RF_RaidAddr_t addr, RF_RowCol_t testRow, RF_RowCol_t testCol, + RF_SectorNum_t testOffs); +static void rf_CopybackComplete(RF_CopybackDesc_t *desc, int status); + +int rf_ConfigureCopyback(listp) + RF_ShutdownList_t **listp; +{ + rf_copyback_in_progress = 0; + return(0); +} + +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/ioctl.h> +#include <sys/fcntl.h> +#ifdef __NETBSD__ +#include <sys/vnode.h> +#endif + +int raidlookup __P((char *, struct proc *, struct vnode **)); +#endif + +/* do a complete copyback */ +void rf_CopybackReconstructedData(raidPtr) + RF_Raid_t *raidPtr; +{ +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + int done,retcode; + RF_CopybackDesc_t *desc; + RF_RowCol_t frow, fcol; + RF_RaidDisk_t *badDisk; + char *databuf; + + struct partinfo dpart; + struct vnode *vp; + struct vattr va; + struct proc *proc; + +#else + int bus, targ, lun, done, retcode; + RF_CopybackDesc_t *desc; + RF_RowCol_t frow, fcol; + RF_RaidDisk_t *badDisk; + RF_DiskOp_t *tur_op; + char *databuf; +#endif + + done = 0; + fcol = 0; + for (frow=0; frow<raidPtr->numRow; frow++) { + for (fcol=0; fcol<raidPtr->numCol; fcol++) { + if (raidPtr->Disks[frow][fcol].status == rf_ds_dist_spared + || raidPtr->Disks[frow][fcol].status == rf_ds_spared) + { + done = 1; + break; + } + } + if (done) + break; + } + + if (frow == raidPtr->numRow) { + printf("COPYBACK: no disks need copyback\n"); + return; + } + + badDisk = &raidPtr->Disks[frow][fcol]; +#ifndef SIMULATE +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + + proc = raidPtr->proc; /* XXX Yes, this is not nice.. */ + +#if 0 + printf("Pretending the disk is happy...\n"); + retcode = 0; /* XXX this should be set to something more realistic. */ +#endif + + /* This device may have been opened successfully the first time. + Close it before trying to open it again.. */ + + if (raidPtr->raid_cinfo[frow][fcol].ci_vp != NULL) { + printf("Closed the open device: %s\n", + raidPtr->Disks[frow][fcol].devname); + (void)vn_close(raidPtr->raid_cinfo[frow][fcol].ci_vp, + FREAD|FWRITE, proc->p_ucred, proc); + } + + printf("About to (re-)open the device: %s\n", + raidPtr->Disks[frow][fcol].devname); + + retcode = raidlookup(raidPtr->Disks[frow][fcol].devname, proc, &vp); + + if (retcode) { + printf("COPYBACK: raidlookup on device: %s failed: %d!\n", + raidPtr->Disks[frow][fcol].devname, retcode); + + /* XXX the component isn't responding properly... + must be still dead :-( */ + return; + + } else { + + /* Ok, so we can at least do a lookup... How about actually + getting a vp for it? */ + + if ((retcode = VOP_GETATTR(vp, &va, proc->p_ucred, proc)) != 0) { + return; + } + + retcode = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart, + FREAD, proc->p_ucred, proc); + if (retcode) { + return; + } + raidPtr->Disks[frow][fcol].blockSize = dpart.disklab->d_secsize; + + raidPtr->Disks[frow][fcol].numBlocks = dpart.part->p_size - + rf_protectedSectors; + + raidPtr->raid_cinfo[frow][fcol].ci_vp = vp; + raidPtr->raid_cinfo[frow][fcol].ci_dev = va.va_rdev; + + raidPtr->Disks[frow][fcol].dev = va.va_rdev; /* XXX or the above? */ + + /* we allow the user to specify that only a fraction of the + * disks should be used this is just for debug: it speeds up + * the parity scan + */ + raidPtr->Disks[frow][fcol].numBlocks = + raidPtr->Disks[frow][fcol].numBlocks * + rf_sizePercentage / 100; + } +#else + if (rf_extract_ids(badDisk->devname, &bus, &targ, &lun)) { + printf("COPYBACK: unable to extract bus, target, lun from devname %s\n", + badDisk->devname); + return; + } + + /* TUR the disk that's marked as bad to be sure that it's actually alive */ + rf_SCSI_AllocTUR(&tur_op); + retcode = rf_SCSI_DoTUR(tur_op, bus, targ, lun, badDisk->dev); + rf_SCSI_FreeDiskOp(tur_op, 0); +#endif + + if (retcode) { + printf("COPYBACK: target disk failed TUR\n"); + return; + } +#endif /* !SIMULATE */ + + /* get a buffer to hold one SU */ + RF_Malloc(databuf, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (char *)); + + /* create a descriptor */ + RF_Malloc(desc, sizeof(*desc), (RF_CopybackDesc_t *)); + desc->raidPtr = raidPtr; + desc->status = 0; + desc->frow = frow; + desc->fcol = fcol; + desc->spRow = badDisk->spareRow; + desc->spCol = badDisk->spareCol; + desc->stripeAddr = 0; + desc->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit; + desc->sectPerStripe = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.numDataCol; + desc->databuf = databuf; +#ifndef SIMULATE + desc->mcpair = rf_AllocMCPair(); +#endif /* !SIMULATE */ + + printf("COPYBACK: Quiescing the array\n"); + /* quiesce the array, since we don't want to code support for user accs here */ + rf_SuspendNewRequestsAndWait(raidPtr); + + /* adjust state of the array and of the disks */ + RF_LOCK_MUTEX(raidPtr->mutex); + raidPtr->Disks[desc->frow][desc->fcol].status = rf_ds_optimal; + raidPtr->status[desc->frow] = rf_rs_optimal; + rf_copyback_in_progress = 1; /* debug only */ + RF_UNLOCK_MUTEX(raidPtr->mutex); + + printf("COPYBACK: Beginning\n"); + RF_GETTIME(desc->starttime); + rf_ContinueCopyback(desc); +} + + +/* + * invoked via callback after a copyback I/O has completed to + * continue on with the next one + */ +void rf_ContinueCopyback(desc) + RF_CopybackDesc_t *desc; +{ + RF_SectorNum_t testOffs, stripeAddr; + RF_Raid_t *raidPtr = desc->raidPtr; + RF_RaidAddr_t addr; + RF_RowCol_t testRow, testCol; + int old_pctg, new_pctg, done; + struct timeval t, diff; + + old_pctg = (-1); + while (1) { + stripeAddr = desc->stripeAddr; + if (rf_prReconSched) { + old_pctg = 100 * desc->stripeAddr / raidPtr->totalSectors; + } + desc->stripeAddr += desc->sectPerStripe; + if (rf_prReconSched) { + new_pctg = 100 * desc->stripeAddr / raidPtr->totalSectors; + if (new_pctg != old_pctg) { + RF_GETTIME(t); + RF_TIMEVAL_DIFF(&desc->starttime, &t, &diff); + printf("%d %d.%06d\n",new_pctg, (int)diff.tv_sec, (int)diff.tv_usec); + } + } + + if (stripeAddr >= raidPtr->totalSectors) { + rf_CopybackComplete(desc, 0); + return; + } + + /* walk through the current stripe, su-by-su */ + for (done=0, addr = stripeAddr; addr < stripeAddr+desc->sectPerStripe; addr += desc->sectPerSU) { + + /* map the SU, disallowing remap to spare space */ + (raidPtr->Layout.map->MapSector)(raidPtr, addr, &testRow, &testCol, &testOffs, RF_DONT_REMAP); + + if (testRow == desc->frow && testCol == desc->fcol) { + rf_CopybackOne(desc, RF_COPYBACK_DATA, addr, testRow, testCol, testOffs); +#ifdef SIMULATE + return; +#else /* SIMULATE */ + done = 1; + break; +#endif /* SIMULATE */ + } + } + + if (!done) { + /* we didn't find the failed disk in the data part. check parity. */ + + /* map the parity for this stripe, disallowing remap to spare space */ + (raidPtr->Layout.map->MapParity)(raidPtr, stripeAddr, &testRow, &testCol, &testOffs, RF_DONT_REMAP); + + if (testRow == desc->frow && testCol == desc->fcol) { + rf_CopybackOne(desc, RF_COPYBACK_PARITY, stripeAddr, testRow, testCol, testOffs); +#ifdef SIMULATE + return; +#endif /* SIMULATE */ + } + } + + /* check to see if the last read/write pair failed */ + if (desc->status) { + rf_CopybackComplete(desc, 1); + return; + } + + /* we didn't find any units to copy back in this stripe. Continue with the next one */ + } +} + + +/* copyback one unit */ +static void rf_CopybackOne(desc, typ, addr, testRow, testCol, testOffs) + RF_CopybackDesc_t *desc; + int typ; + RF_RaidAddr_t addr; + RF_RowCol_t testRow; + RF_RowCol_t testCol; + RF_SectorNum_t testOffs; +{ + RF_SectorCount_t sectPerSU = desc->sectPerSU; + RF_Raid_t *raidPtr = desc->raidPtr; + RF_RowCol_t spRow = desc->spRow; + RF_RowCol_t spCol = desc->spCol; + RF_SectorNum_t spOffs; + + /* find the spare spare location for this SU */ + if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { + if (typ == RF_COPYBACK_DATA) + raidPtr->Layout.map->MapSector(raidPtr, addr, &spRow, &spCol, &spOffs, RF_REMAP); + else + raidPtr->Layout.map->MapParity(raidPtr, addr, &spRow, &spCol, &spOffs, RF_REMAP); + } else { + spOffs = testOffs; + } + + /* create reqs to read the old location & write the new */ + desc->readreq = rf_CreateDiskQueueData(RF_IO_TYPE_READ, spOffs, + sectPerSU, desc->databuf, 0L, 0, + (int (*)(void *,int)) rf_CopybackReadDoneProc, desc, + NULL, NULL, (void *) raidPtr, RF_DISKQUEUE_DATA_FLAGS_NONE, NULL); + desc->writereq = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, testOffs, + sectPerSU, desc->databuf, 0L, 0, + (int (*)(void *,int)) rf_CopybackWriteDoneProc, desc, + NULL, NULL, (void *) raidPtr, RF_DISKQUEUE_DATA_FLAGS_NONE, NULL); + desc->frow = testRow; + desc->fcol = testCol; + + /* enqueue the read. the write will go out as part of the callback on the read. + * at user-level & in the kernel, wait for the read-write pair to complete. + * in the simulator, just return, since everything will happen as callbacks + */ +#ifndef SIMULATE + RF_LOCK_MUTEX(desc->mcpair->mutex); + desc->mcpair->flag = 0; +#endif /* !SIMULATE */ + + rf_DiskIOEnqueue(&raidPtr->Queues[spRow][spCol], desc->readreq, RF_IO_NORMAL_PRIORITY); + +#ifndef SIMULATE + while (!desc->mcpair->flag) { + RF_WAIT_MCPAIR(desc->mcpair); + } + RF_UNLOCK_MUTEX(desc->mcpair->mutex); + rf_FreeDiskQueueData(desc->readreq); + rf_FreeDiskQueueData(desc->writereq); +#endif /* !SIMULATE */ +} + + +/* called at interrupt context when the read has completed. just send out the write */ +static int rf_CopybackReadDoneProc(desc, status) + RF_CopybackDesc_t *desc; + int status; +{ + if (status) { /* invoke the callback with bad status */ + printf("COPYBACK: copyback read failed. Aborting.\n"); + (desc->writereq->CompleteFunc)(desc, -100); + } + else { + rf_DiskIOEnqueue(&(desc->raidPtr->Queues[desc->frow][desc->fcol]), desc->writereq, RF_IO_NORMAL_PRIORITY); + } + return(0); +} + +/* called at interrupt context when the write has completed. + * at user level & in the kernel, wake up the copyback thread. + * in the simulator, invoke the next copyback directly. + * can't free diskqueuedata structs in the kernel b/c we're at interrupt context. + */ +static int rf_CopybackWriteDoneProc(desc, status) + RF_CopybackDesc_t *desc; + int status; +{ + if (status && status != -100) { + printf("COPYBACK: copyback write failed. Aborting.\n"); + } + +#ifdef SIMULATE + rf_FreeDiskQueueData(desc->readreq); + rf_FreeDiskQueueData(desc->writereq); + if (!status) + rf_ContinueCopyback(desc); + else + rf_CopybackComplete(desc, 1); +#else /* SIMULATE */ + desc->status = status; + rf_MCPairWakeupFunc(desc->mcpair); +#endif /* SIMULATE */ + return(0); +} + +/* invoked when the copyback has completed */ +static void rf_CopybackComplete(desc, status) + RF_CopybackDesc_t *desc; + int status; +{ + RF_Raid_t *raidPtr = desc->raidPtr; + struct timeval t, diff; + + if (!status) { + RF_LOCK_MUTEX(raidPtr->mutex); + if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { + RF_ASSERT(raidPtr->Layout.map->parityConfig == 'D'); + rf_FreeSpareTable(raidPtr); + } else { + raidPtr->Disks[desc->spRow][desc->spCol].status = rf_ds_spare; + } + RF_UNLOCK_MUTEX(raidPtr->mutex); + + RF_GETTIME(t); + RF_TIMEVAL_DIFF(&desc->starttime, &t, &diff); + printf("Copyback time was %d.%06d seconds\n", + (int)diff.tv_sec, (int)diff.tv_usec); + } else printf("COPYBACK: Failure.\n"); + + RF_Free(desc->databuf, rf_RaidAddressToByte(raidPtr, desc->sectPerSU)); +#ifndef SIMULATE + rf_FreeMCPair(desc->mcpair); +#endif /* !SIMULATE */ + RF_Free(desc, sizeof(*desc)); + + rf_copyback_in_progress = 0; + rf_ResumeNewRequests(raidPtr); +} diff --git a/sys/dev/raidframe/rf_copyback.h b/sys/dev/raidframe/rf_copyback.h new file mode 100644 index 00000000000..59ef0630447 --- /dev/null +++ b/sys/dev/raidframe/rf_copyback.h @@ -0,0 +1,88 @@ +/* $OpenBSD: rf_copyback.h,v 1.1 1999/01/11 14:29:03 niklas Exp $ */ +/* $NetBSD: rf_copyback.h,v 1.1 1998/11/13 04:20:27 oster Exp $ */ +/* + * rf_copyback.h + */ +/* + * Copyright (c) 1996 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ +/* + * : + * Log: rf_copyback.h,v + * Revision 1.5 1996/07/11 19:08:00 jimz + * generalize reconstruction mechanism + * allow raid1 reconstructs via copyback (done with array + * quiesced, not online, therefore not disk-directed) + * + * Revision 1.4 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.3 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.2 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.1 1996/05/18 19:55:02 jimz + * Initial revision + * + */ + +#ifndef _RF__RF_COPYBACK_H_ +#define _RF__RF_COPYBACK_H_ + +#include "rf_types.h" + +typedef struct RF_CopybackDesc_s { + RF_Raid_t *raidPtr; + RF_RowCol_t frow; + RF_RowCol_t fcol; + RF_RowCol_t spRow; + RF_RowCol_t spCol; + int status; + RF_StripeNum_t stripeAddr; + RF_SectorCount_t sectPerSU; + RF_SectorCount_t sectPerStripe; + char *databuf; + RF_DiskQueueData_t *readreq; + RF_DiskQueueData_t *writereq; + struct timeval starttime; +#ifndef SIMULATE + RF_MCPair_t *mcpair; +#endif /* !SIMULATE */ +} RF_CopybackDesc_t; + +extern int rf_copyback_in_progress; + +int rf_ConfigureCopyback(RF_ShutdownList_t **listp); +void rf_CopybackReconstructedData(RF_Raid_t *raidPtr); +void rf_ContinueCopyback(RF_CopybackDesc_t *desc); + +#endif /* !_RF__RF_COPYBACK_H_ */ diff --git a/sys/dev/raidframe/rf_cpuutil.c b/sys/dev/raidframe/rf_cpuutil.c new file mode 100644 index 00000000000..1816740bfc3 --- /dev/null +++ b/sys/dev/raidframe/rf_cpuutil.c @@ -0,0 +1,195 @@ +/* $OpenBSD: rf_cpuutil.c,v 1.1 1999/01/11 14:29:03 niklas Exp $ */ +/* $NetBSD: rf_cpuutil.c,v 1.1 1998/11/13 04:20:27 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Mark Holland, Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ +/* + * rf_cpuutil.c + * + * track cpu utilization + */ + +#ifdef _KERNEL +#define KERNEL +#endif + +#include "rf_cpuutil.h" + +#ifndef KERNEL +#include <errno.h> +#endif /* !KERNEL */ +#include "rf_types.h" +#include "rf_general.h" +#include "rf_shutdown.h" +#include "rf_sys.h" +#ifdef __osf__ +#include <sys/table.h> +#endif /* __osf__ */ +#ifdef AIX +#include <nlist.h> +#include <sys/sysinfo.h> +#endif /* AIX */ +#ifdef KERNEL +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#include <sys/dk.h> +#endif /* __NetBSD__ && !__OpenBSD__ */ +#else /* KERNEL */ +extern int table(int id, int index, void *addr, int nel, u_int lel); +#endif /* KERNEL */ + +#ifdef __osf__ +static struct tbl_sysinfo start, stop; +#endif /* __osf__ */ + +#ifdef AIX +static int kmem_fd; +static off_t sysinfo_offset; +static struct sysinfo sysinfo_start, sysinfo_stop; +static struct nlist namelist[] = { + {{"sysinfo"}}, + {{""}}, +}; +#endif /* AIX */ + +#ifdef AIX +static void rf_ShutdownCpuMonitor(ignored) + void *ignored; +{ + close(kmem_fd); +} +#endif /* AIX */ + +int rf_ConfigureCpuMonitor(listp) + RF_ShutdownList_t **listp; +{ +#ifdef AIX + int rc; + + rc = knlist(namelist, 1, sizeof(struct nlist)); + if (rc) { + RF_ERRORMSG("Could not knlist() to config CPU monitor\n"); + return(errno); + } + if (namelist[0].n_value == 0) { + RF_ERRORMSG("Got bogus results from knlist() for CPU monitor\n"); + return(EIO); + } + sysinfo_offset = namelist[0].n_value; + kmem_fd = open("/dev/kmem", O_RDONLY); + if (kmem_fd < 0) { + perror("/dev/kmem"); + return(errno); + } + rc = rf_ShutdownCreate(listp, rf_ShutdownCpuMonitor, NULL); + if (rc) { + RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + rf_ShutdownCpuMonitor(NULL); + return(rc); + } +#endif /* AIX */ + return(0); +} + +void rf_start_cpu_monitor() +{ +#ifdef __osf__ +#ifndef KERNEL + if (table(TBL_SYSINFO, 0, &start, 1, sizeof(start)) != 1) { + printf("Unable to get sysinfo for cpu utilization monitor\n"); + perror("start_cpu_monitor"); + } +#else /* !KERNEL */ + /* start.si_user = cp_time[CP_USER]; + start.si_nice = cp_time[CP_NICE]; + start.si_sys = cp_time[CP_SYS]; + start.si_idle = cp_time[CP_IDLE]; + start.wait = cp_time[CP_WAIT]; */ +#endif /* !KERNEL */ +#endif /* __osf__ */ +#ifdef AIX + off_t off; + int rc; + + off = lseek(kmem_fd, sysinfo_offset, SEEK_SET); + RF_ASSERT(off == sysinfo_offset); + rc = read(kmem_fd, &sysinfo_start, sizeof(struct sysinfo)); + if (rc != sizeof(struct sysinfo)) { + RF_ERRORMSG2("Starting CPU monitor: rc=%d != %d\n", rc, + sizeof(struct sysinfo)); + } +#endif /* AIX */ +} + +void rf_stop_cpu_monitor() +{ +#ifdef __osf__ +#ifndef KERNEL + if (table(TBL_SYSINFO, 0, &stop, 1, sizeof(stop)) != 1) { + printf("Unable to get sysinfo for cpu utilization monitor\n"); + perror("stop_cpu_monitor"); + } +#else /* !KERNEL */ + /* stop.si_user = cp_time[CP_USER]; + stop.si_nice = cp_time[CP_NICE]; + stop.si_sys = cp_time[CP_SYS]; + stop.si_idle = cp_time[CP_IDLE]; + stop.wait = cp_time[CP_WAIT]; */ +#endif /* !KERNEL */ +#endif /* __osf__ */ +#ifdef AIX + off_t off; + int rc; + + off = lseek(kmem_fd, sysinfo_offset, SEEK_SET); + RF_ASSERT(off == sysinfo_offset); + rc = read(kmem_fd, &sysinfo_stop, sizeof(struct sysinfo)); + if (rc != sizeof(struct sysinfo)) { + RF_ERRORMSG2("Stopping CPU monitor: rc=%d != %d\n", rc, + sizeof(struct sysinfo)); + } +#endif /* AIX */ +} + +void rf_print_cpu_util(s) + char *s; +{ +#ifdef __osf__ + long totalticks, idleticks; + + idleticks = stop.si_idle - start.si_idle + stop.wait - start.wait; + totalticks = stop.si_user - start.si_user + stop.si_nice - start.si_nice + + stop.si_sys - start.si_sys + idleticks; + printf("CPU utilization during %s was %d %%\n", s, 100 - 100*idleticks/totalticks); +#endif /* __osf__ */ +#ifdef AIX + long idle; + + /* XXX compute a percentage here */ + idle = (long)(sysinfo_stop.cpu[CPU_IDLE] - sysinfo_start.cpu[CPU_IDLE]); + printf("%ld idle ticks during %s.\n", idle, s); +#endif /* AIX */ +} diff --git a/sys/dev/raidframe/rf_cpuutil.h b/sys/dev/raidframe/rf_cpuutil.h new file mode 100644 index 00000000000..72603d9aae6 --- /dev/null +++ b/sys/dev/raidframe/rf_cpuutil.h @@ -0,0 +1,57 @@ +/* $OpenBSD: rf_cpuutil.h,v 1.1 1999/01/11 14:29:03 niklas Exp $ */ +/* $NetBSD: rf_cpuutil.h,v 1.1 1998/11/13 04:20:27 oster Exp $ */ +/* + * rf_cpuutil.h + */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ +/* + * : + * Log: rf_cpuutil.h,v + * Revision 1.3 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.2 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.1 1996/05/18 19:55:29 jimz + * Initial revision + * + */ + +#ifndef _RF__RF_CPUUTIL_H_ +#define _RF__RF_CPUUTIL_H_ + +#include "rf_types.h" + +int rf_ConfigureCpuMonitor(RF_ShutdownList_t **listp); +void rf_start_cpu_monitor(void); +void rf_stop_cpu_monitor(void); +void rf_print_cpu_util(char *s); + +#endif /* !_RF__RF_CPUUTIL_H_ */ diff --git a/sys/dev/raidframe/rf_cvscan.c b/sys/dev/raidframe/rf_cvscan.c new file mode 100644 index 00000000000..73a6e64d001 --- /dev/null +++ b/sys/dev/raidframe/rf_cvscan.c @@ -0,0 +1,450 @@ +/* $OpenBSD: rf_cvscan.c,v 1.1 1999/01/11 14:29:05 niklas Exp $ */ +/* $NetBSD: rf_cvscan.c,v 1.2 1998/11/18 15:13:51 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/******************************************************************************* + * + * cvscan.c -- prioritized cvscan disk queueing code. + * + * Nov 9, 1994, adapted from raidSim version (MCH) + * + ******************************************************************************/ + +/* + * : + * Log: rf_cvscan.c,v + * Revision 1.6 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.5 1996/07/15 17:22:18 jimz + * nit-pick code cleanup + * resolve stdlib problems on DEC OSF + * + * Revision 1.4 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.3 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.2 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.1 1996/06/05 19:17:40 jimz + * Initial revision + * + */ + +#include "rf_types.h" +#include "rf_alloclist.h" +#include "rf_stripelocks.h" +#include "rf_layout.h" +#include "rf_diskqueue.h" +#include "rf_cvscan.h" +#include "rf_debugMem.h" +#include "rf_general.h" +#include "rf_sys.h" + +#define DO_CHECK_STATE(_hdr_) CheckCvscanState((_hdr_), __FILE__, __LINE__) + +#define pri_ok(p) ( ((p) == RF_IO_NORMAL_PRIORITY) || ((p) == RF_IO_LOW_PRIORITY)) + +static void CheckCvscanState(RF_CvscanHeader_t *hdr, char *file, int line) +{ + long i, key; + RF_DiskQueueData_t *tmp; + + if( hdr->left != (RF_DiskQueueData_t *) NULL ) + RF_ASSERT( hdr->left->sectorOffset < hdr->cur_block ); + for( key=hdr->cur_block, i=0, tmp=hdr->left; + tmp != (RF_DiskQueueData_t *) NULL; + key=tmp->sectorOffset, i++, tmp=tmp->next ) + RF_ASSERT( tmp->sectorOffset <= key + && tmp->priority == hdr->nxt_priority && pri_ok(tmp->priority) ); + RF_ASSERT( i == hdr->left_cnt ); + + for( key=hdr->cur_block, i=0, tmp=hdr->right; + tmp != (RF_DiskQueueData_t *) NULL; + key=tmp->sectorOffset, i++, tmp=tmp->next ) + { + RF_ASSERT(key <= tmp->sectorOffset); + RF_ASSERT(tmp->priority == hdr->nxt_priority); + RF_ASSERT(pri_ok(tmp->priority)); + } + RF_ASSERT( i == hdr->right_cnt ); + + for( key=hdr->nxt_priority-1, tmp=hdr->burner; + tmp != (RF_DiskQueueData_t *) NULL; + key=tmp->priority, tmp=tmp->next ) + { + RF_ASSERT(tmp); + RF_ASSERT(hdr); + RF_ASSERT(pri_ok(tmp->priority)); + RF_ASSERT(key >= tmp->priority); + RF_ASSERT(tmp->priority < hdr->nxt_priority); + } +} + + + +static void PriorityInsert(RF_DiskQueueData_t **list_ptr, RF_DiskQueueData_t *req ) +{ + /* + ** insert block pointed to by req in to list whose first + ** entry is pointed to by the pointer that list_ptr points to + ** ie., list_ptr is a grandparent of the first entry + */ + + for( ; (*list_ptr)!=(RF_DiskQueueData_t *)NULL && + (*list_ptr)->priority > req->priority; + list_ptr = &((*list_ptr)->next) ) {} + req->next = (*list_ptr); + (*list_ptr) = req; +} + + + +static void ReqInsert(RF_DiskQueueData_t **list_ptr, RF_DiskQueueData_t *req, RF_CvscanArmDir_t order) +{ + /* + ** insert block pointed to by req in to list whose first + ** entry is pointed to by the pointer that list_ptr points to + ** ie., list_ptr is a grandparent of the first entry + */ + + for( ; (*list_ptr)!=(RF_DiskQueueData_t *)NULL && + + ( (order==rf_cvscan_RIGHT && (*list_ptr)->sectorOffset <= req->sectorOffset) + || (order==rf_cvscan_LEFT && (*list_ptr)->sectorOffset > req->sectorOffset) ); + list_ptr = &((*list_ptr)->next) ) {} + req->next = (*list_ptr); + (*list_ptr) = req; +} + + + +static RF_DiskQueueData_t *ReqDequeue(RF_DiskQueueData_t **list_ptr) +{ + RF_DiskQueueData_t * ret = (*list_ptr); + if( (*list_ptr) != (RF_DiskQueueData_t *) NULL ) { + (*list_ptr) = (*list_ptr)->next; + } + return( ret ); +} + + + +static void ReBalance(RF_CvscanHeader_t *hdr) +{ + /* DO_CHECK_STATE(hdr); */ + while( hdr->right != (RF_DiskQueueData_t *) NULL + && hdr->right->sectorOffset < hdr->cur_block ) { + hdr->right_cnt--; + hdr->left_cnt++; + ReqInsert( &hdr->left, ReqDequeue( &hdr->right ), rf_cvscan_LEFT ); + } + /* DO_CHECK_STATE(hdr); */ +} + + + +static void Transfer(RF_DiskQueueData_t **to_list_ptr, RF_DiskQueueData_t **from_list_ptr ) +{ + RF_DiskQueueData_t *gp; + for( gp=(*from_list_ptr); gp != (RF_DiskQueueData_t *) NULL; ) { + RF_DiskQueueData_t *p = gp->next; + PriorityInsert( to_list_ptr, gp ); + gp = p; + } + (*from_list_ptr) = (RF_DiskQueueData_t *) NULL; +} + + + +static void RealEnqueue(RF_CvscanHeader_t *hdr, RF_DiskQueueData_t *req) +{ + RF_ASSERT(req->priority == RF_IO_NORMAL_PRIORITY || req->priority == RF_IO_LOW_PRIORITY); + + DO_CHECK_STATE(hdr); + if( hdr->left_cnt == 0 && hdr->right_cnt == 0 ) { + hdr->nxt_priority = req->priority; + } + if( req->priority > hdr->nxt_priority ) { + /* + ** dump all other outstanding requests on the back burner + */ + Transfer( &hdr->burner, &hdr->left ); + Transfer( &hdr->burner, &hdr->right ); + hdr->left_cnt = 0; + hdr->right_cnt = 0; + hdr->nxt_priority = req->priority; + } + if( req->priority < hdr->nxt_priority ) { + /* + ** yet another low priority task! + */ + PriorityInsert( &hdr->burner, req ); + } else { + if( req->sectorOffset < hdr->cur_block ) { + /* this request is to the left of the current arms */ + ReqInsert( &hdr->left, req, rf_cvscan_LEFT ); + hdr->left_cnt++; + } else { + /* this request is to the right of the current arms */ + ReqInsert( &hdr->right, req, rf_cvscan_RIGHT ); + hdr->right_cnt++; + } + } + DO_CHECK_STATE(hdr); +} + + + +void rf_CvscanEnqueue(void *q_in, RF_DiskQueueData_t *elem, int priority) +{ + RF_CvscanHeader_t *hdr = (RF_CvscanHeader_t *) q_in; + RealEnqueue( hdr, elem /*req*/ ); +} + + + +RF_DiskQueueData_t *rf_CvscanDequeue(void *q_in) +{ + RF_CvscanHeader_t *hdr = (RF_CvscanHeader_t *) q_in; + long range, i, sum_dist_left, sum_dist_right; + RF_DiskQueueData_t *ret; + RF_DiskQueueData_t *tmp; + + DO_CHECK_STATE(hdr); + + if( hdr->left_cnt == 0 && hdr->right_cnt == 0 ) return( (RF_DiskQueueData_t *) NULL ); + + range = RF_MIN( hdr->range_for_avg, RF_MIN(hdr->left_cnt,hdr->right_cnt)); + for( i=0, tmp=hdr->left, sum_dist_left= + ((hdr->direction==rf_cvscan_RIGHT)?range*hdr->change_penalty:0); + tmp != (RF_DiskQueueData_t *) NULL && i < range; + tmp = tmp->next, i++ ) { + sum_dist_left += hdr->cur_block - tmp->sectorOffset; + } + for( i=0, tmp=hdr->right, sum_dist_right= + ((hdr->direction==rf_cvscan_LEFT)?range*hdr->change_penalty:0); + tmp != (RF_DiskQueueData_t *) NULL && i < range; + tmp = tmp->next, i++ ) { + sum_dist_right += tmp->sectorOffset - hdr->cur_block; + } + + if( hdr->right_cnt == 0 || sum_dist_left < sum_dist_right ) { + hdr->direction = rf_cvscan_LEFT; + hdr->cur_block = hdr->left->sectorOffset + hdr->left->numSector; + hdr->left_cnt = RF_MAX(hdr->left_cnt-1,0); + tmp = hdr->left; + ret = (ReqDequeue(&hdr->left))/*->parent*/; + } else { + hdr->direction = rf_cvscan_RIGHT; + hdr->cur_block = hdr->right->sectorOffset + hdr->right->numSector; + hdr->right_cnt = RF_MAX(hdr->right_cnt-1,0); + tmp = hdr->right; + ret = (ReqDequeue(&hdr->right))/*->parent*/; + } + ReBalance( hdr ); + + if( hdr->left_cnt == 0 && hdr->right_cnt == 0 + && hdr->burner != (RF_DiskQueueData_t *) NULL ) { + /* + ** restore low priority requests for next dequeue + */ + RF_DiskQueueData_t *burner = hdr->burner; + hdr->nxt_priority = burner->priority; + while( burner != (RF_DiskQueueData_t *) NULL + && burner->priority == hdr->nxt_priority ) { + RF_DiskQueueData_t *next = burner->next; + RealEnqueue( hdr, burner ); + burner = next; + } + hdr->burner = burner; + } + DO_CHECK_STATE(hdr); + return( ret ); +} + + + +RF_DiskQueueData_t *rf_CvscanPeek(void *q_in) +{ + RF_CvscanHeader_t *hdr = (RF_CvscanHeader_t *) q_in; + long range, i, sum_dist_left, sum_dist_right; + RF_DiskQueueData_t *tmp, *headElement; + + DO_CHECK_STATE(hdr); + + if( hdr->left_cnt == 0 && hdr->right_cnt == 0 ) + headElement = NULL; + else { + range = RF_MIN( hdr->range_for_avg, RF_MIN(hdr->left_cnt,hdr->right_cnt)); + for( i=0, tmp=hdr->left, sum_dist_left= + ((hdr->direction==rf_cvscan_RIGHT)?range*hdr->change_penalty:0); + tmp != (RF_DiskQueueData_t *) NULL && i < range; + tmp = tmp->next, i++ ) { + sum_dist_left += hdr->cur_block - tmp->sectorOffset; + } + for( i=0, tmp=hdr->right, sum_dist_right= + ((hdr->direction==rf_cvscan_LEFT)?range*hdr->change_penalty:0); + tmp != (RF_DiskQueueData_t *) NULL && i < range; + tmp = tmp->next, i++ ) { + sum_dist_right += tmp->sectorOffset - hdr->cur_block; + } + + if( hdr->right_cnt == 0 || sum_dist_left < sum_dist_right ) + headElement = hdr->left; + else + headElement = hdr->right; + } + return(headElement); +} + + + +/* +** CVSCAN( 1, 0 ) is Shortest Seek Time First (SSTF) +** lowest average response time +** CVSCAN( 1, infinity ) is SCAN +** lowest response time standard deviation +*/ + + +int rf_CvscanConfigure() +{ + return(0); +} + + + +void *rf_CvscanCreate(RF_SectorCount_t sectPerDisk, + RF_AllocListElem_t *clList, + RF_ShutdownList_t **listp) +{ + RF_CvscanHeader_t *hdr; + long range = 2; /* Currently no mechanism to change these */ + long penalty = sectPerDisk / 5; + + RF_MallocAndAdd(hdr, sizeof(RF_CvscanHeader_t), (RF_CvscanHeader_t *), clList); + bzero((char *)hdr, sizeof(RF_CvscanHeader_t)); + hdr->range_for_avg = RF_MAX( range, 1 ); + hdr->change_penalty = RF_MAX( penalty, 0 ); + hdr->direction = rf_cvscan_RIGHT; + hdr->cur_block = 0; + hdr->left_cnt = hdr->right_cnt = 0; + hdr->left = hdr->right = (RF_DiskQueueData_t *) NULL; + hdr->burner = (RF_DiskQueueData_t *) NULL; + DO_CHECK_STATE(hdr); + + return( (void *) hdr ); +} + + +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) +/* PrintCvscanQueue is not used, so we ignore it... */ +#else +static void PrintCvscanQueue(RF_CvscanHeader_t *hdr) +{ + RF_DiskQueueData_t *tmp; + + printf( "CVSCAN(%d,%d) at %d going %s\n", + (int)hdr->range_for_avg, + (int)hdr->change_penalty, + (int)hdr->cur_block, + (hdr->direction==rf_cvscan_LEFT)?"LEFT":"RIGHT" ); + printf( "\tLeft(%d): ", hdr->left_cnt ); + for( tmp = hdr->left; tmp != (RF_DiskQueueData_t *) NULL; tmp = tmp->next) + printf( "(%d,%ld,%d) ", + (int) tmp->sectorOffset, + (long) (tmp->sectorOffset + tmp->numSector), + tmp->priority ); + printf( "\n" ); + printf( "\tRight(%d): ", hdr->right_cnt ); + for( tmp = hdr->right; tmp != (RF_DiskQueueData_t *) NULL; tmp = tmp->next) + printf( "(%d,%ld,%d) ", + (int) tmp->sectorOffset, + (long) (tmp->sectorOffset + tmp->numSector), + tmp->priority ); + printf( "\n" ); + printf( "\tBurner: " ); + for( tmp = hdr->burner; tmp != (RF_DiskQueueData_t *) NULL; tmp = tmp->next) + printf( "(%d,%ld,%d) ", + (int) tmp->sectorOffset, + (long) (tmp->sectorOffset + tmp->numSector), + tmp->priority ); + printf( "\n" ); +} +#endif + + +/* promotes reconstruction accesses for the given stripeID to normal priority. + * returns 1 if an access was found and zero otherwise. Normally, we should + * only have one or zero entries in the burner queue, so execution time should + * be short. + */ +int rf_CvscanPromote(void *q_in, RF_StripeNum_t parityStripeID, RF_ReconUnitNum_t which_ru) +{ + RF_CvscanHeader_t *hdr = (RF_CvscanHeader_t *) q_in; + RF_DiskQueueData_t *trailer = NULL, *tmp = hdr->burner, *tlist = NULL; + int retval=0; + + DO_CHECK_STATE(hdr); + while (tmp) { /* handle entries at the front of the list */ + if (tmp->parityStripeID == parityStripeID && tmp->which_ru == which_ru) { + hdr->burner = tmp->next; + tmp->priority = RF_IO_NORMAL_PRIORITY; + tmp->next = tlist; tlist=tmp; + tmp = hdr->burner; + } else break; + } + if (tmp) {trailer=tmp; tmp=tmp->next;} + while (tmp) { /* handle entries on the rest of the list */ + if (tmp->parityStripeID == parityStripeID && tmp->which_ru == which_ru) { + trailer->next = tmp->next; + tmp->priority = RF_IO_NORMAL_PRIORITY; + tmp->next = tlist; tlist=tmp; /* insert on a temp queue */ + tmp = trailer->next; + } else { + trailer=tmp; tmp=tmp->next; + } + } + while (tlist) { + retval++; + tmp = tlist->next; + RealEnqueue(hdr, tlist); + tlist = tmp; + } + RF_ASSERT(retval==0 || retval==1); + DO_CHECK_STATE((RF_CvscanHeader_t *)q_in); + return(retval); +} + diff --git a/sys/dev/raidframe/rf_cvscan.h b/sys/dev/raidframe/rf_cvscan.h new file mode 100644 index 00000000000..4347fb06a63 --- /dev/null +++ b/sys/dev/raidframe/rf_cvscan.h @@ -0,0 +1,97 @@ +/* $OpenBSD: rf_cvscan.h,v 1.1 1999/01/11 14:29:06 niklas Exp $ */ +/* $NetBSD: rf_cvscan.h,v 1.1 1998/11/13 04:20:27 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* +** Disk scheduling by CVSCAN( N, r ) +** +** Given a set of requests, partition them into one set on each +** side of the current arm position. The trick is to pick which +** side you are going to service next; once a side is picked you will +** service the closest request. +** Let there be n1 requests on one side and n2 requests on the other +** side. If one of n1 or n2 is zero, select the other side. +** If both n1 and n2 are nonzero, select a "range" for examination +** that is N' = min( n1, n2, N ). Average the distance from the +** current position to the nearest N' requests on each side giving +** d1 and d2. +** Suppose the last decision was to move toward set 2, then the +** current direction is toward set 2, and you will only switch to set +** 1 if d1+R < d2 where R is r*(total number of cylinders), r in [0,1]. +** +** I extend this by applying only to the set of requests that all +** share the same, highest priority level. +*/ + +/* : + * Log: rf_cvscan.h,v + * Revision 1.3 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.2 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.1 1996/06/05 19:17:40 jimz + * Initial revision + * + */ + +#ifndef _RF__RF_CVSCAN_H_ +#define _RF__RF_CVSCAN_H_ + +#include "rf_diskqueue.h" + +typedef enum RF_CvscanArmDir_e { + rf_cvscan_LEFT, + rf_cvscan_RIGHT +} RF_CvscanArmDir_t; + +typedef struct RF_CvscanHeader_s { + long range_for_avg; /* CVSCAN param N */ + long change_penalty; /* CVSCAN param R */ + RF_CvscanArmDir_t direction; + RF_SectorNum_t cur_block; + int nxt_priority; + RF_DiskQueueData_t *left; + int left_cnt; + RF_DiskQueueData_t *right; + int right_cnt; + RF_DiskQueueData_t *burner; +} RF_CvscanHeader_t; + +int rf_CvscanConfigure(void); +void *rf_CvscanCreate(RF_SectorCount_t sect_per_disk, + RF_AllocListElem_t *cl_list, RF_ShutdownList_t **listp); +void rf_CvscanEnqueue(void *qptr, RF_DiskQueueData_t *req, int priority); +RF_DiskQueueData_t *rf_CvscanDequeue(void *qptr); +RF_DiskQueueData_t *rf_CvscanPeek(void *qptr); +int rf_CvscanPromote(void *qptr, RF_StripeNum_t parityStripeID, + RF_ReconUnitNum_t which_ru); + +#endif /* !_RF__RF_CVSCAN_H_ */ diff --git a/sys/dev/raidframe/rf_dag.h b/sys/dev/raidframe/rf_dag.h new file mode 100644 index 00000000000..f13fc3f76c3 --- /dev/null +++ b/sys/dev/raidframe/rf_dag.h @@ -0,0 +1,320 @@ +/* $OpenBSD: rf_dag.h,v 1.1 1999/01/11 14:29:06 niklas Exp $ */ +/* $NetBSD: rf_dag.h,v 1.1 1998/11/13 04:20:27 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: William V. Courtright II, Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/**************************************************************************** + * * + * dag.h -- header file for DAG-related data structures * + * * + ****************************************************************************/ +/* + * + * : + * Log: rf_dag.h,v + * Revision 1.35 1996/11/05 18:38:37 jimz + * add patch from galvarez@cs.ucsd.edu (Guillermo Alvarez) + * to fix dag_params memory-sizing problem (should be an array + * of the type, not an array of pointers to the type) + * + * Revision 1.34 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.33 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.32 1996/06/10 22:22:13 wvcii + * added two node status types for use in backward error + * recovery experiments. + * + * Revision 1.31 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.30 1996/06/07 22:49:18 jimz + * fix up raidPtr typing + * + * Revision 1.29 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.28 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.27 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.26 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.25 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.24 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.23 1996/05/16 23:05:20 jimz + * Added dag_ptrs field, RF_DAG_PTRCACHESIZE + * + * The dag_ptrs field of the node is basically some scribble + * space to be used here. We could get rid of it, and always + * allocate the range of pointers, but that's expensive. So, + * we pick a "common case" size for the pointer cache. Hopefully, + * we'll find that: + * (1) Generally, nptrs doesn't exceed RF_DAG_PTRCACHESIZE by + * only a little bit (least efficient case) + * (2) Generally, ntprs isn't a lot less than RF_DAG_PTRCACHESIZE + * (wasted memory) + * + * Revision 1.22 1996/05/08 21:01:24 jimz + * fixed up enum type names that were conflicting with other + * enums and function names (ie, "panic") + * future naming trends will be towards RF_ and rf_ for + * everything raidframe-related + * + * Revision 1.21 1996/05/08 15:23:47 wvcii + * added new node states: undone, recover, panic + * + * Revision 1.20 1995/12/01 14:59:19 root + * increased MAX_ANTECEDENTS from 10 to 20 + * should consider getting rid of this (eliminate static array) + * + * Revision 1.19 1995/11/30 15:58:59 wvcii + * added copyright info + * + * Revision 1.18 1995/11/19 16:27:03 wvcii + * created struct dagList + * + * Revision 1.17 1995/11/07 15:43:01 wvcii + * added static array to DAGnode: antType + * added commitNode type + * added commit node counts to dag header + * added ptr (firstDag) to support multi-dag requests + * added succedent done/fired counts to nodes to support rollback + * added node status type "skipped" + * added hdr status types "rollForward, rollBackward" + * deleted hdr status type "disable" + * updated ResetNode & ResetDAGHeader to zero new fields + * + */ + +#ifndef _RF__RF_DAG_H_ +#define _RF__RF_DAG_H_ + +#include "rf_types.h" +#include "rf_threadstuff.h" +#include "rf_alloclist.h" +#include "rf_stripelocks.h" +#include "rf_layout.h" +#include "rf_dagflags.h" +#include "rf_acctrace.h" +#include "rf_memchunk.h" + +#define RF_THREAD_CONTEXT 0 /* we were invoked from thread context */ +#define RF_INTR_CONTEXT 1 /* we were invoked from interrupt context */ +#define RF_MAX_ANTECEDENTS 20 /* max num of antecedents a node may posses */ + +#ifdef KERNEL +#include <sys/buf.h> +#endif /* KERNEL */ + +struct RF_PropHeader_s { /* structure for propagation of results */ + int resultNum; /* bind result # resultNum */ + int paramNum; /* to parameter # paramNum */ + RF_PropHeader_t *next; /* linked list for multiple results/params */ +}; + +typedef enum RF_NodeStatus_e { + rf_bwd1, /* node is ready for undo logging (backward error recovery only) */ + rf_bwd2, /* node has completed undo logging (backward error recovery only) */ + rf_wait, /* node is waiting to be executed */ + rf_fired, /* node is currently executing its do function */ + rf_good, /* node successfully completed execution of its do function */ + rf_bad, /* node failed to successfully execute its do function */ + rf_skipped, /* not used anymore, used to imply a node was not executed */ + rf_recover, /* node is currently executing its undo function */ + rf_panic, /* node failed to successfully execute its undo function */ + rf_undone /* node successfully executed its undo function */ +} RF_NodeStatus_t; + +/* + * These were used to control skipping a node. + * Now, these are only used as comments. + */ +typedef enum RF_AntecedentType_e { + rf_trueData, + rf_antiData, + rf_outputData, + rf_control +} RF_AntecedentType_t; + +#define RF_DAG_PTRCACHESIZE 40 +#define RF_DAG_PARAMCACHESIZE 12 + +typedef RF_uint8 RF_DagNodeFlags_t; + +struct RF_DagNode_s { + RF_NodeStatus_t status; /* current status of this node */ + int (*doFunc)(RF_DagNode_t *); /* normal function */ + int (*undoFunc)(RF_DagNode_t *); /* func to remove effect of doFunc */ + int (*wakeFunc)(RF_DagNode_t *, int status); /* func called when the node completes an I/O */ + int numParams; /* number of parameters required by *funcPtr */ + int numResults; /* number of results produced by *funcPtr */ + int numAntecedents; /* number of antecedents */ + int numAntDone; /* number of antecedents which have finished */ + int numSuccedents; /* number of succedents */ + int numSuccFired; /* incremented when a succedent is fired during forward execution */ + int numSuccDone; /* incremented when a succedent finishes during rollBackward */ + int commitNode; /* boolean flag - if true, this is a commit node */ + RF_DagNode_t **succedents; /* succedents, array size numSuccedents */ + RF_DagNode_t **antecedents; /* antecedents, array size numAntecedents */ + RF_AntecedentType_t antType[RF_MAX_ANTECEDENTS]; /* type of each antecedent */ + void **results; /* array of results produced by *funcPtr */ + RF_DagParam_t *params; /* array of parameters required by *funcPtr */ + RF_PropHeader_t **propList; /* propagation list, size numSuccedents */ + RF_DagHeader_t *dagHdr; /* ptr to head of dag containing this node */ + void *dagFuncData; /* dag execution func uses this for whatever it wants */ + RF_DagNode_t *next; + int nodeNum; /* used by PrintDAG for debug only */ + int visited; /* used to avoid re-visiting nodes on DAG walks */ + /* ANY CODE THAT USES THIS FIELD MUST MAINTAIN THE PROPERTY + * THAT AFTER IT FINISHES, ALL VISITED FLAGS IN THE DAG ARE IDENTICAL */ + char *name; /* debug only */ + RF_DagNodeFlags_t flags; /* see below */ + RF_DagNode_t *dag_ptrs[RF_DAG_PTRCACHESIZE]; /* cache for performance */ + RF_DagParam_t dag_params[RF_DAG_PARAMCACHESIZE]; /* cache for performance */ +}; + +/* + * Bit values for flags field of RF_DagNode_t + */ +#define RF_DAGNODE_FLAG_NONE 0x00 +#define RF_DAGNODE_FLAG_YIELD 0x01 /* in the kernel, yield the processor before firing this node */ + +/* enable - DAG ready for normal execution, no errors encountered + * rollForward - DAG encountered an error after commit point, rolling forward + * rollBackward - DAG encountered an error prior to commit point, rolling backward + */ +typedef enum RF_DagStatus_e { + rf_enable, + rf_rollForward, + rf_rollBackward +} RF_DagStatus_t; + +#define RF_MAX_HDR_SUCC 1 + +#define RF_MAXCHUNKS 10 + +struct RF_DagHeader_s { + RF_DagStatus_t status; /* status of this DAG */ + int numSuccedents; /* DAG may be a tree, i.e. may have > 1 root */ + int numCommitNodes; /* number of commit nodes in graph */ + int numCommits; /* number of commit nodes which have been fired */ + RF_DagNode_t *succedents[RF_MAX_HDR_SUCC]; /* array of succedents, size numSuccedents */ + RF_DagHeader_t *next; /* ptr to allow a list of dags */ + RF_AllocListElem_t *allocList; /* ptr to list of ptrs to be freed prior to freeing DAG */ + RF_AccessStripeMapHeader_t *asmList; /* list of access stripe maps to be freed */ + int nodeNum; /* used by PrintDAG for debug only */ + int numNodesCompleted; + RF_AccTraceEntry_t *tracerec; /* perf mon only */ + + void (*cbFunc)(void *); /* function to call when the dag completes */ + void *cbArg; /* argument for cbFunc */ + char *creator; /* name of function used to create this dag */ + + RF_Raid_t *raidPtr; /* the descriptor for the RAID device this DAG is for */ + void *bp; /* the bp for this I/O passed down from the file system. ignored outside kernel */ + + RF_ChunkDesc_t *memChunk[RF_MAXCHUNKS]; /* experimental- Chunks of memory to be retained upon DAG free for re-use */ + int chunkIndex; /* the idea is to avoid calls to alloc and free */ + + RF_ChunkDesc_t **xtraMemChunk; /* escape hatch which allows SelectAlgorithm to merge memChunks from several dags */ + int xtraChunkIndex; /* number of ptrs to valid chunks */ + int xtraChunkCnt; /* number of ptrs to chunks allocated */ + +#ifdef SIMULATE + int done; /* Tag to tell if termination node has been fired */ +#endif /* SIMULATE */ +}; + +struct RF_DagList_s { + /* common info for a list of dags which will be fired sequentially */ + int numDags; /* number of dags in the list */ + int numDagsFired; /* number of dags in list which have initiated execution */ + int numDagsDone; /* number of dags in list which have completed execution */ + RF_DagHeader_t *dags; /* list of dags */ + RF_RaidAccessDesc_t *desc; /* ptr to descriptor for this access */ + RF_AccTraceEntry_t tracerec; /* perf mon info for dags (not user info) */ +}; + +/* resets a node so that it can be fired again */ +#define RF_ResetNode(_n_) { \ + (_n_)->status = rf_wait; \ + (_n_)->numAntDone = 0; \ + (_n_)->numSuccFired = 0; \ + (_n_)->numSuccDone = 0; \ + (_n_)->next = NULL; \ +} + +#ifdef SIMULATE +#define RF_ResetDagHeader(_h_) { \ + (_h_)->done = RF_FALSE; \ + (_h_)->numNodesCompleted = 0; \ + (_h_)->numCommits = 0; \ + (_h_)->status = rf_enable; \ +} +#else /* SIMULATE */ +#define RF_ResetDagHeader(_h_) { \ + (_h_)->numNodesCompleted = 0; \ + (_h_)->numCommits = 0; \ + (_h_)->status = rf_enable; \ +} +#endif /* SIMULATE */ + +/* convience macro for declaring a create dag function */ + +#define RF_CREATE_DAG_FUNC_DECL(_name_) \ +void _name_ ( \ + RF_Raid_t *raidPtr, \ + RF_AccessStripeMap_t *asmap, \ + RF_DagHeader_t *dag_h, \ + void *bp, \ + RF_RaidAccessFlags_t flags, \ + RF_AllocListElem_t *allocList) + +#endif /* !_RF__RF_DAG_H_ */ diff --git a/sys/dev/raidframe/rf_dagdegrd.c b/sys/dev/raidframe/rf_dagdegrd.c new file mode 100644 index 00000000000..06390061306 --- /dev/null +++ b/sys/dev/raidframe/rf_dagdegrd.c @@ -0,0 +1,1212 @@ +/* $OpenBSD: rf_dagdegrd.c,v 1.1 1999/01/11 14:29:06 niklas Exp $ */ +/* $NetBSD: rf_dagdegrd.c,v 1.1 1998/11/13 04:20:27 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * rf_dagdegrd.c + * + * code for creating degraded read DAGs + * + * : + * Log: rf_dagdegrd.c,v + * Revision 1.20 1996/11/05 21:10:40 jimz + * failed pda generalization + * + * Revision 1.19 1996/08/19 23:30:36 jimz + * fix chained declustered accesses in degraded mode when mirror copy is failed + * (workload shifting not allowed when there are no duplicate copies extant) + * + * Revision 1.18 1996/07/31 16:29:01 jimz + * asm/asmap re-fix (EO merge) + * + * Revision 1.17 1996/07/31 15:34:34 jimz + * evenodd changes; bugfixes for double-degraded archs, generalize + * some formerly PQ-only functions + * + * Revision 1.16 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.15 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.14 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.13 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.12 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.11 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.10 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.9 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.8 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.7 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.6 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.5 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.4 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.3 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.2 1996/05/08 21:01:24 jimz + * fixed up enum type names that were conflicting with other + * enums and function names (ie, "panic") + * future naming trends will be towards RF_ and rf_ for + * everything raidframe-related + * + * Revision 1.1 1996/05/03 19:22:23 wvcii + * Initial revision + * + */ + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_dagutils.h" +#include "rf_dagfuncs.h" +#include "rf_threadid.h" +#include "rf_debugMem.h" +#include "rf_memchunk.h" +#include "rf_general.h" +#include "rf_dagdegrd.h" +#include "rf_sys.h" + + +/****************************************************************************** + * + * General comments on DAG creation: + * + * All DAGs in this file use roll-away error recovery. Each DAG has a single + * commit node, usually called "Cmt." If an error occurs before the Cmt node + * is reached, the execution engine will halt forward execution and work + * backward through the graph, executing the undo functions. Assuming that + * each node in the graph prior to the Cmt node are undoable and atomic - or - + * does not make changes to permanent state, the graph will fail atomically. + * If an error occurs after the Cmt node executes, the engine will roll-forward + * through the graph, blindly executing nodes until it reaches the end. + * If a graph reaches the end, it is assumed to have completed successfully. + * + * A graph has only 1 Cmt node. + * + */ + + +/****************************************************************************** + * + * The following wrappers map the standard DAG creation interface to the + * DAG creation routines. Additionally, these wrappers enable experimentation + * with new DAG structures by providing an extra level of indirection, allowing + * the DAG creation routines to be replaced at this single point. + */ + +void rf_CreateRaidFiveDegradedReadDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList) +{ + rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, + &rf_xorRecoveryFuncs); +} + + +/****************************************************************************** + * + * DAG creation code begins here + */ + + +/****************************************************************************** + * Create a degraded read DAG for RAID level 1 + * + * Hdr -> Nil -> R(p/s)d -> Commit -> Trm + * + * The "Rd" node reads data from the surviving disk in the mirror pair + * Rpd - read of primary copy + * Rsd - read of secondary copy + * + * Parameters: raidPtr - description of the physical array + * asmap - logical & physical addresses for this access + * bp - buffer ptr (for holding write data) + * flags - general flags (e.g. disk locking) + * allocList - list of memory allocated in DAG creation + *****************************************************************************/ + +void rf_CreateRaidOneDegradedReadDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList) +{ + RF_DagNode_t *nodes, *rdNode, *blockNode, *commitNode, *termNode; + RF_StripeNum_t parityStripeID; + RF_ReconUnitNum_t which_ru; + RF_PhysDiskAddr_t *pda; + int useMirror, i; + + useMirror = 0; + parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), + asmap->raidAddress, &which_ru); + if (rf_dagDebug) { + printf("[Creating RAID level 1 degraded read DAG]\n"); + } + dag_h->creator = "RaidOneDegradedReadDAG"; + /* alloc the Wnd nodes and the Wmir node */ + if (asmap->numDataFailed == 0) + useMirror = RF_FALSE; + else + useMirror = RF_TRUE; + + /* total number of nodes = 1 + (block + commit + terminator) */ + RF_CallocAndAdd(nodes, 4, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList); + i = 0; + rdNode = &nodes[i]; i++; + blockNode = &nodes[i]; i++; + commitNode = &nodes[i]; i++; + termNode = &nodes[i]; i++; + + /* this dag can not commit until the commit node is reached. errors prior + * to the commit point imply the dag has failed and must be retried + */ + dag_h->numCommitNodes = 1; + dag_h->numCommits = 0; + dag_h->numSuccedents = 1; + + /* initialize the block, commit, and terminator nodes */ + rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, 1, 0, 0, 0, dag_h, "Nil", allocList); + rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList); + rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, + NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); + + pda = asmap->physInfo; + RF_ASSERT(pda != NULL); + /* parityInfo must describe entire parity unit */ + RF_ASSERT(asmap->parityInfo->next == NULL); + + /* initialize the data node */ + if (!useMirror) { + /* read primary copy of data */ + rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, + rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rpd", allocList); + rdNode->params[0].p = pda; + rdNode->params[1].p = pda->bufPtr; + rdNode->params[2].v = parityStripeID; + rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + } + else { + /* read secondary copy of data */ + rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, + rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rsd", allocList); + rdNode->params[0].p = asmap->parityInfo; + rdNode->params[1].p = pda->bufPtr; + rdNode->params[2].v = parityStripeID; + rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + } + + /* connect header to block node */ + RF_ASSERT(dag_h->numSuccedents == 1); + RF_ASSERT(blockNode->numAntecedents == 0); + dag_h->succedents[0] = blockNode; + + /* connect block node to rdnode */ + RF_ASSERT(blockNode->numSuccedents == 1); + RF_ASSERT(rdNode->numAntecedents == 1); + blockNode->succedents[0] = rdNode; + rdNode->antecedents[0] = blockNode; + rdNode->antType[0] = rf_control; + + /* connect rdnode to commit node */ + RF_ASSERT(rdNode->numSuccedents == 1); + RF_ASSERT(commitNode->numAntecedents == 1); + rdNode->succedents[0] = commitNode; + commitNode->antecedents[0] = rdNode; + commitNode->antType[0] = rf_control; + + /* connect commit node to terminator */ + RF_ASSERT(commitNode->numSuccedents == 1); + RF_ASSERT(termNode->numAntecedents == 1); + RF_ASSERT(termNode->numSuccedents == 0); + commitNode->succedents[0] = termNode; + termNode->antecedents[0] = commitNode; + termNode->antType[0] = rf_control; +} + + + +/****************************************************************************** + * + * creates a DAG to perform a degraded-mode read of data within one stripe. + * This DAG is as follows: + * + * Hdr -> Block -> Rud -> Xor -> Cmt -> T + * -> Rrd -> + * -> Rp --> + * + * Each R node is a successor of the L node + * One successor arc from each R node goes to C, and the other to X + * There is one Rud for each chunk of surviving user data requested by the + * user, and one Rrd for each chunk of surviving user data _not_ being read by + * the user + * R = read, ud = user data, rd = recovery (surviving) data, p = parity + * X = XOR, C = Commit, T = terminate + * + * The block node guarantees a single source node. + * + * Note: The target buffer for the XOR node is set to the actual user buffer + * where the failed data is supposed to end up. This buffer is zero'd by the + * code here. Thus, if you create a degraded read dag, use it, and then + * re-use, you have to be sure to zero the target buffer prior to the re-use. + * + * The recfunc argument at the end specifies the name and function used for + * the redundancy + * recovery function. + * + *****************************************************************************/ + +void rf_CreateDegradedReadDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList, + RF_RedFuncs_t *recFunc) +{ + RF_DagNode_t *nodes, *rudNodes, *rrdNodes, *xorNode, *blockNode; + RF_DagNode_t *commitNode, *rpNode, *termNode; + int nNodes, nRrdNodes, nRudNodes, nXorBufs, i; + int j, paramNum; + RF_SectorCount_t sectorsPerSU; + RF_ReconUnitNum_t which_ru; + char *overlappingPDAs; /* a temporary array of flags */ + RF_AccessStripeMapHeader_t *new_asm_h[2]; + RF_PhysDiskAddr_t *pda, *parityPDA; + RF_StripeNum_t parityStripeID; + RF_PhysDiskAddr_t *failedPDA; + RF_RaidLayout_t *layoutPtr; + char *rpBuf; + + layoutPtr = &(raidPtr->Layout); + /* failedPDA points to the pda within the asm that targets the failed disk */ + failedPDA = asmap->failedPDAs[0]; + parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, + asmap->raidAddress, &which_ru); + sectorsPerSU = layoutPtr->sectorsPerStripeUnit; + + if (rf_dagDebug) { + printf("[Creating degraded read DAG]\n"); + } + + RF_ASSERT( asmap->numDataFailed == 1 ); + dag_h->creator = "DegradedReadDAG"; + + /* + * generate two ASMs identifying the surviving data we need + * in order to recover the lost data + */ + + /* overlappingPDAs array must be zero'd */ + RF_Calloc(overlappingPDAs, asmap->numStripeUnitsAccessed, sizeof(char), (char *)); + rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h, &nXorBufs, + &rpBuf, overlappingPDAs, allocList); + + /* + * create all the nodes at once + * + * -1 because no access is generated for the failed pda + */ + nRudNodes = asmap->numStripeUnitsAccessed-1; + nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) + + ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0); + nNodes = 5 + nRudNodes + nRrdNodes; /* lock, unlock, xor, Rp, Rud, Rrd */ + RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), + allocList); + i = 0; + blockNode = &nodes[i]; i++; + commitNode = &nodes[i]; i++; + xorNode = &nodes[i]; i++; + rpNode = &nodes[i]; i++; + termNode = &nodes[i]; i++; + rudNodes = &nodes[i]; i += nRudNodes; + rrdNodes = &nodes[i]; i += nRrdNodes; + RF_ASSERT(i == nNodes); + + /* initialize nodes */ + dag_h->numCommitNodes = 1; + dag_h->numCommits = 0; + /* this dag can not commit until the commit node is reached + * errors prior to the commit point imply the dag has failed + */ + dag_h->numSuccedents = 1; + + rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, nRudNodes+nRrdNodes+1, 0, 0, 0, dag_h, "Nil", allocList); + rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList); + rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, + NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); + rf_InitNode(xorNode, rf_wait, RF_FALSE, recFunc->simple, rf_NullNodeUndoFunc, + NULL, 1, nRudNodes+nRrdNodes+1, 2*nXorBufs+2, 1, dag_h, + recFunc->SimpleName, allocList); + + /* fill in the Rud nodes */ + for (pda=asmap->physInfo, i=0; i<nRudNodes; i++, pda=pda->next) { + if (pda == failedPDA) {i--; continue;} + rf_InitNode(&rudNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, + rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, + "Rud", allocList); + RF_ASSERT(pda); + rudNodes[i].params[0].p = pda; + rudNodes[i].params[1].p = pda->bufPtr; + rudNodes[i].params[2].v = parityStripeID; + rudNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + } + + /* fill in the Rrd nodes */ + i = 0; + if (new_asm_h[0]) { + for (pda=new_asm_h[0]->stripeMap->physInfo; + i<new_asm_h[0]->stripeMap->numStripeUnitsAccessed; + i++, pda=pda->next) + { + rf_InitNode(&rrdNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, + rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, + dag_h, "Rrd", allocList); + RF_ASSERT(pda); + rrdNodes[i].params[0].p = pda; + rrdNodes[i].params[1].p = pda->bufPtr; + rrdNodes[i].params[2].v = parityStripeID; + rrdNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + } + } + if (new_asm_h[1]) { + for (j=0,pda=new_asm_h[1]->stripeMap->physInfo; + j<new_asm_h[1]->stripeMap->numStripeUnitsAccessed; + j++, pda=pda->next) + { + rf_InitNode(&rrdNodes[i+j], rf_wait, RF_FALSE, rf_DiskReadFunc, + rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, + dag_h, "Rrd", allocList); + RF_ASSERT(pda); + rrdNodes[i+j].params[0].p = pda; + rrdNodes[i+j].params[1].p = pda->bufPtr; + rrdNodes[i+j].params[2].v = parityStripeID; + rrdNodes[i+j].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + } + } + + /* make a PDA for the parity unit */ + RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList); + parityPDA->row = asmap->parityInfo->row; + parityPDA->col = asmap->parityInfo->col; + parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU) + * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU); + parityPDA->numSector = failedPDA->numSector; + + /* initialize the Rp node */ + rf_InitNode(rpNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, + rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rp ", allocList); + rpNode->params[0].p = parityPDA; + rpNode->params[1].p = rpBuf; + rpNode->params[2].v = parityStripeID; + rpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + + /* + * the last and nastiest step is to assign all + * the parameters of the Xor node + */ + paramNum=0; + for (i=0; i<nRrdNodes; i++) { + /* all the Rrd nodes need to be xored together */ + xorNode->params[paramNum++] = rrdNodes[i].params[0]; + xorNode->params[paramNum++] = rrdNodes[i].params[1]; + } + for (i=0; i<nRudNodes; i++) { + /* any Rud nodes that overlap the failed access need to be xored in */ + if (overlappingPDAs[i]) { + RF_MallocAndAdd(pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList); + bcopy((char *)rudNodes[i].params[0].p, (char *)pda, sizeof(RF_PhysDiskAddr_t)); + rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_DOBUFFER, 0); + xorNode->params[paramNum++].p = pda; + xorNode->params[paramNum++].p = pda->bufPtr; + } + } + RF_Free(overlappingPDAs, asmap->numStripeUnitsAccessed * sizeof(char)); + + /* install parity pda as last set of params to be xor'd */ + xorNode->params[paramNum++].p = parityPDA; + xorNode->params[paramNum++].p = rpBuf; + + /* + * the last 2 params to the recovery xor node are + * the failed PDA and the raidPtr + */ + xorNode->params[paramNum++].p = failedPDA; + xorNode->params[paramNum++].p = raidPtr; + RF_ASSERT( paramNum == 2*nXorBufs+2 ); + + /* + * The xor node uses results[0] as the target buffer. + * Set pointer and zero the buffer. In the kernel, this + * may be a user buffer in which case we have to remap it. + */ + xorNode->results[0] = failedPDA->bufPtr; + RF_BZERO(bp, failedPDA->bufPtr, rf_RaidAddressToByte(raidPtr, + failedPDA->numSector)); + + /* connect nodes to form graph */ + /* connect the header to the block node */ + RF_ASSERT(dag_h->numSuccedents == 1); + RF_ASSERT(blockNode->numAntecedents == 0); + dag_h->succedents[0] = blockNode; + + /* connect the block node to the read nodes */ + RF_ASSERT(blockNode->numSuccedents == (1 + nRrdNodes + nRudNodes)); + RF_ASSERT(rpNode->numAntecedents == 1); + blockNode->succedents[0] = rpNode; + rpNode->antecedents[0] = blockNode; + rpNode->antType[0] = rf_control; + for (i = 0; i < nRrdNodes; i++) { + RF_ASSERT(rrdNodes[i].numSuccedents == 1); + blockNode->succedents[1 + i] = &rrdNodes[i]; + rrdNodes[i].antecedents[0] = blockNode; + rrdNodes[i].antType[0] = rf_control; + } + for (i = 0; i < nRudNodes; i++) { + RF_ASSERT(rudNodes[i].numSuccedents == 1); + blockNode->succedents[1 + nRrdNodes + i] = &rudNodes[i]; + rudNodes[i].antecedents[0] = blockNode; + rudNodes[i].antType[0] = rf_control; + } + + /* connect the read nodes to the xor node */ + RF_ASSERT(xorNode->numAntecedents == (1 + nRrdNodes + nRudNodes)); + RF_ASSERT(rpNode->numSuccedents == 1); + rpNode->succedents[0] = xorNode; + xorNode->antecedents[0] = rpNode; + xorNode->antType[0] = rf_trueData; + for (i = 0; i < nRrdNodes; i++) { + RF_ASSERT(rrdNodes[i].numSuccedents == 1); + rrdNodes[i].succedents[0] = xorNode; + xorNode->antecedents[1 + i] = &rrdNodes[i]; + xorNode->antType[1 + i] = rf_trueData; + } + for (i = 0; i < nRudNodes; i++) { + RF_ASSERT(rudNodes[i].numSuccedents == 1); + rudNodes[i].succedents[0] = xorNode; + xorNode->antecedents[1 + nRrdNodes + i] = &rudNodes[i]; + xorNode->antType[1 + nRrdNodes + i] = rf_trueData; + } + + /* connect the xor node to the commit node */ + RF_ASSERT(xorNode->numSuccedents == 1); + RF_ASSERT(commitNode->numAntecedents == 1); + xorNode->succedents[0] = commitNode; + commitNode->antecedents[0] = xorNode; + commitNode->antType[0] = rf_control; + + /* connect the termNode to the commit node */ + RF_ASSERT(commitNode->numSuccedents == 1); + RF_ASSERT(termNode->numAntecedents == 1); + RF_ASSERT(termNode->numSuccedents == 0); + commitNode->succedents[0] = termNode; + termNode->antType[0] = rf_control; + termNode->antecedents[0] = commitNode; +} + + +/****************************************************************************** + * Create a degraded read DAG for Chained Declustering + * + * Hdr -> Nil -> R(p/s)d -> Cmt -> Trm + * + * The "Rd" node reads data from the surviving disk in the mirror pair + * Rpd - read of primary copy + * Rsd - read of secondary copy + * + * Parameters: raidPtr - description of the physical array + * asmap - logical & physical addresses for this access + * bp - buffer ptr (for holding write data) + * flags - general flags (e.g. disk locking) + * allocList - list of memory allocated in DAG creation + *****************************************************************************/ + +void rf_CreateRaidCDegradedReadDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList) +{ + RF_DagNode_t *nodes, *rdNode, *blockNode, *commitNode, *termNode; + RF_StripeNum_t parityStripeID; + int useMirror, i, shiftable; + RF_ReconUnitNum_t which_ru; + RF_PhysDiskAddr_t *pda; + + if ((asmap->numDataFailed + asmap->numParityFailed) == 0) { + shiftable = RF_TRUE; + } + else { + shiftable = RF_FALSE; + } + useMirror = 0; + parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), + asmap->raidAddress, &which_ru); + + if (rf_dagDebug) { + printf("[Creating RAID C degraded read DAG]\n"); + } + dag_h->creator = "RaidCDegradedReadDAG"; + /* alloc the Wnd nodes and the Wmir node */ + if (asmap->numDataFailed == 0) + useMirror = RF_FALSE; + else + useMirror = RF_TRUE; + + /* total number of nodes = 1 + (block + commit + terminator) */ + RF_CallocAndAdd(nodes, 4, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList); + i = 0; + rdNode = &nodes[i]; i++; + blockNode = &nodes[i]; i++; + commitNode = &nodes[i]; i++; + termNode = &nodes[i]; i++; + + /* + * This dag can not commit until the commit node is reached. + * Errors prior to the commit point imply the dag has failed + * and must be retried. + */ + dag_h->numCommitNodes = 1; + dag_h->numCommits = 0; + dag_h->numSuccedents = 1; + + /* initialize the block, commit, and terminator nodes */ + rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, 1, 0, 0, 0, dag_h, "Nil", allocList); + rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList); + rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, + NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); + + pda = asmap->physInfo; + RF_ASSERT(pda != NULL); + /* parityInfo must describe entire parity unit */ + RF_ASSERT(asmap->parityInfo->next == NULL); + + /* initialize the data node */ + if (!useMirror) { + rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, + rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rpd", allocList); + if (shiftable && rf_compute_workload_shift(raidPtr, pda)) { + /* shift this read to the next disk in line */ + rdNode->params[0].p = asmap->parityInfo; + rdNode->params[1].p = pda->bufPtr; + rdNode->params[2].v = parityStripeID; + rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + } + else { + /* read primary copy */ + rdNode->params[0].p = pda; + rdNode->params[1].p = pda->bufPtr; + rdNode->params[2].v = parityStripeID; + rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + } + } + else { + /* read secondary copy of data */ + rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, + rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rsd", allocList); + rdNode->params[0].p = asmap->parityInfo; + rdNode->params[1].p = pda->bufPtr; + rdNode->params[2].v = parityStripeID; + rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + } + + /* connect header to block node */ + RF_ASSERT(dag_h->numSuccedents == 1); + RF_ASSERT(blockNode->numAntecedents == 0); + dag_h->succedents[0] = blockNode; + + /* connect block node to rdnode */ + RF_ASSERT(blockNode->numSuccedents == 1); + RF_ASSERT(rdNode->numAntecedents == 1); + blockNode->succedents[0] = rdNode; + rdNode->antecedents[0] = blockNode; + rdNode->antType[0] = rf_control; + + /* connect rdnode to commit node */ + RF_ASSERT(rdNode->numSuccedents == 1); + RF_ASSERT(commitNode->numAntecedents == 1); + rdNode->succedents[0] = commitNode; + commitNode->antecedents[0] = rdNode; + commitNode->antType[0] = rf_control; + + /* connect commit node to terminator */ + RF_ASSERT(commitNode->numSuccedents == 1); + RF_ASSERT(termNode->numAntecedents == 1); + RF_ASSERT(termNode->numSuccedents == 0); + commitNode->succedents[0] = termNode; + termNode->antecedents[0] = commitNode; + termNode->antType[0] = rf_control; +} + +/* + * XXX move this elsewhere? + */ +void rf_DD_GenerateFailedAccessASMs( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_PhysDiskAddr_t **pdap, + int *nNodep, + RF_PhysDiskAddr_t **pqpdap, + int *nPQNodep, + RF_AllocListElem_t *allocList) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + int PDAPerDisk,i; + RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; + int numDataCol = layoutPtr->numDataCol; + int state; + RF_SectorNum_t suoff, suend; + unsigned firstDataCol, napdas, count; + RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end = 0; + RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0], *ftwo = asmap->failedPDAs[1]; + RF_PhysDiskAddr_t *pda_p; + RF_PhysDiskAddr_t *phys_p; + RF_RaidAddr_t sosAddr; + + /* determine how many pda's we will have to generate per unaccess stripe. + If there is only one failed data unit, it is one; if two, possibly two, + depending wether they overlap. */ + + fone_start = rf_StripeUnitOffset(layoutPtr,fone->startSector); + fone_end = fone_start + fone->numSector; + +#define CONS_PDA(if,start,num) \ + pda_p->row = asmap->if->row; pda_p->col = asmap->if->col; \ + pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \ + pda_p->numSector = num; \ + pda_p->next = NULL; \ + RF_MallocAndAdd(pda_p->bufPtr,rf_RaidAddressToByte(raidPtr,num),(char *), allocList) + + if (asmap->numDataFailed==1) + { + PDAPerDisk = 1; + state = 1; + RF_MallocAndAdd(*pqpdap,2*sizeof(RF_PhysDiskAddr_t),(RF_PhysDiskAddr_t *), allocList); + pda_p = *pqpdap; + /* build p */ + CONS_PDA(parityInfo,fone_start,fone->numSector); + pda_p->type = RF_PDA_TYPE_PARITY; + pda_p++; + /* build q */ + CONS_PDA(qInfo,fone_start,fone->numSector); + pda_p->type = RF_PDA_TYPE_Q; + } + else + { + ftwo_start = rf_StripeUnitOffset(layoutPtr,ftwo->startSector); + ftwo_end = ftwo_start + ftwo->numSector; + if (fone->numSector + ftwo->numSector > secPerSU) + { + PDAPerDisk = 1; + state = 2; + RF_MallocAndAdd(*pqpdap,2*sizeof(RF_PhysDiskAddr_t),(RF_PhysDiskAddr_t *), allocList); + pda_p = *pqpdap; + CONS_PDA(parityInfo,0,secPerSU); + pda_p->type = RF_PDA_TYPE_PARITY; + pda_p++; + CONS_PDA(qInfo,0,secPerSU); + pda_p->type = RF_PDA_TYPE_Q; + } + else + { + PDAPerDisk = 2; + state = 3; + /* four of them, fone, then ftwo */ + RF_MallocAndAdd(*pqpdap,4*sizeof(RF_PhysDiskAddr_t),(RF_PhysDiskAddr_t *), allocList); + pda_p = *pqpdap; + CONS_PDA(parityInfo,fone_start,fone->numSector); + pda_p->type = RF_PDA_TYPE_PARITY; + pda_p++; + CONS_PDA(qInfo,fone_start,fone->numSector); + pda_p->type = RF_PDA_TYPE_Q; + pda_p++; + CONS_PDA(parityInfo,ftwo_start,ftwo->numSector); + pda_p->type = RF_PDA_TYPE_PARITY; + pda_p++; + CONS_PDA(qInfo,ftwo_start,ftwo->numSector); + pda_p->type = RF_PDA_TYPE_Q; + } + } + /* figure out number of nonaccessed pda */ + napdas = PDAPerDisk * (numDataCol - asmap->numStripeUnitsAccessed - (ftwo==NULL ? 1 : 0)); + *nPQNodep = PDAPerDisk; + + /* sweep over the over accessed pda's, figuring out the number of + additional pda's to generate. Of course, skip the failed ones */ + + count = 0; + for ( pda_p=asmap->physInfo; pda_p; pda_p= pda_p->next) + { + if ((pda_p == fone) || (pda_p == ftwo)) + continue; + suoff = rf_StripeUnitOffset(layoutPtr,pda_p->startSector); + suend = suoff + pda_p->numSector; + switch (state) + { + case 1: /* one failed PDA to overlap */ + /* if a PDA doesn't contain the failed unit, it can + only miss the start or end, not both */ + if ((suoff > fone_start) || (suend <fone_end)) + count++; + break; + case 2: /* whole stripe */ + if (suoff) /* leak at begining */ + count++; + if (suend < numDataCol) /* leak at end */ + count++; + break; + case 3: /* two disjoint units */ + if ((suoff > fone_start) || (suend <fone_end)) + count++; + if ((suoff > ftwo_start) || (suend <ftwo_end)) + count++; + break; + default: + RF_PANIC(); + } + } + + napdas += count; + *nNodep = napdas; + if (napdas == 0) return; /* short circuit */ + + /* allocate up our list of pda's */ + + RF_CallocAndAdd(pda_p, napdas, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList); + *pdap = pda_p; + + /* linkem together */ + for (i=0; i < (napdas-1); i++) + pda_p[i].next = pda_p+(i+1); + + /* march through the one's up to the first accessed disk */ + firstDataCol = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),asmap->physInfo->raidAddress) % numDataCol; + sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); + for (i=0; i < firstDataCol; i++) + { + if ((pda_p - (*pdap)) == napdas) + continue; + pda_p->type = RF_PDA_TYPE_DATA; + pda_p->raidAddress = sosAddr + (i * secPerSU); + (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); + /* skip over dead disks */ + if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status)) + continue; + switch (state) + { + case 1: /* fone */ + pda_p->numSector = fone->numSector; + pda_p->raidAddress += fone_start; + pda_p->startSector += fone_start; + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); + break; + case 2: /* full stripe */ + pda_p->numSector = secPerSU; + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,secPerSU), (char *), allocList); + break; + case 3: /* two slabs */ + pda_p->numSector = fone->numSector; + pda_p->raidAddress += fone_start; + pda_p->startSector += fone_start; + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); + pda_p++; + pda_p->type = RF_PDA_TYPE_DATA; + pda_p->raidAddress = sosAddr + (i * secPerSU); + (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); + pda_p->numSector = ftwo->numSector; + pda_p->raidAddress += ftwo_start; + pda_p->startSector += ftwo_start; + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); + break; + default: + RF_PANIC(); + } + pda_p++; + } + + /* march through the touched stripe units */ + for (phys_p = asmap->physInfo; phys_p; phys_p = phys_p->next, i++) + { + if ((phys_p == asmap->failedPDAs[0]) || (phys_p == asmap->failedPDAs[1])) + continue; + suoff = rf_StripeUnitOffset(layoutPtr,phys_p->startSector); + suend = suoff + phys_p->numSector; + switch(state) + { + case 1: /* single buffer */ + if (suoff > fone_start) + { + RF_ASSERT( suend >= fone_end ); + /* The data read starts after the mapped access, + snip off the begining */ + pda_p->numSector = suoff - fone_start; + pda_p->raidAddress = sosAddr + (i*secPerSU) + fone_start; + (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); + pda_p++; + } + if (suend < fone_end) + { + RF_ASSERT ( suoff <= fone_start); + /* The data read stops before the end of the failed access, extend */ + pda_p->numSector = fone_end - suend; + pda_p->raidAddress = sosAddr + (i*secPerSU) + suend; /* off by one? */ + (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); + pda_p++; + } + break; + case 2: /* whole stripe unit */ + RF_ASSERT( (suoff == 0) || (suend == secPerSU)); + if (suend < secPerSU) + { /* short read, snip from end on */ + pda_p->numSector = secPerSU - suend; + pda_p->raidAddress = sosAddr + (i*secPerSU) + suend; /* off by one? */ + (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); + pda_p++; + } + else + if (suoff > 0) + { /* short at front */ + pda_p->numSector = suoff; + pda_p->raidAddress = sosAddr + (i*secPerSU); + (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); + pda_p++; + } + break; + case 3: /* two nonoverlapping failures */ + if ((suoff > fone_start) || (suend <fone_end)) + { + if (suoff > fone_start) + { + RF_ASSERT( suend >= fone_end ); + /* The data read starts after the mapped access, + snip off the begining */ + pda_p->numSector = suoff - fone_start; + pda_p->raidAddress = sosAddr + (i*secPerSU) + fone_start; + (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); + pda_p++; + } + if (suend < fone_end) + { + RF_ASSERT ( suoff <= fone_start); + /* The data read stops before the end of the failed access, extend */ + pda_p->numSector = fone_end - suend; + pda_p->raidAddress = sosAddr + (i*secPerSU) + suend; /* off by one? */ + (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); + pda_p++; + } + } + if ((suoff > ftwo_start) || (suend <ftwo_end)) + { + if (suoff > ftwo_start) + { + RF_ASSERT( suend >= ftwo_end ); + /* The data read starts after the mapped access, + snip off the begining */ + pda_p->numSector = suoff - ftwo_start; + pda_p->raidAddress = sosAddr + (i*secPerSU) + ftwo_start; + (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); + pda_p++; + } + if (suend < ftwo_end) + { + RF_ASSERT ( suoff <= ftwo_start); + /* The data read stops before the end of the failed access, extend */ + pda_p->numSector = ftwo_end - suend; + pda_p->raidAddress = sosAddr + (i*secPerSU) + suend; /* off by one? */ + (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); + pda_p++; + } + } + break; + default: + RF_PANIC(); + } + } + + /* after the last accessed disk */ + for (; i < numDataCol; i++ ) + { + if ((pda_p - (*pdap)) == napdas) + continue; + pda_p->type = RF_PDA_TYPE_DATA; + pda_p->raidAddress = sosAddr + (i * secPerSU); + (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); + /* skip over dead disks */ + if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status)) + continue; + switch (state) + { + case 1: /* fone */ + pda_p->numSector = fone->numSector; + pda_p->raidAddress += fone_start; + pda_p->startSector += fone_start; + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); + break; + case 2: /* full stripe */ + pda_p->numSector = secPerSU; + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,secPerSU), (char *), allocList); + break; + case 3: /* two slabs */ + pda_p->numSector = fone->numSector; + pda_p->raidAddress += fone_start; + pda_p->startSector += fone_start; + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); + pda_p++; + pda_p->type = RF_PDA_TYPE_DATA; + pda_p->raidAddress = sosAddr + (i * secPerSU); + (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); + pda_p->numSector = ftwo->numSector; + pda_p->raidAddress += ftwo_start; + pda_p->startSector += ftwo_start; + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); + break; + default: + RF_PANIC(); + } + pda_p++; + } + + RF_ASSERT (pda_p - *pdap == napdas); + return; +} + +#define INIT_DISK_NODE(node,name) \ +rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \ +(node)->succedents[0] = unblockNode; \ +(node)->succedents[1] = recoveryNode; \ +(node)->antecedents[0] = blockNode; \ +(node)->antType[0] = rf_control + +#define DISK_NODE_PARAMS(_node_,_p_) \ + (_node_).params[0].p = _p_ ; \ + (_node_).params[1].p = (_p_)->bufPtr; \ + (_node_).params[2].v = parityStripeID; \ + (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru) + +void rf_DoubleDegRead( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList, + char *redundantReadNodeName, + char *recoveryNodeName, + int (*recovFunc)(RF_DagNode_t *)) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_DagNode_t *nodes, *rudNodes, *rrdNodes, *recoveryNode, *blockNode, *unblockNode, *rpNodes, *rqNodes, *termNode; + RF_PhysDiskAddr_t *pda, *pqPDAs; + RF_PhysDiskAddr_t *npdas; + int nNodes, nRrdNodes, nRudNodes, i; + RF_ReconUnitNum_t which_ru; + int nReadNodes, nPQNodes; + RF_PhysDiskAddr_t *failedPDA = asmap->failedPDAs[0]; + RF_PhysDiskAddr_t *failedPDAtwo = asmap->failedPDAs[1]; + RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, &which_ru); + + if (rf_dagDebug) printf("[Creating Double Degraded Read DAG]\n"); + rf_DD_GenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes, &pqPDAs, &nPQNodes,allocList); + + nRudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed); + nReadNodes = nRrdNodes + nRudNodes + 2*nPQNodes; + nNodes = 4 /* block, unblock, recovery, term */ + nReadNodes; + + RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList); + i = 0; + blockNode = &nodes[i]; i += 1; + unblockNode = &nodes[i]; i += 1; + recoveryNode = &nodes[i]; i += 1; + termNode = &nodes[i]; i += 1; + rudNodes = &nodes[i]; i += nRudNodes; + rrdNodes = &nodes[i]; i += nRrdNodes; + rpNodes = &nodes[i]; i += nPQNodes; + rqNodes = &nodes[i]; i += nPQNodes; + RF_ASSERT(i == nNodes); + + dag_h->numSuccedents = 1; + dag_h->succedents[0] = blockNode; + dag_h->creator = "DoubleDegRead"; + dag_h->numCommits = 0; + dag_h->numCommitNodes = 1; /*unblock */ + + rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 2, 0, 0, dag_h, "Trm", allocList); + termNode->antecedents[0] = unblockNode; + termNode->antType[0] = rf_control; + termNode->antecedents[1] = recoveryNode; + termNode->antType[1] = rf_control; + + /* init the block and unblock nodes */ + /* The block node has all nodes except itself, unblock and recovery as successors. Similarly for + predecessors of the unblock. */ + rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h, "Nil", allocList); + rf_InitNode(unblockNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nReadNodes, 0, 0, dag_h, "Nil", allocList); + + for (i=0; i < nReadNodes; i++) + { + blockNode->succedents[i] = rudNodes+i; + unblockNode->antecedents[i] = rudNodes+i; + unblockNode->antType[i] = rf_control; + } + unblockNode->succedents[0] = termNode; + + /* The recovery node has all the reads as predecessors, and the term node as successors. It gets a pda as a param + from each of the read nodes plus the raidPtr. + For each failed unit is has a result pda. */ + rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc, rf_NullNodeUndoFunc, NULL, + 1, /* succesors */ + nReadNodes, /* preds */ + nReadNodes+2, /* params */ + asmap->numDataFailed, /* results */ + dag_h, recoveryNodeName, allocList); + + recoveryNode->succedents[0] = termNode; + for (i=0; i < nReadNodes; i++) { + recoveryNode->antecedents[i] = rudNodes+i; + recoveryNode->antType[i] = rf_trueData; + } + + /* build the read nodes, then come back and fill in recovery params and results */ + pda = asmap->physInfo; + for (i=0; i < nRudNodes; pda = pda->next) + { + if ((pda == failedPDA) || (pda == failedPDAtwo)) + continue; + INIT_DISK_NODE(rudNodes+i,"Rud"); + RF_ASSERT(pda); + DISK_NODE_PARAMS(rudNodes[i],pda); + i++; + } + + pda = npdas; + for (i=0; i < nRrdNodes; i++, pda = pda->next) + { + INIT_DISK_NODE(rrdNodes+i,"Rrd"); + RF_ASSERT(pda); + DISK_NODE_PARAMS(rrdNodes[i],pda); + } + + /* redundancy pdas */ + pda = pqPDAs; + INIT_DISK_NODE(rpNodes,"Rp"); + RF_ASSERT(pda); + DISK_NODE_PARAMS(rpNodes[0],pda); + pda++; + INIT_DISK_NODE(rqNodes,redundantReadNodeName ); + RF_ASSERT(pda); + DISK_NODE_PARAMS(rqNodes[0],pda); + if (nPQNodes==2) + { + pda++; + INIT_DISK_NODE(rpNodes+1,"Rp"); + RF_ASSERT(pda); + DISK_NODE_PARAMS(rpNodes[1],pda); + pda++; + INIT_DISK_NODE( rqNodes+1,redundantReadNodeName ); + RF_ASSERT(pda); + DISK_NODE_PARAMS(rqNodes[1],pda); + } + + /* fill in recovery node params */ + for (i=0; i < nReadNodes; i++) + recoveryNode->params[i] = rudNodes[i].params[0]; /* pda */ + recoveryNode->params[i++].p = (void *) raidPtr; + recoveryNode->params[i++].p = (void *) asmap; + recoveryNode->results[0] = failedPDA; + if (asmap->numDataFailed ==2 ) + recoveryNode->results[1] = failedPDAtwo; + + /* zero fill the target data buffers? */ +} diff --git a/sys/dev/raidframe/rf_dagdegrd.h b/sys/dev/raidframe/rf_dagdegrd.h new file mode 100644 index 00000000000..3e0bce1c7ff --- /dev/null +++ b/sys/dev/raidframe/rf_dagdegrd.h @@ -0,0 +1,88 @@ +/* $OpenBSD: rf_dagdegrd.h,v 1.1 1999/01/11 14:29:07 niklas Exp $ */ +/* $NetBSD: rf_dagdegrd.h,v 1.1 1998/11/13 04:20:27 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * : + * Log: rf_dagdegrd.h,v + * Revision 1.6 1996/07/31 16:29:06 jimz + * asm/asmap re-fix (EO merge) + * + * Revision 1.5 1996/07/31 15:34:40 jimz + * evenodd changes; bugfixes for double-degraded archs, generalize + * some formerly PQ-only functions + * + * Revision 1.4 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.3 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.2 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.1 1996/05/03 19:22:06 wvcii + * Initial revision + * + */ + +#ifndef _RF__RF_DAGDEGRD_H_ +#define _RF__RF_DAGDEGRD_H_ + +#include "rf_types.h" + +/* degraded read DAG creation routines */ +void rf_CreateRaidFiveDegradedReadDAG(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp, + RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList); +void rf_CreateRaidOneDegradedReadDAG(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp, + RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList); +void rf_CreateDegradedReadDAG(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp, + RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList, + RF_RedFuncs_t *recFunc); +void rf_CreateRaidCDegradedReadDAG(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp, + RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList); +void rf_DD_GenerateFailedAccessASMs(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_PhysDiskAddr_t **pdap, + int *nNodep, RF_PhysDiskAddr_t **pqpdap, int *nPQNodep, + RF_AllocListElem_t *allocList); +void rf_DoubleDegRead(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, void *bp, RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList, char *redundantReadNodeName, + char *recoveryNodeName, int (*recovFunc)(RF_DagNode_t *)); + +#endif /* !_RF__RF_DAGDEGRD_H_ */ diff --git a/sys/dev/raidframe/rf_dagdegwr.c b/sys/dev/raidframe/rf_dagdegwr.c new file mode 100644 index 00000000000..a712dd1e83b --- /dev/null +++ b/sys/dev/raidframe/rf_dagdegwr.c @@ -0,0 +1,969 @@ +/* $OpenBSD: rf_dagdegwr.c,v 1.1 1999/01/11 14:29:07 niklas Exp $ */ +/* $NetBSD: rf_dagdegwr.c,v 1.1 1998/11/13 04:20:27 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * rf_dagdegwr.c + * + * code for creating degraded write DAGs + * + * : + * Log: rf_dagdegwr.c,v + * Revision 1.23 1996/11/05 21:10:40 jimz + * failed pda generalization + * + * Revision 1.22 1996/08/23 14:49:48 jimz + * remove bogus assert from small write double deg DAG generator + * + * Revision 1.21 1996/08/21 05:09:44 jimz + * get rid of bogus fakery in DoubleDegSmallWrite + * + * Revision 1.20 1996/08/21 04:14:35 jimz + * cleanup doubledegsmallwrite + * NOTE: we need doubledeglargewrite + * + * Revision 1.19 1996/08/19 21:39:38 jimz + * CommonCreateSimpleDegradedWriteDAG() was unable to correctly create DAGs for + * complete stripe overwrite accesses- it assumed the necessity to read old + * data. Rather than do the "right" thing, and risk breaking a critical DAG so + * close to release, I made a no-op read node to stick in and link up in this + * case. Seems to work. + * + * Revision 1.18 1996/07/31 15:35:34 jimz + * evenodd changes; bugfixes for double-degraded archs, generalize + * some formerly PQ-only functions + * + * Revision 1.17 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.16 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.15 1996/07/27 16:30:19 jimz + * cleanup sweep + * + * Revision 1.14 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.13 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.12 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.11 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.10 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.9 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.8 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.7 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.6 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.5 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.4 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.3 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.2 1996/05/08 21:01:24 jimz + * fixed up enum type names that were conflicting with other + * enums and function names (ie, "panic") + * future naming trends will be towards RF_ and rf_ for + * everything raidframe-related + * + * Revision 1.1 1996/05/03 19:21:50 wvcii + * Initial revision + * + */ + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_dagutils.h" +#include "rf_dagfuncs.h" +#include "rf_threadid.h" +#include "rf_debugMem.h" +#include "rf_memchunk.h" +#include "rf_general.h" +#include "rf_dagdegwr.h" +#include "rf_sys.h" + + +/****************************************************************************** + * + * General comments on DAG creation: + * + * All DAGs in this file use roll-away error recovery. Each DAG has a single + * commit node, usually called "Cmt." If an error occurs before the Cmt node + * is reached, the execution engine will halt forward execution and work + * backward through the graph, executing the undo functions. Assuming that + * each node in the graph prior to the Cmt node are undoable and atomic - or - + * does not make changes to permanent state, the graph will fail atomically. + * If an error occurs after the Cmt node executes, the engine will roll-forward + * through the graph, blindly executing nodes until it reaches the end. + * If a graph reaches the end, it is assumed to have completed successfully. + * + * A graph has only 1 Cmt node. + * + */ + + +/****************************************************************************** + * + * The following wrappers map the standard DAG creation interface to the + * DAG creation routines. Additionally, these wrappers enable experimentation + * with new DAG structures by providing an extra level of indirection, allowing + * the DAG creation routines to be replaced at this single point. + */ + +static RF_CREATE_DAG_FUNC_DECL(rf_CreateSimpleDegradedWriteDAG) +{ + rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, + flags, allocList,1, rf_RecoveryXorFunc, RF_TRUE); +} + +void rf_CreateDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList) + RF_Raid_t *raidPtr; + RF_AccessStripeMap_t *asmap; + RF_DagHeader_t *dag_h; + void *bp; + RF_RaidAccessFlags_t flags; + RF_AllocListElem_t *allocList; +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_PhysDiskAddr_t *failedPDA = asmap->failedPDAs[0]; + + RF_ASSERT( asmap->numDataFailed == 1 ); + dag_h->creator = "DegradedWriteDAG"; + + /* if the access writes only a portion of the failed unit, and also writes + * some portion of at least one surviving unit, we create two DAGs, one for + * the failed component and one for the non-failed component, and do them + * sequentially. Note that the fact that we're accessing only a portion of + * the failed unit indicates that the access either starts or ends in the + * failed unit, and hence we need create only two dags. This is inefficient + * in that the same data or parity can get read and written twice using this + * structure. I need to fix this to do the access all at once. + */ + RF_ASSERT(!(asmap->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit)); + rf_CreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList); +} + + + +/****************************************************************************** + * + * DAG creation code begins here + */ + + + +/****************************************************************************** + * + * CommonCreateSimpleDegradedWriteDAG -- creates a DAG to do a degraded-mode + * write, which is as follows + * + * / {Wnq} --\ + * hdr -> blockNode -> Rod -> Xor -> Cmt -> Wnp ----> unblock -> term + * \ {Rod} / \ Wnd ---/ + * \ {Wnd} -/ + * + * commit nodes: Xor, Wnd + * + * IMPORTANT: + * This DAG generator does not work for double-degraded archs since it does not + * generate Q + * + * This dag is essentially identical to the large-write dag, except that the + * write to the failed data unit is suppressed. + * + * IMPORTANT: this dag does not work in the case where the access writes only + * a portion of the failed unit, and also writes some portion of at least one + * surviving SU. this case is handled in CreateDegradedWriteDAG above. + * + * The block & unblock nodes are leftovers from a previous version. They + * do nothing, but I haven't deleted them because it would be a tremendous + * effort to put them back in. + * + * This dag is used whenever a one of the data units in a write has failed. + * If it is the parity unit that failed, the nonredundant write dag (below) + * is used. + *****************************************************************************/ + +void rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, + allocList, nfaults, redFunc, allowBufferRecycle) + RF_Raid_t *raidPtr; + RF_AccessStripeMap_t *asmap; + RF_DagHeader_t *dag_h; + void *bp; + RF_RaidAccessFlags_t flags; + RF_AllocListElem_t *allocList; + int nfaults; + int (*redFunc)(RF_DagNode_t *); + int allowBufferRecycle; +{ + int nNodes, nRrdNodes, nWndNodes, nXorBufs, i, j, paramNum, rdnodesFaked; + RF_DagNode_t *blockNode, *unblockNode, *wnpNode, *wnqNode, *termNode; + RF_DagNode_t *nodes, *wndNodes, *rrdNodes, *xorNode, *commitNode; + RF_SectorCount_t sectorsPerSU; + RF_ReconUnitNum_t which_ru; + char *xorTargetBuf = NULL; /* the target buffer for the XOR operation */ + char *overlappingPDAs; /* a temporary array of flags */ + RF_AccessStripeMapHeader_t *new_asm_h[2]; + RF_PhysDiskAddr_t *pda, *parityPDA; + RF_StripeNum_t parityStripeID; + RF_PhysDiskAddr_t *failedPDA; + RF_RaidLayout_t *layoutPtr; + + layoutPtr = &(raidPtr->Layout); + parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, + &which_ru); + sectorsPerSU = layoutPtr->sectorsPerStripeUnit; + /* failedPDA points to the pda within the asm that targets the failed disk */ + failedPDA = asmap->failedPDAs[0]; + + if (rf_dagDebug) + printf("[Creating degraded-write DAG]\n"); + + RF_ASSERT( asmap->numDataFailed == 1 ); + dag_h->creator = "SimpleDegradedWriteDAG"; + + /* + * Generate two ASMs identifying the surviving data + * we need in order to recover the lost data. + */ + /* overlappingPDAs array must be zero'd */ + RF_Calloc(overlappingPDAs, asmap->numStripeUnitsAccessed, sizeof(char), (char *)); + rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h, + &nXorBufs, NULL, overlappingPDAs, allocList); + + /* create all the nodes at once */ + nWndNodes = asmap->numStripeUnitsAccessed - 1; /* no access is generated + * for the failed pda */ + + nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) + + ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0); + /* + * XXX + * + * There's a bug with a complete stripe overwrite- that means 0 reads + * of old data, and the rest of the DAG generation code doesn't like + * that. A release is coming, and I don't wanna risk breaking a critical + * DAG generator, so here's what I'm gonna do- if there's no read nodes, + * I'm gonna fake there being a read node, and I'm gonna swap in a + * no-op node in its place (to make all the link-up code happy). + * This should be fixed at some point. --jimz + */ + if (nRrdNodes == 0) { + nRrdNodes = 1; + rdnodesFaked = 1; + } + else { + rdnodesFaked = 0; + } + /* lock, unlock, xor, Wnd, Rrd, W(nfaults) */ + nNodes = 5 + nfaults + nWndNodes + nRrdNodes; + RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), + (RF_DagNode_t *), allocList); + i = 0; + blockNode = &nodes[i]; i += 1; + commitNode = &nodes[i]; i += 1; + unblockNode = &nodes[i]; i += 1; + termNode = &nodes[i]; i += 1; + xorNode = &nodes[i]; i += 1; + wnpNode = &nodes[i]; i += 1; + wndNodes = &nodes[i]; i += nWndNodes; + rrdNodes = &nodes[i]; i += nRrdNodes; + if (nfaults == 2) { + wnqNode = &nodes[i]; i += 1; + } + else { + wnqNode = NULL; + } + RF_ASSERT(i == nNodes); + + /* this dag can not commit until all rrd and xor Nodes have completed */ + dag_h->numCommitNodes = 1; + dag_h->numCommits = 0; + dag_h->numSuccedents = 1; + + RF_ASSERT( nRrdNodes > 0 ); + rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, nRrdNodes, 0, 0, 0, dag_h, "Nil", allocList); + rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, nWndNodes + nfaults, 1, 0, 0, dag_h, "Cmt", allocList); + rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, 1, nWndNodes + nfaults, 0, 0, dag_h, "Nil", allocList); + rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, + NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); + rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1, + nRrdNodes, 2*nXorBufs+2, nfaults, dag_h, "Xrc", allocList); + + /* + * Fill in the Rrd nodes. If any of the rrd buffers are the same size as + * the failed buffer, save a pointer to it so we can use it as the target + * of the XOR. The pdas in the rrd nodes have been range-restricted, so if + * a buffer is the same size as the failed buffer, it must also be at the + * same alignment within the SU. + */ + i = 0; + if (new_asm_h[0]) { + for (i=0, pda=new_asm_h[0]->stripeMap->physInfo; + i<new_asm_h[0]->stripeMap->numStripeUnitsAccessed; + i++, pda=pda->next) + { + rf_InitNode(&rrdNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, + rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList); + RF_ASSERT(pda); + rrdNodes[i].params[0].p = pda; + rrdNodes[i].params[1].p = pda->bufPtr; + rrdNodes[i].params[2].v = parityStripeID; + rrdNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + } + } + /* i now equals the number of stripe units accessed in new_asm_h[0] */ + if (new_asm_h[1]) { + for (j=0,pda=new_asm_h[1]->stripeMap->physInfo; + j<new_asm_h[1]->stripeMap->numStripeUnitsAccessed; + j++, pda=pda->next) + { + rf_InitNode(&rrdNodes[i+j], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, + rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList); + RF_ASSERT(pda); + rrdNodes[i+j].params[0].p = pda; + rrdNodes[i+j].params[1].p = pda->bufPtr; + rrdNodes[i+j].params[2].v = parityStripeID; + rrdNodes[i+j].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + if (allowBufferRecycle && (pda->numSector == failedPDA->numSector)) + xorTargetBuf = pda->bufPtr; + } + } + if (rdnodesFaked) { + /* + * This is where we'll init that fake noop read node + * (XXX should the wakeup func be different?) + */ + rf_InitNode(&rrdNodes[0], rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, 1, 1, 0, 0, dag_h, "RrN", allocList); + } + + /* + * Make a PDA for the parity unit. The parity PDA should start at + * the same offset into the SU as the failed PDA. + */ + /* + * Danner comment: + * I don't think this copy is really necessary. + * We are in one of two cases here. + * (1) The entire failed unit is written. Then asmap->parityInfo will + * describe the entire parity. + * (2) We are only writing a subset of the failed unit and nothing + * else. Then the asmap->parityInfo describes the failed unit and + * the copy can also be avoided. + */ + + RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList); + parityPDA->row = asmap->parityInfo->row; + parityPDA->col = asmap->parityInfo->col; + parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU) + * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU); + parityPDA->numSector = failedPDA->numSector; + + if (!xorTargetBuf) { + RF_CallocAndAdd(xorTargetBuf, 1, + rf_RaidAddressToByte(raidPtr, failedPDA->numSector), (char *), allocList); + } + + /* init the Wnp node */ + rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, + rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList); + wnpNode->params[0].p = parityPDA; + wnpNode->params[1].p = xorTargetBuf; + wnpNode->params[2].v = parityStripeID; + wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + + /* fill in the Wnq Node */ + if (nfaults == 2) { + { + RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t), + (RF_PhysDiskAddr_t *), allocList); + parityPDA->row = asmap->qInfo->row; + parityPDA->col = asmap->qInfo->col; + parityPDA->startSector = ((asmap->qInfo->startSector / sectorsPerSU) + * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU); + parityPDA->numSector = failedPDA->numSector; + + rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, + rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList); + wnqNode->params[0].p = parityPDA; + RF_CallocAndAdd(xorNode->results[1], 1, + rf_RaidAddressToByte(raidPtr, failedPDA->numSector), (char *), allocList); + wnqNode->params[1].p = xorNode->results[1]; + wnqNode->params[2].v = parityStripeID; + wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + } + } + + /* fill in the Wnd nodes */ + for (pda=asmap->physInfo, i=0; i<nWndNodes; i++, pda=pda->next) { + if (pda == failedPDA) { + i--; + continue; + } + rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, + rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList); + RF_ASSERT(pda); + wndNodes[i].params[0].p = pda; + wndNodes[i].params[1].p = pda->bufPtr; + wndNodes[i].params[2].v = parityStripeID; + wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + } + + /* fill in the results of the xor node */ + xorNode->results[0] = xorTargetBuf; + + /* fill in the params of the xor node */ + + paramNum=0; + if (rdnodesFaked == 0) { + for (i=0; i<nRrdNodes; i++) { + /* all the Rrd nodes need to be xored together */ + xorNode->params[paramNum++] = rrdNodes[i].params[0]; + xorNode->params[paramNum++] = rrdNodes[i].params[1]; + } + } + for (i=0; i < nWndNodes; i++) { + /* any Wnd nodes that overlap the failed access need to be xored in */ + if (overlappingPDAs[i]) { + RF_MallocAndAdd(pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList); + bcopy((char *)wndNodes[i].params[0].p, (char *)pda, sizeof(RF_PhysDiskAddr_t)); + rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_DOBUFFER, 0); + xorNode->params[paramNum++].p = pda; + xorNode->params[paramNum++].p = pda->bufPtr; + } + } + RF_Free(overlappingPDAs, asmap->numStripeUnitsAccessed * sizeof(char)); + + /* + * Install the failed PDA into the xor param list so that the + * new data gets xor'd in. + */ + xorNode->params[paramNum++].p = failedPDA; + xorNode->params[paramNum++].p = failedPDA->bufPtr; + + /* + * The last 2 params to the recovery xor node are always the failed + * PDA and the raidPtr. install the failedPDA even though we have just + * done so above. This allows us to use the same XOR function for both + * degraded reads and degraded writes. + */ + xorNode->params[paramNum++].p = failedPDA; + xorNode->params[paramNum++].p = raidPtr; + RF_ASSERT( paramNum == 2*nXorBufs+2 ); + + /* + * Code to link nodes begins here + */ + + /* link header to block node */ + RF_ASSERT(blockNode->numAntecedents == 0); + dag_h->succedents[0] = blockNode; + + /* link block node to rd nodes */ + RF_ASSERT(blockNode->numSuccedents == nRrdNodes); + for (i = 0; i < nRrdNodes; i++) { + RF_ASSERT(rrdNodes[i].numAntecedents == 1); + blockNode->succedents[i] = &rrdNodes[i]; + rrdNodes[i].antecedents[0] = blockNode; + rrdNodes[i].antType[0] = rf_control; + } + + /* link read nodes to xor node*/ + RF_ASSERT(xorNode->numAntecedents == nRrdNodes); + for (i = 0; i < nRrdNodes; i++) { + RF_ASSERT(rrdNodes[i].numSuccedents == 1); + rrdNodes[i].succedents[0] = xorNode; + xorNode->antecedents[i] = &rrdNodes[i]; + xorNode->antType[i] = rf_trueData; + } + + /* link xor node to commit node */ + RF_ASSERT(xorNode->numSuccedents == 1); + RF_ASSERT(commitNode->numAntecedents == 1); + xorNode->succedents[0] = commitNode; + commitNode->antecedents[0] = xorNode; + commitNode->antType[0] = rf_control; + + /* link commit node to wnd nodes */ + RF_ASSERT(commitNode->numSuccedents == nfaults + nWndNodes); + for (i = 0; i < nWndNodes; i++) { + RF_ASSERT(wndNodes[i].numAntecedents == 1); + commitNode->succedents[i] = &wndNodes[i]; + wndNodes[i].antecedents[0] = commitNode; + wndNodes[i].antType[0] = rf_control; + } + + /* link the commit node to wnp, wnq nodes */ + RF_ASSERT(wnpNode->numAntecedents == 1); + commitNode->succedents[nWndNodes] = wnpNode; + wnpNode->antecedents[0] = commitNode; + wnpNode->antType[0] = rf_control; + if (nfaults == 2) { + RF_ASSERT(wnqNode->numAntecedents == 1); + commitNode->succedents[nWndNodes + 1] = wnqNode; + wnqNode->antecedents[0] = commitNode; + wnqNode->antType[0] = rf_control; + } + + /* link write new data nodes to unblock node */ + RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nfaults)); + for(i = 0; i < nWndNodes; i++) { + RF_ASSERT(wndNodes[i].numSuccedents == 1); + wndNodes[i].succedents[0] = unblockNode; + unblockNode->antecedents[i] = &wndNodes[i]; + unblockNode->antType[i] = rf_control; + } + + /* link write new parity node to unblock node */ + RF_ASSERT(wnpNode->numSuccedents == 1); + wnpNode->succedents[0] = unblockNode; + unblockNode->antecedents[nWndNodes] = wnpNode; + unblockNode->antType[nWndNodes] = rf_control; + + /* link write new q node to unblock node */ + if (nfaults == 2) { + RF_ASSERT(wnqNode->numSuccedents == 1); + wnqNode->succedents[0] = unblockNode; + unblockNode->antecedents[nWndNodes+1] = wnqNode; + unblockNode->antType[nWndNodes+1] = rf_control; + } + + /* link unblock node to term node */ + RF_ASSERT(unblockNode->numSuccedents == 1); + RF_ASSERT(termNode->numAntecedents == 1); + RF_ASSERT(termNode->numSuccedents == 0); + unblockNode->succedents[0] = termNode; + termNode->antecedents[0] = unblockNode; + termNode->antType[0] = rf_control; +} + +#define CONS_PDA(if,start,num) \ + pda_p->row = asmap->if->row; pda_p->col = asmap->if->col; \ + pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \ + pda_p->numSector = num; \ + pda_p->next = NULL; \ + RF_MallocAndAdd(pda_p->bufPtr,rf_RaidAddressToByte(raidPtr,num),(char *), allocList) + +void rf_WriteGenerateFailedAccessASMs( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_PhysDiskAddr_t **pdap, + int *nNodep, + RF_PhysDiskAddr_t **pqpdap, + int *nPQNodep, + RF_AllocListElem_t *allocList) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + int PDAPerDisk,i; + RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; + int numDataCol = layoutPtr->numDataCol; + int state; + unsigned napdas; + RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end; + RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0], *ftwo = asmap->failedPDAs[1]; + RF_PhysDiskAddr_t *pda_p; + RF_RaidAddr_t sosAddr; + + /* determine how many pda's we will have to generate per unaccess stripe. + If there is only one failed data unit, it is one; if two, possibly two, + depending wether they overlap. */ + + fone_start = rf_StripeUnitOffset(layoutPtr,fone->startSector); + fone_end = fone_start + fone->numSector; + + if (asmap->numDataFailed==1) + { + PDAPerDisk = 1; + state = 1; + RF_MallocAndAdd(*pqpdap,2*sizeof(RF_PhysDiskAddr_t),(RF_PhysDiskAddr_t *), allocList); + pda_p = *pqpdap; + /* build p */ + CONS_PDA(parityInfo,fone_start,fone->numSector); + pda_p->type = RF_PDA_TYPE_PARITY; + pda_p++; + /* build q */ + CONS_PDA(qInfo,fone_start,fone->numSector); + pda_p->type = RF_PDA_TYPE_Q; + } + else + { + ftwo_start = rf_StripeUnitOffset(layoutPtr,ftwo->startSector); + ftwo_end = ftwo_start + ftwo->numSector; + if (fone->numSector + ftwo->numSector > secPerSU) + { + PDAPerDisk = 1; + state = 2; + RF_MallocAndAdd(*pqpdap,2*sizeof(RF_PhysDiskAddr_t),(RF_PhysDiskAddr_t *), allocList); + pda_p = *pqpdap; + CONS_PDA(parityInfo,0,secPerSU); + pda_p->type = RF_PDA_TYPE_PARITY; + pda_p++; + CONS_PDA(qInfo,0,secPerSU); + pda_p->type = RF_PDA_TYPE_Q; + } + else + { + PDAPerDisk = 2; + state = 3; + /* four of them, fone, then ftwo */ + RF_MallocAndAdd(*pqpdap,4*sizeof(RF_PhysDiskAddr_t),(RF_PhysDiskAddr_t *), allocList); + pda_p = *pqpdap; + CONS_PDA(parityInfo,fone_start,fone->numSector); + pda_p->type = RF_PDA_TYPE_PARITY; + pda_p++; + CONS_PDA(qInfo,fone_start,fone->numSector); + pda_p->type = RF_PDA_TYPE_Q; + pda_p++; + CONS_PDA(parityInfo,ftwo_start,ftwo->numSector); + pda_p->type = RF_PDA_TYPE_PARITY; + pda_p++; + CONS_PDA(qInfo,ftwo_start,ftwo->numSector); + pda_p->type = RF_PDA_TYPE_Q; + } + } + /* figure out number of nonaccessed pda */ + napdas = PDAPerDisk * (numDataCol - 2); + *nPQNodep = PDAPerDisk; + + *nNodep = napdas; + if (napdas == 0) return; /* short circuit */ + + /* allocate up our list of pda's */ + + RF_CallocAndAdd(pda_p, napdas, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList); + *pdap = pda_p; + + /* linkem together */ + for (i=0; i < (napdas-1); i++) + pda_p[i].next = pda_p+(i+1); + + sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); + for (i=0; i < numDataCol; i++) + { + if ((pda_p - (*pdap)) == napdas) + continue; + pda_p->type = RF_PDA_TYPE_DATA; + pda_p->raidAddress = sosAddr + (i * secPerSU); + (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); + /* skip over dead disks */ + if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status)) + continue; + switch (state) + { + case 1: /* fone */ + pda_p->numSector = fone->numSector; + pda_p->raidAddress += fone_start; + pda_p->startSector += fone_start; + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); + break; + case 2: /* full stripe */ + pda_p->numSector = secPerSU; + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,secPerSU), (char *), allocList); + break; + case 3: /* two slabs */ + pda_p->numSector = fone->numSector; + pda_p->raidAddress += fone_start; + pda_p->startSector += fone_start; + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); + pda_p++; + pda_p->type = RF_PDA_TYPE_DATA; + pda_p->raidAddress = sosAddr + (i * secPerSU); + (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0); + pda_p->numSector = ftwo->numSector; + pda_p->raidAddress += ftwo_start; + pda_p->startSector += ftwo_start; + RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList); + break; + default: + RF_PANIC(); + } + pda_p++; + } + + RF_ASSERT (pda_p - *pdap == napdas); + return; +} + +#define DISK_NODE_PDA(node) ((node)->params[0].p) + +#define DISK_NODE_PARAMS(_node_,_p_) \ + (_node_).params[0].p = _p_ ; \ + (_node_).params[1].p = (_p_)->bufPtr; \ + (_node_).params[2].v = parityStripeID; \ + (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru) + +void rf_DoubleDegSmallWrite( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList, + char *redundantReadNodeName, + char *redundantWriteNodeName, + char *recoveryNodeName, + int (*recovFunc)(RF_DagNode_t *)) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_DagNode_t *nodes, *wudNodes, *rrdNodes, *recoveryNode, *blockNode, *unblockNode, *rpNodes,*rqNodes, *wpNodes, *wqNodes, *termNode; + RF_PhysDiskAddr_t *pda, *pqPDAs; + RF_PhysDiskAddr_t *npdas; + int nWriteNodes, nNodes, nReadNodes, nRrdNodes, nWudNodes, i; + RF_ReconUnitNum_t which_ru; + int nPQNodes; + RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, &which_ru); + + /* simple small write case - + First part looks like a reconstruct-read of the failed data units. + Then a write of all data units not failed. */ + + + /* + Hdr + | + ------Block- + / / \ + Rrd Rrd ... Rrd Rp Rq + \ \ / + -------PQ----- + / \ \ + Wud Wp WQ + \ | / + --Unblock- + | + T + + Rrd = read recovery data (potentially none) + Wud = write user data (not incl. failed disks) + Wp = Write P (could be two) + Wq = Write Q (could be two) + + */ + + rf_WriteGenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes, &pqPDAs, &nPQNodes,allocList); + + RF_ASSERT(asmap->numDataFailed == 1); + + nWudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed); + nReadNodes = nRrdNodes + 2*nPQNodes; + nWriteNodes = nWudNodes+ 2*nPQNodes; + nNodes = 4 + nReadNodes + nWriteNodes; + + RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList); + blockNode = nodes; + unblockNode = blockNode+1; + termNode = unblockNode+1; + recoveryNode = termNode+1; + rrdNodes = recoveryNode+1; + rpNodes = rrdNodes + nRrdNodes; + rqNodes = rpNodes + nPQNodes; + wudNodes = rqNodes + nPQNodes; + wpNodes = wudNodes + nWudNodes; + wqNodes = wpNodes + nPQNodes; + + dag_h->creator = "PQ_DDSimpleSmallWrite"; + dag_h->numSuccedents = 1; + dag_h->succedents[0] = blockNode; + rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); + termNode->antecedents[0] = unblockNode; + termNode->antType[0] = rf_control; + + /* init the block and unblock nodes */ + /* The block node has all the read nodes as successors */ + rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h, "Nil", allocList); + for (i=0; i < nReadNodes; i++) + blockNode->succedents[i] = rrdNodes+i; + + /* The unblock node has all the writes as successors */ + rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWriteNodes, 0, 0, dag_h, "Nil", allocList); + for (i=0; i < nWriteNodes; i++) { + unblockNode->antecedents[i] = wudNodes+i; + unblockNode->antType[i] = rf_control; + } + unblockNode->succedents[0] = termNode; + +#define INIT_READ_NODE(node,name) \ + rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \ + (node)->succedents[0] = recoveryNode; \ + (node)->antecedents[0] = blockNode; \ + (node)->antType[0] = rf_control; + + /* build the read nodes */ + pda = npdas; + for (i=0; i < nRrdNodes; i++, pda = pda->next) { + INIT_READ_NODE(rrdNodes+i,"rrd"); + DISK_NODE_PARAMS(rrdNodes[i],pda); + } + + /* read redundancy pdas */ + pda = pqPDAs; + INIT_READ_NODE(rpNodes,"Rp"); + RF_ASSERT(pda); + DISK_NODE_PARAMS(rpNodes[0],pda); + pda++; + INIT_READ_NODE(rqNodes, redundantReadNodeName ); + RF_ASSERT(pda); + DISK_NODE_PARAMS(rqNodes[0],pda); + if (nPQNodes==2) + { + pda++; + INIT_READ_NODE(rpNodes+1,"Rp"); + RF_ASSERT(pda); + DISK_NODE_PARAMS(rpNodes[1],pda); + pda++; + INIT_READ_NODE(rqNodes+1,redundantReadNodeName ); + RF_ASSERT(pda); + DISK_NODE_PARAMS(rqNodes[1],pda); + } + + /* the recovery node has all reads as precedessors and all writes as successors. + It generates a result for every write P or write Q node. + As parameters, it takes a pda per read and a pda per stripe of user data written. + It also takes as the last params the raidPtr and asm. + For results, it takes PDA for P & Q. */ + + + rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc, rf_NullNodeUndoFunc, NULL, + nWriteNodes, /* succesors */ + nReadNodes, /* preds */ + nReadNodes + nWudNodes + 3, /* params */ + 2 * nPQNodes, /* results */ + dag_h, recoveryNodeName, allocList); + + + + for (i=0; i < nReadNodes; i++ ) + { + recoveryNode->antecedents[i] = rrdNodes+i; + recoveryNode->antType[i] = rf_control; + recoveryNode->params[i].p = DISK_NODE_PDA(rrdNodes+i); + } + for (i=0; i < nWudNodes; i++) + { + recoveryNode->succedents[i] = wudNodes+i; + } + recoveryNode->params[nReadNodes+nWudNodes].p = asmap->failedPDAs[0]; + recoveryNode->params[nReadNodes+nWudNodes+1].p = raidPtr; + recoveryNode->params[nReadNodes+nWudNodes+2].p = asmap; + + for ( ; i < nWriteNodes; i++) + recoveryNode->succedents[i] = wudNodes+i; + + pda = pqPDAs; + recoveryNode->results[0] = pda; + pda++; + recoveryNode->results[1] = pda; + if ( nPQNodes == 2) + { + pda++; + recoveryNode->results[2] = pda; + pda++; + recoveryNode->results[3] = pda; + } + + /* fill writes */ +#define INIT_WRITE_NODE(node,name) \ + rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \ + (node)->succedents[0] = unblockNode; \ + (node)->antecedents[0] = recoveryNode; \ + (node)->antType[0] = rf_control; + + pda = asmap->physInfo; + for (i=0; i < nWudNodes; i++) + { + INIT_WRITE_NODE(wudNodes+i,"Wd"); + DISK_NODE_PARAMS(wudNodes[i],pda); + recoveryNode->params[nReadNodes+i].p = DISK_NODE_PDA(wudNodes+i); + pda = pda->next; + } + /* write redundancy pdas */ + pda = pqPDAs; + INIT_WRITE_NODE(wpNodes,"Wp"); + RF_ASSERT(pda); + DISK_NODE_PARAMS(wpNodes[0],pda); + pda++; + INIT_WRITE_NODE(wqNodes,"Wq"); + RF_ASSERT(pda); + DISK_NODE_PARAMS(wqNodes[0],pda); + if (nPQNodes==2) + { + pda++; + INIT_WRITE_NODE(wpNodes+1,"Wp"); + RF_ASSERT(pda); + DISK_NODE_PARAMS(wpNodes[1],pda); + pda++; + INIT_WRITE_NODE(wqNodes+1,"Wq"); + RF_ASSERT(pda); + DISK_NODE_PARAMS(wqNodes[1],pda); + } +} diff --git a/sys/dev/raidframe/rf_dagdegwr.h b/sys/dev/raidframe/rf_dagdegwr.h new file mode 100644 index 00000000000..180c5f75668 --- /dev/null +++ b/sys/dev/raidframe/rf_dagdegwr.h @@ -0,0 +1,81 @@ +/* $OpenBSD: rf_dagdegwr.h,v 1.1 1999/01/11 14:29:08 niklas Exp $ */ +/* $NetBSD: rf_dagdegwr.h,v 1.1 1998/11/13 04:20:27 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * : + * Log: rf_dagdegwr.h,v + * Revision 1.6 1996/07/31 16:30:28 jimz + * asm/asmap fix (EO merge) + * + * Revision 1.5 1996/07/31 15:35:38 jimz + * evenodd changes; bugfixes for double-degraded archs, generalize + * some formerly PQ-only functions + * + * Revision 1.4 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.3 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.2 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.1 1996/05/03 19:21:28 wvcii + * Initial revision + * + */ + +#ifndef _RF__RF_DAGDEGWR_H_ +#define _RF__RF_DAGDEGWR_H_ + +/* degraded write DAG creation routines */ +void rf_CreateDegradedWriteDAG(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp, + RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList); +void rf_CommonCreateSimpleDegradedWriteDAG(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp, + RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList, + int nfaults, int (*redFunc)(RF_DagNode_t *), int allowBufferRecycle); +void rf_WriteGenerateFailedAccessASMs(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_PhysDiskAddr_t **pdap, + int *nNodep, RF_PhysDiskAddr_t **pqpdap, + int *nPQNodep, RF_AllocListElem_t *allocList); +void rf_DoubleDegSmallWrite(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, void *bp, RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList, char *redundantReadNodeName, + char *redundantWriteNodeName, char *recoveryNodeName, + int (*recovFunc)(RF_DagNode_t *)); + +#endif /* !_RF__RF_DAGDEGWR_H_ */ diff --git a/sys/dev/raidframe/rf_dagffrd.c b/sys/dev/raidframe/rf_dagffrd.c new file mode 100644 index 00000000000..b831980cb0e --- /dev/null +++ b/sys/dev/raidframe/rf_dagffrd.c @@ -0,0 +1,500 @@ +/* $OpenBSD: rf_dagffrd.c,v 1.1 1999/01/11 14:29:08 niklas Exp $ */ +/* $NetBSD: rf_dagffrd.c,v 1.1 1998/11/13 04:20:27 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * rf_dagffrd.c + * + * code for creating fault-free read DAGs + * + * : + * Log: rf_dagffrd.c,v + * Revision 1.14 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.13 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.12 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.11 1996/06/06 17:30:44 jimz + * turn old Raid1 mirror read creation into a more generic function + * parameterized by an addtional parameter: type of mirrored read + * this is now used by other dag creation routines so chained declustering + * and raid1 can share dag creation code, but have different mirroring + * policies + * + * Revision 1.10 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.9 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.8 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.7 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.6 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.5 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.4 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.3 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.2 1996/05/08 21:01:24 jimz + * fixed up enum type names that were conflicting with other + * enums and function names (ie, "panic") + * future naming trends will be towards RF_ and rf_ for + * everything raidframe-related + * + * Revision 1.1 1996/05/03 19:19:20 wvcii + * Initial revision + * + */ + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_dagutils.h" +#include "rf_dagfuncs.h" +#include "rf_threadid.h" +#include "rf_debugMem.h" +#include "rf_memchunk.h" +#include "rf_general.h" +#include "rf_dagffrd.h" + +/****************************************************************************** + * + * General comments on DAG creation: + * + * All DAGs in this file use roll-away error recovery. Each DAG has a single + * commit node, usually called "Cmt." If an error occurs before the Cmt node + * is reached, the execution engine will halt forward execution and work + * backward through the graph, executing the undo functions. Assuming that + * each node in the graph prior to the Cmt node are undoable and atomic - or - + * does not make changes to permanent state, the graph will fail atomically. + * If an error occurs after the Cmt node executes, the engine will roll-forward + * through the graph, blindly executing nodes until it reaches the end. + * If a graph reaches the end, it is assumed to have completed successfully. + * + * A graph has only 1 Cmt node. + * + */ + + +/****************************************************************************** + * + * The following wrappers map the standard DAG creation interface to the + * DAG creation routines. Additionally, these wrappers enable experimentation + * with new DAG structures by providing an extra level of indirection, allowing + * the DAG creation routines to be replaced at this single point. + */ + +void rf_CreateFaultFreeReadDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList) +{ + rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList, + RF_IO_TYPE_READ); +} + + +/****************************************************************************** + * + * DAG creation code begins here + */ + +/****************************************************************************** + * + * creates a DAG to perform a nonredundant read or write of data within one + * stripe. + * For reads, this DAG is as follows: + * + * /---- read ----\ + * Header -- Block ---- read ---- Commit -- Terminate + * \---- read ----/ + * + * For writes, this DAG is as follows: + * + * /---- write ----\ + * Header -- Commit ---- write ---- Block -- Terminate + * \---- write ----/ + * + * There is one disk node per stripe unit accessed, and all disk nodes are in + * parallel. + * + * Tricky point here: The first disk node (read or write) is created + * normally. Subsequent disk nodes are created by copying the first one, + * and modifying a few params. The "succedents" and "antecedents" fields are + * _not_ re-created in each node, but rather left pointing to the same array + * that was malloc'd when the first node was created. Thus, it's essential + * that when this DAG is freed, the succedents and antecedents fields be freed + * in ONLY ONE of the read nodes. This does not apply to the "params" field + * because it is recreated for each READ node. + * + * Note that normal-priority accesses do not need to be tagged with their + * parity stripe ID, because they will never be promoted. Hence, I've + * commented-out the code to do this, and marked it with UNNEEDED. + * + *****************************************************************************/ + +void rf_CreateNonredundantDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList, + RF_IoType_t type) +{ + RF_DagNode_t *nodes, *diskNodes, *blockNode, *commitNode, *termNode; + RF_PhysDiskAddr_t *pda = asmap->physInfo; + int (*doFunc)(RF_DagNode_t *), (*undoFunc)(RF_DagNode_t *); + int i, n, totalNumNodes; + char *name; + + n = asmap->numStripeUnitsAccessed; + dag_h->creator = "NonredundantDAG"; + + RF_ASSERT(RF_IO_IS_R_OR_W(type)); + switch (type) { + case RF_IO_TYPE_READ: + doFunc = rf_DiskReadFunc; + undoFunc = rf_DiskReadUndoFunc; + name = "R "; + if (rf_dagDebug) printf("[Creating non-redundant read DAG]\n"); + break; + case RF_IO_TYPE_WRITE: + doFunc = rf_DiskWriteFunc; + undoFunc = rf_DiskWriteUndoFunc; + name = "W "; + if (rf_dagDebug) printf("[Creating non-redundant write DAG]\n"); + break; + default: + RF_PANIC(); + } + + /* + * For reads, the dag can not commit until the block node is reached. + * for writes, the dag commits immediately. + */ + dag_h->numCommitNodes = 1; + dag_h->numCommits = 0; + dag_h->numSuccedents = 1; + + /* + * Node count: + * 1 block node + * n data reads (or writes) + * 1 commit node + * 1 terminator node + */ + RF_ASSERT(n > 0); + totalNumNodes = n + 3; + RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t), + (RF_DagNode_t *), allocList); + i = 0; + diskNodes = &nodes[i]; i += n; + blockNode = &nodes[i]; i += 1; + commitNode = &nodes[i]; i += 1; + termNode = &nodes[i]; i += 1; + RF_ASSERT(i == totalNumNodes); + + /* initialize nodes */ + switch (type) { + case RF_IO_TYPE_READ: + rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, n, 0, 0, 0, dag_h, "Nil", allocList); + rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, 1, n, 0, 0, dag_h, "Cmt", allocList); + rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, + NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); + break; + case RF_IO_TYPE_WRITE: + rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, 1, 0, 0, 0, dag_h, "Nil", allocList); + rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, n, 1, 0, 0, dag_h, "Cmt", allocList); + rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, + NULL, 0, n, 0, 0, dag_h, "Trm", allocList); + break; + default: + RF_PANIC(); + } + + for (i = 0; i < n; i++) { + RF_ASSERT(pda != NULL); + rf_InitNode(&diskNodes[i], rf_wait, RF_FALSE, doFunc, undoFunc, rf_GenericWakeupFunc, + 1, 1, 4, 0, dag_h, name, allocList); + diskNodes[i].params[0].p = pda; + diskNodes[i].params[1].p = pda->bufPtr; + /* parity stripe id is not necessary */ + diskNodes[i].params[2].v = 0; + diskNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0); + pda = pda->next; + } + + /* + * Connect nodes. + */ + + /* connect hdr to block node */ + RF_ASSERT(blockNode->numAntecedents == 0); + dag_h->succedents[0] = blockNode; + + if (type == RF_IO_TYPE_READ) { + /* connecting a nonredundant read DAG */ + RF_ASSERT(blockNode->numSuccedents == n); + RF_ASSERT(commitNode->numAntecedents == n); + for (i=0; i < n; i++) { + /* connect block node to each read node */ + RF_ASSERT(diskNodes[i].numAntecedents == 1); + blockNode->succedents[i] = &diskNodes[i]; + diskNodes[i].antecedents[0] = blockNode; + diskNodes[i].antType[0] = rf_control; + + /* connect each read node to the commit node */ + RF_ASSERT(diskNodes[i].numSuccedents == 1); + diskNodes[i].succedents[0] = commitNode; + commitNode->antecedents[i] = &diskNodes[i]; + commitNode->antType[i] = rf_control; + } + /* connect the commit node to the term node */ + RF_ASSERT(commitNode->numSuccedents == 1); + RF_ASSERT(termNode->numAntecedents == 1); + RF_ASSERT(termNode->numSuccedents == 0); + commitNode->succedents[0] = termNode; + termNode->antecedents[0] = commitNode; + termNode->antType[0] = rf_control; + } + else { + /* connecting a nonredundant write DAG */ + /* connect the block node to the commit node */ + RF_ASSERT(blockNode->numSuccedents == 1); + RF_ASSERT(commitNode->numAntecedents == 1); + blockNode->succedents[0] = commitNode; + commitNode->antecedents[0] = blockNode; + commitNode->antType[0] = rf_control; + + RF_ASSERT(commitNode->numSuccedents == n); + RF_ASSERT(termNode->numAntecedents == n); + RF_ASSERT(termNode->numSuccedents == 0); + for (i=0; i < n; i++) { + /* connect the commit node to each write node */ + RF_ASSERT(diskNodes[i].numAntecedents == 1); + commitNode->succedents[i] = &diskNodes[i]; + diskNodes[i].antecedents[0] = commitNode; + diskNodes[i].antType[0] = rf_control; + + /* connect each write node to the term node */ + RF_ASSERT(diskNodes[i].numSuccedents == 1); + diskNodes[i].succedents[0] = termNode; + termNode->antecedents[i] = &diskNodes[i]; + termNode->antType[i] = rf_control; + } + } +} + +/****************************************************************************** + * Create a fault-free read DAG for RAID level 1 + * + * Hdr -> Nil -> Rmir -> Cmt -> Trm + * + * The "Rmir" node schedules a read from the disk in the mirror pair with the + * shortest disk queue. the proper queue is selected at Rmir execution. this + * deferred mapping is unlike other archs in RAIDframe which generally fix + * mapping at DAG creation time. + * + * Parameters: raidPtr - description of the physical array + * asmap - logical & physical addresses for this access + * bp - buffer ptr (for holding read data) + * flags - general flags (e.g. disk locking) + * allocList - list of memory allocated in DAG creation + *****************************************************************************/ + +static void CreateMirrorReadDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList, + int (*readfunc)(RF_DagNode_t *node)) +{ + RF_DagNode_t *readNodes, *nodes, *blockNode, *commitNode, *termNode; + RF_PhysDiskAddr_t *data_pda = asmap->physInfo; + RF_PhysDiskAddr_t *parity_pda = asmap->parityInfo; + int i, n, totalNumNodes; + + n = asmap->numStripeUnitsAccessed; + dag_h->creator = "RaidOneReadDAG"; + if (rf_dagDebug) { + printf("[Creating RAID level 1 read DAG]\n"); + } + + /* + * This dag can not commit until the commit node is reached + * errors prior to the commit point imply the dag has failed. + */ + dag_h->numCommitNodes = 1; + dag_h->numCommits = 0; + dag_h->numSuccedents = 1; + + /* + * Node count: + * n data reads + * 1 block node + * 1 commit node + * 1 terminator node + */ + RF_ASSERT(n > 0); + totalNumNodes = n + 3; + RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t), + (RF_DagNode_t *), allocList); + i = 0; + readNodes = &nodes[i]; i += n; + blockNode = &nodes[i]; i += 1; + commitNode = &nodes[i]; i += 1; + termNode = &nodes[i]; i += 1; + RF_ASSERT(i == totalNumNodes); + + /* initialize nodes */ + rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, + rf_NullNodeUndoFunc, NULL, n, 0, 0, 0, dag_h, "Nil", allocList); + rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, + rf_NullNodeUndoFunc, NULL, 1, n, 0, 0, dag_h, "Cmt", allocList); + rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, + rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); + + for (i = 0; i < n; i++) { + RF_ASSERT(data_pda != NULL); + RF_ASSERT(parity_pda != NULL); + rf_InitNode(&readNodes[i], rf_wait, RF_FALSE, readfunc, + rf_DiskReadMirrorUndoFunc, rf_GenericWakeupFunc, 1, 1, 5, 0, dag_h, + "Rmir", allocList); + readNodes[i].params[0].p = data_pda; + readNodes[i].params[1].p = data_pda->bufPtr; + /* parity stripe id is not necessary */ + readNodes[i].params[2].p = 0; + readNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0); + readNodes[i].params[4].p = parity_pda; + data_pda = data_pda->next; + parity_pda = parity_pda->next; + } + + /* + * Connect nodes + */ + + /* connect hdr to block node */ + RF_ASSERT(blockNode->numAntecedents == 0); + dag_h->succedents[0] = blockNode; + + /* connect block node to read nodes */ + RF_ASSERT(blockNode->numSuccedents == n); + for (i=0; i < n; i++) { + RF_ASSERT(readNodes[i].numAntecedents == 1); + blockNode->succedents[i] = &readNodes[i]; + readNodes[i].antecedents[0] = blockNode; + readNodes[i].antType[0] = rf_control; + } + + /* connect read nodes to commit node */ + RF_ASSERT(commitNode->numAntecedents == n); + for (i=0; i < n; i++) { + RF_ASSERT(readNodes[i].numSuccedents == 1); + readNodes[i].succedents[0] = commitNode; + commitNode->antecedents[i] = &readNodes[i]; + commitNode->antType[i] = rf_control; + } + + /* connect commit node to term node */ + RF_ASSERT(commitNode->numSuccedents == 1); + RF_ASSERT(termNode->numAntecedents == 1); + RF_ASSERT(termNode->numSuccedents == 0); + commitNode->succedents[0] = termNode; + termNode->antecedents[0] = commitNode; + termNode->antType[0] = rf_control; +} + +void rf_CreateMirrorIdleReadDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList) +{ + CreateMirrorReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, + rf_DiskReadMirrorIdleFunc); +} + +void rf_CreateMirrorPartitionReadDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList) +{ + CreateMirrorReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, + rf_DiskReadMirrorPartitionFunc); +} diff --git a/sys/dev/raidframe/rf_dagffrd.h b/sys/dev/raidframe/rf_dagffrd.h new file mode 100644 index 00000000000..61e3ee86241 --- /dev/null +++ b/sys/dev/raidframe/rf_dagffrd.h @@ -0,0 +1,75 @@ +/* $OpenBSD: rf_dagffrd.h,v 1.1 1999/01/11 14:29:08 niklas Exp $ */ +/* $NetBSD: rf_dagffrd.h,v 1.1 1998/11/13 04:20:27 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * : + * Log: rf_dagffrd.h,v + * Revision 1.5 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.4 1996/06/06 17:31:13 jimz + * new mirror read creation dags + * + * Revision 1.3 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.2 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.1 1996/05/03 19:19:53 wvcii + * Initial revision + * + */ + +#ifndef _RF__RF_DAGFFRD_H_ +#define _RF__RF_DAGFFRD_H_ + +#include "rf_types.h" + +/* fault-free read DAG creation routines */ +void rf_CreateFaultFreeReadDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, void *bp, RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList); +void rf_CreateNonredundantDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, void *bp, RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList, RF_IoType_t type); +void rf_CreateMirrorIdleReadDAG(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp, + RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList); +void rf_CreateMirrorPartitionReadDAG(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp, + RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList); + +#endif /* !_RF__RF_DAGFFRD_H_ */ diff --git a/sys/dev/raidframe/rf_dagffwr.c b/sys/dev/raidframe/rf_dagffwr.c new file mode 100644 index 00000000000..f502de1b293 --- /dev/null +++ b/sys/dev/raidframe/rf_dagffwr.c @@ -0,0 +1,2202 @@ +/* $OpenBSD: rf_dagffwr.c,v 1.1 1999/01/11 14:29:09 niklas Exp $ */ +/* $NetBSD: rf_dagffwr.c,v 1.1 1998/11/13 04:20:27 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * rf_dagff.c + * + * code for creating fault-free DAGs + * + * : + * Log: rf_dagffwr.c,v + * Revision 1.19 1996/07/31 15:35:24 jimz + * evenodd changes; bugfixes for double-degraded archs, generalize + * some formerly PQ-only functions + * + * Revision 1.18 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.17 1996/07/27 18:40:24 jimz + * cleanup sweep + * + * Revision 1.16 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.15 1996/06/11 01:27:50 jimz + * Fixed bug where diskthread shutdown would crash or hang. This + * turned out to be two distinct bugs: + * (1) [crash] The thread shutdown code wasn't properly waiting for + * all the diskthreads to complete. This caused diskthreads that were + * exiting+cleaning up to unlock a destroyed mutex. + * (2) [hang] TerminateDiskQueues wasn't locking, and DiskIODequeue + * only checked for termination _after_ a wakeup if the queues were + * empty. This was a race where the termination wakeup could be lost + * by the dequeueing thread, and the system would hang waiting for the + * thread to exit, while the thread waited for an I/O or a signal to + * check the termination flag. + * + * Revision 1.14 1996/06/10 22:24:01 wvcii + * added write dags which do not have a commit node and are + * used in forward and backward error recovery experiments. + * + * Revision 1.13 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.12 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.11 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.10 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.9 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.8 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.7 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.6 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.5 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.4 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.3 1996/05/15 23:23:12 wvcii + * fixed bug in small write read old q node succedent initialization + * + * Revision 1.2 1996/05/08 21:01:24 jimz + * fixed up enum type names that were conflicting with other + * enums and function names (ie, "panic") + * future naming trends will be towards RF_ and rf_ for + * everything raidframe-related + * + * Revision 1.1 1996/05/03 19:20:45 wvcii + * Initial revision + * + */ + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_dagutils.h" +#include "rf_dagfuncs.h" +#include "rf_threadid.h" +#include "rf_debugMem.h" +#include "rf_dagffrd.h" +#include "rf_memchunk.h" +#include "rf_general.h" +#include "rf_dagffwr.h" + +/****************************************************************************** + * + * General comments on DAG creation: + * + * All DAGs in this file use roll-away error recovery. Each DAG has a single + * commit node, usually called "Cmt." If an error occurs before the Cmt node + * is reached, the execution engine will halt forward execution and work + * backward through the graph, executing the undo functions. Assuming that + * each node in the graph prior to the Cmt node are undoable and atomic - or - + * does not make changes to permanent state, the graph will fail atomically. + * If an error occurs after the Cmt node executes, the engine will roll-forward + * through the graph, blindly executing nodes until it reaches the end. + * If a graph reaches the end, it is assumed to have completed successfully. + * + * A graph has only 1 Cmt node. + * + */ + + +/****************************************************************************** + * + * The following wrappers map the standard DAG creation interface to the + * DAG creation routines. Additionally, these wrappers enable experimentation + * with new DAG structures by providing an extra level of indirection, allowing + * the DAG creation routines to be replaced at this single point. + */ + + +void rf_CreateNonRedundantWriteDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList, + RF_IoType_t type) +{ + rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList, + RF_IO_TYPE_WRITE); +} + +void rf_CreateRAID0WriteDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList, + RF_IoType_t type) +{ + rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList, + RF_IO_TYPE_WRITE); +} + +void rf_CreateSmallWriteDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList) +{ +#if RF_FORWARD > 0 + rf_CommonCreateSmallWriteDAGFwd(raidPtr, asmap, dag_h, bp, flags, allocList, + &rf_xorFuncs, NULL); +#else /* RF_FORWARD > 0 */ +#if RF_BACKWARD > 0 + rf_CommonCreateSmallWriteDAGFwd(raidPtr, asmap, dag_h, bp, flags, allocList, + &rf_xorFuncs, NULL); +#else /* RF_BACKWARD > 0 */ + /* "normal" rollaway */ + rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, + &rf_xorFuncs, NULL); +#endif /* RF_BACKWARD > 0 */ +#endif /* RF_FORWARD > 0 */ +} + +void rf_CreateLargeWriteDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList) +{ +#if RF_FORWARD > 0 + rf_CommonCreateLargeWriteDAGFwd(raidPtr, asmap, dag_h, bp, flags, allocList, + 1, rf_RegularXorFunc, RF_TRUE); +#else /* RF_FORWARD > 0 */ +#if RF_BACKWARD > 0 + rf_CommonCreateLargeWriteDAGFwd(raidPtr, asmap, dag_h, bp, flags, allocList, + 1, rf_RegularXorFunc, RF_TRUE); +#else /* RF_BACKWARD > 0 */ + /* "normal" rollaway */ + rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, + 1, rf_RegularXorFunc, RF_TRUE); +#endif /* RF_BACKWARD > 0 */ +#endif /* RF_FORWARD > 0 */ +} + + +/****************************************************************************** + * + * DAG creation code begins here + */ + + +/****************************************************************************** + * + * creates a DAG to perform a large-write operation: + * + * / Rod \ / Wnd \ + * H -- block- Rod - Xor - Cmt - Wnd --- T + * \ Rod / \ Wnp / + * \[Wnq]/ + * + * The XOR node also does the Q calculation in the P+Q architecture. + * All nodes are before the commit node (Cmt) are assumed to be atomic and + * undoable - or - they make no changes to permanent state. + * + * Rod = read old data + * Cmt = commit node + * Wnp = write new parity + * Wnd = write new data + * Wnq = write new "q" + * [] denotes optional segments in the graph + * + * Parameters: raidPtr - description of the physical array + * asmap - logical & physical addresses for this access + * bp - buffer ptr (holds write data) + * flags - general flags (e.g. disk locking) + * allocList - list of memory allocated in DAG creation + * nfaults - number of faults array can tolerate + * (equal to # redundancy units in stripe) + * redfuncs - list of redundancy generating functions + * + *****************************************************************************/ + +void rf_CommonCreateLargeWriteDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList, + int nfaults, + int (*redFunc)(RF_DagNode_t *), + int allowBufferRecycle) +{ + RF_DagNode_t *nodes, *wndNodes, *rodNodes, *xorNode, *wnpNode; + RF_DagNode_t *wnqNode, *blockNode, *commitNode, *termNode; + int nWndNodes, nRodNodes, i, nodeNum, asmNum; + RF_AccessStripeMapHeader_t *new_asm_h[2]; + RF_StripeNum_t parityStripeID; + char *sosBuffer, *eosBuffer; + RF_ReconUnitNum_t which_ru; + RF_RaidLayout_t *layoutPtr; + RF_PhysDiskAddr_t *pda; + + layoutPtr = &(raidPtr->Layout); + parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, + &which_ru); + + if (rf_dagDebug) { + printf("[Creating large-write DAG]\n"); + } + dag_h->creator = "LargeWriteDAG"; + + dag_h->numCommitNodes = 1; + dag_h->numCommits = 0; + dag_h->numSuccedents = 1; + + /* alloc the nodes: Wnd, xor, commit, block, term, and Wnp */ + nWndNodes = asmap->numStripeUnitsAccessed; + RF_CallocAndAdd(nodes, nWndNodes + 4 + nfaults, sizeof(RF_DagNode_t), + (RF_DagNode_t *), allocList); + i = 0; + wndNodes = &nodes[i]; i += nWndNodes; + xorNode = &nodes[i]; i += 1; + wnpNode = &nodes[i]; i += 1; + blockNode = &nodes[i]; i += 1; + commitNode = &nodes[i]; i += 1; + termNode = &nodes[i]; i += 1; + if (nfaults == 2) { + wnqNode = &nodes[i]; i += 1; + } + else { + wnqNode = NULL; + } + rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h, + &nRodNodes, &sosBuffer, &eosBuffer, allocList); + if (nRodNodes > 0) { + RF_CallocAndAdd(rodNodes, nRodNodes, sizeof(RF_DagNode_t), + (RF_DagNode_t *), allocList); + } + else { + rodNodes = NULL; + } + + /* begin node initialization */ + if (nRodNodes > 0) { + rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, nRodNodes, 0, 0, 0, dag_h, "Nil", allocList); + } + else { + rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, 1, 0, 0, 0, dag_h, "Nil", allocList); + } + + rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, + nWndNodes + nfaults, 1, 0, 0, dag_h, "Cmt", allocList); + rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, + 0, nWndNodes + nfaults, 0, 0, dag_h, "Trm", allocList); + + /* initialize the Rod nodes */ + for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) { + if (new_asm_h[asmNum]) { + pda = new_asm_h[asmNum]->stripeMap->physInfo; + while (pda) { + rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc, + rf_DiskReadUndoFunc,rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, + "Rod", allocList); + rodNodes[nodeNum].params[0].p = pda; + rodNodes[nodeNum].params[1].p = pda->bufPtr; + rodNodes[nodeNum].params[2].v = parityStripeID; + rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, + 0, 0, which_ru); + nodeNum++; + pda = pda->next; + } + } + } + RF_ASSERT(nodeNum == nRodNodes); + + /* initialize the wnd nodes */ + pda = asmap->physInfo; + for (i=0; i < nWndNodes; i++) { + rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, + rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList); + RF_ASSERT(pda != NULL); + wndNodes[i].params[0].p = pda; + wndNodes[i].params[1].p = pda->bufPtr; + wndNodes[i].params[2].v = parityStripeID; + wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + pda = pda->next; + } + + /* initialize the redundancy node */ + if (nRodNodes > 0) { + rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1, + nRodNodes, 2 * (nWndNodes+nRodNodes) + 1, nfaults, dag_h, + "Xr ", allocList); + } + else { + rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1, + 1, 2 * (nWndNodes+nRodNodes) + 1, nfaults, dag_h, "Xr ", allocList); + } + xorNode->flags |= RF_DAGNODE_FLAG_YIELD; + for (i=0; i < nWndNodes; i++) { + xorNode->params[2*i+0] = wndNodes[i].params[0]; /* pda */ + xorNode->params[2*i+1] = wndNodes[i].params[1]; /* buf ptr */ + } + for (i=0; i < nRodNodes; i++) { + xorNode->params[2*(nWndNodes+i)+0] = rodNodes[i].params[0]; /* pda */ + xorNode->params[2*(nWndNodes+i)+1] = rodNodes[i].params[1]; /* buf ptr */ + } + /* xor node needs to get at RAID information */ + xorNode->params[2*(nWndNodes+nRodNodes)].p = raidPtr; + + /* + * Look for an Rod node that reads a complete SU. If none, alloc a buffer + * to receive the parity info. Note that we can't use a new data buffer + * because it will not have gotten written when the xor occurs. + */ + if (allowBufferRecycle) { + for (i = 0; i < nRodNodes; i++) { + if (((RF_PhysDiskAddr_t *)rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit) + break; + } + } + if ((!allowBufferRecycle) || (i == nRodNodes)) { + RF_CallocAndAdd(xorNode->results[0], 1, + rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), + (void *), allocList); + } + else { + xorNode->results[0] = rodNodes[i].params[1].p; + } + + /* initialize the Wnp node */ + rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, + rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList); + wnpNode->params[0].p = asmap->parityInfo; + wnpNode->params[1].p = xorNode->results[0]; + wnpNode->params[2].v = parityStripeID; + wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + /* parityInfo must describe entire parity unit */ + RF_ASSERT(asmap->parityInfo->next == NULL); + + if (nfaults == 2) { + /* + * We never try to recycle a buffer for the Q calcuation + * in addition to the parity. This would cause two buffers + * to get smashed during the P and Q calculation, guaranteeing + * one would be wrong. + */ + RF_CallocAndAdd(xorNode->results[1], 1, + rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), + (void *),allocList); + rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, + rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList); + wnqNode->params[0].p = asmap->qInfo; + wnqNode->params[1].p = xorNode->results[1]; + wnqNode->params[2].v = parityStripeID; + wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + /* parityInfo must describe entire parity unit */ + RF_ASSERT(asmap->parityInfo->next == NULL); + } + + /* + * Connect nodes to form graph. + */ + + /* connect dag header to block node */ + RF_ASSERT(blockNode->numAntecedents == 0); + dag_h->succedents[0] = blockNode; + + if (nRodNodes > 0) { + /* connect the block node to the Rod nodes */ + RF_ASSERT(blockNode->numSuccedents == nRodNodes); + RF_ASSERT(xorNode->numAntecedents == nRodNodes); + for (i = 0; i < nRodNodes; i++) { + RF_ASSERT(rodNodes[i].numAntecedents == 1); + blockNode->succedents[i] = &rodNodes[i]; + rodNodes[i].antecedents[0] = blockNode; + rodNodes[i].antType[0] = rf_control; + + /* connect the Rod nodes to the Xor node */ + RF_ASSERT(rodNodes[i].numSuccedents == 1); + rodNodes[i].succedents[0] = xorNode; + xorNode->antecedents[i] = &rodNodes[i]; + xorNode->antType[i] = rf_trueData; + } + } + else { + /* connect the block node to the Xor node */ + RF_ASSERT(blockNode->numSuccedents == 1); + RF_ASSERT(xorNode->numAntecedents == 1); + blockNode->succedents[0] = xorNode; + xorNode->antecedents[0] = blockNode; + xorNode->antType[0] = rf_control; + } + + /* connect the xor node to the commit node */ + RF_ASSERT(xorNode->numSuccedents == 1); + RF_ASSERT(commitNode->numAntecedents == 1); + xorNode->succedents[0] = commitNode; + commitNode->antecedents[0] = xorNode; + commitNode->antType[0] = rf_control; + + /* connect the commit node to the write nodes */ + RF_ASSERT(commitNode->numSuccedents == nWndNodes + nfaults); + for (i = 0; i < nWndNodes; i++) { + RF_ASSERT(wndNodes->numAntecedents == 1); + commitNode->succedents[i] = &wndNodes[i]; + wndNodes[i].antecedents[0] = commitNode; + wndNodes[i].antType[0] = rf_control; + } + RF_ASSERT(wnpNode->numAntecedents == 1); + commitNode->succedents[nWndNodes] = wnpNode; + wnpNode->antecedents[0]= commitNode; + wnpNode->antType[0] = rf_trueData; + if (nfaults == 2) { + RF_ASSERT(wnqNode->numAntecedents == 1); + commitNode->succedents[nWndNodes + 1] = wnqNode; + wnqNode->antecedents[0] = commitNode; + wnqNode->antType[0] = rf_trueData; + } + + /* connect the write nodes to the term node */ + RF_ASSERT(termNode->numAntecedents == nWndNodes + nfaults); + RF_ASSERT(termNode->numSuccedents == 0); + for (i = 0; i < nWndNodes; i++) { + RF_ASSERT(wndNodes->numSuccedents == 1); + wndNodes[i].succedents[0] = termNode; + termNode->antecedents[i] = &wndNodes[i]; + termNode->antType[i] = rf_control; + } + RF_ASSERT(wnpNode->numSuccedents == 1); + wnpNode->succedents[0] = termNode; + termNode->antecedents[nWndNodes] = wnpNode; + termNode->antType[nWndNodes] = rf_control; + if (nfaults == 2) { + RF_ASSERT(wnqNode->numSuccedents == 1); + wnqNode->succedents[0] = termNode; + termNode->antecedents[nWndNodes + 1] = wnqNode; + termNode->antType[nWndNodes + 1] = rf_control; + } +} + +/****************************************************************************** + * + * creates a DAG to perform a small-write operation (either raid 5 or pq), + * which is as follows: + * + * Hdr -> Nil -> Rop -> Xor -> Cmt ----> Wnp [Unp] --> Trm + * \- Rod X / \----> Wnd [Und]-/ + * [\- Rod X / \---> Wnd [Und]-/] + * [\- Roq -> Q / \--> Wnq [Unq]-/] + * + * Rop = read old parity + * Rod = read old data + * Roq = read old "q" + * Cmt = commit node + * Und = unlock data disk + * Unp = unlock parity disk + * Unq = unlock q disk + * Wnp = write new parity + * Wnd = write new data + * Wnq = write new "q" + * [ ] denotes optional segments in the graph + * + * Parameters: raidPtr - description of the physical array + * asmap - logical & physical addresses for this access + * bp - buffer ptr (holds write data) + * flags - general flags (e.g. disk locking) + * allocList - list of memory allocated in DAG creation + * pfuncs - list of parity generating functions + * qfuncs - list of q generating functions + * + * A null qfuncs indicates single fault tolerant + *****************************************************************************/ + +void rf_CommonCreateSmallWriteDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList, + RF_RedFuncs_t *pfuncs, + RF_RedFuncs_t *qfuncs) +{ + RF_DagNode_t *readDataNodes, *readParityNodes, *readQNodes, *termNode; + RF_DagNode_t *unlockDataNodes, *unlockParityNodes, *unlockQNodes; + RF_DagNode_t *xorNodes, *qNodes, *blockNode, *commitNode, *nodes; + RF_DagNode_t *writeDataNodes, *writeParityNodes, *writeQNodes; + int i, j, nNodes, totalNumNodes, lu_flag; + RF_ReconUnitNum_t which_ru; + int (*func)(RF_DagNode_t *), (*undoFunc)(RF_DagNode_t *); + int (*qfunc)(RF_DagNode_t *); + int numDataNodes, numParityNodes; + RF_StripeNum_t parityStripeID; + RF_PhysDiskAddr_t *pda; + char *name, *qname; + long nfaults; + + nfaults = qfuncs ? 2 : 1; + lu_flag = (rf_enableAtomicRMW) ? 1 : 0; /* lock/unlock flag */ + + parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), + asmap->raidAddress, &which_ru); + pda = asmap->physInfo; + numDataNodes = asmap->numStripeUnitsAccessed; + numParityNodes = (asmap->parityInfo->next) ? 2 : 1; + + if (rf_dagDebug) { + printf("[Creating small-write DAG]\n"); + } + RF_ASSERT(numDataNodes > 0); + dag_h->creator = "SmallWriteDAG"; + + dag_h->numCommitNodes = 1; + dag_h->numCommits = 0; + dag_h->numSuccedents = 1; + + /* + * DAG creation occurs in four steps: + * 1. count the number of nodes in the DAG + * 2. create the nodes + * 3. initialize the nodes + * 4. connect the nodes + */ + + /* + * Step 1. compute number of nodes in the graph + */ + + /* number of nodes: + * a read and write for each data unit + * a redundancy computation node for each parity node (nfaults * nparity) + * a read and write for each parity unit + * a block and commit node (2) + * a terminate node + * if atomic RMW + * an unlock node for each data unit, redundancy unit + */ + totalNumNodes = (2 * numDataNodes) + (nfaults * numParityNodes) + + (nfaults * 2 * numParityNodes) + 3; + if (lu_flag) { + totalNumNodes += (numDataNodes + (nfaults * numParityNodes)); + } + + /* + * Step 2. create the nodes + */ + RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t), + (RF_DagNode_t *), allocList); + i = 0; + blockNode = &nodes[i]; i += 1; + commitNode = &nodes[i]; i += 1; + readDataNodes = &nodes[i]; i += numDataNodes; + readParityNodes = &nodes[i]; i += numParityNodes; + writeDataNodes = &nodes[i]; i += numDataNodes; + writeParityNodes = &nodes[i]; i += numParityNodes; + xorNodes = &nodes[i]; i += numParityNodes; + termNode = &nodes[i]; i += 1; + if (lu_flag) { + unlockDataNodes = &nodes[i]; i += numDataNodes; + unlockParityNodes = &nodes[i]; i += numParityNodes; + } + else { + unlockDataNodes = unlockParityNodes = NULL; + } + if (nfaults == 2) { + readQNodes = &nodes[i]; i += numParityNodes; + writeQNodes = &nodes[i]; i += numParityNodes; + qNodes = &nodes[i]; i += numParityNodes; + if (lu_flag) { + unlockQNodes = &nodes[i]; i += numParityNodes; + } + else { + unlockQNodes = NULL; + } + } + else { + readQNodes = writeQNodes = qNodes = unlockQNodes = NULL; + } + RF_ASSERT(i == totalNumNodes); + + /* + * Step 3. initialize the nodes + */ + /* initialize block node (Nil) */ + nNodes = numDataNodes + (nfaults * numParityNodes); + rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, nNodes, 0, 0, 0, dag_h, "Nil", allocList); + + /* initialize commit node (Cmt) */ + rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, nNodes, (nfaults * numParityNodes), 0, 0, dag_h, "Cmt", allocList); + + /* initialize terminate node (Trm) */ + rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, + NULL, 0, nNodes, 0, 0, dag_h, "Trm", allocList); + + /* initialize nodes which read old data (Rod) */ + for (i = 0; i < numDataNodes; i++) { + rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, + rf_GenericWakeupFunc, (nfaults * numParityNodes), 1, 4, 0, dag_h, + "Rod", allocList); + RF_ASSERT(pda != NULL); + /* physical disk addr desc */ + readDataNodes[i].params[0].p = pda; + /* buffer to hold old data */ + readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr, + dag_h, pda, allocList); + readDataNodes[i].params[2].v = parityStripeID; + readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, + lu_flag, 0, which_ru); + pda = pda->next; + for (j = 0; j < readDataNodes[i].numSuccedents; j++) { + readDataNodes[i].propList[j] = NULL; + } + } + + /* initialize nodes which read old parity (Rop) */ + pda = asmap->parityInfo; i = 0; + for (i = 0; i < numParityNodes; i++) { + RF_ASSERT(pda != NULL); + rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, + rf_DiskReadUndoFunc, rf_GenericWakeupFunc, numParityNodes, 1, 4, + 0, dag_h, "Rop", allocList); + readParityNodes[i].params[0].p = pda; + /* buffer to hold old parity */ + readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr, + dag_h, pda, allocList); + readParityNodes[i].params[2].v = parityStripeID; + readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, + lu_flag, 0, which_ru); + pda = pda->next; + for (j = 0; j < readParityNodes[i].numSuccedents; j++) { + readParityNodes[i].propList[0] = NULL; + } + } + + /* initialize nodes which read old Q (Roq) */ + if (nfaults == 2) { + pda = asmap->qInfo; + for (i = 0; i < numParityNodes; i++) { + RF_ASSERT(pda != NULL); + rf_InitNode(&readQNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, + rf_GenericWakeupFunc, numParityNodes, 1, 4, 0, dag_h, "Roq", allocList); + readQNodes[i].params[0].p = pda; + /* buffer to hold old Q */ + readQNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, + allocList); + readQNodes[i].params[2].v = parityStripeID; + readQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, + lu_flag, 0, which_ru); + pda = pda->next; + for (j = 0; j < readQNodes[i].numSuccedents; j++) { + readQNodes[i].propList[0] = NULL; + } + } + } + + /* initialize nodes which write new data (Wnd) */ + pda = asmap->physInfo; + for (i=0; i < numDataNodes; i++) { + RF_ASSERT(pda != NULL); + rf_InitNode(&writeDataNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, + rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, + "Wnd", allocList); + /* physical disk addr desc */ + writeDataNodes[i].params[0].p = pda; + /* buffer holding new data to be written */ + writeDataNodes[i].params[1].p = pda->bufPtr; + writeDataNodes[i].params[2].v = parityStripeID; + writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, + 0, 0, which_ru); + if (lu_flag) { + /* initialize node to unlock the disk queue */ + rf_InitNode(&unlockDataNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, + rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, + "Und", allocList); + /* physical disk addr desc */ + unlockDataNodes[i].params[0].p = pda; + unlockDataNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, + 0, lu_flag, which_ru); + } + pda = pda->next; + } + + /* + * Initialize nodes which compute new parity and Q. + */ + /* + * We use the simple XOR func in the double-XOR case, and when + * we're accessing only a portion of one stripe unit. The distinction + * between the two is that the regular XOR func assumes that the targbuf + * is a full SU in size, and examines the pda associated with the buffer + * to decide where within the buffer to XOR the data, whereas + * the simple XOR func just XORs the data into the start of the buffer. + */ + if ((numParityNodes==2) || ((numDataNodes == 1) + && (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit))) + { + func = pfuncs->simple; undoFunc = rf_NullNodeUndoFunc; name = pfuncs->SimpleName; + if (qfuncs) { + qfunc = qfuncs->simple; + qname = qfuncs->SimpleName; + } + else { + qfunc = NULL; + qname = NULL; + } + } + else { + func = pfuncs->regular; + undoFunc = rf_NullNodeUndoFunc; + name = pfuncs->RegularName; + if (qfuncs) { + qfunc = qfuncs->regular; + qname = qfuncs->RegularName; + } + else { + qfunc = NULL; + qname = NULL; + } + } + /* + * Initialize the xor nodes: params are {pda,buf} + * from {Rod,Wnd,Rop} nodes, and raidPtr + */ + if (numParityNodes==2) { + /* double-xor case */ + for (i=0; i < numParityNodes; i++) { + /* note: no wakeup func for xor */ + rf_InitNode(&xorNodes[i], rf_wait, RF_FALSE, func, undoFunc, NULL, + 1, (numDataNodes + numParityNodes), 7, 1, dag_h, name, allocList); + xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD; + xorNodes[i].params[0] = readDataNodes[i].params[0]; + xorNodes[i].params[1] = readDataNodes[i].params[1]; + xorNodes[i].params[2] = readParityNodes[i].params[0]; + xorNodes[i].params[3] = readParityNodes[i].params[1]; + xorNodes[i].params[4] = writeDataNodes[i].params[0]; + xorNodes[i].params[5] = writeDataNodes[i].params[1]; + xorNodes[i].params[6].p = raidPtr; + /* use old parity buf as target buf */ + xorNodes[i].results[0] = readParityNodes[i].params[1].p; + if (nfaults == 2) { + /* note: no wakeup func for qor */ + rf_InitNode(&qNodes[i], rf_wait, RF_FALSE, qfunc, undoFunc, NULL, 1, + (numDataNodes + numParityNodes), 7, 1, dag_h, qname, allocList); + qNodes[i].params[0] = readDataNodes[i].params[0]; + qNodes[i].params[1] = readDataNodes[i].params[1]; + qNodes[i].params[2] = readQNodes[i].params[0]; + qNodes[i].params[3] = readQNodes[i].params[1]; + qNodes[i].params[4] = writeDataNodes[i].params[0]; + qNodes[i].params[5] = writeDataNodes[i].params[1]; + qNodes[i].params[6].p = raidPtr; + /* use old Q buf as target buf */ + qNodes[i].results[0] = readQNodes[i].params[1].p; + } + } + } + else { + /* there is only one xor node in this case */ + rf_InitNode(&xorNodes[0], rf_wait, RF_FALSE, func, undoFunc, NULL, 1, + (numDataNodes + numParityNodes), + (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList); + xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD; + for (i=0; i < numDataNodes + 1; i++) { + /* set up params related to Rod and Rop nodes */ + xorNodes[0].params[2*i+0] = readDataNodes[i].params[0]; /* pda */ + xorNodes[0].params[2*i+1] = readDataNodes[i].params[1]; /* buffer ptr */ + } + for (i=0; i < numDataNodes; i++) { + /* set up params related to Wnd and Wnp nodes */ + xorNodes[0].params[2*(numDataNodes+1+i)+0] = /* pda */ + writeDataNodes[i].params[0]; + xorNodes[0].params[2*(numDataNodes+1+i)+1] = /* buffer ptr */ + writeDataNodes[i].params[1]; + } + /* xor node needs to get at RAID information */ + xorNodes[0].params[2*(numDataNodes+numDataNodes+1)].p = raidPtr; + xorNodes[0].results[0] = readParityNodes[0].params[1].p; + if (nfaults == 2) { + rf_InitNode(&qNodes[0], rf_wait, RF_FALSE, qfunc, undoFunc, NULL, 1, + (numDataNodes + numParityNodes), + (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, + qname, allocList); + for (i=0; i<numDataNodes; i++) { + /* set up params related to Rod */ + qNodes[0].params[2*i+0] = readDataNodes[i].params[0]; /* pda */ + qNodes[0].params[2*i+1] = readDataNodes[i].params[1]; /* buffer ptr */ + } + /* and read old q */ + qNodes[0].params[2*numDataNodes + 0] = /* pda */ + readQNodes[0].params[0]; + qNodes[0].params[2*numDataNodes + 1] = /* buffer ptr */ + readQNodes[0].params[1]; + for (i=0; i < numDataNodes; i++) { + /* set up params related to Wnd nodes */ + qNodes[0].params[2*(numDataNodes+1+i)+0] = /* pda */ + writeDataNodes[i].params[0]; + qNodes[0].params[2*(numDataNodes+1+i)+1] = /* buffer ptr */ + writeDataNodes[i].params[1]; + } + /* xor node needs to get at RAID information */ + qNodes[0].params[2*(numDataNodes+numDataNodes+1)].p = raidPtr; + qNodes[0].results[0] = readQNodes[0].params[1].p; + } + } + + /* initialize nodes which write new parity (Wnp) */ + pda = asmap->parityInfo; + for (i=0; i < numParityNodes; i++) { + rf_InitNode(&writeParityNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, + rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, + "Wnp", allocList); + RF_ASSERT(pda != NULL); + writeParityNodes[i].params[0].p = pda; /* param 1 (bufPtr) filled in by xor node */ + writeParityNodes[i].params[1].p = xorNodes[i].results[0]; /* buffer pointer for parity write operation */ + writeParityNodes[i].params[2].v = parityStripeID; + writeParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, + 0, 0, which_ru); + if (lu_flag) { + /* initialize node to unlock the disk queue */ + rf_InitNode(&unlockParityNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, + rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, + "Unp", allocList); + unlockParityNodes[i].params[0].p = pda; /* physical disk addr desc */ + unlockParityNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, + 0, lu_flag, which_ru); + } + pda = pda->next; + } + + /* initialize nodes which write new Q (Wnq) */ + if (nfaults == 2) { + pda = asmap->qInfo; + for (i=0; i < numParityNodes; i++) { + rf_InitNode(&writeQNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, + rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, + "Wnq", allocList); + RF_ASSERT(pda != NULL); + writeQNodes[i].params[0].p = pda; /* param 1 (bufPtr) filled in by xor node */ + writeQNodes[i].params[1].p = qNodes[i].results[0]; /* buffer pointer for parity write operation */ + writeQNodes[i].params[2].v = parityStripeID; + writeQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, + 0, 0, which_ru); + if (lu_flag) { + /* initialize node to unlock the disk queue */ + rf_InitNode(&unlockQNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, + rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, + "Unq", allocList); + unlockQNodes[i].params[0].p = pda; /* physical disk addr desc */ + unlockQNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, + 0, lu_flag, which_ru); + } + pda = pda->next; + } + } + + /* + * Step 4. connect the nodes. + */ + + /* connect header to block node */ + dag_h->succedents[0] = blockNode; + + /* connect block node to read old data nodes */ + RF_ASSERT(blockNode->numSuccedents == (numDataNodes + (numParityNodes * nfaults))); + for (i = 0; i < numDataNodes; i++) { + blockNode->succedents[i] = &readDataNodes[i]; + RF_ASSERT(readDataNodes[i].numAntecedents == 1); + readDataNodes[i].antecedents[0]= blockNode; + readDataNodes[i].antType[0] = rf_control; + } + + /* connect block node to read old parity nodes */ + for (i = 0; i < numParityNodes; i++) { + blockNode->succedents[numDataNodes + i] = &readParityNodes[i]; + RF_ASSERT(readParityNodes[i].numAntecedents == 1); + readParityNodes[i].antecedents[0] = blockNode; + readParityNodes[i].antType[0] = rf_control; + } + + /* connect block node to read old Q nodes */ + if (nfaults == 2) { + for (i = 0; i < numParityNodes; i++) { + blockNode->succedents[numDataNodes + numParityNodes + i] = &readQNodes[i]; + RF_ASSERT(readQNodes[i].numAntecedents == 1); + readQNodes[i].antecedents[0] = blockNode; + readQNodes[i].antType[0] = rf_control; + } + } + + /* connect read old data nodes to xor nodes */ + for (i = 0; i < numDataNodes; i++) { + RF_ASSERT(readDataNodes[i].numSuccedents == (nfaults * numParityNodes)); + for (j = 0; j < numParityNodes; j++){ + RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes); + readDataNodes[i].succedents[j] = &xorNodes[j]; + xorNodes[j].antecedents[i] = &readDataNodes[i]; + xorNodes[j].antType[i] = rf_trueData; + } + } + + /* connect read old data nodes to q nodes */ + if (nfaults == 2) { + for (i = 0; i < numDataNodes; i++) { + for (j = 0; j < numParityNodes; j++) { + RF_ASSERT(qNodes[j].numAntecedents == numDataNodes + numParityNodes); + readDataNodes[i].succedents[numParityNodes + j] = &qNodes[j]; + qNodes[j].antecedents[i] = &readDataNodes[i]; + qNodes[j].antType[i] = rf_trueData; + } + } + } + + /* connect read old parity nodes to xor nodes */ + for (i = 0; i < numParityNodes; i++) { + RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes); + for (j = 0; j < numParityNodes; j++) { + readParityNodes[i].succedents[j] = &xorNodes[j]; + xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i]; + xorNodes[j].antType[numDataNodes + i] = rf_trueData; + } + } + + /* connect read old q nodes to q nodes */ + if (nfaults == 2) { + for (i = 0; i < numParityNodes; i++) { + RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes); + for (j = 0; j < numParityNodes; j++) { + readQNodes[i].succedents[j] = &qNodes[j]; + qNodes[j].antecedents[numDataNodes + i] = &readQNodes[i]; + qNodes[j].antType[numDataNodes + i] = rf_trueData; + } + } + } + + /* connect xor nodes to commit node */ + RF_ASSERT(commitNode->numAntecedents == (nfaults * numParityNodes)); + for (i = 0; i < numParityNodes; i++) { + RF_ASSERT(xorNodes[i].numSuccedents == 1); + xorNodes[i].succedents[0] = commitNode; + commitNode->antecedents[i] = &xorNodes[i]; + commitNode->antType[i] = rf_control; + } + + /* connect q nodes to commit node */ + if (nfaults == 2) { + for (i = 0; i < numParityNodes; i++) { + RF_ASSERT(qNodes[i].numSuccedents == 1); + qNodes[i].succedents[0] = commitNode; + commitNode->antecedents[i + numParityNodes] = &qNodes[i]; + commitNode->antType[i + numParityNodes] = rf_control; + } + } + + /* connect commit node to write nodes */ + RF_ASSERT(commitNode->numSuccedents == (numDataNodes + (nfaults * numParityNodes))); + for (i = 0; i < numDataNodes; i++) { + RF_ASSERT(writeDataNodes[i].numAntecedents == 1); + commitNode->succedents[i] = &writeDataNodes[i]; + writeDataNodes[i].antecedents[0] = commitNode; + writeDataNodes[i].antType[0] = rf_trueData; + } + for (i = 0; i < numParityNodes; i++) { + RF_ASSERT(writeParityNodes[i].numAntecedents == 1); + commitNode->succedents[i + numDataNodes] = &writeParityNodes[i]; + writeParityNodes[i].antecedents[0] = commitNode; + writeParityNodes[i].antType[0] = rf_trueData; + } + if (nfaults == 2) { + for (i = 0; i < numParityNodes; i++) { + RF_ASSERT(writeQNodes[i].numAntecedents == 1); + commitNode->succedents[i + numDataNodes + numParityNodes] = &writeQNodes[i]; + writeQNodes[i].antecedents[0] = commitNode; + writeQNodes[i].antType[0] = rf_trueData; + } + } + + RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes))); + RF_ASSERT(termNode->numSuccedents == 0); + for (i = 0; i < numDataNodes; i++) { + if (lu_flag) { + /* connect write new data nodes to unlock nodes */ + RF_ASSERT(writeDataNodes[i].numSuccedents == 1); + RF_ASSERT(unlockDataNodes[i].numAntecedents == 1); + writeDataNodes[i].succedents[0] = &unlockDataNodes[i]; + unlockDataNodes[i].antecedents[0] = &writeDataNodes[i]; + unlockDataNodes[i].antType[0] = rf_control; + + /* connect unlock nodes to term node */ + RF_ASSERT(unlockDataNodes[i].numSuccedents == 1); + unlockDataNodes[i].succedents[0] = termNode; + termNode->antecedents[i] = &unlockDataNodes[i]; + termNode->antType[i] = rf_control; + } + else { + /* connect write new data nodes to term node */ + RF_ASSERT(writeDataNodes[i].numSuccedents == 1); + RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes))); + writeDataNodes[i].succedents[0] = termNode; + termNode->antecedents[i] = &writeDataNodes[i]; + termNode->antType[i] = rf_control; + } + } + + for (i = 0; i < numParityNodes; i++) { + if (lu_flag) { + /* connect write new parity nodes to unlock nodes */ + RF_ASSERT(writeParityNodes[i].numSuccedents == 1); + RF_ASSERT(unlockParityNodes[i].numAntecedents == 1); + writeParityNodes[i].succedents[0] = &unlockParityNodes[i]; + unlockParityNodes[i].antecedents[0] = &writeParityNodes[i]; + unlockParityNodes[i].antType[0] = rf_control; + + /* connect unlock nodes to term node */ + RF_ASSERT(unlockParityNodes[i].numSuccedents == 1); + unlockParityNodes[i].succedents[0] = termNode; + termNode->antecedents[numDataNodes + i] = &unlockParityNodes[i]; + termNode->antType[numDataNodes + i] = rf_control; + } + else { + RF_ASSERT(writeParityNodes[i].numSuccedents == 1); + writeParityNodes[i].succedents[0] = termNode; + termNode->antecedents[numDataNodes + i] = &writeParityNodes[i]; + termNode->antType[numDataNodes + i] = rf_control; + } + } + + if (nfaults == 2) { + for (i = 0; i < numParityNodes; i++) { + if (lu_flag) { + /* connect write new Q nodes to unlock nodes */ + RF_ASSERT(writeQNodes[i].numSuccedents == 1); + RF_ASSERT(unlockQNodes[i].numAntecedents == 1); + writeQNodes[i].succedents[0] = &unlockQNodes[i]; + unlockQNodes[i].antecedents[0] = &writeQNodes[i]; + unlockQNodes[i].antType[0] = rf_control; + + /* connect unlock nodes to unblock node */ + RF_ASSERT(unlockQNodes[i].numSuccedents == 1); + unlockQNodes[i].succedents[0] = termNode; + termNode->antecedents[numDataNodes + numParityNodes + i] = &unlockQNodes[i]; + termNode->antType[numDataNodes + numParityNodes + i] = rf_control; + } + else { + RF_ASSERT(writeQNodes[i].numSuccedents == 1); + writeQNodes[i].succedents[0] = termNode; + termNode->antecedents[numDataNodes + numParityNodes + i] = &writeQNodes[i]; + termNode->antType[numDataNodes + numParityNodes + i] = rf_control; + } + } + } +} + + +/****************************************************************************** + * create a write graph (fault-free or degraded) for RAID level 1 + * + * Hdr -> Commit -> Wpd -> Nil -> Trm + * -> Wsd -> + * + * The "Wpd" node writes data to the primary copy in the mirror pair + * The "Wsd" node writes data to the secondary copy in the mirror pair + * + * Parameters: raidPtr - description of the physical array + * asmap - logical & physical addresses for this access + * bp - buffer ptr (holds write data) + * flags - general flags (e.g. disk locking) + * allocList - list of memory allocated in DAG creation + *****************************************************************************/ + +void rf_CreateRaidOneWriteDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList) +{ + RF_DagNode_t *unblockNode, *termNode, *commitNode; + RF_DagNode_t *nodes, *wndNode, *wmirNode; + int nWndNodes, nWmirNodes, i; + RF_ReconUnitNum_t which_ru; + RF_PhysDiskAddr_t *pda, *pdaP; + RF_StripeNum_t parityStripeID; + + parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), + asmap->raidAddress, &which_ru); + if (rf_dagDebug) { + printf("[Creating RAID level 1 write DAG]\n"); + } + dag_h->creator = "RaidOneWriteDAG"; + + /* 2 implies access not SU aligned */ + nWmirNodes = (asmap->parityInfo->next) ? 2 : 1; + nWndNodes = (asmap->physInfo->next) ? 2 : 1; + + /* alloc the Wnd nodes and the Wmir node */ + if (asmap->numDataFailed == 1) + nWndNodes--; + if (asmap->numParityFailed == 1) + nWmirNodes--; + + /* total number of nodes = nWndNodes + nWmirNodes + (commit + unblock + terminator) */ + RF_CallocAndAdd(nodes, nWndNodes + nWmirNodes + 3, sizeof(RF_DagNode_t), + (RF_DagNode_t *), allocList); + i = 0; + wndNode = &nodes[i]; i += nWndNodes; + wmirNode = &nodes[i]; i += nWmirNodes; + commitNode = &nodes[i]; i += 1; + unblockNode = &nodes[i]; i += 1; + termNode = &nodes[i]; i += 1; + RF_ASSERT(i == (nWndNodes + nWmirNodes + 3)); + + /* this dag can commit immediately */ + dag_h->numCommitNodes = 1; + dag_h->numCommits = 0; + dag_h->numSuccedents = 1; + + /* initialize the commit, unblock, and term nodes */ + rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, (nWndNodes + nWmirNodes), 0, 0, 0, dag_h, "Cmt", allocList); + rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, + NULL, 1, (nWndNodes + nWmirNodes), 0, 0, dag_h, "Nil", allocList); + rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, + NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); + + /* initialize the wnd nodes */ + if (nWndNodes > 0) { + pda = asmap->physInfo; + for (i = 0; i < nWndNodes; i++) { + rf_InitNode(&wndNode[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, + rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wpd", allocList); + RF_ASSERT(pda != NULL); + wndNode[i].params[0].p = pda; + wndNode[i].params[1].p = pda->bufPtr; + wndNode[i].params[2].v = parityStripeID; + wndNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + pda = pda->next; + } + RF_ASSERT(pda == NULL); + } + + /* initialize the mirror nodes */ + if (nWmirNodes > 0) { + pda = asmap->physInfo; + pdaP = asmap->parityInfo; + for (i = 0; i < nWmirNodes; i++) { + rf_InitNode(&wmirNode[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, + rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wsd", allocList); + RF_ASSERT(pda != NULL); + wmirNode[i].params[0].p = pdaP; + wmirNode[i].params[1].p = pda->bufPtr; + wmirNode[i].params[2].v = parityStripeID; + wmirNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + pda = pda->next; + pdaP = pdaP->next; + } + RF_ASSERT(pda == NULL); + RF_ASSERT(pdaP == NULL); + } + + /* link the header node to the commit node */ + RF_ASSERT(dag_h->numSuccedents == 1); + RF_ASSERT(commitNode->numAntecedents == 0); + dag_h->succedents[0] = commitNode; + + /* link the commit node to the write nodes */ + RF_ASSERT(commitNode->numSuccedents == (nWndNodes + nWmirNodes)); + for (i = 0; i < nWndNodes; i++) { + RF_ASSERT(wndNode[i].numAntecedents == 1); + commitNode->succedents[i] = &wndNode[i]; + wndNode[i].antecedents[0] = commitNode; + wndNode[i].antType[0] = rf_control; + } + for (i = 0; i < nWmirNodes; i++) { + RF_ASSERT(wmirNode[i].numAntecedents == 1); + commitNode->succedents[i + nWndNodes] = &wmirNode[i]; + wmirNode[i].antecedents[0] = commitNode; + wmirNode[i].antType[0] = rf_control; + } + + /* link the write nodes to the unblock node */ + RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nWmirNodes)); + for (i = 0; i < nWndNodes; i++) { + RF_ASSERT(wndNode[i].numSuccedents == 1); + wndNode[i].succedents[0] = unblockNode; + unblockNode->antecedents[i] = &wndNode[i]; + unblockNode->antType[i] = rf_control; + } + for (i = 0; i < nWmirNodes; i++) { + RF_ASSERT(wmirNode[i].numSuccedents == 1); + wmirNode[i].succedents[0] = unblockNode; + unblockNode->antecedents[i + nWndNodes] = &wmirNode[i]; + unblockNode->antType[i + nWndNodes] = rf_control; + } + + /* link the unblock node to the term node */ + RF_ASSERT(unblockNode->numSuccedents == 1); + RF_ASSERT(termNode->numAntecedents == 1); + RF_ASSERT(termNode->numSuccedents == 0); + unblockNode->succedents[0] = termNode; + termNode->antecedents[0] = unblockNode; + termNode->antType[0] = rf_control; +} + + + +/* DAGs which have no commit points. + * + * The following DAGs are used in forward and backward error recovery experiments. + * They are identical to the DAGs above this comment with the exception that the + * the commit points have been removed. + */ + + + +void rf_CommonCreateLargeWriteDAGFwd( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList, + int nfaults, + int (*redFunc)(RF_DagNode_t *), + int allowBufferRecycle) +{ + RF_DagNode_t *nodes, *wndNodes, *rodNodes, *xorNode, *wnpNode; + RF_DagNode_t *wnqNode, *blockNode, *syncNode, *termNode; + int nWndNodes, nRodNodes, i, nodeNum, asmNum; + RF_AccessStripeMapHeader_t *new_asm_h[2]; + RF_StripeNum_t parityStripeID; + char *sosBuffer, *eosBuffer; + RF_ReconUnitNum_t which_ru; + RF_RaidLayout_t *layoutPtr; + RF_PhysDiskAddr_t *pda; + + layoutPtr = &(raidPtr->Layout); + parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru); + + if (rf_dagDebug) + printf("[Creating large-write DAG]\n"); + dag_h->creator = "LargeWriteDAGFwd"; + + dag_h->numCommitNodes = 0; + dag_h->numCommits = 0; + dag_h->numSuccedents = 1; + + /* alloc the nodes: Wnd, xor, commit, block, term, and Wnp */ + nWndNodes = asmap->numStripeUnitsAccessed; + RF_CallocAndAdd(nodes, nWndNodes + 4 + nfaults, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList); + i = 0; + wndNodes = &nodes[i]; i += nWndNodes; + xorNode = &nodes[i]; i += 1; + wnpNode = &nodes[i]; i += 1; + blockNode = &nodes[i]; i += 1; + syncNode = &nodes[i]; i += 1; + termNode = &nodes[i]; i += 1; + if (nfaults == 2) { + wnqNode = &nodes[i]; i += 1; + } + else { + wnqNode = NULL; + } + rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h, &nRodNodes, &sosBuffer, &eosBuffer, allocList); + if (nRodNodes > 0) { + RF_CallocAndAdd(rodNodes, nRodNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList); + } + else { + rodNodes = NULL; + } + + /* begin node initialization */ + if (nRodNodes > 0) { + rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nRodNodes, 0, 0, 0, dag_h, "Nil", allocList); + rf_InitNode(syncNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nWndNodes + 1, nRodNodes, 0, 0, dag_h, "Nil", allocList); + } + else { + rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, 0, 0, 0, dag_h, "Nil", allocList); + rf_InitNode(syncNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nWndNodes + 1, 1, 0, 0, dag_h, "Nil", allocList); + } + + rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, nWndNodes + nfaults, 0, 0, dag_h, "Trm", allocList); + + /* initialize the Rod nodes */ + for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) { + if (new_asm_h[asmNum]) { + pda = new_asm_h[asmNum]->stripeMap->physInfo; + while (pda) { + rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rod", allocList); + rodNodes[nodeNum].params[0].p = pda; + rodNodes[nodeNum].params[1].p = pda->bufPtr; + rodNodes[nodeNum].params[2].v = parityStripeID; + rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + nodeNum++; + pda=pda->next; + } + } + } + RF_ASSERT(nodeNum == nRodNodes); + + /* initialize the wnd nodes */ + pda = asmap->physInfo; + for (i=0; i < nWndNodes; i++) { + rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList); + RF_ASSERT(pda != NULL); + wndNodes[i].params[0].p = pda; + wndNodes[i].params[1].p = pda->bufPtr; + wndNodes[i].params[2].v = parityStripeID; + wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + pda = pda->next; + } + + /* initialize the redundancy node */ + rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1, nfaults, 2 * (nWndNodes + nRodNodes) + 1, nfaults, dag_h, "Xr ", allocList); + xorNode->flags |= RF_DAGNODE_FLAG_YIELD; + for (i=0; i < nWndNodes; i++) { + xorNode->params[2*i+0] = wndNodes[i].params[0]; /* pda */ + xorNode->params[2*i+1] = wndNodes[i].params[1]; /* buf ptr */ + } + for (i=0; i < nRodNodes; i++) { + xorNode->params[2*(nWndNodes+i)+0] = rodNodes[i].params[0]; /* pda */ + xorNode->params[2*(nWndNodes+i)+1] = rodNodes[i].params[1]; /* buf ptr */ + } + xorNode->params[2*(nWndNodes+nRodNodes)].p = raidPtr; /* xor node needs to get at RAID information */ + + /* look for an Rod node that reads a complete SU. If none, alloc a buffer to receive the parity info. + * Note that we can't use a new data buffer because it will not have gotten written when the xor occurs. + */ + if (allowBufferRecycle) { + for (i = 0; i < nRodNodes; i++) + if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit) + break; + } + if ((!allowBufferRecycle) || (i == nRodNodes)) { + RF_CallocAndAdd(xorNode->results[0], 1, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (void *), allocList); + } + else + xorNode->results[0] = rodNodes[i].params[1].p; + + /* initialize the Wnp node */ + rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList); + wnpNode->params[0].p = asmap->parityInfo; + wnpNode->params[1].p = xorNode->results[0]; + wnpNode->params[2].v = parityStripeID; + wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + RF_ASSERT(asmap->parityInfo->next == NULL); /* parityInfo must describe entire parity unit */ + + if (nfaults == 2) + { + /* we never try to recycle a buffer for the Q calcuation in addition to the parity. + This would cause two buffers to get smashed during the P and Q calculation, + guaranteeing one would be wrong. + */ + RF_CallocAndAdd(xorNode->results[1], 1, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (void *), allocList); + rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList); + wnqNode->params[0].p = asmap->qInfo; + wnqNode->params[1].p = xorNode->results[1]; + wnqNode->params[2].v = parityStripeID; + wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + RF_ASSERT(asmap->parityInfo->next == NULL); /* parityInfo must describe entire parity unit */ + } + + + /* connect nodes to form graph */ + + /* connect dag header to block node */ + RF_ASSERT(blockNode->numAntecedents == 0); + dag_h->succedents[0] = blockNode; + + if (nRodNodes > 0) { + /* connect the block node to the Rod nodes */ + RF_ASSERT(blockNode->numSuccedents == nRodNodes); + RF_ASSERT(syncNode->numAntecedents == nRodNodes); + for (i = 0; i < nRodNodes; i++) { + RF_ASSERT(rodNodes[i].numAntecedents == 1); + blockNode->succedents[i] = &rodNodes[i]; + rodNodes[i].antecedents[0] = blockNode; + rodNodes[i].antType[0] = rf_control; + + /* connect the Rod nodes to the Nil node */ + RF_ASSERT(rodNodes[i].numSuccedents == 1); + rodNodes[i].succedents[0] = syncNode; + syncNode->antecedents[i] = &rodNodes[i]; + syncNode->antType[i] = rf_trueData; + } + } + else { + /* connect the block node to the Nil node */ + RF_ASSERT(blockNode->numSuccedents == 1); + RF_ASSERT(syncNode->numAntecedents == 1); + blockNode->succedents[0] = syncNode; + syncNode->antecedents[0] = blockNode; + syncNode->antType[0] = rf_control; + } + + /* connect the sync node to the Wnd nodes */ + RF_ASSERT(syncNode->numSuccedents == (1 + nWndNodes)); + for (i = 0; i < nWndNodes; i++) { + RF_ASSERT(wndNodes->numAntecedents == 1); + syncNode->succedents[i] = &wndNodes[i]; + wndNodes[i].antecedents[0] = syncNode; + wndNodes[i].antType[0] = rf_control; + } + + /* connect the sync node to the Xor node */ + RF_ASSERT(xorNode->numAntecedents == 1); + syncNode->succedents[nWndNodes] = xorNode; + xorNode->antecedents[0] = syncNode; + xorNode->antType[0] = rf_control; + + /* connect the xor node to the write parity node */ + RF_ASSERT(xorNode->numSuccedents == nfaults); + RF_ASSERT(wnpNode->numAntecedents == 1); + xorNode->succedents[0] = wnpNode; + wnpNode->antecedents[0]= xorNode; + wnpNode->antType[0] = rf_trueData; + if (nfaults == 2) { + RF_ASSERT(wnqNode->numAntecedents == 1); + xorNode->succedents[1] = wnqNode; + wnqNode->antecedents[0] = xorNode; + wnqNode->antType[0] = rf_trueData; + } + + /* connect the write nodes to the term node */ + RF_ASSERT(termNode->numAntecedents == nWndNodes + nfaults); + RF_ASSERT(termNode->numSuccedents == 0); + for (i = 0; i < nWndNodes; i++) { + RF_ASSERT(wndNodes->numSuccedents == 1); + wndNodes[i].succedents[0] = termNode; + termNode->antecedents[i] = &wndNodes[i]; + termNode->antType[i] = rf_control; + } + RF_ASSERT(wnpNode->numSuccedents == 1); + wnpNode->succedents[0] = termNode; + termNode->antecedents[nWndNodes] = wnpNode; + termNode->antType[nWndNodes] = rf_control; + if (nfaults == 2) { + RF_ASSERT(wnqNode->numSuccedents == 1); + wnqNode->succedents[0] = termNode; + termNode->antecedents[nWndNodes + 1] = wnqNode; + termNode->antType[nWndNodes + 1] = rf_control; + } +} + + +/****************************************************************************** + * + * creates a DAG to perform a small-write operation (either raid 5 or pq), + * which is as follows: + * + * Hdr -> Nil -> Rop - Xor - Wnp [Unp] -- Trm + * \- Rod X- Wnd [Und] -------/ + * [\- Rod X- Wnd [Und] ------/] + * [\- Roq - Q --> Wnq [Unq]-/] + * + * Rop = read old parity + * Rod = read old data + * Roq = read old "q" + * Cmt = commit node + * Und = unlock data disk + * Unp = unlock parity disk + * Unq = unlock q disk + * Wnp = write new parity + * Wnd = write new data + * Wnq = write new "q" + * [ ] denotes optional segments in the graph + * + * Parameters: raidPtr - description of the physical array + * asmap - logical & physical addresses for this access + * bp - buffer ptr (holds write data) + * flags - general flags (e.g. disk locking) + * allocList - list of memory allocated in DAG creation + * pfuncs - list of parity generating functions + * qfuncs - list of q generating functions + * + * A null qfuncs indicates single fault tolerant + *****************************************************************************/ + +void rf_CommonCreateSmallWriteDAGFwd( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList, + RF_RedFuncs_t *pfuncs, + RF_RedFuncs_t *qfuncs) +{ + RF_DagNode_t *readDataNodes, *readParityNodes, *readQNodes, *termNode; + RF_DagNode_t *unlockDataNodes, *unlockParityNodes, *unlockQNodes; + RF_DagNode_t *xorNodes, *qNodes, *blockNode, *nodes; + RF_DagNode_t *writeDataNodes, *writeParityNodes, *writeQNodes; + int i, j, nNodes, totalNumNodes, lu_flag; + RF_ReconUnitNum_t which_ru; + int (*func)(RF_DagNode_t *), (*undoFunc)(RF_DagNode_t *); + int (*qfunc)(RF_DagNode_t *); + int numDataNodes, numParityNodes; + RF_StripeNum_t parityStripeID; + RF_PhysDiskAddr_t *pda; + char *name, *qname; + long nfaults; + + nfaults = qfuncs ? 2 : 1; + lu_flag = (rf_enableAtomicRMW) ? 1 : 0; /* lock/unlock flag */ + + parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru); + pda = asmap->physInfo; + numDataNodes = asmap->numStripeUnitsAccessed; + numParityNodes = (asmap->parityInfo->next) ? 2 : 1; + + if (rf_dagDebug) printf("[Creating small-write DAG]\n"); + RF_ASSERT(numDataNodes > 0); + dag_h->creator = "SmallWriteDAGFwd"; + + dag_h->numCommitNodes = 0; + dag_h->numCommits = 0; + dag_h->numSuccedents = 1; + + qfunc = NULL; + qname = NULL; + + /* DAG creation occurs in four steps: + 1. count the number of nodes in the DAG + 2. create the nodes + 3. initialize the nodes + 4. connect the nodes + */ + + /* Step 1. compute number of nodes in the graph */ + + /* number of nodes: + a read and write for each data unit + a redundancy computation node for each parity node (nfaults * nparity) + a read and write for each parity unit + a block node + a terminate node + if atomic RMW + an unlock node for each data unit, redundancy unit + */ + totalNumNodes = (2 * numDataNodes) + (nfaults * numParityNodes) + (nfaults * 2 * numParityNodes) + 2; + if (lu_flag) + totalNumNodes += (numDataNodes + (nfaults * numParityNodes)); + + + /* Step 2. create the nodes */ + RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList); + i = 0; + blockNode = &nodes[i]; i += 1; + readDataNodes = &nodes[i]; i += numDataNodes; + readParityNodes = &nodes[i]; i += numParityNodes; + writeDataNodes = &nodes[i]; i += numDataNodes; + writeParityNodes = &nodes[i]; i += numParityNodes; + xorNodes = &nodes[i]; i += numParityNodes; + termNode = &nodes[i]; i += 1; + if (lu_flag) { + unlockDataNodes = &nodes[i]; i += numDataNodes; + unlockParityNodes = &nodes[i]; i += numParityNodes; + } + else { + unlockDataNodes = unlockParityNodes = NULL; + } + if (nfaults == 2) { + readQNodes = &nodes[i]; i += numParityNodes; + writeQNodes = &nodes[i]; i += numParityNodes; + qNodes = &nodes[i]; i += numParityNodes; + if (lu_flag) { + unlockQNodes = &nodes[i]; i += numParityNodes; + } + else { + unlockQNodes = NULL; + } + } + else { + readQNodes = writeQNodes = qNodes = unlockQNodes = NULL; + } + RF_ASSERT(i == totalNumNodes); + + /* Step 3. initialize the nodes */ + /* initialize block node (Nil) */ + nNodes = numDataNodes + (nfaults * numParityNodes); + rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil", allocList); + + /* initialize terminate node (Trm) */ + rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, nNodes, 0, 0, dag_h, "Trm", allocList); + + /* initialize nodes which read old data (Rod) */ + for (i = 0; i < numDataNodes; i++) { + rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, (numParityNodes * nfaults) + 1, 1, 4, 0, dag_h, "Rod", allocList); + RF_ASSERT(pda != NULL); + readDataNodes[i].params[0].p = pda; /* physical disk addr desc */ + readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old data */ + readDataNodes[i].params[2].v = parityStripeID; + readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, lu_flag, 0, which_ru); + pda=pda->next; + for (j = 0; j < readDataNodes[i].numSuccedents; j++) + readDataNodes[i].propList[j] = NULL; + } + + /* initialize nodes which read old parity (Rop) */ + pda = asmap->parityInfo; i = 0; + for (i = 0; i < numParityNodes; i++) { + RF_ASSERT(pda != NULL); + rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, numParityNodes, 1, 4, 0, dag_h, "Rop", allocList); + readParityNodes[i].params[0].p = pda; + readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old parity */ + readParityNodes[i].params[2].v = parityStripeID; + readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, lu_flag, 0, which_ru); + for (j = 0; j < readParityNodes[i].numSuccedents; j++) + readParityNodes[i].propList[0] = NULL; + pda=pda->next; + } + + /* initialize nodes which read old Q (Roq) */ + if (nfaults == 2) + { + pda = asmap->qInfo; + for (i = 0; i < numParityNodes; i++) { + RF_ASSERT(pda != NULL); + rf_InitNode(&readQNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, numParityNodes, 1, 4, 0, dag_h, "Roq", allocList); + readQNodes[i].params[0].p = pda; + readQNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old Q */ + readQNodes[i].params[2].v = parityStripeID; + readQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, lu_flag, 0, which_ru); + for (j = 0; j < readQNodes[i].numSuccedents; j++) + readQNodes[i].propList[0] = NULL; + pda=pda->next; + } + } + + /* initialize nodes which write new data (Wnd) */ + pda = asmap->physInfo; + for (i=0; i < numDataNodes; i++) { + RF_ASSERT(pda != NULL); + rf_InitNode(&writeDataNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList); + writeDataNodes[i].params[0].p = pda; /* physical disk addr desc */ + writeDataNodes[i].params[1].p = pda->bufPtr; /* buffer holding new data to be written */ + writeDataNodes[i].params[2].v = parityStripeID; + writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + + if (lu_flag) { + /* initialize node to unlock the disk queue */ + rf_InitNode(&unlockDataNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Und", allocList); + unlockDataNodes[i].params[0].p = pda; /* physical disk addr desc */ + unlockDataNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, lu_flag, which_ru); + } + + pda = pda->next; + } + + + /* initialize nodes which compute new parity and Q */ + /* we use the simple XOR func in the double-XOR case, and when we're accessing only a portion of one stripe unit. + * the distinction between the two is that the regular XOR func assumes that the targbuf is a full SU in size, + * and examines the pda associated with the buffer to decide where within the buffer to XOR the data, whereas + * the simple XOR func just XORs the data into the start of the buffer. + */ + if ((numParityNodes==2) || ((numDataNodes == 1) && (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit))) { + func = pfuncs->simple; undoFunc = rf_NullNodeUndoFunc; name = pfuncs->SimpleName; + if (qfuncs) { + qfunc = qfuncs->simple; + qname = qfuncs->SimpleName; + } + } + else { + func = pfuncs->regular; undoFunc = rf_NullNodeUndoFunc; name = pfuncs->RegularName; + if (qfuncs) { qfunc = qfuncs->regular; qname = qfuncs->RegularName;} + } + /* initialize the xor nodes: params are {pda,buf} from {Rod,Wnd,Rop} nodes, and raidPtr */ + if (numParityNodes==2) { /* double-xor case */ + for (i=0; i < numParityNodes; i++) { + rf_InitNode(&xorNodes[i], rf_wait, RF_FALSE, func, undoFunc, NULL, numParityNodes, numParityNodes + numDataNodes, 7, 1, dag_h, name, allocList); /* no wakeup func for xor */ + xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD; + xorNodes[i].params[0] = readDataNodes[i].params[0]; + xorNodes[i].params[1] = readDataNodes[i].params[1]; + xorNodes[i].params[2] = readParityNodes[i].params[0]; + xorNodes[i].params[3] = readParityNodes[i].params[1]; + xorNodes[i].params[4] = writeDataNodes[i].params[0]; + xorNodes[i].params[5] = writeDataNodes[i].params[1]; + xorNodes[i].params[6].p = raidPtr; + xorNodes[i].results[0] = readParityNodes[i].params[1].p; /* use old parity buf as target buf */ + if (nfaults==2) + { + rf_InitNode(&qNodes[i], rf_wait, RF_FALSE, qfunc, undoFunc, NULL, numParityNodes, numParityNodes + numDataNodes, 7, 1, dag_h, qname, allocList); /* no wakeup func for xor */ + qNodes[i].params[0] = readDataNodes[i].params[0]; + qNodes[i].params[1] = readDataNodes[i].params[1]; + qNodes[i].params[2] = readQNodes[i].params[0]; + qNodes[i].params[3] = readQNodes[i].params[1]; + qNodes[i].params[4] = writeDataNodes[i].params[0]; + qNodes[i].params[5] = writeDataNodes[i].params[1]; + qNodes[i].params[6].p = raidPtr; + qNodes[i].results[0] = readQNodes[i].params[1].p; /* use old Q buf as target buf */ + } + } + } + else { + /* there is only one xor node in this case */ + rf_InitNode(&xorNodes[0], rf_wait, RF_FALSE, func, undoFunc, NULL, numParityNodes, numParityNodes + numDataNodes, (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList); + xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD; + for (i=0; i < numDataNodes + 1; i++) { + /* set up params related to Rod and Rop nodes */ + xorNodes[0].params[2*i+0] = readDataNodes[i].params[0]; /* pda */ + xorNodes[0].params[2*i+1] = readDataNodes[i].params[1]; /* buffer pointer */ + } + for (i=0; i < numDataNodes; i++) { + /* set up params related to Wnd and Wnp nodes */ + xorNodes[0].params[2*(numDataNodes+1+i)+0] = writeDataNodes[i].params[0]; /* pda */ + xorNodes[0].params[2*(numDataNodes+1+i)+1] = writeDataNodes[i].params[1]; /* buffer pointer */ + } + xorNodes[0].params[2*(numDataNodes+numDataNodes+1)].p = raidPtr; /* xor node needs to get at RAID information */ + xorNodes[0].results[0] = readParityNodes[0].params[1].p; + if (nfaults==2) + { + rf_InitNode(&qNodes[0], rf_wait, RF_FALSE, qfunc, undoFunc, NULL, numParityNodes, numParityNodes + numDataNodes, (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, qname, allocList); + for (i=0; i<numDataNodes; i++) { + /* set up params related to Rod */ + qNodes[0].params[2*i+0] = readDataNodes[i].params[0]; /* pda */ + qNodes[0].params[2*i+1] = readDataNodes[i].params[1]; /* buffer pointer */ + } + /* and read old q */ + qNodes[0].params[2*numDataNodes + 0] = readQNodes[0].params[0]; /* pda */ + qNodes[0].params[2*numDataNodes + 1] = readQNodes[0].params[1]; /* buffer pointer */ + for (i=0; i < numDataNodes; i++) { + /* set up params related to Wnd nodes */ + qNodes[0].params[2*(numDataNodes+1+i)+0] = writeDataNodes[i].params[0]; /* pda */ + qNodes[0].params[2*(numDataNodes+1+i)+1] = writeDataNodes[i].params[1]; /* buffer pointer */ + } + qNodes[0].params[2*(numDataNodes+numDataNodes+1)].p = raidPtr; /* xor node needs to get at RAID information */ + qNodes[0].results[0] = readQNodes[0].params[1].p; + } + } + + /* initialize nodes which write new parity (Wnp) */ + pda = asmap->parityInfo; + for (i=0; i < numParityNodes; i++) { + rf_InitNode(&writeParityNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, numParityNodes, 4, 0, dag_h, "Wnp", allocList); + RF_ASSERT(pda != NULL); + writeParityNodes[i].params[0].p = pda; /* param 1 (bufPtr) filled in by xor node */ + writeParityNodes[i].params[1].p = xorNodes[i].results[0]; /* buffer pointer for parity write operation */ + writeParityNodes[i].params[2].v = parityStripeID; + writeParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + + if (lu_flag) { + /* initialize node to unlock the disk queue */ + rf_InitNode(&unlockParityNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Unp", allocList); + unlockParityNodes[i].params[0].p = pda; /* physical disk addr desc */ + unlockParityNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, lu_flag, which_ru); + } + + pda = pda->next; + } + + /* initialize nodes which write new Q (Wnq) */ + if (nfaults == 2) + { + pda = asmap->qInfo; + for (i=0; i < numParityNodes; i++) { + rf_InitNode(&writeQNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, numParityNodes, 4, 0, dag_h, "Wnq", allocList); + RF_ASSERT(pda != NULL); + writeQNodes[i].params[0].p = pda; /* param 1 (bufPtr) filled in by xor node */ + writeQNodes[i].params[1].p = qNodes[i].results[0]; /* buffer pointer for parity write operation */ + writeQNodes[i].params[2].v = parityStripeID; + writeQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + + if (lu_flag) { + /* initialize node to unlock the disk queue */ + rf_InitNode(&unlockQNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Unq", allocList); + unlockQNodes[i].params[0].p = pda; /* physical disk addr desc */ + unlockQNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, lu_flag, which_ru); + } + + pda = pda->next; + } + } + + /* Step 4. connect the nodes */ + + /* connect header to block node */ + dag_h->succedents[0] = blockNode; + + /* connect block node to read old data nodes */ + RF_ASSERT(blockNode->numSuccedents == (numDataNodes + (numParityNodes * nfaults))); + for (i = 0; i < numDataNodes; i++) { + blockNode->succedents[i] = &readDataNodes[i]; + RF_ASSERT(readDataNodes[i].numAntecedents == 1); + readDataNodes[i].antecedents[0]= blockNode; + readDataNodes[i].antType[0] = rf_control; + } + + /* connect block node to read old parity nodes */ + for (i = 0; i < numParityNodes; i++) { + blockNode->succedents[numDataNodes + i] = &readParityNodes[i]; + RF_ASSERT(readParityNodes[i].numAntecedents == 1); + readParityNodes[i].antecedents[0] = blockNode; + readParityNodes[i].antType[0] = rf_control; + } + + /* connect block node to read old Q nodes */ + if (nfaults == 2) + for (i = 0; i < numParityNodes; i++) { + blockNode->succedents[numDataNodes + numParityNodes + i] = &readQNodes[i]; + RF_ASSERT(readQNodes[i].numAntecedents == 1); + readQNodes[i].antecedents[0] = blockNode; + readQNodes[i].antType[0] = rf_control; + } + + /* connect read old data nodes to write new data nodes */ + for (i = 0; i < numDataNodes; i++) { + RF_ASSERT(readDataNodes[i].numSuccedents == ((nfaults * numParityNodes) + 1)); + RF_ASSERT(writeDataNodes[i].numAntecedents == 1); + readDataNodes[i].succedents[0] = &writeDataNodes[i]; + writeDataNodes[i].antecedents[0] = &readDataNodes[i]; + writeDataNodes[i].antType[0] = rf_antiData; + } + + /* connect read old data nodes to xor nodes */ + for (i = 0; i < numDataNodes; i++) { + for (j = 0; j < numParityNodes; j++){ + RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes); + readDataNodes[i].succedents[1 + j] = &xorNodes[j]; + xorNodes[j].antecedents[i] = &readDataNodes[i]; + xorNodes[j].antType[i] = rf_trueData; + } + } + + /* connect read old data nodes to q nodes */ + if (nfaults == 2) + for (i = 0; i < numDataNodes; i++) + for (j = 0; j < numParityNodes; j++){ + RF_ASSERT(qNodes[j].numAntecedents == numDataNodes + numParityNodes); + readDataNodes[i].succedents[1 + numParityNodes + j] = &qNodes[j]; + qNodes[j].antecedents[i] = &readDataNodes[i]; + qNodes[j].antType[i] = rf_trueData; + } + + /* connect read old parity nodes to xor nodes */ + for (i = 0; i < numParityNodes; i++) { + for (j = 0; j < numParityNodes; j++) { + RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes); + readParityNodes[i].succedents[j] = &xorNodes[j]; + xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i]; + xorNodes[j].antType[numDataNodes + i] = rf_trueData; + } + } + + /* connect read old q nodes to q nodes */ + if (nfaults == 2) + for (i = 0; i < numParityNodes; i++) { + for (j = 0; j < numParityNodes; j++) { + RF_ASSERT(readQNodes[i].numSuccedents == numParityNodes); + readQNodes[i].succedents[j] = &qNodes[j]; + qNodes[j].antecedents[numDataNodes + i] = &readQNodes[i]; + qNodes[j].antType[numDataNodes + i] = rf_trueData; + } + } + + /* connect xor nodes to the write new parity nodes */ + for (i = 0; i < numParityNodes; i++) { + RF_ASSERT(writeParityNodes[i].numAntecedents == numParityNodes); + for (j = 0; j < numParityNodes; j++) { + RF_ASSERT(xorNodes[j].numSuccedents == numParityNodes); + xorNodes[i].succedents[j] = &writeParityNodes[j]; + writeParityNodes[j].antecedents[i] = &xorNodes[i]; + writeParityNodes[j].antType[i] = rf_trueData; + } + } + + /* connect q nodes to the write new q nodes */ + if (nfaults == 2) + for (i = 0; i < numParityNodes; i++) { + RF_ASSERT(writeQNodes[i].numAntecedents == numParityNodes); + for (j = 0; j < numParityNodes; j++) { + RF_ASSERT(qNodes[j].numSuccedents == 1); + qNodes[i].succedents[j] = &writeQNodes[j]; + writeQNodes[j].antecedents[i] = &qNodes[i]; + writeQNodes[j].antType[i] = rf_trueData; + } + } + + RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes))); + RF_ASSERT(termNode->numSuccedents == 0); + for (i = 0; i < numDataNodes; i++) { + if (lu_flag) { + /* connect write new data nodes to unlock nodes */ + RF_ASSERT(writeDataNodes[i].numSuccedents == 1); + RF_ASSERT(unlockDataNodes[i].numAntecedents == 1); + writeDataNodes[i].succedents[0] = &unlockDataNodes[i]; + unlockDataNodes[i].antecedents[0] = &writeDataNodes[i]; + unlockDataNodes[i].antType[0] = rf_control; + + /* connect unlock nodes to term node */ + RF_ASSERT(unlockDataNodes[i].numSuccedents == 1); + unlockDataNodes[i].succedents[0] = termNode; + termNode->antecedents[i] = &unlockDataNodes[i]; + termNode->antType[i] = rf_control; + } + else { + /* connect write new data nodes to term node */ + RF_ASSERT(writeDataNodes[i].numSuccedents == 1); + RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes))); + writeDataNodes[i].succedents[0] = termNode; + termNode->antecedents[i] = &writeDataNodes[i]; + termNode->antType[i] = rf_control; + } + } + + for (i = 0; i < numParityNodes; i++) { + if (lu_flag) { + /* connect write new parity nodes to unlock nodes */ + RF_ASSERT(writeParityNodes[i].numSuccedents == 1); + RF_ASSERT(unlockParityNodes[i].numAntecedents == 1); + writeParityNodes[i].succedents[0] = &unlockParityNodes[i]; + unlockParityNodes[i].antecedents[0] = &writeParityNodes[i]; + unlockParityNodes[i].antType[0] = rf_control; + + /* connect unlock nodes to term node */ + RF_ASSERT(unlockParityNodes[i].numSuccedents == 1); + unlockParityNodes[i].succedents[0] = termNode; + termNode->antecedents[numDataNodes + i] = &unlockParityNodes[i]; + termNode->antType[numDataNodes + i] = rf_control; + } + else { + RF_ASSERT(writeParityNodes[i].numSuccedents == 1); + writeParityNodes[i].succedents[0] = termNode; + termNode->antecedents[numDataNodes + i] = &writeParityNodes[i]; + termNode->antType[numDataNodes + i] = rf_control; + } + } + + if (nfaults == 2) + for (i = 0; i < numParityNodes; i++) { + if (lu_flag) { + /* connect write new Q nodes to unlock nodes */ + RF_ASSERT(writeQNodes[i].numSuccedents == 1); + RF_ASSERT(unlockQNodes[i].numAntecedents == 1); + writeQNodes[i].succedents[0] = &unlockQNodes[i]; + unlockQNodes[i].antecedents[0] = &writeQNodes[i]; + unlockQNodes[i].antType[0] = rf_control; + + /* connect unlock nodes to unblock node */ + RF_ASSERT(unlockQNodes[i].numSuccedents == 1); + unlockQNodes[i].succedents[0] = termNode; + termNode->antecedents[numDataNodes + numParityNodes + i] = &unlockQNodes[i]; + termNode->antType[numDataNodes + numParityNodes + i] = rf_control; + } + else { + RF_ASSERT(writeQNodes[i].numSuccedents == 1); + writeQNodes[i].succedents[0] = termNode; + termNode->antecedents[numDataNodes + numParityNodes + i] = &writeQNodes[i]; + termNode->antType[numDataNodes + numParityNodes + i] = rf_control; + } + } +} + + + +/****************************************************************************** + * create a write graph (fault-free or degraded) for RAID level 1 + * + * Hdr Nil -> Wpd -> Nil -> Trm + * Nil -> Wsd -> + * + * The "Wpd" node writes data to the primary copy in the mirror pair + * The "Wsd" node writes data to the secondary copy in the mirror pair + * + * Parameters: raidPtr - description of the physical array + * asmap - logical & physical addresses for this access + * bp - buffer ptr (holds write data) + * flags - general flags (e.g. disk locking) + * allocList - list of memory allocated in DAG creation + *****************************************************************************/ + +void rf_CreateRaidOneWriteDAGFwd( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList) +{ + RF_DagNode_t *blockNode, *unblockNode, *termNode; + RF_DagNode_t *nodes, *wndNode, *wmirNode; + int nWndNodes, nWmirNodes, i; + RF_ReconUnitNum_t which_ru; + RF_PhysDiskAddr_t *pda, *pdaP; + RF_StripeNum_t parityStripeID; + + parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), + asmap->raidAddress, &which_ru); + if (rf_dagDebug) { + printf("[Creating RAID level 1 write DAG]\n"); + } + + nWmirNodes = (asmap->parityInfo->next) ? 2 : 1; /* 2 implies access not SU aligned */ + nWndNodes = (asmap->physInfo->next) ? 2 : 1; + + /* alloc the Wnd nodes and the Wmir node */ + if (asmap->numDataFailed == 1) + nWndNodes--; + if (asmap->numParityFailed == 1) + nWmirNodes--; + + /* total number of nodes = nWndNodes + nWmirNodes + (block + unblock + terminator) */ + RF_CallocAndAdd(nodes, nWndNodes + nWmirNodes + 3, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList); + i = 0; + wndNode = &nodes[i]; i += nWndNodes; + wmirNode = &nodes[i]; i += nWmirNodes; + blockNode = &nodes[i]; i += 1; + unblockNode = &nodes[i]; i += 1; + termNode = &nodes[i]; i += 1; + RF_ASSERT(i == (nWndNodes + nWmirNodes + 3)); + + /* this dag can commit immediately */ + dag_h->numCommitNodes = 0; + dag_h->numCommits = 0; + dag_h->numSuccedents = 1; + + /* initialize the unblock and term nodes */ + rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, (nWndNodes + nWmirNodes), 0, 0, 0, dag_h, "Nil", allocList); + rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, (nWndNodes + nWmirNodes), 0, 0, dag_h, "Nil", allocList); + rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); + + /* initialize the wnd nodes */ + if (nWndNodes > 0) { + pda = asmap->physInfo; + for (i = 0; i < nWndNodes; i++) { + rf_InitNode(&wndNode[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wpd", allocList); + RF_ASSERT(pda != NULL); + wndNode[i].params[0].p = pda; + wndNode[i].params[1].p = pda->bufPtr; + wndNode[i].params[2].v = parityStripeID; + wndNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + pda = pda->next; + } + RF_ASSERT(pda == NULL); + } + + /* initialize the mirror nodes */ + if (nWmirNodes > 0) { + pda = asmap->physInfo; + pdaP = asmap->parityInfo; + for (i = 0; i < nWmirNodes; i++) { + rf_InitNode(&wmirNode[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wsd", allocList); + RF_ASSERT(pda != NULL); + wmirNode[i].params[0].p = pdaP; + wmirNode[i].params[1].p = pda->bufPtr; + wmirNode[i].params[2].v = parityStripeID; + wmirNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + pda = pda->next; + pdaP = pdaP->next; + } + RF_ASSERT(pda == NULL); + RF_ASSERT(pdaP == NULL); + } + + /* link the header node to the block node */ + RF_ASSERT(dag_h->numSuccedents == 1); + RF_ASSERT(blockNode->numAntecedents == 0); + dag_h->succedents[0] = blockNode; + + /* link the block node to the write nodes */ + RF_ASSERT(blockNode->numSuccedents == (nWndNodes + nWmirNodes)); + for (i = 0; i < nWndNodes; i++) { + RF_ASSERT(wndNode[i].numAntecedents == 1); + blockNode->succedents[i] = &wndNode[i]; + wndNode[i].antecedents[0] = blockNode; + wndNode[i].antType[0] = rf_control; + } + for (i = 0; i < nWmirNodes; i++) { + RF_ASSERT(wmirNode[i].numAntecedents == 1); + blockNode->succedents[i + nWndNodes] = &wmirNode[i]; + wmirNode[i].antecedents[0] = blockNode; + wmirNode[i].antType[0] = rf_control; + } + + /* link the write nodes to the unblock node */ + RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nWmirNodes)); + for (i = 0; i < nWndNodes; i++) { + RF_ASSERT(wndNode[i].numSuccedents == 1); + wndNode[i].succedents[0] = unblockNode; + unblockNode->antecedents[i] = &wndNode[i]; + unblockNode->antType[i] = rf_control; + } + for (i = 0; i < nWmirNodes; i++) { + RF_ASSERT(wmirNode[i].numSuccedents == 1); + wmirNode[i].succedents[0] = unblockNode; + unblockNode->antecedents[i + nWndNodes] = &wmirNode[i]; + unblockNode->antType[i + nWndNodes] = rf_control; + } + + /* link the unblock node to the term node */ + RF_ASSERT(unblockNode->numSuccedents == 1); + RF_ASSERT(termNode->numAntecedents == 1); + RF_ASSERT(termNode->numSuccedents == 0); + unblockNode->succedents[0] = termNode; + termNode->antecedents[0] = unblockNode; + termNode->antType[0] = rf_control; + + return; +} diff --git a/sys/dev/raidframe/rf_dagffwr.h b/sys/dev/raidframe/rf_dagffwr.h new file mode 100644 index 00000000000..69c7fdf4832 --- /dev/null +++ b/sys/dev/raidframe/rf_dagffwr.h @@ -0,0 +1,103 @@ +/* $OpenBSD: rf_dagffwr.h,v 1.1 1999/01/11 14:29:10 niklas Exp $ */ +/* $NetBSD: rf_dagffwr.h,v 1.1 1998/11/13 04:20:27 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * : + * Log: rf_dagffwr.h,v + * Revision 1.6 1996/07/31 15:35:29 jimz + * evenodd changes; bugfixes for double-degraded archs, generalize + * some formerly PQ-only functions + * + * Revision 1.5 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.4 1996/06/10 22:25:28 wvcii + * added write dags which do not have a commit node and are + * used in forward and backward error recovery experiments. + * + * Revision 1.3 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.2 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.1 1996/05/03 19:20:18 wvcii + * Initial revision + * + */ + +#ifndef _RF__RF_DAGFFWR_H_ +#define _RF__RF_DAGFFWR_H_ + +#include "rf_types.h" + +/* fault-free write DAG creation routines */ +void rf_CreateNonRedundantWriteDAG(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp, + RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList, + RF_IoType_t type); +void rf_CreateRAID0WriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, void *bp, RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList, RF_IoType_t type); +void rf_CreateSmallWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, void *bp, RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList); +void rf_CreateLargeWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, void *bp, RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList); +void rf_CommonCreateLargeWriteDAG(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp, + RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList, int nfaults, + int (*redFunc)(RF_DagNode_t *), int allowBufferRecycle); +void rf_CommonCreateLargeWriteDAGFwd(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp, + RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList, int nfaults, + int (*redFunc)(RF_DagNode_t *), int allowBufferRecycle); +void rf_CommonCreateSmallWriteDAG(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp, + RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList, + RF_RedFuncs_t *pfuncs, RF_RedFuncs_t *qfuncs); +void rf_CommonCreateSmallWriteDAGFwd(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp, + RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList, + RF_RedFuncs_t *pfuncs, RF_RedFuncs_t *qfuncs); +void rf_CreateRaidOneWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, void *bp, RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList); +void rf_CreateRaidOneWriteDAGFwd(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp, + RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList); + +#endif /* !_RF__RF_DAGFFWR_H_ */ diff --git a/sys/dev/raidframe/rf_dagflags.h b/sys/dev/raidframe/rf_dagflags.h new file mode 100644 index 00000000000..ac6f5ec5705 --- /dev/null +++ b/sys/dev/raidframe/rf_dagflags.h @@ -0,0 +1,86 @@ +/* $OpenBSD: rf_dagflags.h,v 1.1 1999/01/11 14:29:10 niklas Exp $ */ +/* $NetBSD: rf_dagflags.h,v 1.1 1998/11/13 04:20:27 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/************************************************************************************** + * + * dagflags.h -- flags that can be given to DoAccess + * I pulled these out of dag.h because routines that call DoAccess may need these flags, + * but certainly do not need the declarations related to the DAG data structures. + * + **************************************************************************************/ + +/* : + * Log: rf_dagflags.h,v + * Revision 1.10 1996/06/13 19:08:23 jimz + * remove unused BD flag + * + * Revision 1.9 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.8 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.7 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.6 1995/12/01 15:59:40 root + * added copyright info + * + */ + +#ifndef _RF__RF_DAGFLAGS_H_ +#define _RF__RF_DAGFLAGS_H_ + +/* + * Bitmasks for the "flags" parameter (RF_RaidAccessFlags_t) used + * by DoAccess, SelectAlgorithm, and the DAG creation routines. + * + * If USE_DAG or USE_ASM is specified, neither the DAG nor the ASM + * will be modified, which means that you can't SUPRESS if you + * specify USE_DAG. + */ + +#define RF_DAG_FLAGS_NONE 0 /* no flags */ +#define RF_DAG_SUPPRESS_LOCKS (1<<0) /* supress all stripe locks in the DAG */ +#define RF_DAG_RETURN_ASM (1<<1) /* create an ASM and return it instead of freeing it */ +#define RF_DAG_RETURN_DAG (1<<2) /* create a DAG and return it instead of freeing it */ +#define RF_DAG_NONBLOCKING_IO (1<<3) /* cause DoAccess to be non-blocking */ +#define RF_DAG_ACCESS_COMPLETE (1<<4) /* the access is complete */ +#define RF_DAG_DISPATCH_RETURNED (1<<5) /* used to handle the case where the dag invokes no I/O */ +#define RF_DAG_TEST_ACCESS (1<<6) /* this access came through rf_ioctl instead of rf_strategy */ + +#endif /* !_RF__RF_DAGFLAGS_H_ */ diff --git a/sys/dev/raidframe/rf_dagfuncs.c b/sys/dev/raidframe/rf_dagfuncs.c new file mode 100644 index 00000000000..78e23ed1d95 --- /dev/null +++ b/sys/dev/raidframe/rf_dagfuncs.c @@ -0,0 +1,1050 @@ +/* $OpenBSD: rf_dagfuncs.c,v 1.1 1999/01/11 14:29:10 niklas Exp $ */ +/* $NetBSD: rf_dagfuncs.c,v 1.1 1998/11/13 04:20:28 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * dagfuncs.c -- DAG node execution routines + * + * Rules: + * 1. Every DAG execution function must eventually cause node->status to + * get set to "good" or "bad", and "FinishNode" to be called. In the + * case of nodes that complete immediately (xor, NullNodeFunc, etc), + * the node execution function can do these two things directly. In + * the case of nodes that have to wait for some event (a disk read to + * complete, a lock to be released, etc) to occur before they can + * complete, this is typically achieved by having whatever module + * is doing the operation call GenericWakeupFunc upon completion. + * 2. DAG execution functions should check the status in the DAG header + * and NOP out their operations if the status is not "enable". However, + * execution functions that release resources must be sure to release + * them even when they NOP out the function that would use them. + * Functions that acquire resources should go ahead and acquire them + * even when they NOP, so that a downstream release node will not have + * to check to find out whether or not the acquire was suppressed. + */ + +/* : + * Log: rf_dagfuncs.c,v + * Revision 1.64 1996/07/31 16:29:26 jimz + * LONGSHIFT -> RF_LONGSHIFT, defined in rf_types.h + * + * Revision 1.63 1996/07/30 04:00:20 jimz + * define LONGSHIFT for mips + * + * Revision 1.62 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.61 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.60 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.59 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.58 1996/07/17 21:00:58 jimz + * clean up timer interface, tracing + * + * Revision 1.57 1996/07/15 17:22:18 jimz + * nit-pick code cleanup + * resolve stdlib problems on DEC OSF + * + * Revision 1.56 1996/06/11 01:27:50 jimz + * Fixed bug where diskthread shutdown would crash or hang. This + * turned out to be two distinct bugs: + * (1) [crash] The thread shutdown code wasn't properly waiting for + * all the diskthreads to complete. This caused diskthreads that were + * exiting+cleaning up to unlock a destroyed mutex. + * (2) [hang] TerminateDiskQueues wasn't locking, and DiskIODequeue + * only checked for termination _after_ a wakeup if the queues were + * empty. This was a race where the termination wakeup could be lost + * by the dequeueing thread, and the system would hang waiting for the + * thread to exit, while the thread waited for an I/O or a signal to + * check the termination flag. + * + * Revision 1.55 1996/06/10 22:23:18 wvcii + * disk and xor funcs now optionally support undo logging + * for backward error recovery experiments + * + * Revision 1.54 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.53 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.52 1996/06/06 17:28:44 jimz + * add new read mirror partition func, rename old read mirror + * to rf_DiskReadMirrorIdleFunc + * + * Revision 1.51 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.50 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.49 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.48 1996/05/30 12:59:18 jimz + * make etimer happier, more portable + * + * Revision 1.47 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.46 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.45 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.44 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.43 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.42 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.41 1996/05/08 21:01:24 jimz + * fixed up enum type names that were conflicting with other + * enums and function names (ie, "panic") + * future naming trends will be towards RF_ and rf_ for + * everything raidframe-related + * + * Revision 1.40 1996/05/08 15:24:14 wvcii + * modified GenericWakeupFunc to use recover, undone, and panic node states + * + * Revision 1.39 1996/05/02 17:18:01 jimz + * fix up headers for user-land, following ccmn cleanup + * + * Revision 1.38 1996/05/01 16:26:51 jimz + * don't include rf_ccmn.h (get ready to phase out) + * + * Revision 1.37 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.36 1995/12/04 19:19:09 wvcii + * modified DiskReadMirrorFunc + * - added fifth parameter, physical disk address of mirror copy + * - SelectIdleDisk conditionally swaps parameters 0 & 4 + * + * Revision 1.35 1995/12/01 15:58:33 root + * added copyright info + * + * Revision 1.34 1995/11/17 18:12:17 amiri + * Changed DiskReadMirrorFunc to use the generic mapping routines + * to find the mirror of the data, function was assuming RAID level 1. + * + * Revision 1.33 1995/11/17 15:15:59 wvcii + * changes in DiskReadMirrorFunc + * - added ASSERTs + * - added call to MapParityRAID1 + * + * Revision 1.32 1995/11/07 16:25:50 wvcii + * added DiskUnlockFuncForThreads + * general debugging of undo functions (first time they were used) + * + * Revision 1.31 1995/09/06 19:23:36 wvcii + * fixed tracing for parity logging nodes + * + * Revision 1.30 95/07/07 00:13:01 wvcii + * added 4th parameter to ParityLogAppend + * + */ + +#ifdef _KERNEL +#define KERNEL +#endif + +#ifndef KERNEL +#include <errno.h> +#endif /* !KERNEL */ + +#include <sys/ioctl.h> +#include <sys/param.h> + +#include "rf_archs.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_layout.h" +#include "rf_etimer.h" +#include "rf_acctrace.h" +#include "rf_diskqueue.h" +#include "rf_dagfuncs.h" +#include "rf_general.h" +#include "rf_engine.h" +#include "rf_dagutils.h" + +#ifdef KERNEL +#include "rf_kintf.h" +#endif /* KERNEL */ + +#if RF_INCLUDE_PARITYLOGGING > 0 +#include "rf_paritylog.h" +#endif /* RF_INCLUDE_PARITYLOGGING > 0 */ + +int (*rf_DiskReadFunc)(RF_DagNode_t *); +int (*rf_DiskWriteFunc)(RF_DagNode_t *); +int (*rf_DiskReadUndoFunc)(RF_DagNode_t *); +int (*rf_DiskWriteUndoFunc)(RF_DagNode_t *); +int (*rf_DiskUnlockFunc)(RF_DagNode_t *); +int (*rf_DiskUnlockUndoFunc)(RF_DagNode_t *); +int (*rf_RegularXorUndoFunc)(RF_DagNode_t *); +int (*rf_SimpleXorUndoFunc)(RF_DagNode_t *); +int (*rf_RecoveryXorUndoFunc)(RF_DagNode_t *); + +/***************************************************************************************** + * main (only) configuration routine for this module + ****************************************************************************************/ +int rf_ConfigureDAGFuncs(listp) + RF_ShutdownList_t **listp; +{ + RF_ASSERT( ((sizeof(long)==8) && RF_LONGSHIFT==3) || ((sizeof(long)==4) && RF_LONGSHIFT==2) ); + rf_DiskReadFunc = rf_DiskReadFuncForThreads; + rf_DiskReadUndoFunc = rf_DiskUndoFunc; + rf_DiskWriteFunc = rf_DiskWriteFuncForThreads; + rf_DiskWriteUndoFunc = rf_DiskUndoFunc; + rf_DiskUnlockFunc = rf_DiskUnlockFuncForThreads; + rf_DiskUnlockUndoFunc = rf_NullNodeUndoFunc; + rf_RegularXorUndoFunc = rf_NullNodeUndoFunc; + rf_SimpleXorUndoFunc = rf_NullNodeUndoFunc; + rf_RecoveryXorUndoFunc = rf_NullNodeUndoFunc; + return(0); +} + + + +/***************************************************************************************** + * the execution function associated with a terminate node + ****************************************************************************************/ +int rf_TerminateFunc(node) + RF_DagNode_t *node; +{ + RF_ASSERT(node->dagHdr->numCommits == node->dagHdr->numCommitNodes); + node->status = rf_good; + return(rf_FinishNode(node, RF_THREAD_CONTEXT)); +} + +int rf_TerminateUndoFunc(node) + RF_DagNode_t *node; +{ + return(0); +} + + +/***************************************************************************************** + * execution functions associated with a mirror node + * + * parameters: + * + * 0 - physical disk addres of data + * 1 - buffer for holding read data + * 2 - parity stripe ID + * 3 - flags + * 4 - physical disk address of mirror (parity) + * + ****************************************************************************************/ + +int rf_DiskReadMirrorIdleFunc(node) + RF_DagNode_t *node; +{ + /* select the mirror copy with the shortest queue and fill in node parameters + with physical disk address */ + + rf_SelectMirrorDiskIdle(node); + return(rf_DiskReadFunc(node)); +} + +int rf_DiskReadMirrorPartitionFunc(node) + RF_DagNode_t *node; +{ + /* select the mirror copy with the shortest queue and fill in node parameters + with physical disk address */ + + rf_SelectMirrorDiskPartition(node); + return(rf_DiskReadFunc(node)); +} + +int rf_DiskReadMirrorUndoFunc(node) + RF_DagNode_t *node; +{ + return(0); +} + + + +#if RF_INCLUDE_PARITYLOGGING > 0 +/***************************************************************************************** + * the execution function associated with a parity log update node + ****************************************************************************************/ +int rf_ParityLogUpdateFunc(node) + RF_DagNode_t *node; +{ + RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; + caddr_t buf = (caddr_t) node->params[1].p; + RF_ParityLogData_t *logData; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + RF_Etimer_t timer; + + if (node->dagHdr->status == rf_enable) + { + RF_ETIMER_START(timer); + logData = rf_CreateParityLogData(RF_UPDATE, pda, buf, + (RF_Raid_t *) (node->dagHdr->raidPtr), + node->wakeFunc, (void *) node, + node->dagHdr->tracerec, timer); + if (logData) + rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE); + else + { + RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->plog_us += RF_ETIMER_VAL_US(timer); + (node->wakeFunc)(node, ENOMEM); + } + } + return(0); +} + + +/***************************************************************************************** + * the execution function associated with a parity log overwrite node + ****************************************************************************************/ +int rf_ParityLogOverwriteFunc(node) + RF_DagNode_t *node; +{ + RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; + caddr_t buf = (caddr_t) node->params[1].p; + RF_ParityLogData_t *logData; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + RF_Etimer_t timer; + + if (node->dagHdr->status == rf_enable) + { + RF_ETIMER_START(timer); + logData = rf_CreateParityLogData(RF_OVERWRITE, pda, buf, (RF_Raid_t *) (node->dagHdr->raidPtr), + node->wakeFunc, (void *) node, node->dagHdr->tracerec, timer); + if (logData) + rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE); + else + { + RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->plog_us += RF_ETIMER_VAL_US(timer); + (node->wakeFunc)(node, ENOMEM); + } + } + return(0); +} + +#else /* RF_INCLUDE_PARITYLOGGING > 0 */ + +int rf_ParityLogUpdateFunc(node) + RF_DagNode_t *node; +{ + return(0); +} +int rf_ParityLogOverwriteFunc(node) + RF_DagNode_t *node; +{ + return(0); +} + +#endif /* RF_INCLUDE_PARITYLOGGING > 0 */ + +int rf_ParityLogUpdateUndoFunc(node) + RF_DagNode_t *node; +{ + return(0); +} + +int rf_ParityLogOverwriteUndoFunc(node) + RF_DagNode_t *node; +{ + return(0); +} + +/***************************************************************************************** + * the execution function associated with a NOP node + ****************************************************************************************/ +int rf_NullNodeFunc(node) + RF_DagNode_t *node; +{ + node->status = rf_good; + return(rf_FinishNode(node, RF_THREAD_CONTEXT)); +} + +int rf_NullNodeUndoFunc(node) + RF_DagNode_t *node; +{ + node->status = rf_undone; + return(rf_FinishNode(node, RF_THREAD_CONTEXT)); +} + + +/***************************************************************************************** + * the execution function associated with a disk-read node + ****************************************************************************************/ +int rf_DiskReadFuncForThreads(node) + RF_DagNode_t *node; +{ + RF_DiskQueueData_t *req; + RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *)node->params[0].p; + caddr_t buf = (caddr_t)node->params[1].p; + RF_StripeNum_t parityStripeID = (RF_StripeNum_t)node->params[2].v; + unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v); + unsigned lock = RF_EXTRACT_LOCK_FLAG(node->params[3].v); + unsigned unlock = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v); + unsigned which_ru = RF_EXTRACT_RU(node->params[3].v); + RF_DiskQueueDataFlags_t flags = 0; + RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_READ : RF_IO_TYPE_NOP; + RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues; + void *b_proc = NULL; +#if RF_BACKWARD > 0 + caddr_t undoBuf; +#endif + +#ifdef KERNEL + if (node->dagHdr->bp) b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc; +#endif /* KERNEL */ + + RF_ASSERT( !(lock && unlock) ); + flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0; + flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0; +#if RF_BACKWARD > 0 + /* allocate and zero the undo buffer. + * this is equivalent to copying the original buffer's contents to the undo buffer + * prior to performing the disk read. + * XXX hardcoded 512 bytes per sector! + */ + if (node->dagHdr->allocList == NULL) + rf_MakeAllocList(node->dagHdr->allocList); + RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList); +#endif /* RF_BACKWARD > 0 */ + req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector, + buf, parityStripeID, which_ru, + (int (*)(void *,int)) node->wakeFunc, + node, NULL, node->dagHdr->tracerec, + (void *)(node->dagHdr->raidPtr), flags, b_proc); + if (!req) { + (node->wakeFunc)(node, ENOMEM); + } else { + node->dagFuncData = (void *) req; + rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, priority ); + } + return(0); +} + + +/***************************************************************************************** + * the execution function associated with a disk-write node + ****************************************************************************************/ +int rf_DiskWriteFuncForThreads(node) + RF_DagNode_t *node; +{ + RF_DiskQueueData_t *req; + RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *)node->params[0].p; + caddr_t buf = (caddr_t)node->params[1].p; + RF_StripeNum_t parityStripeID = (RF_StripeNum_t)node->params[2].v; + unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v); + unsigned lock = RF_EXTRACT_LOCK_FLAG(node->params[3].v); + unsigned unlock = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v); + unsigned which_ru = RF_EXTRACT_RU(node->params[3].v); + RF_DiskQueueDataFlags_t flags = 0; + RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_WRITE : RF_IO_TYPE_NOP; + RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues; + void *b_proc = NULL; +#if RF_BACKWARD > 0 + caddr_t undoBuf; +#endif + +#ifdef KERNEL + if (node->dagHdr->bp) b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc; +#endif /* KERNEL */ + +#if RF_BACKWARD > 0 + /* This area is used only for backward error recovery experiments + * First, schedule allocate a buffer and schedule a pre-read of the disk + * After the pre-read, proceed with the normal disk write + */ + if (node->status == rf_bwd2) { + /* just finished undo logging, now perform real function */ + node->status = rf_fired; + RF_ASSERT( !(lock && unlock) ); + flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0; + flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0; + req = rf_CreateDiskQueueData(iotype, + pda->startSector, pda->numSector, buf, parityStripeID, which_ru, + node->wakeFunc, (void *) node, NULL, node->dagHdr->tracerec, + (void *) (node->dagHdr->raidPtr), flags, b_proc); + + if (!req) { + (node->wakeFunc)(node, ENOMEM); + } else { + node->dagFuncData = (void *) req; + rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, priority ); + } + } + + else { + /* node status should be rf_fired */ + /* schedule a disk pre-read */ + node->status = rf_bwd1; + RF_ASSERT( !(lock && unlock) ); + flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0; + flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0; + if (node->dagHdr->allocList == NULL) + rf_MakeAllocList(node->dagHdr->allocList); + RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList); + req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, + pda->startSector, pda->numSector, undoBuf, parityStripeID, which_ru, + node->wakeFunc, (void *) node, NULL, node->dagHdr->tracerec, + (void *) (node->dagHdr->raidPtr), flags, b_proc); + + if (!req) { + (node->wakeFunc)(node, ENOMEM); + } else { + node->dagFuncData = (void *) req; + rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, priority ); + } + } + return(0); +#endif /* RF_BACKWARD > 0 */ + + /* normal processing (rollaway or forward recovery) begins here */ + RF_ASSERT( !(lock && unlock) ); + flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0; + flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0; + req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector, + buf, parityStripeID, which_ru, + (int (*)(void *,int)) node->wakeFunc, + (void *) node, NULL, + node->dagHdr->tracerec, + (void *) (node->dagHdr->raidPtr), + flags, b_proc); + + if (!req) { + (node->wakeFunc)(node, ENOMEM); + } else { + node->dagFuncData = (void *) req; + rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, priority ); + } + + return(0); +} + +/***************************************************************************************** + * the undo function for disk nodes + * Note: this is not a proper undo of a write node, only locks are released. + * old data is not restored to disk! + ****************************************************************************************/ +int rf_DiskUndoFunc(node) + RF_DagNode_t *node; +{ + RF_DiskQueueData_t *req; + RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *)node->params[0].p; + RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues; + + req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP, + 0L, 0, NULL, 0L, 0, + (int (*)(void *,int)) node->wakeFunc, + (void *) node, + NULL, node->dagHdr->tracerec, + (void *) (node->dagHdr->raidPtr), + RF_UNLOCK_DISK_QUEUE, NULL); + if (!req) + (node->wakeFunc)(node, ENOMEM); + else { + node->dagFuncData = (void *) req; + rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, RF_IO_NORMAL_PRIORITY ); + } + + return(0); +} + +/***************************************************************************************** + * the execution function associated with an "unlock disk queue" node + ****************************************************************************************/ +int rf_DiskUnlockFuncForThreads(node) + RF_DagNode_t *node; +{ + RF_DiskQueueData_t *req; + RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *)node->params[0].p; + RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues; + + req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP, + 0L, 0, NULL, 0L, 0, + (int (*)(void *,int)) node->wakeFunc, + (void *) node, + NULL, node->dagHdr->tracerec, + (void *) (node->dagHdr->raidPtr), + RF_UNLOCK_DISK_QUEUE, NULL); + if (!req) + (node->wakeFunc)(node, ENOMEM); + else { + node->dagFuncData = (void *) req; + rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, RF_IO_NORMAL_PRIORITY ); + } + + return(0); +} + +/***************************************************************************************** + * Callback routine for DiskRead and DiskWrite nodes. When the disk op completes, + * the routine is called to set the node status and inform the execution engine that + * the node has fired. + ****************************************************************************************/ +int rf_GenericWakeupFunc(node, status) + RF_DagNode_t *node; + int status; +{ + switch (node->status) { + case rf_bwd1 : + node->status = rf_bwd2; + if (node->dagFuncData) + rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData); + return(rf_DiskWriteFuncForThreads(node)); + break; + case rf_fired : + if (status) node->status = rf_bad; + else node->status = rf_good; + break; + case rf_recover : + /* probably should never reach this case */ + if (status) node->status = rf_panic; + else node->status = rf_undone; + break; + default : + RF_PANIC(); + break; + } + if (node->dagFuncData) + rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData); + return(rf_FinishNode(node, RF_INTR_CONTEXT)); +} + + +/***************************************************************************************** + * there are three distinct types of xor nodes + * A "regular xor" is used in the fault-free case where the access spans a complete + * stripe unit. It assumes that the result buffer is one full stripe unit in size, + * and uses the stripe-unit-offset values that it computes from the PDAs to determine + * where within the stripe unit to XOR each argument buffer. + * + * A "simple xor" is used in the fault-free case where the access touches only a portion + * of one (or two, in some cases) stripe unit(s). It assumes that all the argument + * buffers are of the same size and have the same stripe unit offset. + * + * A "recovery xor" is used in the degraded-mode case. It's similar to the regular + * xor function except that it takes the failed PDA as an additional parameter, and + * uses it to determine what portions of the argument buffers need to be xor'd into + * the result buffer, and where in the result buffer they should go. + ****************************************************************************************/ + +/* xor the params together and store the result in the result field. + * assume the result field points to a buffer that is the size of one SU, + * and use the pda params to determine where within the buffer to XOR + * the input buffers. + */ +int rf_RegularXorFunc(node) + RF_DagNode_t *node; +{ + RF_Raid_t *raidPtr = (RF_Raid_t *)node->params[node->numParams-1].p; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + RF_Etimer_t timer; + int i, retcode; +#if RF_BACKWARD > 0 + RF_PhysDiskAddr_t *pda; + caddr_t undoBuf; +#endif + + retcode = 0; + if (node->dagHdr->status == rf_enable) { + /* don't do the XOR if the input is the same as the output */ + RF_ETIMER_START(timer); + for (i=0; i<node->numParams-1; i+=2) if (node->params[i+1].p != node->results[0]) { +#if RF_BACKWARD > 0 + /* This section mimics undo logging for backward error recovery experiments b + * allocating and initializing a buffer + * XXX 512 byte sector size is hard coded! + */ + pda = node->params[i].p; + if (node->dagHdr->allocList == NULL) + rf_MakeAllocList(node->dagHdr->allocList); + RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList); +#endif /* RF_BACKWARD > 0 */ + retcode = rf_XorIntoBuffer(raidPtr, (RF_PhysDiskAddr_t *) node->params[i].p, + (char *)node->params[i+1].p, (char *) node->results[0], node->dagHdr->bp); + } + RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->xor_us += RF_ETIMER_VAL_US(timer); + } + return(rf_GenericWakeupFunc(node, retcode)); /* call wake func explicitly since no I/O in this node */ +} + +/* xor the inputs into the result buffer, ignoring placement issues */ +int rf_SimpleXorFunc(node) + RF_DagNode_t *node; +{ + RF_Raid_t *raidPtr = (RF_Raid_t *)node->params[node->numParams-1].p; + int i, retcode = 0; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + RF_Etimer_t timer; +#if RF_BACKWARD > 0 + RF_PhysDiskAddr_t *pda; + caddr_t undoBuf; +#endif + + if (node->dagHdr->status == rf_enable) { + RF_ETIMER_START(timer); + /* don't do the XOR if the input is the same as the output */ + for (i=0; i<node->numParams-1; i+=2) if (node->params[i+1].p != node->results[0]) { +#if RF_BACKWARD > 0 + /* This section mimics undo logging for backward error recovery experiments b + * allocating and initializing a buffer + * XXX 512 byte sector size is hard coded! + */ + pda = node->params[i].p; + if (node->dagHdr->allocList == NULL) + rf_MakeAllocList(node->dagHdr->allocList); + RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList); +#endif /* RF_BACKWARD > 0 */ + retcode = rf_bxor((char *)node->params[i+1].p, (char *) node->results[0], + rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *)node->params[i].p)->numSector), + (struct buf *) node->dagHdr->bp); + } + RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->xor_us += RF_ETIMER_VAL_US(timer); + } + + return(rf_GenericWakeupFunc(node, retcode)); /* call wake func explicitly since no I/O in this node */ +} + +/* this xor is used by the degraded-mode dag functions to recover lost data. + * the second-to-last parameter is the PDA for the failed portion of the access. + * the code here looks at this PDA and assumes that the xor target buffer is + * equal in size to the number of sectors in the failed PDA. It then uses + * the other PDAs in the parameter list to determine where within the target + * buffer the corresponding data should be xored. + */ +int rf_RecoveryXorFunc(node) + RF_DagNode_t *node; +{ + RF_Raid_t *raidPtr = (RF_Raid_t *)node->params[node->numParams-1].p; + RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout; + RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *)node->params[node->numParams-2].p; + int i, retcode = 0; + RF_PhysDiskAddr_t *pda; + int suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr,failedPDA->startSector); + char *srcbuf, *destbuf; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + RF_Etimer_t timer; +#if RF_BACKWARD > 0 + caddr_t undoBuf; +#endif + + if (node->dagHdr->status == rf_enable) { + RF_ETIMER_START(timer); + for (i=0; i<node->numParams-2; i+=2) if (node->params[i+1].p != node->results[0]) { + pda = (RF_PhysDiskAddr_t *)node->params[i].p; +#if RF_BACKWARD > 0 + /* This section mimics undo logging for backward error recovery experiments b + * allocating and initializing a buffer + * XXX 512 byte sector size is hard coded! + */ + if (node->dagHdr->allocList == NULL) + rf_MakeAllocList(node->dagHdr->allocList); + RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList); +#endif /* RF_BACKWARD > 0 */ + srcbuf = (char *)node->params[i+1].p; + suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); + destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr,suoffset-failedSUOffset); + retcode = rf_bxor(srcbuf, destbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), node->dagHdr->bp); + } + RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->xor_us += RF_ETIMER_VAL_US(timer); + } + return (rf_GenericWakeupFunc(node, retcode)); +} + +/***************************************************************************************** + * The next three functions are utilities used by the above xor-execution functions. + ****************************************************************************************/ + + +/* + * this is just a glorified buffer xor. targbuf points to a buffer that is one full stripe unit + * in size. srcbuf points to a buffer that may be less than 1 SU, but never more. When the + * access described by pda is one SU in size (which by implication means it's SU-aligned), + * all that happens is (targbuf) <- (srcbuf ^ targbuf). When the access is less than one + * SU in size the XOR occurs on only the portion of targbuf identified in the pda. + */ + +int rf_XorIntoBuffer(raidPtr, pda, srcbuf, targbuf, bp) + RF_Raid_t *raidPtr; + RF_PhysDiskAddr_t *pda; + char *srcbuf; + char *targbuf; + void *bp; +{ + char *targptr; + int sectPerSU = raidPtr->Layout.sectorsPerStripeUnit; + int SUOffset = pda->startSector % sectPerSU; + int length, retcode = 0; + + RF_ASSERT(pda->numSector <= sectPerSU); + + targptr = targbuf + rf_RaidAddressToByte(raidPtr, SUOffset); + length = rf_RaidAddressToByte(raidPtr, pda->numSector); + retcode = rf_bxor(srcbuf, targptr, length, bp); + return(retcode); +} + +/* it really should be the case that the buffer pointers (returned by malloc) + * are aligned to the natural word size of the machine, so this is the only + * case we optimize for. The length should always be a multiple of the sector + * size, so there should be no problem with leftover bytes at the end. + */ +int rf_bxor(src, dest, len, bp) + char *src; + char *dest; + int len; + void *bp; +{ + unsigned mask = sizeof(long) -1, retcode = 0; + + if ( !(((unsigned long) src) & mask) && !(((unsigned long) dest) & mask) && !(len&mask) ) { + retcode = rf_longword_bxor((unsigned long *) src, (unsigned long *) dest, len>>RF_LONGSHIFT, bp); + } else { + RF_ASSERT(0); + } + return(retcode); +} + +/* map a user buffer into kernel space, if necessary */ +#ifdef KERNEL +#if defined(__NetBSD__) || defined(__OpenBSD__) +/* XXX Not a clue if this is even close.. */ +#define REMAP_VA(_bp,x,y) (y) = (x) +#else +#define REMAP_VA(_bp,x,y) (y) = (unsigned long *) ((IS_SYS_VA(x)) ? (unsigned long *)(x) : (unsigned long *) rf_MapToKernelSpace((struct buf *) (_bp), (caddr_t)(x))) +#endif /* __NetBSD__ || __OpenBSD__ */ +#else /* KERNEL */ +#define REMAP_VA(_bp,x,y) (y) = (x) +#endif /* KERNEL */ + +/* When XORing in kernel mode, we need to map each user page to kernel space before we can access it. + * We don't want to assume anything about which input buffers are in kernel/user + * space, nor about their alignment, so in each loop we compute the maximum number + * of bytes that we can xor without crossing any page boundaries, and do only this many + * bytes before the next remap. + */ +int rf_longword_bxor(src, dest, len, bp) + register unsigned long *src; + register unsigned long *dest; + int len; /* longwords */ + void *bp; +{ + register unsigned long *end = src+len; + register unsigned long d0, d1, d2, d3, s0, s1, s2, s3; /* temps */ + register unsigned long *pg_src, *pg_dest; /* per-page source/dest pointers */ + int longs_this_time; /* # longwords to xor in the current iteration */ + + REMAP_VA(bp, src, pg_src); + REMAP_VA(bp, dest, pg_dest); + if (!pg_src || !pg_dest) return(EFAULT); + + while (len >= 4 ) { + longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(pg_src), RF_BLIP(pg_dest)) >> RF_LONGSHIFT); /* note len in longwords */ + src += longs_this_time; dest+= longs_this_time; len -= longs_this_time; + while (longs_this_time >= 4) { + d0 = pg_dest[0]; + d1 = pg_dest[1]; + d2 = pg_dest[2]; + d3 = pg_dest[3]; + s0 = pg_src[0]; + s1 = pg_src[1]; + s2 = pg_src[2]; + s3 = pg_src[3]; + pg_dest[0] = d0 ^ s0; + pg_dest[1] = d1 ^ s1; + pg_dest[2] = d2 ^ s2; + pg_dest[3] = d3 ^ s3; + pg_src += 4; + pg_dest += 4; + longs_this_time -= 4; + } + while (longs_this_time > 0) { /* cannot cross any page boundaries here */ + *pg_dest++ ^= *pg_src++; + longs_this_time--; + } + + /* either we're done, or we've reached a page boundary on one (or possibly both) of the pointers */ + if (len) { + if (RF_PAGE_ALIGNED(src)) REMAP_VA(bp, src, pg_src); + if (RF_PAGE_ALIGNED(dest)) REMAP_VA(bp, dest, pg_dest); + if (!pg_src || !pg_dest) return(EFAULT); + } + } + while (src < end) { + *pg_dest++ ^= *pg_src++; + src++; dest++; len--; + if (RF_PAGE_ALIGNED(src)) REMAP_VA(bp, src, pg_src); + if (RF_PAGE_ALIGNED(dest)) REMAP_VA(bp, dest, pg_dest); + } + RF_ASSERT(len == 0); + return(0); +} + + +/* + dst = a ^ b ^ c; + a may equal dst + see comment above longword_bxor +*/ +int rf_longword_bxor3(dst,a,b,c,len, bp) + register unsigned long *dst; + register unsigned long *a; + register unsigned long *b; + register unsigned long *c; + int len; /* length in longwords */ + void *bp; +{ + unsigned long a0,a1,a2,a3, b0,b1,b2,b3; + register unsigned long *pg_a, *pg_b, *pg_c, *pg_dst; /* per-page source/dest pointers */ + int longs_this_time; /* # longs to xor in the current iteration */ + char dst_is_a = 0; + + REMAP_VA(bp, a, pg_a); + REMAP_VA(bp, b, pg_b); + REMAP_VA(bp, c, pg_c); + if (a == dst) {pg_dst = pg_a; dst_is_a = 1;} else { REMAP_VA(bp, dst, pg_dst); } + + /* align dest to cache line. Can't cross a pg boundary on dst here. */ + while ((((unsigned long) pg_dst) & 0x1f)) { + *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++; + dst++; a++; b++; c++; + if (RF_PAGE_ALIGNED(a)) {REMAP_VA(bp, a, pg_a); if (!pg_a) return(EFAULT);} + if (RF_PAGE_ALIGNED(b)) {REMAP_VA(bp, a, pg_b); if (!pg_b) return(EFAULT);} + if (RF_PAGE_ALIGNED(c)) {REMAP_VA(bp, a, pg_c); if (!pg_c) return(EFAULT);} + len--; + } + + while (len > 4 ) { + longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(a), RF_MIN(RF_BLIP(b), RF_MIN(RF_BLIP(c), RF_BLIP(dst)))) >> RF_LONGSHIFT); + a+= longs_this_time; b+= longs_this_time; c+= longs_this_time; dst+=longs_this_time; len-=longs_this_time; + while (longs_this_time >= 4) { + a0 = pg_a[0]; longs_this_time -= 4; + + a1 = pg_a[1]; + a2 = pg_a[2]; + + a3 = pg_a[3]; pg_a += 4; + + b0 = pg_b[0]; + b1 = pg_b[1]; + + b2 = pg_b[2]; + b3 = pg_b[3]; + /* start dual issue */ + a0 ^= b0; b0 = pg_c[0]; + + pg_b += 4; a1 ^= b1; + + a2 ^= b2; a3 ^= b3; + + b1 = pg_c[1]; a0 ^= b0; + + b2 = pg_c[2]; a1 ^= b1; + + b3 = pg_c[3]; a2 ^= b2; + + pg_dst[0] = a0; a3 ^= b3; + pg_dst[1] = a1; pg_c += 4; + pg_dst[2] = a2; + pg_dst[3] = a3; pg_dst += 4; + } + while (longs_this_time > 0) { /* cannot cross any page boundaries here */ + *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++; + longs_this_time--; + } + + if (len) { + if (RF_PAGE_ALIGNED(a)) {REMAP_VA(bp, a, pg_a); if (!pg_a) return(EFAULT); if (dst_is_a) pg_dst = pg_a;} + if (RF_PAGE_ALIGNED(b)) {REMAP_VA(bp, b, pg_b); if (!pg_b) return(EFAULT);} + if (RF_PAGE_ALIGNED(c)) {REMAP_VA(bp, c, pg_c); if (!pg_c) return(EFAULT);} + if (!dst_is_a) if (RF_PAGE_ALIGNED(dst)) {REMAP_VA(bp, dst, pg_dst); if (!pg_dst) return(EFAULT);} + } + } + while (len) { + *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++; + dst++; a++; b++; c++; + if (RF_PAGE_ALIGNED(a)) {REMAP_VA(bp, a, pg_a); if (!pg_a) return(EFAULT); if (dst_is_a) pg_dst = pg_a;} + if (RF_PAGE_ALIGNED(b)) {REMAP_VA(bp, b, pg_b); if (!pg_b) return(EFAULT);} + if (RF_PAGE_ALIGNED(c)) {REMAP_VA(bp, c, pg_c); if (!pg_c) return(EFAULT);} + if (!dst_is_a) if (RF_PAGE_ALIGNED(dst)) {REMAP_VA(bp, dst, pg_dst); if (!pg_dst) return(EFAULT);} + len--; + } + return(0); +} + +int rf_bxor3(dst,a,b,c,len, bp) + register unsigned char *dst; + register unsigned char *a; + register unsigned char *b; + register unsigned char *c; + unsigned long len; + void *bp; +{ + RF_ASSERT(((RF_UL(dst)|RF_UL(a)|RF_UL(b)|RF_UL(c)|len) & 0x7) == 0); + + return(rf_longword_bxor3((unsigned long *)dst, (unsigned long *)a, + (unsigned long *)b, (unsigned long *)c, len>>RF_LONGSHIFT, bp)); +} diff --git a/sys/dev/raidframe/rf_dagfuncs.h b/sys/dev/raidframe/rf_dagfuncs.h new file mode 100644 index 00000000000..ab19b712421 --- /dev/null +++ b/sys/dev/raidframe/rf_dagfuncs.h @@ -0,0 +1,138 @@ +/* $OpenBSD: rf_dagfuncs.h,v 1.1 1999/01/11 14:29:11 niklas Exp $ */ +/* $NetBSD: rf_dagfuncs.h,v 1.1 1998/11/13 04:20:28 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, William V. Courtright II, Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/***************************************************************************************** + * + * dagfuncs.h -- header file for DAG node execution routines + * + ****************************************************************************************/ + +/* + * : + * Log: rf_dagfuncs.h,v + * Revision 1.17 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.16 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.15 1996/06/06 17:27:20 jimz + * added another read mirror func (partitioning), changed names so dag + * creation routines can use the appropriate one + * + * Revision 1.14 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.13 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.12 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.11 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.10 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.9 1995/12/01 15:56:46 root + * added copyright info + * + * Revision 1.8 1995/11/07 16:25:23 wvcii + * added DiskUnlockFuncForThreads + * + */ + +#ifndef _RF__RF_DAGFUNCS_H_ +#define _RF__RF_DAGFUNCS_H_ + +int rf_ConfigureDAGFuncs(RF_ShutdownList_t **listp); +int rf_TerminateFunc(RF_DagNode_t *node); +int rf_TerminateUndoFunc(RF_DagNode_t *node); +int rf_DiskReadMirrorIdleFunc(RF_DagNode_t *node); +int rf_DiskReadMirrorPartitionFunc(RF_DagNode_t *node); +int rf_DiskReadMirrorUndoFunc(RF_DagNode_t *node); +int rf_ParityLogUpdateFunc(RF_DagNode_t *node); +int rf_ParityLogOverwriteFunc(RF_DagNode_t *node); +int rf_ParityLogUpdateUndoFunc(RF_DagNode_t *node); +int rf_ParityLogOverwriteUndoFunc(RF_DagNode_t *node); +int rf_NullNodeFunc(RF_DagNode_t *node); +int rf_NullNodeUndoFunc(RF_DagNode_t *node); +int rf_DiskReadFuncForThreads(RF_DagNode_t *node); +int rf_DiskWriteFuncForThreads(RF_DagNode_t *node); +int rf_DiskUndoFunc(RF_DagNode_t *node); +int rf_DiskUnlockFuncForThreads(RF_DagNode_t *node); +int rf_GenericWakeupFunc(RF_DagNode_t *node, int status); +int rf_RegularXorFunc(RF_DagNode_t *node); +int rf_SimpleXorFunc(RF_DagNode_t *node); +int rf_RecoveryXorFunc(RF_DagNode_t *node); +int rf_XorIntoBuffer(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, char *srcbuf, + char *targbuf, void *bp); +int rf_bxor(char *src, char *dest, int len, void *bp); +int rf_longword_bxor(register unsigned long *src, register unsigned long *dest, + int len, void *bp); +int rf_longword_bxor3(register unsigned long *dest, register unsigned long *a, + register unsigned long *b, register unsigned long *c, int len, void *bp); +int rf_bxor3(unsigned char *dst, unsigned char *a, unsigned char *b, + unsigned char *c, unsigned long len, void *bp); + +/* function ptrs defined in ConfigureDAGFuncs() */ +extern int (*rf_DiskReadFunc)(RF_DagNode_t *); +extern int (*rf_DiskWriteFunc)(RF_DagNode_t *); +extern int (*rf_DiskReadUndoFunc)(RF_DagNode_t *); +extern int (*rf_DiskWriteUndoFunc)(RF_DagNode_t *); +extern int (*rf_DiskUnlockFunc)(RF_DagNode_t *); +extern int (*rf_DiskUnlockUndoFunc)(RF_DagNode_t *); +extern int (*rf_SimpleXorUndoFunc)(RF_DagNode_t *); +extern int (*rf_RegularXorUndoFunc)(RF_DagNode_t *); +extern int (*rf_RecoveryXorUndoFunc)(RF_DagNode_t *); + +/* macros for manipulating the param[3] in a read or write node */ +#define RF_CREATE_PARAM3(pri, lk, unlk, wru) (((RF_uint64)(((wru&0xFFFFFF)<<8)|((lk)?0x10:0)|((unlk)?0x20:0)|((pri)&0xF)) )) +#define RF_EXTRACT_PRIORITY(_x_) ((((unsigned) ((unsigned long)(_x_))) >> 0) & 0x0F) +#define RF_EXTRACT_LOCK_FLAG(_x_) ((((unsigned) ((unsigned long)(_x_))) >> 4) & 0x1) +#define RF_EXTRACT_UNLOCK_FLAG(_x_) ((((unsigned) ((unsigned long)(_x_))) >> 5) & 0x1) +#define RF_EXTRACT_RU(_x_) ((((unsigned) ((unsigned long)(_x_))) >> 8) & 0xFFFFFF) + +#endif /* !_RF__RF_DAGFUNCS_H_ */ diff --git a/sys/dev/raidframe/rf_dagutils.c b/sys/dev/raidframe/rf_dagutils.c new file mode 100644 index 00000000000..b050b832af6 --- /dev/null +++ b/sys/dev/raidframe/rf_dagutils.c @@ -0,0 +1,1406 @@ +/* $OpenBSD: rf_dagutils.c,v 1.1 1999/01/11 14:29:11 niklas Exp $ */ +/* $NetBSD: rf_dagutils.c,v 1.1 1998/11/13 04:20:28 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Mark Holland, William V. Courtright II, Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/****************************************************************************** + * + * rf_dagutils.c -- utility routines for manipulating dags + * + *****************************************************************************/ + +/* + * : + * Log: rf_dagutils.c,v + * Revision 1.55 1996/08/22 14:39:47 jimz + * reduce v/k fraction (better load balancing) + * + * Revision 1.54 1996/08/21 04:14:12 jimz + * minor workload shift tweaking + * + * Revision 1.53 1996/08/20 23:41:16 jimz + * fix up workload shift computation + * + * Revision 1.52 1996/08/20 22:34:16 jimz + * first cut at fixing workload shift + * needs work + * + * Revision 1.51 1996/08/20 16:51:16 jimz + * comment more verbosely compute_workload_shift() + * + * Revision 1.50 1996/08/11 00:40:50 jimz + * fix up broken comment + * + * Revision 1.49 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.48 1996/07/27 18:40:01 jimz + * cleanup sweep + * + * Revision 1.47 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.46 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.45 1996/06/17 03:24:59 jimz + * include shutdown.h for define of now-macroized ShutdownCreate + * + * Revision 1.44 1996/06/10 12:50:57 jimz + * Add counters to freelists to track number of allocations, frees, + * grows, max size, etc. Adjust a couple sets of PRIME params based + * on the results. + * + * Revision 1.43 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.42 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.41 1996/06/06 17:28:58 jimz + * make PrintNodeInfoString aware of new mirroring funcs + * + * Revision 1.40 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.39 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.38 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.37 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.36 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.35 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.34 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.33 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.32 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.31 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.30 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.29 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.28 1996/05/16 23:05:52 jimz + * changed InitNode() to use dag_ptrs field of node when appropriate + * (see rf_dag.h or comments within InitNode() for details) + * + * Revision 1.27 1996/05/16 15:37:19 jimz + * convert to RF_FREELIST stuff for dag headers + * + * Revision 1.26 1996/05/08 21:01:24 jimz + * fixed up enum type names that were conflicting with other + * enums and function names (ie, "panic") + * future naming trends will be towards RF_ and rf_ for + * everything raidframe-related + * + * Revision 1.25 1996/05/03 19:56:15 wvcii + * added misc routines from old dag creation files + * + * Revision 1.24 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.23 1995/12/01 15:59:50 root + * added copyright info + * + * Revision 1.22 1995/11/17 15:14:12 wvcii + * PrintDAG now processes DiskReadMirrorFunc nodes + * + * Revision 1.21 1995/11/07 16:22:38 wvcii + * InitNode and InitNodeFromBuf now initialize commit fields + * beefed up ValidateDag + * prettied up PrintDAGList + * + */ + +#include "rf_archs.h" +#include "rf_types.h" +#include "rf_threadstuff.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_dagutils.h" +#include "rf_dagfuncs.h" +#include "rf_general.h" +#include "rf_freelist.h" +#include "rf_map.h" +#include "rf_shutdown.h" +#include "rf_sys.h" + +#define SNUM_DIFF(_a_,_b_) (((_a_)>(_b_))?((_a_)-(_b_)):((_b_)-(_a_))) + +RF_RedFuncs_t rf_xorFuncs = { + rf_RegularXorFunc, "Reg Xr", + rf_SimpleXorFunc, "Simple Xr"}; + +RF_RedFuncs_t rf_xorRecoveryFuncs = { + rf_RecoveryXorFunc, "Recovery Xr", + rf_RecoveryXorFunc, "Recovery Xr"}; + +static void rf_RecurPrintDAG(RF_DagNode_t *, int, int); +static void rf_PrintDAG(RF_DagHeader_t *); +static int rf_ValidateBranch(RF_DagNode_t *, int *, int *, + RF_DagNode_t **, int ); +static void rf_ValidateBranchVisitedBits(RF_DagNode_t *, int, int); +static void rf_ValidateVisitedBits(RF_DagHeader_t *); + +/****************************************************************************** + * + * InitNode - initialize a dag node + * + * the size of the propList array is always the same as that of the + * successors array. + * + *****************************************************************************/ +void rf_InitNode( + RF_DagNode_t *node, + RF_NodeStatus_t initstatus, + int commit, + int (*doFunc)(RF_DagNode_t *node), + int (*undoFunc)(RF_DagNode_t *node), + int (*wakeFunc)(RF_DagNode_t *node,int status), + int nSucc, + int nAnte, + int nParam, + int nResult, + RF_DagHeader_t *hdr, + char *name, + RF_AllocListElem_t *alist) +{ + void **ptrs; + int nptrs; + + if (nAnte > RF_MAX_ANTECEDENTS) + RF_PANIC(); + node->status = initstatus; + node->commitNode = commit; + node->doFunc = doFunc; + node->undoFunc = undoFunc; + node->wakeFunc = wakeFunc; + node->numParams = nParam; + node->numResults = nResult; + node->numAntecedents = nAnte; + node->numAntDone = 0; + node->next = NULL; + node->numSuccedents = nSucc; + node->name = name; + node->dagHdr = hdr; + node->visited = 0; + + /* allocate all the pointers with one call to malloc */ + nptrs = nSucc+nAnte+nResult+nSucc; + + if (nptrs <= RF_DAG_PTRCACHESIZE) { + /* + * The dag_ptrs field of the node is basically some scribble + * space to be used here. We could get rid of it, and always + * allocate the range of pointers, but that's expensive. So, + * we pick a "common case" size for the pointer cache. Hopefully, + * we'll find that: + * (1) Generally, nptrs doesn't exceed RF_DAG_PTRCACHESIZE by + * only a little bit (least efficient case) + * (2) Generally, ntprs isn't a lot less than RF_DAG_PTRCACHESIZE + * (wasted memory) + */ + ptrs = (void **)node->dag_ptrs; + } + else { + RF_CallocAndAdd(ptrs, nptrs, sizeof(void *), (void **), alist); + } + node->succedents = (nSucc) ? (RF_DagNode_t **) ptrs : NULL; + node->antecedents = (nAnte) ? (RF_DagNode_t **) (ptrs+nSucc) : NULL; + node->results = (nResult) ? (void **) (ptrs+nSucc+nAnte) : NULL; + node->propList = (nSucc) ? (RF_PropHeader_t **) (ptrs+nSucc+nAnte+nResult) : NULL; + + if (nParam) { + if (nParam <= RF_DAG_PARAMCACHESIZE) { + node->params = (RF_DagParam_t *)node->dag_params; + } + else { + RF_CallocAndAdd(node->params, nParam, sizeof(RF_DagParam_t), (RF_DagParam_t *), alist); + } + } + else { + node->params = NULL; + } +} + + + +/****************************************************************************** + * + * allocation and deallocation routines + * + *****************************************************************************/ + +void rf_FreeDAG(dag_h) + RF_DagHeader_t *dag_h; +{ + RF_AccessStripeMapHeader_t *asmap, *t_asmap; + RF_DagHeader_t *nextDag; + int i; + + while (dag_h) { + nextDag = dag_h->next; + for (i=0; dag_h->memChunk[i] && i < RF_MAXCHUNKS; i++) { + /* release mem chunks */ + rf_ReleaseMemChunk(dag_h->memChunk[i]); + dag_h->memChunk[i] = NULL; + } + + RF_ASSERT(i == dag_h->chunkIndex); + if (dag_h->xtraChunkCnt > 0) { + /* free xtraMemChunks */ + for (i=0; dag_h->xtraMemChunk[i] && i < dag_h->xtraChunkIndex; i++) { + rf_ReleaseMemChunk(dag_h->xtraMemChunk[i]); + dag_h->xtraMemChunk[i] = NULL; + } + RF_ASSERT(i == dag_h->xtraChunkIndex); + /* free ptrs to xtraMemChunks */ + RF_Free(dag_h->xtraMemChunk, dag_h->xtraChunkCnt * sizeof(RF_ChunkDesc_t *)); + } + rf_FreeAllocList(dag_h->allocList); + for (asmap = dag_h->asmList; asmap;) { + t_asmap = asmap; + asmap = asmap->next; + rf_FreeAccessStripeMap(t_asmap); + } + rf_FreeDAGHeader(dag_h); + dag_h = nextDag; + } +} + +RF_PropHeader_t *rf_MakePropListEntry( + RF_DagHeader_t *dag_h, + int resultNum, + int paramNum, + RF_PropHeader_t *next, + RF_AllocListElem_t *allocList) +{ + RF_PropHeader_t *p; + + RF_CallocAndAdd(p, 1, sizeof(RF_PropHeader_t), + (RF_PropHeader_t *), allocList); + p->resultNum = resultNum; + p->paramNum = paramNum; + p->next = next; + return(p); +} + +static RF_FreeList_t *rf_dagh_freelist; + +#define RF_MAX_FREE_DAGH 128 +#define RF_DAGH_INC 16 +#define RF_DAGH_INITIAL 32 + +static void rf_ShutdownDAGs(void *); +static void rf_ShutdownDAGs(ignored) + void *ignored; +{ + RF_FREELIST_DESTROY(rf_dagh_freelist,next,(RF_DagHeader_t *)); +} + +int rf_ConfigureDAGs(listp) + RF_ShutdownList_t **listp; +{ + int rc; + + RF_FREELIST_CREATE(rf_dagh_freelist, RF_MAX_FREE_DAGH, + RF_DAGH_INC, sizeof(RF_DagHeader_t)); + if (rf_dagh_freelist == NULL) + return(ENOMEM); + rc = rf_ShutdownCreate(listp, rf_ShutdownDAGs, NULL); + if (rc) { + RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", + __FILE__, __LINE__, rc); + rf_ShutdownDAGs(NULL); + return(rc); + } + RF_FREELIST_PRIME(rf_dagh_freelist, RF_DAGH_INITIAL,next, + (RF_DagHeader_t *)); + return(0); +} + +RF_DagHeader_t *rf_AllocDAGHeader() +{ + RF_DagHeader_t *dh; + + RF_FREELIST_GET(rf_dagh_freelist,dh,next,(RF_DagHeader_t *)); + if (dh) { + bzero((char *)dh, sizeof(RF_DagHeader_t)); + } + return(dh); +} + +void rf_FreeDAGHeader(RF_DagHeader_t *dh) +{ + RF_FREELIST_FREE(rf_dagh_freelist,dh,next); +} + +/* allocates a buffer big enough to hold the data described by pda */ +void *rf_AllocBuffer( + RF_Raid_t *raidPtr, + RF_DagHeader_t *dag_h, + RF_PhysDiskAddr_t *pda, + RF_AllocListElem_t *allocList) +{ + char *p; + + RF_MallocAndAdd(p, pda->numSector << raidPtr->logBytesPerSector, + (char *), allocList); + return((void *)p); +} + +/****************************************************************************** + * + * debug routines + * + *****************************************************************************/ + +char *rf_NodeStatusString(RF_DagNode_t *node) +{ + switch (node->status) { + case rf_wait: return("wait"); + case rf_fired: return("fired"); + case rf_good: return("good"); + case rf_bad: return("bad"); + default: return("?"); + } +} + +void rf_PrintNodeInfoString(RF_DagNode_t *node) +{ + RF_PhysDiskAddr_t *pda; + int (*df)(RF_DagNode_t *) = node->doFunc; + int i, lk, unlk; + void *bufPtr; + + if ((df==rf_DiskReadFunc) || (df==rf_DiskWriteFunc) + || (df==rf_DiskReadMirrorIdleFunc) + || (df == rf_DiskReadMirrorPartitionFunc)) + { + pda = (RF_PhysDiskAddr_t *)node->params[0].p; + bufPtr = (void *)node->params[1].p; + lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v); + unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v); + RF_ASSERT( !(lk && unlk) ); + printf("r %d c %d offs %ld nsect %d buf 0x%lx %s\n", pda->row, pda->col, + (long)pda->startSector, (int) pda->numSector, (long)bufPtr, + (lk) ? "LOCK" : ((unlk) ? "UNLK" : " ")); + return; + } + + if (df == rf_DiskUnlockFunc) { + pda = (RF_PhysDiskAddr_t *)node->params[0].p; + lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v); + unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v); + RF_ASSERT( !(lk && unlk) ); + printf("r %d c %d %s\n", pda->row, pda->col, + (lk) ? "LOCK" : ((unlk) ? "UNLK" : "nop")); + return; + } + + if ((df==rf_SimpleXorFunc) || (df==rf_RegularXorFunc) + || (df==rf_RecoveryXorFunc)) + { + printf("result buf 0x%lx\n",(long) node->results[0]); + for (i=0; i<node->numParams-1; i+=2) { + pda = (RF_PhysDiskAddr_t *)node->params[i].p; + bufPtr = (RF_PhysDiskAddr_t *)node->params[i+1].p; + printf(" buf 0x%lx r%d c%d offs %ld nsect %d\n", + (long)bufPtr, pda->row, pda->col, + (long)pda->startSector, (int)pda->numSector); + } + return; + } + +#if RF_INCLUDE_PARITYLOGGING > 0 + if (df==rf_ParityLogOverwriteFunc || df==rf_ParityLogUpdateFunc) { + for (i=0; i<node->numParams-1; i+=2) { + pda = (RF_PhysDiskAddr_t *)node->params[i].p; + bufPtr = (RF_PhysDiskAddr_t *)node->params[i+1].p; + printf(" r%d c%d offs %ld nsect %d buf 0x%lx\n", + pda->row, pda->col, (long) pda->startSector, + (int) pda->numSector, (long) bufPtr); + } + return; + } +#endif /* RF_INCLUDE_PARITYLOGGING > 0 */ + + if ((df==rf_TerminateFunc) || (df==rf_NullNodeFunc)) { + printf("\n"); + return; + } + + printf("?\n"); +} + +static void rf_RecurPrintDAG(node, depth, unvisited) + RF_DagNode_t *node; + int depth; + int unvisited; +{ + char *anttype; + int i; + + node->visited = (unvisited) ? 0 : 1; + printf("(%d) %d C%d %s: %s,s%d %d/%d,a%d/%d,p%d,r%d S{", depth, + node->nodeNum, node->commitNode, node->name, rf_NodeStatusString(node), + node->numSuccedents, node->numSuccFired, node->numSuccDone, + node->numAntecedents, node->numAntDone, node->numParams,node->numResults); + for (i=0; i<node->numSuccedents; i++) { + printf("%d%s", node->succedents[i]->nodeNum, + ((i==node->numSuccedents-1) ? "\0" : " ")); + } + printf("} A{"); + for (i=0; i<node->numAntecedents; i++) { + switch (node->antType[i]) { + case rf_trueData : + anttype = "T"; + break; + case rf_antiData : + anttype = "A"; + break; + case rf_outputData : + anttype = "O"; + break; + case rf_control : + anttype = "C"; + break; + default : + anttype = "?"; + break; + } + printf("%d(%s)%s", node->antecedents[i]->nodeNum, anttype, (i==node->numAntecedents-1) ? "\0" : " "); + } + printf("}; "); + rf_PrintNodeInfoString(node); + for (i=0; i<node->numSuccedents; i++) { + if (node->succedents[i]->visited == unvisited) + rf_RecurPrintDAG(node->succedents[i], depth+1, unvisited); + } +} + +static void rf_PrintDAG(dag_h) + RF_DagHeader_t *dag_h; +{ + int unvisited, i; + char *status; + + /* set dag status */ + switch (dag_h->status) { + case rf_enable : + status = "enable"; + break; + case rf_rollForward : + status = "rollForward"; + break; + case rf_rollBackward : + status = "rollBackward"; + break; + default : + status = "illegal!"; + break; + } + /* find out if visited bits are currently set or clear */ + unvisited = dag_h->succedents[0]->visited; + + printf("DAG type: %s\n", dag_h->creator); + printf("format is (depth) num commit type: status,nSucc nSuccFired/nSuccDone,nAnte/nAnteDone,nParam,nResult S{x} A{x(type)}; info\n"); + printf("(0) %d Hdr: %s, s%d, (commit %d/%d) S{", dag_h->nodeNum, + status, dag_h->numSuccedents, dag_h->numCommitNodes, dag_h->numCommits); + for (i=0; i<dag_h->numSuccedents; i++) { + printf("%d%s", dag_h->succedents[i]->nodeNum, + ((i==dag_h->numSuccedents-1) ? "\0" : " ")); + } + printf("};\n"); + for (i=0; i<dag_h->numSuccedents; i++) { + if (dag_h->succedents[i]->visited == unvisited) + rf_RecurPrintDAG(dag_h->succedents[i], 1, unvisited); + } +} + +/* assigns node numbers */ +int rf_AssignNodeNums(RF_DagHeader_t *dag_h) +{ + int unvisited, i, nnum; + RF_DagNode_t *node; + + nnum = 0; + unvisited = dag_h->succedents[0]->visited; + + dag_h->nodeNum = nnum++; + for (i=0; i<dag_h->numSuccedents; i++) { + node = dag_h->succedents[i]; + if (node->visited == unvisited) { + nnum = rf_RecurAssignNodeNums(dag_h->succedents[i], nnum, unvisited); + } + } + return(nnum); +} + +int rf_RecurAssignNodeNums(node, num, unvisited) + RF_DagNode_t *node; + int num; + int unvisited; +{ + int i; + + node->visited = (unvisited) ? 0 : 1; + + node->nodeNum = num++; + for (i=0; i<node->numSuccedents; i++) { + if (node->succedents[i]->visited == unvisited) { + num = rf_RecurAssignNodeNums(node->succedents[i], num, unvisited); + } + } + return(num); +} + +/* set the header pointers in each node to "newptr" */ +void rf_ResetDAGHeaderPointers(dag_h, newptr) + RF_DagHeader_t *dag_h; + RF_DagHeader_t *newptr; +{ + int i; + for (i=0; i<dag_h->numSuccedents; i++) + if (dag_h->succedents[i]->dagHdr != newptr) + rf_RecurResetDAGHeaderPointers(dag_h->succedents[i], newptr); +} + +void rf_RecurResetDAGHeaderPointers(node, newptr) + RF_DagNode_t *node; + RF_DagHeader_t *newptr; +{ + int i; + node->dagHdr = newptr; + for (i=0; i<node->numSuccedents; i++) + if (node->succedents[i]->dagHdr != newptr) + rf_RecurResetDAGHeaderPointers(node->succedents[i], newptr); +} + + +void rf_PrintDAGList(RF_DagHeader_t *dag_h) +{ + int i=0; + + for (; dag_h; dag_h=dag_h->next) { + rf_AssignNodeNums(dag_h); + printf("\n\nDAG %d IN LIST:\n",i++); + rf_PrintDAG(dag_h); + } +} + +static int rf_ValidateBranch(node, scount, acount, nodes, unvisited) + RF_DagNode_t *node; + int *scount; + int *acount; + RF_DagNode_t **nodes; + int unvisited; +{ + int i, retcode = 0; + + /* construct an array of node pointers indexed by node num */ + node->visited = (unvisited) ? 0 : 1; + nodes[ node->nodeNum ] = node; + + if (node->next != NULL) { + printf("INVALID DAG: next pointer in node is not NULL\n"); + retcode = 1; + } + if (node->status != rf_wait) { + printf("INVALID DAG: Node status is not wait\n"); + retcode = 1; + } + if (node->numAntDone != 0) { + printf("INVALID DAG: numAntDone is not zero\n"); + retcode = 1; + } + if (node->doFunc == rf_TerminateFunc) { + if (node->numSuccedents != 0) { + printf("INVALID DAG: Terminator node has succedents\n"); + retcode = 1; + } + } else { + if (node->numSuccedents == 0) { + printf("INVALID DAG: Non-terminator node has no succedents\n"); + retcode = 1; + } + } + for (i=0; i<node->numSuccedents; i++) { + if (!node->succedents[i]) { + printf("INVALID DAG: succedent %d of node %s is NULL\n",i,node->name); + retcode = 1; + } + scount[ node->succedents[i]->nodeNum ]++; + } + for (i=0; i<node->numAntecedents; i++) { + if (!node->antecedents[i]) { + printf("INVALID DAG: antecedent %d of node %s is NULL\n",i,node->name); + retcode = 1; + } + acount[ node->antecedents[i]->nodeNum ]++; + } + for (i=0; i<node->numSuccedents; i++) { + if (node->succedents[i]->visited == unvisited) { + if (rf_ValidateBranch(node->succedents[i], scount, + acount, nodes, unvisited)) + { + retcode = 1; + } + } + } + return(retcode); +} + +static void rf_ValidateBranchVisitedBits(node, unvisited, rl) + RF_DagNode_t *node; + int unvisited; + int rl; +{ + int i; + + RF_ASSERT(node->visited == unvisited); + for (i=0; i<node->numSuccedents; i++) { + if (node->succedents[i] == NULL) { + printf("node=%lx node->succedents[%d] is NULL\n", (long)node, i); + RF_ASSERT(0); + } + rf_ValidateBranchVisitedBits(node->succedents[i],unvisited, rl+1); + } +} + +/* NOTE: never call this on a big dag, because it is exponential + * in execution time + */ +static void rf_ValidateVisitedBits(dag) + RF_DagHeader_t *dag; +{ + int i, unvisited; + + unvisited = dag->succedents[0]->visited; + + for (i=0; i<dag->numSuccedents; i++) { + if (dag->succedents[i] == NULL) { + printf("dag=%lx dag->succedents[%d] is NULL\n", (long) dag, i); + RF_ASSERT(0); + } + rf_ValidateBranchVisitedBits(dag->succedents[i],unvisited,0); + } +} + +/* validate a DAG. _at entry_ verify that: + * -- numNodesCompleted is zero + * -- node queue is null + * -- dag status is rf_enable + * -- next pointer is null on every node + * -- all nodes have status wait + * -- numAntDone is zero in all nodes + * -- terminator node has zero successors + * -- no other node besides terminator has zero successors + * -- no successor or antecedent pointer in a node is NULL + * -- number of times that each node appears as a successor of another node + * is equal to the antecedent count on that node + * -- number of times that each node appears as an antecedent of another node + * is equal to the succedent count on that node + * -- what else? + */ +int rf_ValidateDAG(dag_h) + RF_DagHeader_t *dag_h; +{ + int i, nodecount; + int *scount, *acount; /* per-node successor and antecedent counts */ + RF_DagNode_t **nodes; /* array of ptrs to nodes in dag */ + int retcode = 0; + int unvisited; + int commitNodeCount = 0; + + if (rf_validateVisitedDebug) + rf_ValidateVisitedBits(dag_h); + + if (dag_h->numNodesCompleted != 0) { + printf("INVALID DAG: num nodes completed is %d, should be 0\n",dag_h->numNodesCompleted); + retcode = 1; goto validate_dag_bad; + } + if (dag_h->status != rf_enable) { + printf("INVALID DAG: not enabled\n"); + retcode = 1; goto validate_dag_bad; + } + if (dag_h->numCommits != 0) { + printf("INVALID DAG: numCommits != 0 (%d)\n",dag_h->numCommits); + retcode = 1; goto validate_dag_bad; + } + if (dag_h->numSuccedents != 1) { + /* currently, all dags must have only one succedent */ + printf("INVALID DAG: numSuccedents !1 (%d)\n",dag_h->numSuccedents); + retcode = 1; goto validate_dag_bad; + } + nodecount = rf_AssignNodeNums(dag_h); + + unvisited = dag_h->succedents[0]->visited; + + RF_Calloc(scount, nodecount, sizeof(int), (int *)); + RF_Calloc(acount, nodecount, sizeof(int), (int *)); + RF_Calloc(nodes, nodecount, sizeof(RF_DagNode_t *), (RF_DagNode_t **)); + for (i=0; i<dag_h->numSuccedents; i++) { + if ((dag_h->succedents[i]->visited == unvisited) + && rf_ValidateBranch(dag_h->succedents[i], scount, + acount, nodes, unvisited)) + { + retcode = 1; + } + } + /* start at 1 to skip the header node */ + for (i=1; i<nodecount; i++) { + if ( nodes[i]->commitNode ) + commitNodeCount++; + if ( nodes[i]->doFunc == NULL ) { + printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name); + retcode = 1; + goto validate_dag_out; + } + if ( nodes[i]->undoFunc == NULL ) { + printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name); + retcode = 1; + goto validate_dag_out; + } + if ( nodes[i]->numAntecedents != scount[ nodes[i]->nodeNum ] ) { + printf("INVALID DAG: node %s has %d antecedents but appears as a succedent %d times\n", + nodes[i]->name, nodes[i]->numAntecedents, scount[nodes[i]->nodeNum]); + retcode = 1; + goto validate_dag_out; + } + if ( nodes[i]->numSuccedents != acount[ nodes[i]->nodeNum ] ) { + printf("INVALID DAG: node %s has %d succedents but appears as an antecedent %d times\n", + nodes[i]->name, nodes[i]->numSuccedents, acount[nodes[i]->nodeNum]); + retcode = 1; + goto validate_dag_out; + } + } + + if ( dag_h->numCommitNodes != commitNodeCount ) { + printf("INVALID DAG: incorrect commit node count. hdr->numCommitNodes (%d) found (%d) commit nodes in graph\n", + dag_h->numCommitNodes, commitNodeCount); + retcode = 1; + goto validate_dag_out; + } + +validate_dag_out: + RF_Free(scount, nodecount*sizeof(int)); + RF_Free(acount, nodecount*sizeof(int)); + RF_Free(nodes, nodecount*sizeof(RF_DagNode_t *)); + if (retcode) + rf_PrintDAGList(dag_h); + + if (rf_validateVisitedDebug) + rf_ValidateVisitedBits(dag_h); + + return(retcode); + +validate_dag_bad: + rf_PrintDAGList(dag_h); + return(retcode); +} + + +/****************************************************************************** + * + * misc construction routines + * + *****************************************************************************/ + +void rf_redirect_asm( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap) +{ + int ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) ? 1 : 0; + int row = asmap->physInfo->row; + int fcol = raidPtr->reconControl[row]->fcol; + int srow = raidPtr->reconControl[row]->spareRow; + int scol = raidPtr->reconControl[row]->spareCol; + RF_PhysDiskAddr_t *pda; + + RF_ASSERT( raidPtr->status[row] == rf_rs_reconstructing ); + for (pda = asmap->physInfo; pda; pda=pda->next) { + if (pda->col == fcol) { + if (rf_dagDebug) { + if (!rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, + pda->startSector)) + { + RF_PANIC(); + } + } + /*printf("Remapped data for large write\n");*/ + if (ds) { + raidPtr->Layout.map->MapSector(raidPtr, pda->raidAddress, + &pda->row, &pda->col, &pda->startSector, RF_REMAP); + } + else { + pda->row = srow; pda->col = scol; + } + } + } + for (pda = asmap->parityInfo; pda; pda=pda->next) { + if (pda->col == fcol) { + if (rf_dagDebug) { + if (!rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, pda->startSector)) { + RF_PANIC(); + } + } + } + if (ds) { + (raidPtr->Layout.map->MapParity)(raidPtr, pda->raidAddress, &pda->row, &pda->col, &pda->startSector, RF_REMAP); + } + else { + pda->row = srow; pda->col = scol; + } + } +} + + +/* this routine allocates read buffers and generates stripe maps for the + * regions of the array from the start of the stripe to the start of the + * access, and from the end of the access to the end of the stripe. It also + * computes and returns the number of DAG nodes needed to read all this data. + * Note that this routine does the wrong thing if the access is fully + * contained within one stripe unit, so we RF_ASSERT against this case at the + * start. + */ +void rf_MapUnaccessedPortionOfStripe( + RF_Raid_t *raidPtr, + RF_RaidLayout_t *layoutPtr, /* in: layout information */ + RF_AccessStripeMap_t *asmap, /* in: access stripe map */ + RF_DagHeader_t *dag_h, /* in: header of the dag to create */ + RF_AccessStripeMapHeader_t **new_asm_h, /* in: ptr to array of 2 headers, to be filled in */ + int *nRodNodes, /* out: num nodes to be generated to read unaccessed data */ + char **sosBuffer, /* out: pointers to newly allocated buffer */ + char **eosBuffer, + RF_AllocListElem_t *allocList) +{ + RF_RaidAddr_t sosRaidAddress, eosRaidAddress; + RF_SectorNum_t sosNumSector, eosNumSector; + + RF_ASSERT( asmap->numStripeUnitsAccessed > (layoutPtr->numDataCol/2) ); + /* generate an access map for the region of the array from start of stripe + * to start of access */ + new_asm_h[0] = new_asm_h[1] = NULL; *nRodNodes = 0; + if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->raidAddress)) { + sosRaidAddress = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); + sosNumSector = asmap->raidAddress - sosRaidAddress; + RF_MallocAndAdd(*sosBuffer, rf_RaidAddressToByte(raidPtr, sosNumSector), (char *), allocList); + new_asm_h[0] = rf_MapAccess(raidPtr, sosRaidAddress, sosNumSector, *sosBuffer, RF_DONT_REMAP); + new_asm_h[0]->next = dag_h->asmList; + dag_h->asmList = new_asm_h[0]; + *nRodNodes += new_asm_h[0]->stripeMap->numStripeUnitsAccessed; + + RF_ASSERT(new_asm_h[0]->stripeMap->next == NULL); + /* we're totally within one stripe here */ + if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE) + rf_redirect_asm(raidPtr, new_asm_h[0]->stripeMap); + } + /* generate an access map for the region of the array from end of access + * to end of stripe */ + if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->endRaidAddress)) { + eosRaidAddress = asmap->endRaidAddress; + eosNumSector = rf_RaidAddressOfNextStripeBoundary(layoutPtr, eosRaidAddress) - eosRaidAddress; + RF_MallocAndAdd(*eosBuffer, rf_RaidAddressToByte(raidPtr, eosNumSector), (char *), allocList); + new_asm_h[1] = rf_MapAccess(raidPtr, eosRaidAddress, eosNumSector, *eosBuffer, RF_DONT_REMAP); + new_asm_h[1]->next = dag_h->asmList; + dag_h->asmList = new_asm_h[1]; + *nRodNodes += new_asm_h[1]->stripeMap->numStripeUnitsAccessed; + + RF_ASSERT(new_asm_h[1]->stripeMap->next == NULL); + /* we're totally within one stripe here */ + if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE) + rf_redirect_asm(raidPtr, new_asm_h[1]->stripeMap); + } +} + + + +/* returns non-zero if the indicated ranges of stripe unit offsets overlap */ +int rf_PDAOverlap( + RF_RaidLayout_t *layoutPtr, + RF_PhysDiskAddr_t *src, + RF_PhysDiskAddr_t *dest) +{ + RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector); + RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector); + /* use -1 to be sure we stay within SU */ + RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector-1); + RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector-1); + return( (RF_MAX(soffs,doffs) <= RF_MIN(send,dend)) ? 1 : 0 ); +} + + +/* GenerateFailedAccessASMs + * + * this routine figures out what portion of the stripe needs to be read + * to effect the degraded read or write operation. It's primary function + * is to identify everything required to recover the data, and then + * eliminate anything that is already being accessed by the user. + * + * The main result is two new ASMs, one for the region from the start of the + * stripe to the start of the access, and one for the region from the end of + * the access to the end of the stripe. These ASMs describe everything that + * needs to be read to effect the degraded access. Other results are: + * nXorBufs -- the total number of buffers that need to be XORed together to + * recover the lost data, + * rpBufPtr -- ptr to a newly-allocated buffer to hold the parity. If NULL + * at entry, not allocated. + * overlappingPDAs -- + * describes which of the non-failed PDAs in the user access + * overlap data that needs to be read to effect recovery. + * overlappingPDAs[i]==1 if and only if, neglecting the failed + * PDA, the ith pda in the input asm overlaps data that needs + * to be read for recovery. + */ + /* in: asm - ASM for the actual access, one stripe only */ + /* in: faildPDA - which component of the access has failed */ + /* in: dag_h - header of the DAG we're going to create */ + /* out: new_asm_h - the two new ASMs */ + /* out: nXorBufs - the total number of xor bufs required */ + /* out: rpBufPtr - a buffer for the parity read */ +void rf_GenerateFailedAccessASMs( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_PhysDiskAddr_t *failedPDA, + RF_DagHeader_t *dag_h, + RF_AccessStripeMapHeader_t **new_asm_h, + int *nXorBufs, + char **rpBufPtr, + char *overlappingPDAs, + RF_AllocListElem_t *allocList) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + + /* s=start, e=end, s=stripe, a=access, f=failed, su=stripe unit */ + RF_RaidAddr_t sosAddr, sosEndAddr, eosStartAddr, eosAddr; + + RF_SectorCount_t numSect[2], numParitySect; + RF_PhysDiskAddr_t *pda; + char *rdBuf, *bufP; + int foundit, i; + + bufP = NULL; + foundit = 0; + /* first compute the following raid addresses: + start of stripe, (sosAddr) + MIN(start of access, start of failed SU), (sosEndAddr) + MAX(end of access, end of failed SU), (eosStartAddr) + end of stripe (i.e. start of next stripe) (eosAddr) + */ + sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); + sosEndAddr = RF_MIN(asmap->raidAddress, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,failedPDA->raidAddress)); + eosStartAddr = RF_MAX(asmap->endRaidAddress, rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, failedPDA->raidAddress)); + eosAddr = rf_RaidAddressOfNextStripeBoundary(layoutPtr, asmap->raidAddress); + + /* now generate access stripe maps for each of the above regions of the + * stripe. Use a dummy (NULL) buf ptr for now */ + + new_asm_h[0] = (sosAddr != sosEndAddr) ? rf_MapAccess(raidPtr, sosAddr, sosEndAddr-sosAddr, NULL, RF_DONT_REMAP) : NULL; + new_asm_h[1] = (eosStartAddr != eosAddr) ? rf_MapAccess(raidPtr, eosStartAddr, eosAddr-eosStartAddr, NULL, RF_DONT_REMAP) : NULL; + + /* walk through the PDAs and range-restrict each SU to the region of the + * SU touched on the failed PDA. also compute total data buffer space + * requirements in this step. Ignore the parity for now. */ + + numSect[0] = numSect[1] = 0; + if (new_asm_h[0]) { + new_asm_h[0]->next = dag_h->asmList; dag_h->asmList = new_asm_h[0]; + for (pda = new_asm_h[0]->stripeMap->physInfo; pda; pda = pda->next) { + rf_RangeRestrictPDA(raidPtr,failedPDA, pda, RF_RESTRICT_NOBUFFER, 0); numSect[0] += pda->numSector; + } + } + if (new_asm_h[1]) { + new_asm_h[1]->next = dag_h->asmList; dag_h->asmList = new_asm_h[1]; + for (pda = new_asm_h[1]->stripeMap->physInfo; pda; pda = pda->next) { + rf_RangeRestrictPDA(raidPtr,failedPDA, pda, RF_RESTRICT_NOBUFFER, 0); numSect[1] += pda->numSector; + } + } + numParitySect = failedPDA->numSector; + + /* allocate buffer space for the data & parity we have to read to recover + * from the failure */ + + if (numSect[0]+numSect[1]+ ((rpBufPtr) ? numParitySect : 0)) { /* don't allocate parity buf if not needed */ + RF_MallocAndAdd(rdBuf, rf_RaidAddressToByte(raidPtr,numSect[0]+numSect[1]+numParitySect), (char *), allocList); + bufP = rdBuf; + if (rf_degDagDebug) printf("Newly allocated buffer (%d bytes) is 0x%lx\n", + (int)rf_RaidAddressToByte(raidPtr,numSect[0]+numSect[1]+numParitySect), (unsigned long) bufP); + } + + /* now walk through the pdas one last time and assign buffer pointers + * (ugh!). Again, ignore the parity. also, count nodes to find out how + * many bufs need to be xored together */ + (*nXorBufs) = 1; /* in read case, 1 is for parity. In write case, 1 is for failed data */ + if (new_asm_h[0]) { + for (pda=new_asm_h[0]->stripeMap->physInfo; pda; pda=pda->next) {pda->bufPtr = bufP; bufP += rf_RaidAddressToByte(raidPtr,pda->numSector);} + *nXorBufs += new_asm_h[0]->stripeMap->numStripeUnitsAccessed; + } + if (new_asm_h[1]) { + for (pda=new_asm_h[1]->stripeMap->physInfo; pda; pda=pda->next) {pda->bufPtr = bufP; bufP += rf_RaidAddressToByte(raidPtr,pda->numSector);} + (*nXorBufs) += new_asm_h[1]->stripeMap->numStripeUnitsAccessed; + } + if (rpBufPtr) *rpBufPtr = bufP; /* the rest of the buffer is for parity */ + + /* the last step is to figure out how many more distinct buffers need to + * get xor'd to produce the missing unit. there's one for each user-data + * read node that overlaps the portion of the failed unit being accessed */ + + for (foundit=i=0,pda=asmap->physInfo; pda; i++,pda=pda->next) { + if (pda == failedPDA) {i--; foundit=1; continue;} + if (rf_PDAOverlap(layoutPtr, pda, failedPDA)) { + overlappingPDAs[i] = 1; + (*nXorBufs)++; + } + } + if (!foundit) {RF_ERRORMSG("GenerateFailedAccessASMs: did not find failedPDA in asm list\n"); RF_ASSERT(0);} + + if (rf_degDagDebug) { + if (new_asm_h[0]) { + printf("First asm:\n"); rf_PrintFullAccessStripeMap(new_asm_h[0], 1); + } + if (new_asm_h[1]) { + printf("Second asm:\n"); rf_PrintFullAccessStripeMap(new_asm_h[1], 1); + } + } +} + + +/* adjusts the offset and number of sectors in the destination pda so that + * it covers at most the region of the SU covered by the source PDA. This + * is exclusively a restriction: the number of sectors indicated by the + * target PDA can only shrink. + * + * For example: s = sectors within SU indicated by source PDA + * d = sectors within SU indicated by dest PDA + * r = results, stored in dest PDA + * + * |--------------- one stripe unit ---------------------| + * | sssssssssssssssssssssssssssssssss | + * | ddddddddddddddddddddddddddddddddddddddddddddd | + * | rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr | + * + * Another example: + * + * |--------------- one stripe unit ---------------------| + * | sssssssssssssssssssssssssssssssss | + * | ddddddddddddddddddddddd | + * | rrrrrrrrrrrrrrrr | + * + */ +void rf_RangeRestrictPDA( + RF_Raid_t *raidPtr, + RF_PhysDiskAddr_t *src, + RF_PhysDiskAddr_t *dest, + int dobuffer, + int doraidaddr) +{ + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector); + RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector); + RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector-1); /* use -1 to be sure we stay within SU */ + RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector-1); + RF_SectorNum_t subAddr = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->startSector); /* stripe unit boundary */ + + dest->startSector = subAddr + RF_MAX(soffs,doffs); + dest->numSector = subAddr + RF_MIN(send,dend) + 1 - dest->startSector; + + if (dobuffer) + dest->bufPtr += (soffs > doffs) ? rf_RaidAddressToByte(raidPtr,soffs-doffs) : 0; + if (doraidaddr) { + dest->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->raidAddress) + + rf_StripeUnitOffset(layoutPtr, dest->startSector); + } +} + +/* + * Want the highest of these primes to be the largest one + * less than the max expected number of columns (won't hurt + * to be too small or too large, but won't be optimal, either) + * --jimz + */ +#define NLOWPRIMES 8 +static int lowprimes[NLOWPRIMES] = {2,3,5,7,11,13,17,19}; + +/***************************************************************************** + * compute the workload shift factor. (chained declustering) + * + * return nonzero if access should shift to secondary, otherwise, + * access is to primary + *****************************************************************************/ +int rf_compute_workload_shift( + RF_Raid_t *raidPtr, + RF_PhysDiskAddr_t *pda) +{ + /* + * variables: + * d = column of disk containing primary + * f = column of failed disk + * n = number of disks in array + * sd = "shift distance" (number of columns that d is to the right of f) + * row = row of array the access is in + * v = numerator of redirection ratio + * k = denominator of redirection ratio + */ + RF_RowCol_t d, f, sd, row, n; + int k, v, ret, i; + + row = pda->row; + n = raidPtr->numCol; + + /* assign column of primary copy to d */ + d = pda->col; + + /* assign column of dead disk to f */ + for(f=0;((!RF_DEAD_DISK(raidPtr->Disks[row][f].status))&&(f<n));f++); + + RF_ASSERT(f < n); + RF_ASSERT(f != d); + + sd = (f > d) ? (n + d - f) : (d - f); + RF_ASSERT(sd < n); + + /* + * v of every k accesses should be redirected + * + * v/k := (n-1-sd)/(n-1) + */ + v = (n-1-sd); + k = (n-1); + +#if 1 + /* + * XXX + * Is this worth it? + * + * Now reduce the fraction, by repeatedly factoring + * out primes (just like they teach in elementary school!) + */ + for(i=0;i<NLOWPRIMES;i++) { + if (lowprimes[i] > v) + break; + while (((v%lowprimes[i])==0) && ((k%lowprimes[i])==0)) { + v /= lowprimes[i]; + k /= lowprimes[i]; + } + } +#endif + + raidPtr->hist_diskreq[row][d]++; + if (raidPtr->hist_diskreq[row][d] > v) { + ret = 0; /* do not redirect */ + } + else { + ret = 1; /* redirect */ + } + +#if 0 + printf("d=%d f=%d sd=%d v=%d k=%d ret=%d h=%d\n", d, f, sd, v, k, ret, + raidPtr->hist_diskreq[row][d]); +#endif + + if (raidPtr->hist_diskreq[row][d] >= k) { + /* reset counter */ + raidPtr->hist_diskreq[row][d] = 0; + } + + return(ret); +} + +/* + * Disk selection routines + */ + +/* + * Selects the disk with the shortest queue from a mirror pair. + * Both the disk I/Os queued in RAIDframe as well as those at the physical + * disk are counted as members of the "queue" + */ +void rf_SelectMirrorDiskIdle(RF_DagNode_t *node) +{ + RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr; + RF_RowCol_t rowData, colData, rowMirror, colMirror; + int dataQueueLength, mirrorQueueLength, usemirror; + RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *)node->params[0].p; + RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *)node->params[4].p; + RF_PhysDiskAddr_t *tmp_pda; + RF_RaidDisk_t **disks = raidPtr->Disks; + RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue; + + /* return the [row col] of the disk with the shortest queue */ + rowData = data_pda->row; + colData = data_pda->col; + rowMirror = mirror_pda->row; + colMirror = mirror_pda->col; + dataQueue = &(dqs[rowData][colData]); + mirrorQueue = &(dqs[rowMirror][colMirror]); + +#ifdef RF_LOCK_QUEUES_TO_READ_LEN + RF_LOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle"); +#endif /* RF_LOCK_QUEUES_TO_READ_LEN */ + dataQueueLength = dataQueue->queueLength + dataQueue->numOutstanding; +#ifdef RF_LOCK_QUEUES_TO_READ_LEN + RF_UNLOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle"); + RF_LOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle"); +#endif /* RF_LOCK_QUEUES_TO_READ_LEN */ + mirrorQueueLength = mirrorQueue->queueLength + mirrorQueue->numOutstanding; +#ifdef RF_LOCK_QUEUES_TO_READ_LEN + RF_UNLOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle"); +#endif /* RF_LOCK_QUEUES_TO_READ_LEN */ + + usemirror = 0; + if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) { + usemirror = 0; + } + else if (RF_DEAD_DISK(disks[rowData][colData].status)) { + usemirror = 1; + } + else if (dataQueueLength < mirrorQueueLength) { + usemirror = 0; + } + else if (mirrorQueueLength < dataQueueLength) { + usemirror = 1; + } + else { + /* queues are equal length. attempt cleverness. */ + if (SNUM_DIFF(dataQueue->last_deq_sector,data_pda->startSector) + <= SNUM_DIFF(mirrorQueue->last_deq_sector,mirror_pda->startSector)) + { + usemirror = 0; + } + else { + usemirror = 1; + } + } + + if (usemirror) { + /* use mirror (parity) disk, swap params 0 & 4 */ + tmp_pda = data_pda; + node->params[0].p = mirror_pda; + node->params[4].p = tmp_pda; + } + else { + /* use data disk, leave param 0 unchanged */ + } + /* printf("dataQueueLength %d, mirrorQueueLength %d\n",dataQueueLength, mirrorQueueLength); */ +} + +/* + * Do simple partitioning. This assumes that + * the data and parity disks are laid out identically. + */ +void rf_SelectMirrorDiskPartition(RF_DagNode_t *node) +{ + RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr; + RF_RowCol_t rowData, colData, rowMirror, colMirror; + RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *)node->params[0].p; + RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *)node->params[4].p; + RF_PhysDiskAddr_t *tmp_pda; + RF_RaidDisk_t **disks = raidPtr->Disks; + RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue; + int usemirror; + + /* return the [row col] of the disk with the shortest queue */ + rowData = data_pda->row; + colData = data_pda->col; + rowMirror = mirror_pda->row; + colMirror = mirror_pda->col; + dataQueue = &(dqs[rowData][colData]); + mirrorQueue = &(dqs[rowMirror][colMirror]); + + usemirror = 0; + if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) { + usemirror = 0; + } + else if (RF_DEAD_DISK(disks[rowData][colData].status)) { + usemirror = 1; + } + else if (data_pda->startSector < (disks[rowData][colData].numBlocks / 2)) { + usemirror = 0; + } + else { + usemirror = 1; + } + + if (usemirror) { + /* use mirror (parity) disk, swap params 0 & 4 */ + tmp_pda = data_pda; + node->params[0].p = mirror_pda; + node->params[4].p = tmp_pda; + } + else { + /* use data disk, leave param 0 unchanged */ + } +} diff --git a/sys/dev/raidframe/rf_dagutils.h b/sys/dev/raidframe/rf_dagutils.h new file mode 100644 index 00000000000..cb732879230 --- /dev/null +++ b/sys/dev/raidframe/rf_dagutils.h @@ -0,0 +1,192 @@ +/* $OpenBSD: rf_dagutils.h,v 1.1 1999/01/11 14:29:12 niklas Exp $ */ +/* $NetBSD: rf_dagutils.h,v 1.1 1998/11/13 04:20:28 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/************************************************************************* + * + * rf_dagutils.h -- header file for utility routines for manipulating DAGs + * + *************************************************************************/ + +/* + * : + * Log: rf_dagutils.h,v + * Revision 1.19 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.18 1996/07/15 17:22:18 jimz + * nit-pick code cleanup + * resolve stdlib problems on DEC OSF + * + * Revision 1.17 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.16 1996/06/06 17:27:46 jimz + * added another select mirror func (partitioning), changed names so dag + * creation routines can use the appropriate one + * + * fixed old idle mirror func to pick closest arm if queue lengths are equal + * + * Revision 1.15 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.14 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.13 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.12 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.11 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.10 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.9 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.8 1996/05/08 21:01:24 jimz + * fixed up enum type names that were conflicting with other + * enums and function names (ie, "panic") + * future naming trends will be towards RF_ and rf_ for + * everything raidframe-related + * + * Revision 1.7 1996/05/03 19:55:27 wvcii + * added misc routines from old dag creation files + * + * Revision 1.6 1995/12/01 15:57:28 root + * added copyright info + * + * Revision 1.5 1995/11/07 16:21:36 wvcii + * modified InitNode and InitNodeFromBuf prototypes + * + */ + +#include "rf_types.h" +#include "rf_dagfuncs.h" +#include "rf_general.h" + +#ifndef _RF__RF_DAGUTILS_H_ +#define _RF__RF_DAGUTILS_H_ + +struct RF_RedFuncs_s { + int (*regular)(RF_DagNode_t *); + char *RegularName; + int (*simple)(RF_DagNode_t *); + char *SimpleName; +}; + +extern RF_RedFuncs_t rf_xorFuncs; +extern RF_RedFuncs_t rf_xorRecoveryFuncs; + +void rf_InitNode(RF_DagNode_t *node, RF_NodeStatus_t initstatus, + int commit, + int (*doFunc)(RF_DagNode_t *node), + int (*undoFunc)(RF_DagNode_t *node), + int (*wakeFunc)(RF_DagNode_t *node, int status), + int nSucc, int nAnte, int nParam, int nResult, + RF_DagHeader_t *hdr, char *name, RF_AllocListElem_t *alist); + +void rf_FreeDAG(RF_DagHeader_t *dag_h); + +RF_PropHeader_t *rf_MakePropListEntry(RF_DagHeader_t *dag_h, int resultNum, + int paramNum, RF_PropHeader_t *next, RF_AllocListElem_t *allocList); + +int rf_ConfigureDAGs(RF_ShutdownList_t **listp); + +RF_DagHeader_t *rf_AllocDAGHeader(void); + +void rf_FreeDAGHeader(RF_DagHeader_t *dh); + +void *rf_AllocBuffer(RF_Raid_t *raidPtr, RF_DagHeader_t *dag_h, + RF_PhysDiskAddr_t *pda, RF_AllocListElem_t *allocList); + +char *rf_NodeStatusString(RF_DagNode_t *node); + +void rf_PrintNodeInfoString(RF_DagNode_t *node); + +int rf_AssignNodeNums(RF_DagHeader_t *dag_h); + +int rf_RecurAssignNodeNums(RF_DagNode_t *node, int num, int unvisited); + +void rf_ResetDAGHeaderPointers(RF_DagHeader_t *dag_h, RF_DagHeader_t *newptr); + +void rf_RecurResetDAGHeaderPointers(RF_DagNode_t *node, RF_DagHeader_t *newptr); + +void rf_PrintDAGList(RF_DagHeader_t *dag_h); + +int rf_ValidateDAG(RF_DagHeader_t *dag_h); + +void rf_redirect_asm(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap); + +void rf_MapUnaccessedPortionOfStripe(RF_Raid_t *raidPtr, + RF_RaidLayout_t *layoutPtr, + RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, + RF_AccessStripeMapHeader_t **new_asm_h, int *nRodNodes, char **sosBuffer, + char **eosBuffer, RF_AllocListElem_t *allocList); + +int rf_PDAOverlap(RF_RaidLayout_t *layoutPtr, RF_PhysDiskAddr_t *src, + RF_PhysDiskAddr_t *dest); + +void rf_GenerateFailedAccessASMs(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_PhysDiskAddr_t *failedPDA, + RF_DagHeader_t *dag_h, RF_AccessStripeMapHeader_t **new_asm_h, + int *nXorBufs, char **rpBufPtr, char *overlappingPDAs, + RF_AllocListElem_t *allocList); + +/* flags used by RangeRestrictPDA */ +#define RF_RESTRICT_NOBUFFER 0 +#define RF_RESTRICT_DOBUFFER 1 + +void rf_RangeRestrictPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *src, + RF_PhysDiskAddr_t *dest, int dobuffer, int doraidaddr); + +int rf_compute_workload_shift(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda); +void rf_SelectMirrorDiskIdle(RF_DagNode_t *node); +void rf_SelectMirrorDiskPartition(RF_DagNode_t *node); + +#endif /* !_RF__RF_DAGUTILS_H_ */ diff --git a/sys/dev/raidframe/rf_debugMem.c b/sys/dev/raidframe/rf_debugMem.c new file mode 100644 index 00000000000..7d32463a11a --- /dev/null +++ b/sys/dev/raidframe/rf_debugMem.c @@ -0,0 +1,578 @@ +/* $OpenBSD: rf_debugMem.c,v 1.1 1999/01/11 14:29:12 niklas Exp $ */ +/* $NetBSD: rf_debugMem.c,v 1.1 1998/11/13 04:20:28 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Daniel Stodolsky, Mark Holland, Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* debugMem.c: memory usage debugging stuff. + * Malloc, Calloc, and Free are #defined everywhere + * to do_malloc, do_calloc, and do_free. + * + * if RF_UTILITY is nonzero, it means were compiling one of the + * raidframe utility programs, such as rfctrl or smd. In this + * case, we eliminate all references to the threads package + * and to the allocation list stuff. + */ + +/* : + * Log: rf_debugMem.c,v + * Revision 1.38 1996/08/20 14:45:43 jimz + * add debugging to track memory allocated (amount only, w/out + * excessive sanity checking) + * + * Revision 1.37 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.36 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.35 1996/06/13 08:55:38 jimz + * make error messages refer to file, line of original + * allocation + * + * Revision 1.34 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.33 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.32 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.31 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.30 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.29 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.28 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.27 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.26 1996/05/21 18:53:46 jimz + * return NULL for failed allocations, not panic + * + * Revision 1.25 1996/05/20 16:14:19 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.24 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.23 1996/05/17 12:42:35 jimz + * wrap get_threadid stuff in #ifndef UTILITY for utils which use + * redzone allocation stuff + * + * Revision 1.22 1996/05/16 23:06:09 jimz + * don't warn about NULL alists + * + * Revision 1.21 1996/05/16 22:25:02 jimz + * show allocations for [MC]allocAndAdd + * + * Revision 1.20 1996/05/15 18:30:22 jimz + * print memory allocation as well as frees if memDebug > 1 + * + * Revision 1.19 1996/05/07 17:41:17 jimz + * add "level 2" for memDebug, which will print freed address ranges + * + * Revision 1.18 1996/05/02 20:41:53 jimz + * really fix malloc problem out-of-kernel in memory_hash_insert() + * + * Revision 1.17 1996/05/02 20:04:29 jimz + * fixed malloc deadlock previous change introduced + * + * Revision 1.16 1996/05/01 16:27:26 jimz + * get rid of ALLOCMH + * stop using ccmn_ memory management + * + * Revision 1.15 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.14 1995/12/01 15:56:17 root + * added copyright info + * + */ + +#include "rf_types.h" +#include "rf_sys.h" + +#if RF_UTILITY == 0 +#include "rf_threadstuff.h" +#include "rf_threadid.h" +#include "rf_options.h" +#else /* RF_UTILITY == 0 */ +#include "rf_utility.h" +#endif /* RF_UTILITY == 0 */ + +#ifndef KERNEL +#include <stdio.h> +#include <assert.h> +#endif /* !KERNEL */ +#include "rf_debugMem.h" +#include "rf_general.h" + +static long tot_mem_in_use = 0, max_mem = 0; + +/* Hash table of information about memory allocations */ +#define RF_MH_TABLESIZE 1000 + +struct mh_struct { + void *address; + int size; + int line; + char *filen; + char allocated; + struct mh_struct *next; +}; +static struct mh_struct *mh_table[RF_MH_TABLESIZE]; +RF_DECLARE_MUTEX(rf_debug_mem_mutex) +static int mh_table_initialized=0; + +static void memory_hash_insert(void *addr, int size, int line, char *filen); +static int memory_hash_remove(void *addr, int sz); + +#ifndef KERNEL /* no redzones or "real_" routines in the kernel */ + +static void rf_redzone_free_failed(void *ptr, int size, int line, char *file); + +void *rf_real_redzone_malloc(_size_) + int _size_; +{ + char *p; + + rf_validate_mh_table(); + p = malloc((_size_)+16); + if (p == NULL) + return(p); + RF_ASSERT (p); + *((long *) p) = (_size_) ; + ((char *) p)[(_size_)+8] = '!'; + ((char *) p)[(_size_)+15] = '!'; + p += 8; + return(p); +} + +void *rf_real_redzone_calloc(_n_,_size_) +int _n_,_size_; +{ + char *p; + int _sz_; + + rf_validate_mh_table(); + _sz_ = (_n_) * (_size_); + p = malloc((_sz_)+16); + if (p == NULL) + return(p); + bzero(p,(_sz_)+16); + *((long *) p) = (_sz_) ; + ((char *) p)[(_sz_)+8] = '!'; + ((char *) p)[(_sz_)+15] = '!'; + p += 8; + return(p); +} + +void rf_real_redzone_free(p, line, filen) +char *p; +int line; +char *filen; +{ + unsigned long _size_; + + rf_validate_mh_table(); + p -= 8; + _size_ = *((long *) p); + if ((((char *) p)[(_size_)+8] != '!') || (((char *) p)[(_size_)+15] != '!')) + rf_redzone_free_failed(p,(_size_),line,filen); + free(p); +} + +unsigned long rf_mem_alloc = 0; + +char *rf_real_Malloc(size, line, file) + int size; + int line; + char *file; +{ + void *pp; + char *p; + int tid; + + RF_LOCK_MUTEX(rf_debug_mem_mutex); + rf_redzone_malloc(pp, size); + p = pp; + if (p == NULL) { + RF_ERRORMSG3("Unable to malloc %d bytes at line %d file %s\n", size, + line, file); + } + if (rf_memAmtDebug) { + rf_mem_alloc += size; + printf("%lu size %d %s:%d\n", rf_mem_alloc, size, file, line); + } +#if RF_UTILITY == 0 + if (rf_memDebug > 1) { + rf_get_threadid(tid); + printf("[%d] malloc 0x%lx - 0x%lx (%d) %s %d\n", tid, p, p+size, size, + file, line); + } +#endif /* RF_UTILITY == 0 */ + if (rf_memDebug) + rf_record_malloc(p, size, line, file); + RF_UNLOCK_MUTEX(rf_debug_mem_mutex); + return(p); +} + +#if RF_UTILITY == 0 +char *rf_real_MallocAndAdd(size, alist, line, file) + int size; + RF_AllocListElem_t *alist; + int line; + char *file; +{ + void *pp; + char *p; + int tid; + + RF_LOCK_MUTEX(rf_debug_mem_mutex); + rf_redzone_malloc(pp, size); + p = pp; + if (p == NULL) { + RF_ERRORMSG3("Unable to malloc %d bytes at line %d file %s\n", size, + line, file); + } + if (rf_memAmtDebug) { + rf_mem_alloc += size; + printf("%lu size %d %s:%d\n", rf_mem_alloc, size, file, line); + } + if (rf_memDebug > 1) { + rf_get_threadid(tid); + printf("[%d] malloc+add 0x%lx - 0x%lx (%d) %s %d\n", tid, p, p+size, + size, file, line); + } + if (alist) { + rf_real_AddToAllocList(alist, pp, size, 0); + } + if (rf_memDebug) + rf_record_malloc(p, size, line, file); + RF_UNLOCK_MUTEX(rf_debug_mem_mutex); + return(p); +} +#endif /* RF_UTILITY == 0 */ + +char *rf_real_Calloc(nel, elsz, line, file) + int nel; + int elsz; + int line; + char *file; +{ + int tid, size; + void *pp; + char *p; + + size = nel * elsz; + RF_LOCK_MUTEX(rf_debug_mem_mutex); + rf_redzone_calloc(pp, nel, elsz); + p = pp; + if (p == NULL) { + RF_ERRORMSG4("Unable to calloc %d objects of size %d at line %d file %s\n", + nel, elsz, line, file); + return(NULL); + } + if (rf_memAmtDebug) { + rf_mem_alloc += size; + printf("%lu size %d %s:%d\n", rf_mem_alloc, size, file, line); + } +#if RF_UTILITY == 0 + if (rf_memDebug > 1) { + rf_get_threadid(tid); + printf("[%d] calloc 0x%lx - 0x%lx (%d,%d) %s %d\n", tid, p, p+size, nel, + elsz, file, line); + } +#endif /* RF_UTILITY == 0 */ + if (rf_memDebug) { + rf_record_malloc(p, size, line, file); + } + RF_UNLOCK_MUTEX(rf_debug_mem_mutex); + return(p); +} + +#if RF_UTILITY == 0 +char *rf_real_CallocAndAdd(nel, elsz, alist, line, file) + int nel; + int elsz; + RF_AllocListElem_t *alist; + int line; + char *file; +{ + int tid, size; + void *pp; + char *p; + + size = nel * elsz; + RF_LOCK_MUTEX(rf_debug_mem_mutex); + rf_redzone_calloc(pp, nel, elsz); + p = pp; + if (p == NULL) { + RF_ERRORMSG4("Unable to calloc %d objs of size %d at line %d file %s\n", + nel, elsz, line, file); + return(NULL); + } + if (rf_memAmtDebug) { + rf_mem_alloc += size; + printf("%lu size %d %s:%d\n", rf_mem_alloc, size, file, line); + } + if (rf_memDebug > 1) { + rf_get_threadid(tid); + printf("[%d] calloc+add 0x%lx - 0x%lx (%d,%d) %s %d\n", tid, p, + p+size, nel, elsz, file, line); + } + if (alist) { + rf_real_AddToAllocList(alist, pp, size, 0); + } + if (rf_memDebug) + rf_record_malloc(p, size, line, file); + RF_UNLOCK_MUTEX(rf_debug_mem_mutex); + return(p); +} +#endif /* RF_UTILITY == 0 */ + +void rf_real_Free(p, sz, line, file) + void *p; + int sz; + int line; + char *file; +{ + int tid; + +#if RF_UTILITY == 0 + if (rf_memDebug > 1) { + rf_get_threadid(tid); + printf("[%d] free 0x%lx - 0x%lx (%d) %s %d\n", tid, p, ((char *)p)+sz, sz, + file, line); + } +#endif /* RF_UTILITY == 0 */ + RF_LOCK_MUTEX(rf_debug_mem_mutex); + if (rf_memAmtDebug) { + rf_mem_alloc -= sz; + printf("%lu - size %d %s:%d\n", rf_mem_alloc, sz, file, line); + } + if (rf_memDebug) { + rf_unrecord_malloc(p,sz); + } + rf_redzone_free(p); + RF_UNLOCK_MUTEX(rf_debug_mem_mutex); +} + +void rf_validate_mh_table() +{ + int i, size; + struct mh_struct *p; + char *cp; + + return; + for (i=0; i<RF_MH_TABLESIZE; i++) { + for (p=mh_table[i]; p; p=p->next) if (p->allocated) { + cp = ((char *) p->address) - 8; + size = *((long *) cp); + if ((((char *) cp)[(size)+8] != '!') || (((char *) cp)[(size)+15] != '!')) { + rf_redzone_free_failed(cp,(size),__LINE__,__FILE__); + } + } + } +} + +static void rf_redzone_free_failed(ptr,size,line,file) + void *ptr; + int size; + int line; + char *file; +{ + RF_ERRORMSG4("Free of 0x%lx (recorded size %d) at %d of %s detected redzone overrun\n",ptr,size,line,file); + RF_ASSERT(0); +} + +#endif /* !KERNEL */ + +void rf_record_malloc(p, size, line, filen) +void *p; +int size, line; +char *filen; +{ + RF_ASSERT(size != 0); + + /*RF_LOCK_MUTEX(rf_debug_mem_mutex);*/ + memory_hash_insert(p, size, line, filen); + tot_mem_in_use += size; + /*RF_UNLOCK_MUTEX(rf_debug_mem_mutex);*/ + if ( (long) p == rf_memDebugAddress) { + printf("Allocate: debug address allocated from line %d file %s\n",line,filen); + } +} + +void rf_unrecord_malloc(p, sz) +void *p; +int sz; +{ + int size; + + /*RF_LOCK_MUTEX(rf_debug_mem_mutex);*/ + size = memory_hash_remove(p, sz); + tot_mem_in_use -= size; + /*RF_UNLOCK_MUTEX(rf_debug_mem_mutex);*/ + if ( (long) p == rf_memDebugAddress) { + printf("Free: Found debug address\n"); /* this is really only a flag line for gdb */ + } +} + +void rf_print_unfreed() +{ + int i, foundone=0; + struct mh_struct *p; + + for (i=0; i<RF_MH_TABLESIZE; i++) { + for (p=mh_table[i]; p; p=p->next) if (p->allocated) { + if (!foundone) printf("\n\nThere are unfreed memory locations at program shutdown:\n"); + foundone = 1; + printf("Addr 0x%lx Size %d line %d file %s\n", + (long)p->address,p->size,p->line,p->filen); + } + } + if (tot_mem_in_use) { + printf("%ld total bytes in use\n", tot_mem_in_use); + } +} + +int rf_ConfigureDebugMem(listp) + RF_ShutdownList_t **listp; +{ + int i, rc; + + rc = rf_create_managed_mutex(listp, &rf_debug_mem_mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + return(rc); + } + if (rf_memDebug) { + for (i=0; i<RF_MH_TABLESIZE; i++) + mh_table[i] = NULL; + mh_table_initialized=1; + } + return(0); +} + +#define HASHADDR(_a_) ( (((unsigned long) _a_)>>3) % RF_MH_TABLESIZE ) + +static void memory_hash_insert(addr, size, line, filen) +void *addr; +int size, line; +char *filen; +{ + unsigned long bucket = HASHADDR(addr); + struct mh_struct *p; + + RF_ASSERT(mh_table_initialized); + + /* search for this address in the hash table */ + for (p=mh_table[bucket]; p && (p->address != addr); p=p->next); + if (!p) { +#ifdef KERNEL + RF_Malloc(p,sizeof(struct mh_struct),(struct mh_struct *)); +#else /* KERNEL */ + p = (struct mh_struct *)malloc(sizeof(struct mh_struct)); +#endif /* KERNEL */ + RF_ASSERT(p); + p->next = mh_table[bucket]; + mh_table[bucket] = p; + p->address = addr; + p->allocated = 0; + } + if (p->allocated) { + printf("ERROR: reallocated address 0x%lx from line %d, file %s without intervening free\n",(long) addr, line, filen); + printf(" last allocated from line %d file %s\n",p->line, p->filen); + RF_ASSERT(0); + } + p->size = size; p->line = line; p->filen = filen; + p->allocated = 1; +} + +static int memory_hash_remove(addr, sz) +void *addr; +int sz; +{ + unsigned long bucket = HASHADDR(addr); + struct mh_struct *p; + + RF_ASSERT(mh_table_initialized); + for (p=mh_table[bucket]; p && (p->address != addr); p=p->next); + if (!p) { + printf("ERROR: freeing never-allocated address 0x%lx\n",(long) addr); + RF_PANIC(); + } + if (!p->allocated) { + printf("ERROR: freeing unallocated address 0x%lx. Last allocation line %d file %s\n",(long) addr, p->line, p->filen); + RF_PANIC(); + } + if (sz > 0 && p->size != sz) { /* you can suppress this error by using a negative value as the size to free */ + printf("ERROR: incorrect size at free for address 0x%lx: is %d should be %d. Alloc at line %d of file %s\n",(unsigned long) addr, sz, p->size,p->line, p->filen); + RF_PANIC(); + } + p->allocated = 0; + return(p->size); +} + +void rf_ReportMaxMem() +{ + printf("Max memory used: %d bytes\n",(int)max_mem); +#ifndef KERNEL + fflush(stdout); + fprintf(stderr,"Max memory used: %d bytes\n",max_mem); + fflush(stderr); +#endif /* !KERNEL */ +} diff --git a/sys/dev/raidframe/rf_debugMem.h b/sys/dev/raidframe/rf_debugMem.h new file mode 100644 index 00000000000..2b5f1545d12 --- /dev/null +++ b/sys/dev/raidframe/rf_debugMem.h @@ -0,0 +1,263 @@ +/* $OpenBSD: rf_debugMem.h,v 1.1 1999/01/11 14:29:12 niklas Exp $ */ +/* $NetBSD: rf_debugMem.h,v 1.1 1998/11/13 04:20:28 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Daniel Stodolsky, Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * rf_debugMem.h -- memory leak debugging module + * + * IMPORTANT: if you put the lock/unlock mutex stuff back in here, you + * need to take it out of the routines in debugMem.c + * + * Log: rf_debugMem.h,v + * Revision 1.27 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.26 1996/06/11 13:46:43 jimz + * make bracing consistent around memory allocation macros + * + * Revision 1.25 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.24 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.23 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.22 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.21 1996/05/23 22:17:40 jimz + * fix alloclist macro names for kernel + * + * Revision 1.20 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.19 1996/05/23 13:18:23 jimz + * include rf_options.h + * + * Revision 1.18 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.17 1996/05/21 18:51:54 jimz + * cleaned up macro args + * + * Revision 1.16 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.15 1996/05/01 16:26:22 jimz + * get rid of old ccmn stuff + * + * Revision 1.14 1995/12/01 15:58:09 root + * added copyright info + * + * Revision 1.13 1995/10/11 15:26:03 jimz + * zero memory after allocation in kernel (hide effects + * of uninitialized structs) + * + * Revision 1.12 1995/10/06 17:04:15 jimz + * make Malloc and Free in kernel use kernel malloc package, not cam + * dbufs (which is gross, and was exhausting cam zalloc limit) + * + * Revision 1.11 1995/05/01 13:28:00 holland + * parity range locks, locking disk requests, recon+parityscan in kernel, etc. + * + * Revision 1.10 1995/04/24 13:25:51 holland + * rewrite to move disk queues, recon, & atomic RMW to kernel + * + * Revision 1.9 1995/02/17 19:39:56 holland + * added size param to all calls to Free(). + * this is ignored at user level, but necessary in the kernel. + * + * Revision 1.8 1995/02/10 17:34:10 holland + * kernelization changes + * + * Revision 1.7 1995/02/03 22:31:36 holland + * many changes related to kernelization + * + * Revision 1.6 1995/02/01 15:13:05 holland + * moved #include of general.h out of raid.h and into each file + * + * Revision 1.5 1995/02/01 14:25:19 holland + * began changes for kernelization: + * changed all instances of mutex_t and cond_t to DECLARE macros + * converted configuration code to use config structure + * + * Revision 1.4 1995/01/11 19:27:02 holland + * many changes related to performance tuning + * + * Revision 1.3 1994/11/29 21:34:56 danner + * Changed type of redzone_calloc and malloc to void *. + * + * Revision 1.2 1994/11/28 22:13:23 danner + * Many macros converted to functions. + * + */ + +#ifndef _RF__RF_DEBUGMEM_H_ +#define _RF__RF_DEBUGMEM_H_ + +#include "rf_archs.h" +#include "rf_alloclist.h" +#include "rf_options.h" + +#ifndef KERNEL + +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +void *malloc(), *calloc(); +#endif +RF_DECLARE_EXTERN_MUTEX(rf_debug_mem_mutex) + +/* + * redzone malloc, calloc, and free allocate an extra 16 bytes on each + * malloc/calloc call to allow tracking of overflows on free. + */ + +#if RF_MEMORY_REDZONES > 0 +#define rf_redzone_malloc(_p_,_size_) _p_ = rf_real_redzone_malloc(_size_) +#define rf_redzone_calloc(_p_,_n_,_size_) _p_ = rf_real_redzone_calloc(_n_,_size_) +#define rf_redzone_free(_p_) rf_real_redzone_free(_p_, __LINE__, __FILE__) +#else /* RF_MEMORY_REDZONES > 0 */ +#define rf_redzone_malloc(_p_,_size_) _p_ = malloc(_size_) +#define rf_redzone_calloc(_p_,_nel_,_size_) _p_ = calloc(_nel_,_size_) +#define rf_redzone_free(_ptr_) free(_ptr_) +#endif /* RF_MEMORY_REDZONES > 0 */ + +#define RF_Malloc(_p_, _size_, _cast_) { \ + _p_ = _cast_ rf_real_Malloc(_size_, __LINE__, __FILE__); \ +} + +#define RF_MallocAndAdd(_p_, _size_, _cast_, _alist_) { \ + _p_ = _cast_ rf_real_MallocAndAdd(_size_, _alist_, __LINE__, __FILE__); \ +} + +#define RF_Calloc(_p_, _nel_, _elsz_, _cast_) { \ + _p_ = _cast_ rf_real_Calloc(_nel_, _elsz_, __LINE__, __FILE__); \ +} + +#define RF_CallocAndAdd(_p_, _nel_, _elsz_, _cast_, _alist_) { \ + _p_ = _cast_ rf_real_CallocAndAdd(_nel_, _elsz_, _alist_, __LINE__, __FILE__); \ +} + +#define RF_Free(__p_, _sz_) { \ + rf_real_Free(__p_, _sz_, __LINE__, __FILE__); \ +} + +#else /* KERNEL */ + +#include <sys/types.h> +#if defined(__NetBSD__) || defined(__OpenBSD__) +typedef u_int32_t U32; +#else +#include <io/common/iotypes.h> /* just to get defn of U32 */ +#endif /* __NetBSD__ || __OpenBSD__ */ +#include <sys/malloc.h> + + +#if defined(__NetBSD__) || defined(__OpenBSD__) + +#define RF_Malloc(_p_, _size_, _cast_) \ + { \ + _p_ = _cast_ malloc((u_long)_size_, M_DEVBUF, M_WAITOK); \ + bzero((char *)_p_, _size_); \ + if (rf_memDebug) rf_record_malloc(_p_, _size_, __LINE__, __FILE__); \ + } + +#else + +#define RF_Malloc(_p_, _size_, _cast_) \ + { \ + _p_ = _cast_ malloc((u_long)_size_, BUCKETINDEX(_size_), M_DEVBUF, M_WAITOK); \ + bzero((char *)_p_, _size_); \ + if (rf_memDebug) rf_record_malloc(_p_, _size_, __LINE__, __FILE__); \ + } +#endif /* __NetBSD__ || __OpenBSD__ */ + +#define RF_MallocAndAdd(__p_, __size_, __cast_, __alist_) \ + { \ + RF_Malloc(__p_, __size_, __cast_); \ + if (__alist_) rf_AddToAllocList(__alist_, __p_, __size_); \ + } + +#define RF_Calloc(_p_, _nel_, _elsz_, _cast_) \ + { \ + RF_Malloc( _p_, (_nel_) * (_elsz_), _cast_); \ + bzero( (_p_), (_nel_) * (_elsz_) ); \ + } + +#define RF_CallocAndAdd(__p,__nel,__elsz,__cast,__alist) \ + { \ + RF_Calloc(__p, __nel, __elsz, __cast); \ + if (__alist) rf_AddToAllocList(__alist, __p, (__nel)*(__elsz)); \ + } + +#define RF_Free(_p_, _sz_) \ + { \ + free((void *)(_p_), M_DEVBUF); \ + if (rf_memDebug) rf_unrecord_malloc(_p_, (U32) (_sz_)); \ + } + +#endif /* KERNEL */ + +#ifndef KERNEL +void *rf_real_redzone_malloc(int size); +void *rf_real_redzone_calloc(int n, int size); +void rf_real_redzone_free(char *p, int line, char *filen); +char *rf_real_Malloc(int size, int line, char *file); +char *rf_real_Calloc(int nel, int elsz, int line, char *file); +void rf_real_Free(void *p, int sz, int line, char *file); +void rf_validate_mh_table(void); +#if RF_UTILITY == 0 +char *rf_real_MallocAndAdd(int size, RF_AllocListElem_t *alist, int line, char *file); +char *rf_real_CallocAndAdd(int nel, int elsz, RF_AllocListElem_t *alist, int line, char *file); +#endif /* RF_UTILITY == 0 */ +#endif /* !KERNEL */ + +void rf_record_malloc(void *p, int size, int line, char *filen); +void rf_unrecord_malloc(void *p, int sz); +void rf_print_unfreed(void); +int rf_ConfigureDebugMem(RF_ShutdownList_t **listp); +void rf_ReportMaxMem(void); + +#endif /* !_RF__RF_DEBUGMEM_H_ */ diff --git a/sys/dev/raidframe/rf_debugprint.c b/sys/dev/raidframe/rf_debugprint.c new file mode 100644 index 00000000000..573d53ae71a --- /dev/null +++ b/sys/dev/raidframe/rf_debugprint.c @@ -0,0 +1,186 @@ +/* $OpenBSD: rf_debugprint.c,v 1.1 1999/01/11 14:29:13 niklas Exp $ */ +/* $NetBSD: rf_debugprint.c,v 1.1 1998/11/13 04:20:28 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Code to do debug printfs. Calls to rf_debug_printf cause the corresponding + * information to be printed to a circular buffer rather than the screen. + * The point is to try and minimize the timing variations induced by the + * printfs, and to capture only the printf's immediately preceding a failure. + */ + +/* : + * Log: rf_debugprint.c,v + * Revision 1.13 1996/08/07 21:08:31 jimz + * remove bogus ; from mutex decl + * + * Revision 1.12 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.11 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.10 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.9 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.8 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.7 1996/05/20 16:16:06 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.6 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.5 1995/12/01 16:00:45 root + * added copyright info + * + */ + +#include "rf_types.h" +#include "rf_threadstuff.h" +#include "rf_debugprint.h" +#include "rf_general.h" +#include "rf_options.h" + +#include <sys/param.h> + +struct RF_Entry_s { + char *cstring; + void *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; +}; + +/* space for 1k lines */ +#define BUFSHIFT 10 +#define BUFSIZE (1<<BUFSHIFT) +#define BUFMASK (BUFSIZE-1) + +static struct RF_Entry_s rf_debugprint_buf[BUFSIZE]; +static int rf_debugprint_index = 0; +RF_DECLARE_STATIC_MUTEX(rf_debug_print_mutex) + +int rf_ConfigureDebugPrint(listp) + RF_ShutdownList_t **listp; +{ + int rc; + + rc = rf_create_managed_mutex(listp, &rf_debug_print_mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + return(rc); + } + rf_clear_debug_print_buffer(); + return(0); +} + +void rf_clear_debug_print_buffer() +{ + int i; + + for (i=0; i<BUFSIZE; i++) + rf_debugprint_buf[i].cstring = NULL; + rf_debugprint_index = 0; +} + +void rf_debug_printf(s,a1,a2,a3,a4,a5,a6,a7,a8) +char *s; +void *a1,*a2,*a3,*a4,*a5,*a6,*a7,*a8; +{ + int idx; + + if (rf_debugPrintUseBuffer) { + + RF_LOCK_MUTEX(rf_debug_print_mutex); + idx = rf_debugprint_index; + rf_debugprint_index = (rf_debugprint_index+1) & BUFMASK; + RF_UNLOCK_MUTEX(rf_debug_print_mutex); + + rf_debugprint_buf[idx].cstring = s; + rf_debugprint_buf[idx].a1 = a1; + rf_debugprint_buf[idx].a2 = a2; + rf_debugprint_buf[idx].a3 = a3; + rf_debugprint_buf[idx].a4 = a4; + rf_debugprint_buf[idx].a5 = a5; + rf_debugprint_buf[idx].a6 = a6; + rf_debugprint_buf[idx].a7 = a7; + rf_debugprint_buf[idx].a8 = a8; + } + else { + printf(s,a1,a2,a3,a4,a5,a6,a7,a8); + } +} + +void rf_print_debug_buffer() +{ + rf_spill_debug_buffer(NULL); +} + +void rf_spill_debug_buffer(fname) + char *fname; +{ + int i; +#ifndef KERNEL + FILE *fp; +#endif /* !KERNEL */ + + if (!rf_debugPrintUseBuffer) + return; + + RF_LOCK_MUTEX(rf_debug_print_mutex); +#ifndef KERNEL + fp = (fname) ? fopen(fname,"w") : stdout; + if (!fp) {printf("Unable to open file %s for writing\n",fname); return;} + for (i=rf_debugprint_index+1; i != rf_debugprint_index; i = (i+1)&BUFMASK) if (rf_debugprint_buf[i].cstring) + fprintf(fp,rf_debugprint_buf[i].cstring,rf_debugprint_buf[i].a1,rf_debugprint_buf[i].a2,rf_debugprint_buf[i].a3, + rf_debugprint_buf[i].a4,rf_debugprint_buf[i].a5,rf_debugprint_buf[i].a6,rf_debugprint_buf[i].a7,rf_debugprint_buf[i].a8); + fprintf(fp,rf_debugprint_buf[i].cstring,rf_debugprint_buf[i].a1,rf_debugprint_buf[i].a2,rf_debugprint_buf[i].a3, + rf_debugprint_buf[i].a4,rf_debugprint_buf[i].a5,rf_debugprint_buf[i].a6,rf_debugprint_buf[i].a7,rf_debugprint_buf[i].a8); + fclose(fp); +#else /* !KERNEL */ + for (i=rf_debugprint_index+1; i != rf_debugprint_index; i = (i+1)&BUFMASK) if (rf_debugprint_buf[i].cstring) + printf(rf_debugprint_buf[i].cstring,rf_debugprint_buf[i].a1,rf_debugprint_buf[i].a2,rf_debugprint_buf[i].a3, + rf_debugprint_buf[i].a4,rf_debugprint_buf[i].a5,rf_debugprint_buf[i].a6,rf_debugprint_buf[i].a7,rf_debugprint_buf[i].a8); + printf(rf_debugprint_buf[i].cstring,rf_debugprint_buf[i].a1,rf_debugprint_buf[i].a2,rf_debugprint_buf[i].a3, + rf_debugprint_buf[i].a4,rf_debugprint_buf[i].a5,rf_debugprint_buf[i].a6,rf_debugprint_buf[i].a7,rf_debugprint_buf[i].a8); +#endif /* !KERNEL */ + RF_UNLOCK_MUTEX(rf_debug_print_mutex); +} diff --git a/sys/dev/raidframe/rf_debugprint.h b/sys/dev/raidframe/rf_debugprint.h new file mode 100644 index 00000000000..6810fd0a6ee --- /dev/null +++ b/sys/dev/raidframe/rf_debugprint.h @@ -0,0 +1,64 @@ +/* $OpenBSD: rf_debugprint.h,v 1.1 1999/01/11 14:29:13 niklas Exp $ */ +/* $NetBSD: rf_debugprint.h,v 1.1 1998/11/13 04:20:28 oster Exp $ */ +/* + * rf_debugprint.h + */ +/* + * Copyright (c) 1996 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ +/* + * : + * Log: rf_debugprint.h,v + * Revision 1.4 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.3 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.2 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.1 1996/05/18 19:55:43 jimz + * Initial revision + * + */ + +#ifndef _RF__RF_DEBUGPRINT_H_ +#define _RF__RF_DEBUGPRINT_H_ + +int rf_ConfigureDebugPrint(RF_ShutdownList_t **listp); +void rf_clear_debug_print_buffer(void); +void rf_debug_printf(char *s, void *a1, void *a2, void *a3, void *a4, + void *a5, void *a6, void *a7, void *a8); +void rf_print_debug_buffer(void); +void rf_spill_debug_buffer(char *fname); + +#endif /* !_RF__RF_DEBUGPRINT_H_ */ diff --git a/sys/dev/raidframe/rf_decluster.c b/sys/dev/raidframe/rf_decluster.c new file mode 100644 index 00000000000..11cff33143a --- /dev/null +++ b/sys/dev/raidframe/rf_decluster.c @@ -0,0 +1,847 @@ +/* $OpenBSD: rf_decluster.c,v 1.1 1999/01/11 14:29:14 niklas Exp $ */ +/* $NetBSD: rf_decluster.c,v 1.1 1998/11/13 04:20:28 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/*---------------------------------------------------------------------- + * + * rf_decluster.c -- code related to the declustered layout + * + * Created 10-21-92 (MCH) + * + * Nov 93: adding support for distributed sparing. This code is a little + * complex: the basic layout used is as follows: + * let F = (v-1)/GCD(r,v-1). The spare space for each set of + * F consecutive fulltables is grouped together and placed after + * that set of tables. + * +------------------------------+ + * | F fulltables | + * | Spare Space | + * | F fulltables | + * | Spare Space | + * | ... | + * +------------------------------+ + * + *--------------------------------------------------------------------*/ + +/* + * : + * Log: rf_decluster.c,v + * Revision 1.51 1996/08/21 19:47:10 jimz + * fix bogus return values from config + * + * Revision 1.50 1996/08/20 22:41:42 jimz + * better diagnostics for bad blockdesigns + * + * Revision 1.49 1996/07/31 16:56:18 jimz + * dataBytesPerStripe, sectorsPerDisk init arch-indep. + * + * Revision 1.48 1996/07/29 14:05:12 jimz + * fix numPUs/numRUs confusion (everything is now numRUs) + * clean up some commenting, return values + * + * Revision 1.47 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.46 1996/07/27 18:40:11 jimz + * cleanup sweep + * + * Revision 1.45 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.44 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.43 1996/06/19 17:53:48 jimz + * move GetNumSparePUs, InstallSpareTable ops into layout switch + * + * Revision 1.42 1996/06/17 03:23:48 jimz + * switch DeclusteredDS typing + * + * Revision 1.41 1996/06/11 08:55:15 jimz + * improved error-checking at configuration time + * + * Revision 1.40 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.39 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.38 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.37 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.36 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.35 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.34 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.33 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.32 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.31 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.30 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.29 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.28 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.27 1995/12/01 16:00:08 root + * added copyright info + * + * Revision 1.26 1995/11/28 21:35:12 amiri + * set the RF_BD_DECLUSTERED flag + * + * Revision 1.25 1995/11/17 18:56:00 wvcii + * added prototyping to MapParity + * + * Revision 1.24 1995/07/04 22:25:33 holland + * increased default num bufs + * + * Revision 1.23 1995/07/03 20:23:51 holland + * changed floating recon bufs & head sep yet again + * + * Revision 1.22 1995/07/03 18:12:14 holland + * changed the way the number of floating recon bufs & the head sep + * limit are set + * + * Revision 1.21 1995/07/02 15:07:42 holland + * bug fixes related to getting distributed sparing numbers + * + * Revision 1.20 1995/06/23 13:41:28 robby + * updeated to prototypes in rf_layout.h + * + */ + +#ifdef _KERNEL +#define KERNEL +#endif + + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_raidframe.h" +#include "rf_configure.h" +#include "rf_decluster.h" +#include "rf_debugMem.h" +#include "rf_utils.h" +#include "rf_alloclist.h" +#include "rf_general.h" +#include "rf_shutdown.h" +#include "rf_sys.h" + +extern int rf_copyback_in_progress; /* debug only */ + +/* found in rf_kintf.c */ +int rf_GetSpareTableFromDaemon(RF_SparetWait_t *req); + +/* configuration code */ + +int rf_ConfigureDeclustered( + RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + int b, v, k, r, lambda; /* block design params */ + int i, j; + RF_RowCol_t *first_avail_slot; + RF_StripeCount_t complete_FT_count, numCompleteFullTablesPerDisk; + RF_DeclusteredConfigInfo_t *info; + RF_StripeCount_t PUsPerDisk, spareRegionDepthInPUs, numCompleteSpareRegionsPerDisk, extraPUsPerDisk; + RF_StripeCount_t totSparePUsPerDisk; + RF_SectorNum_t diskOffsetOfLastFullTableInSUs; + RF_SectorCount_t SpareSpaceInSUs; + char *cfgBuf = (char *) (cfgPtr->layoutSpecific); + RF_StripeNum_t l, SUID; + + SUID = l = 0; + numCompleteSpareRegionsPerDisk = 0; + + /* 1. create layout specific structure */ + RF_MallocAndAdd(info, sizeof(RF_DeclusteredConfigInfo_t), (RF_DeclusteredConfigInfo_t *), raidPtr->cleanupList); + if (info == NULL) + return(ENOMEM); + layoutPtr->layoutSpecificInfo = (void *) info; + info->SpareTable = NULL; + + /* 2. extract parameters from the config structure */ + if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { + (void) bcopy(cfgBuf, info->sparemap_fname, RF_SPAREMAP_NAME_LEN); + } + cfgBuf += RF_SPAREMAP_NAME_LEN; + + b = *( (int *) cfgBuf); cfgBuf += sizeof(int); + v = *( (int *) cfgBuf); cfgBuf += sizeof(int); + k = *( (int *) cfgBuf); cfgBuf += sizeof(int); + r = *( (int *) cfgBuf); cfgBuf += sizeof(int); + lambda = *( (int *) cfgBuf); cfgBuf += sizeof(int); + raidPtr->noRotate = *( (int *) cfgBuf); cfgBuf += sizeof(int); + + /* the sparemaps are generated assuming that parity is rotated, so we issue + * a warning if both distributed sparing and no-rotate are on at the same time + */ + if ((layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) && raidPtr->noRotate) { + RF_ERRORMSG("Warning: distributed sparing specified without parity rotation.\n"); + } + + if (raidPtr->numCol != v) { + RF_ERRORMSG2("RAID: config error: table element count (%d) not equal to no. of cols (%d)\n", v, raidPtr->numCol); + return(EINVAL); + } + + /* 3. set up the values used in the mapping code */ + info->BlocksPerTable = b; + info->Lambda = lambda; + info->NumParityReps = info->groupSize = k; + info->SUsPerTable = b * (k-1) * layoutPtr->SUsPerPU;/* b blks, k-1 SUs each */ + info->SUsPerFullTable = k * info->SUsPerTable; /* rot k times */ + info->PUsPerBlock = k-1; + info->SUsPerBlock = info->PUsPerBlock * layoutPtr->SUsPerPU; + info->TableDepthInPUs = (b*k) / v; + info->FullTableDepthInPUs = info->TableDepthInPUs * k; /* k repetitions */ + + /* used only in distributed sparing case */ + info->FullTablesPerSpareRegion = (v-1) / rf_gcd(r, v-1); /* (v-1)/gcd fulltables */ + info->TablesPerSpareRegion = k * info->FullTablesPerSpareRegion; + info->SpareSpaceDepthPerRegionInSUs = (r * info->TablesPerSpareRegion / (v-1)) * layoutPtr->SUsPerPU; + + /* check to make sure the block design is sufficiently small */ + if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { + if (info->FullTableDepthInPUs * layoutPtr->SUsPerPU + info->SpareSpaceDepthPerRegionInSUs > layoutPtr->stripeUnitsPerDisk) { + RF_ERRORMSG3("RAID: config error: Full Table depth (%d) + Spare Space (%d) larger than disk size (%d) (BD too big)\n", + (int)info->FullTableDepthInPUs, + (int)info->SpareSpaceDepthPerRegionInSUs, + (int)layoutPtr->stripeUnitsPerDisk); + return(EINVAL); + } + } else { + if (info->TableDepthInPUs * layoutPtr->SUsPerPU > layoutPtr->stripeUnitsPerDisk) { + RF_ERRORMSG2("RAID: config error: Table depth (%d) larger than disk size (%d) (BD too big)\n", + (int)(info->TableDepthInPUs * layoutPtr->SUsPerPU), \ + (int)layoutPtr->stripeUnitsPerDisk); + return(EINVAL); + } + } + + + /* compute the size of each disk, and the number of tables in the last fulltable (which + * need not be complete) + */ + if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { + + PUsPerDisk = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU; + spareRegionDepthInPUs = (info->TablesPerSpareRegion * info->TableDepthInPUs + + (info->TablesPerSpareRegion * info->TableDepthInPUs) / (v-1)); + info->SpareRegionDepthInSUs = spareRegionDepthInPUs * layoutPtr->SUsPerPU; + + numCompleteSpareRegionsPerDisk = PUsPerDisk / spareRegionDepthInPUs; + info->NumCompleteSRs = numCompleteSpareRegionsPerDisk; + extraPUsPerDisk = PUsPerDisk % spareRegionDepthInPUs; + + /* assume conservatively that we need the full amount of spare space in one region in order + * to provide spares for the partial spare region at the end of the array. We set "i" to + * the number of tables in the partial spare region. This may actually include some fulltables. + */ + extraPUsPerDisk -= (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU); + if (extraPUsPerDisk <= 0) i = 0; + else i = extraPUsPerDisk/info->TableDepthInPUs; + + complete_FT_count = raidPtr->numRow * (numCompleteSpareRegionsPerDisk * (info->TablesPerSpareRegion/k) + i/k); + info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable; + info->ExtraTablesPerDisk = i % k; + + /* note that in the last spare region, the spare space is complete even though data/parity space is not */ + totSparePUsPerDisk = (numCompleteSpareRegionsPerDisk+1) * (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU); + info->TotSparePUsPerDisk = totSparePUsPerDisk; + + layoutPtr->stripeUnitsPerDisk = + ((complete_FT_count/raidPtr->numRow) * info->FullTableDepthInPUs + /* data & parity space */ + info->ExtraTablesPerDisk * info->TableDepthInPUs + + totSparePUsPerDisk /* spare space */ + ) * layoutPtr->SUsPerPU; + layoutPtr->dataStripeUnitsPerDisk = + (complete_FT_count * info->FullTableDepthInPUs + info->ExtraTablesPerDisk * info->TableDepthInPUs) + * layoutPtr->SUsPerPU * (k-1) / k; + + } else { + /* non-dist spare case: force each disk to contain an integral number of tables */ + layoutPtr->stripeUnitsPerDisk /= (info->TableDepthInPUs * layoutPtr->SUsPerPU); + layoutPtr->stripeUnitsPerDisk *= (info->TableDepthInPUs * layoutPtr->SUsPerPU); + + /* compute the number of tables in the last fulltable, which need not be complete */ + complete_FT_count = + ((layoutPtr->stripeUnitsPerDisk/layoutPtr->SUsPerPU) / info->FullTableDepthInPUs) * raidPtr->numRow; + + info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable; + info->ExtraTablesPerDisk = + ((layoutPtr->stripeUnitsPerDisk/layoutPtr->SUsPerPU) / info->TableDepthInPUs) % k; + } + + raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit; + + /* find the disk offset of the stripe unit where the last fulltable starts */ + numCompleteFullTablesPerDisk = complete_FT_count / raidPtr->numRow; + diskOffsetOfLastFullTableInSUs = numCompleteFullTablesPerDisk * info->FullTableDepthInPUs * layoutPtr->SUsPerPU; + if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { + SpareSpaceInSUs = numCompleteSpareRegionsPerDisk * info->SpareSpaceDepthPerRegionInSUs; + diskOffsetOfLastFullTableInSUs += SpareSpaceInSUs; + info->DiskOffsetOfLastSpareSpaceChunkInSUs = + diskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU; + } + info->DiskOffsetOfLastFullTableInSUs = diskOffsetOfLastFullTableInSUs; + info->numCompleteFullTablesPerDisk = numCompleteFullTablesPerDisk; + + /* 4. create and initialize the lookup tables */ + info->LayoutTable = rf_make_2d_array(b, k, raidPtr->cleanupList); + if (info->LayoutTable == NULL) + return(ENOMEM); + info->OffsetTable = rf_make_2d_array(b, k, raidPtr->cleanupList); + if (info->OffsetTable == NULL) + return(ENOMEM); + info->BlockTable = rf_make_2d_array(info->TableDepthInPUs*layoutPtr->SUsPerPU, raidPtr->numCol, raidPtr->cleanupList); + if (info->BlockTable == NULL) + return(ENOMEM); + + first_avail_slot = rf_make_1d_array(v, NULL); + if (first_avail_slot == NULL) + return(ENOMEM); + + for (i=0; i<b; i++) + for (j=0; j<k; j++) + info->LayoutTable[i][j] = *cfgBuf++; + + /* initialize offset table */ + for (i=0; i<b; i++) for (j=0; j<k; j++) { + info->OffsetTable[i][j] = first_avail_slot[ info->LayoutTable[i][j] ]; + first_avail_slot[ info->LayoutTable[i][j] ]++; + } + + /* initialize block table */ + for (SUID=l=0; l<layoutPtr->SUsPerPU; l++) { + for (i=0; i<b; i++) { + for (j=0; j<k; j++) { + info->BlockTable[ (info->OffsetTable[i][j] * layoutPtr->SUsPerPU) + l ] + [ info->LayoutTable[i][j] ] = SUID; + } + SUID++; + } + } + + rf_free_1d_array(first_avail_slot, v); + + /* 5. set up the remaining redundant-but-useful parameters */ + + raidPtr->totalSectors = (k*complete_FT_count + raidPtr->numRow*info->ExtraTablesPerDisk) * + info->SUsPerTable * layoutPtr->sectorsPerStripeUnit; + layoutPtr->numStripe = (raidPtr->totalSectors / layoutPtr->sectorsPerStripeUnit) / (k-1); + + /* strange evaluation order below to try and minimize overflow problems */ + + layoutPtr->dataSectorsPerStripe = (k-1) * layoutPtr->sectorsPerStripeUnit; + layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector; + layoutPtr->numDataCol = k-1; + layoutPtr->numParityCol = 1; + + return(0); +} + +/* declustering with distributed sparing */ +static void rf_ShutdownDeclusteredDS(RF_ThreadArg_t); +static void rf_ShutdownDeclusteredDS(arg) + RF_ThreadArg_t arg; +{ + RF_DeclusteredConfigInfo_t *info; + RF_Raid_t *raidPtr; + + raidPtr = (RF_Raid_t *)arg; + info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; + if (info->SpareTable) + rf_FreeSpareTable(raidPtr); +} + +int rf_ConfigureDeclusteredDS( + RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr) +{ + int rc; + + rc = rf_ConfigureDeclustered(listp, raidPtr, cfgPtr); + if (rc) + return(rc); + rc = rf_ShutdownCreate(listp, rf_ShutdownDeclusteredDS, raidPtr); + if (rc) { + RF_ERRORMSG1("Got %d adding shutdown event for DeclusteredDS\n", rc); + rf_ShutdownDeclusteredDS(raidPtr); + return(rc); + } + return(0); +} + +void rf_MapSectorDeclustered(raidPtr, raidSector, row, col, diskSector, remap) + RF_Raid_t *raidPtr; + RF_RaidAddr_t raidSector; + RF_RowCol_t *row; + RF_RowCol_t *col; + RF_SectorNum_t *diskSector; + int remap; +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; + RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit; + RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset; + RF_StripeNum_t BlockID, BlockOffset, RepIndex; + RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; + RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; + RF_StripeNum_t base_suid = 0, outSU, SpareRegion=0, SpareSpace=0; + + rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); + + FullTableID = SUID / sus_per_fulltable; /* fulltable ID within array (across rows) */ + if (raidPtr->numRow == 1) *row = 0; /* avoid a mod and a div in the common case */ + else { + *row = FullTableID % raidPtr->numRow; + FullTableID /= raidPtr->numRow; /* convert to fulltable ID on this disk */ + } + if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { + SpareRegion = FullTableID / info->FullTablesPerSpareRegion; + SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs; + } + FullTableOffset = SUID % sus_per_fulltable; + TableID = FullTableOffset / info->SUsPerTable; + TableOffset = FullTableOffset - TableID * info->SUsPerTable; + BlockID = TableOffset / info->PUsPerBlock; + BlockOffset = TableOffset - BlockID * info->PUsPerBlock; + BlockID %= info->BlocksPerTable; + RepIndex = info->PUsPerBlock - TableID; + if (!raidPtr->noRotate) BlockOffset += ((BlockOffset >= RepIndex) ? 1 : 0); + *col = info->LayoutTable[BlockID][BlockOffset]; + + /* remap to distributed spare space if indicated */ + if (remap) { + RF_ASSERT( raidPtr->Disks[*row][*col].status == rf_ds_reconstructing || raidPtr->Disks[*row][*col].status == rf_ds_dist_spared || + (rf_copyback_in_progress && raidPtr->Disks[*row][*col].status == rf_ds_optimal)); + rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU); + } else { + + outSU = base_suid; + outSU += FullTableID * fulltable_depth; /* offs to strt of FT */ + outSU += SpareSpace; /* skip rsvd spare space */ + outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU; /* offs to strt of tble */ + outSU += info->OffsetTable[BlockID][BlockOffset] * layoutPtr->SUsPerPU; /* offs to the PU */ + } + outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock); /* offs to the SU within a PU */ + + /* convert SUs to sectors, and, if not aligned to SU boundary, add in offset to sector. */ + *diskSector = outSU*layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit); + + RF_ASSERT( *col != -1 ); +} + + +/* prototyping this inexplicably causes the compile of the layout table (rf_layout.c) to fail */ +void rf_MapParityDeclustered( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; + RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit; + RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset; + RF_StripeNum_t BlockID, BlockOffset, RepIndex; + RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; + RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; + RF_StripeNum_t base_suid = 0, outSU, SpareRegion=0, SpareSpace=0; + + rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); + + /* compute row & (possibly) spare space exactly as before */ + FullTableID = SUID / sus_per_fulltable; + if (raidPtr->numRow == 1) *row = 0; /* avoid a mod and a div in the common case */ + else { + *row = FullTableID % raidPtr->numRow; + FullTableID /= raidPtr->numRow; /* convert to fulltable ID on this disk */ + } + if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { + SpareRegion = FullTableID / info->FullTablesPerSpareRegion; + SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs; + } + + /* compute BlockID and RepIndex exactly as before */ + FullTableOffset = SUID % sus_per_fulltable; + TableID = FullTableOffset / info->SUsPerTable; + TableOffset = FullTableOffset - TableID * info->SUsPerTable; + /*TableOffset = FullTableOffset % info->SUsPerTable;*/ + /*BlockID = (TableOffset / info->PUsPerBlock) % info->BlocksPerTable;*/ + BlockID = TableOffset / info->PUsPerBlock; + /*BlockOffset = TableOffset % info->PUsPerBlock;*/ + BlockOffset = TableOffset - BlockID * info->PUsPerBlock; + BlockID %= info->BlocksPerTable; + + /* the parity block is in the position indicated by RepIndex */ + RepIndex = (raidPtr->noRotate) ? info->PUsPerBlock : info->PUsPerBlock - TableID; + *col = info->LayoutTable[BlockID][RepIndex]; + + if (remap) { + RF_ASSERT( raidPtr->Disks[*row][*col].status == rf_ds_reconstructing || raidPtr->Disks[*row][*col].status == rf_ds_dist_spared || + (rf_copyback_in_progress && raidPtr->Disks[*row][*col].status == rf_ds_optimal)); + rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU); + } else { + + /* compute sector as before, except use RepIndex instead of BlockOffset */ + outSU = base_suid; + outSU += FullTableID * fulltable_depth; + outSU += SpareSpace; /* skip rsvd spare space */ + outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU; + outSU += info->OffsetTable[BlockID][RepIndex] * layoutPtr->SUsPerPU; + } + + outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock); + *diskSector = outSU*layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit); + + RF_ASSERT( *col != -1 ); +} + +/* returns an array of ints identifying the disks that comprise the stripe containing the indicated address. + * the caller must _never_ attempt to modify this array. + */ +void rf_IdentifyStripeDeclustered( + RF_Raid_t *raidPtr, + RF_RaidAddr_t addr, + RF_RowCol_t **diskids, + RF_RowCol_t *outRow) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; + RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; + RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; + RF_StripeNum_t base_suid = 0; + RF_StripeNum_t SUID = rf_RaidAddressToStripeUnitID(layoutPtr, addr); + RF_StripeNum_t stripeID, FullTableID; + int tableOffset; + + rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); + FullTableID = SUID / sus_per_fulltable; /* fulltable ID within array (across rows) */ + *outRow = FullTableID % raidPtr->numRow; + stripeID = rf_StripeUnitIDToStripeID(layoutPtr, SUID); /* find stripe offset into array */ + tableOffset = (stripeID % info->BlocksPerTable); /* find offset into block design table */ + *diskids = info->LayoutTable[tableOffset]; +} + +/* This returns the default head-separation limit, which is measured + * in "required units for reconstruction". Each time a disk fetches + * a unit, it bumps a counter. The head-sep code prohibits any disk + * from getting more than headSepLimit counter values ahead of any + * other. + * + * We assume here that the number of floating recon buffers is already + * set. There are r stripes to be reconstructed in each table, and so + * if we have a total of B buffers, we can have at most B/r tables + * under recon at any one time. In each table, lambda units are required + * from each disk, so given B buffers, the head sep limit has to be + * (lambda*B)/r units. We subtract one to avoid weird boundary cases. + * + * for example, suppose were given 50 buffers, r=19, and lambda=4 as in + * the 20.5 design. There are 19 stripes/table to be reconstructed, so + * we can have 50/19 tables concurrently under reconstruction, which means + * we can allow the fastest disk to get 50/19 tables ahead of the slower + * disk. There are lambda "required units" for each disk, so the fastest + * disk can get 4*50/19 = 10 counter values ahead of the slowest. + * + * If numBufsToAccumulate is not 1, we need to limit the head sep further + * because multiple bufs will be required for each stripe under recon. + */ +RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitDeclustered( + RF_Raid_t *raidPtr) +{ + RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; + + return(info->Lambda * raidPtr->numFloatingReconBufs / info->TableDepthInPUs / rf_numBufsToAccumulate); +} + +/* returns the default number of recon buffers to use. The value + * is somewhat arbitrary...it's intended to be large enough to allow + * for a reasonably large head-sep limit, but small enough that you + * don't use up all your system memory with buffers. + */ +int rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t *raidPtr) +{ + return(100 * rf_numBufsToAccumulate); +} + +/* sectors in the last fulltable of the array need to be handled + * specially since this fulltable can be incomplete. this function + * changes the values of certain params to handle this. + * + * the idea here is that MapSector et. al. figure out which disk the + * addressed unit lives on by computing the modulos of the unit number + * with the number of units per fulltable, table, etc. In the last + * fulltable, there are fewer units per fulltable, so we need to adjust + * the number of user data units per fulltable to reflect this. + * + * so, we (1) convert the fulltable size and depth parameters to + * the size of the partial fulltable at the end, (2) compute the + * disk sector offset where this fulltable starts, and (3) convert + * the users stripe unit number from an offset into the array to + * an offset into the last fulltable. + */ +void rf_decluster_adjust_params( + RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t *SUID, + RF_StripeCount_t *sus_per_fulltable, + RF_StripeCount_t *fulltable_depth, + RF_StripeNum_t *base_suid) +{ + RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + /* Nothing! */ +#else + char pc = layoutPtr->map->parityConfig; +#endif + + if (*SUID >= info->FullTableLimitSUID) { + /* new full table size is size of last full table on disk */ + *sus_per_fulltable = info->ExtraTablesPerDisk * info->SUsPerTable; + + /* new full table depth is corresponding depth */ + *fulltable_depth = info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU; + + /* set up the new base offset */ + *base_suid = info->DiskOffsetOfLastFullTableInSUs; + + /* convert users array address to an offset into the last fulltable */ + *SUID -= info->FullTableLimitSUID; + } +} + +/* + * map a stripe ID to a parity stripe ID. + * See comment above RaidAddressToParityStripeID in layout.c. + */ +void rf_MapSIDToPSIDDeclustered( + RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t stripeID, + RF_StripeNum_t *psID, + RF_ReconUnitNum_t *which_ru) +{ + RF_DeclusteredConfigInfo_t *info; + + info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; + + *psID = (stripeID / (layoutPtr->SUsPerPU * info->BlocksPerTable)) + * info->BlocksPerTable + (stripeID % info->BlocksPerTable); + *which_ru = (stripeID % (info->BlocksPerTable * layoutPtr->SUsPerPU)) + / info->BlocksPerTable; + RF_ASSERT( (*which_ru) < layoutPtr->SUsPerPU/layoutPtr->SUsPerRU); +} + +/* + * Called from MapSector and MapParity to retarget an access at the spare unit. + * Modifies the "col" and "outSU" parameters only. + */ +void rf_remap_to_spare_space( + RF_RaidLayout_t *layoutPtr, + RF_DeclusteredConfigInfo_t *info, + RF_RowCol_t row, + RF_StripeNum_t FullTableID, + RF_StripeNum_t TableID, + RF_SectorNum_t BlockID, + RF_StripeNum_t base_suid, + RF_StripeNum_t SpareRegion, + RF_RowCol_t *outCol, + RF_StripeNum_t *outSU) +{ + RF_StripeNum_t ftID, spareTableStartSU, TableInSpareRegion, lastSROffset, which_ft; + + /* + * note that FullTableID and hence SpareRegion may have gotten + * tweaked by rf_decluster_adjust_params. We detect this by + * noticing that base_suid is not 0. + */ + if (base_suid == 0) { + ftID = FullTableID; + } + else { + /* + * There may be > 1.0 full tables in the last (i.e. partial) + * spare region. find out which of these we're in. + */ + lastSROffset = info->NumCompleteSRs * info->SpareRegionDepthInSUs; + which_ft = (info->DiskOffsetOfLastFullTableInSUs - lastSROffset) / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU); + + /* compute the actual full table ID */ + ftID = info->DiskOffsetOfLastFullTableInSUs / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU) + which_ft; + SpareRegion = info->NumCompleteSRs; + } + TableInSpareRegion = (ftID * info->NumParityReps + TableID) % info->TablesPerSpareRegion; + + *outCol = info->SpareTable[TableInSpareRegion][BlockID].spareDisk; + RF_ASSERT( *outCol != -1); + + spareTableStartSU = (SpareRegion == info->NumCompleteSRs) ? + info->DiskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU : + (SpareRegion+1) * info->SpareRegionDepthInSUs - info->SpareSpaceDepthPerRegionInSUs; + *outSU = spareTableStartSU + info->SpareTable[TableInSpareRegion][BlockID].spareBlockOffsetInSUs; + if (*outSU >= layoutPtr->stripeUnitsPerDisk) { + printf("rf_remap_to_spare_space: invalid remapped disk SU offset %ld\n",(long)*outSU); + } +} + +int rf_InstallSpareTable( + RF_Raid_t *raidPtr, + RF_RowCol_t frow, + RF_RowCol_t fcol) +{ + RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; + RF_SparetWait_t *req; + int retcode; + + RF_Malloc(req, sizeof(*req), (RF_SparetWait_t *)); + req->C = raidPtr->numCol; + req->G = raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; + req->fcol = fcol; + req->SUsPerPU = raidPtr->Layout.SUsPerPU; + req->TablesPerSpareRegion = info->TablesPerSpareRegion; + req->BlocksPerTable = info->BlocksPerTable; + req->TableDepthInPUs = info->TableDepthInPUs; + req->SpareSpaceDepthPerRegionInSUs = info->SpareSpaceDepthPerRegionInSUs; + +#ifndef KERNEL + info->SpareTable = rf_ReadSpareTable(req, info->sparemap_fname); + RF_Free(req, sizeof(*req)); + retcode = (info->SpareTable) ? 0 : 1; +#else /* !KERNEL */ + retcode = rf_GetSpareTableFromDaemon(req); + RF_ASSERT(!retcode); /* XXX -- fix this to recover gracefully -- XXX */ +#endif /* !KERNEL */ + + return(retcode); +} + +#ifdef KERNEL +/* + * Invoked via ioctl to install a spare table in the kernel. + */ +int rf_SetSpareTable(raidPtr, data) + RF_Raid_t *raidPtr; + void *data; +{ + RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; + RF_SpareTableEntry_t **ptrs; + int i, retcode; + + /* what we need to copyin is a 2-d array, so first copyin the user pointers to the rows in the table */ + RF_Malloc(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **)); + retcode = copyin((caddr_t) data, (caddr_t) ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *)); + + if (retcode) return(retcode); + + /* now allocate kernel space for the row pointers */ + RF_Malloc(info->SpareTable, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **)); + + /* now allocate kernel space for each row in the table, and copy it in from user space */ + for (i=0; i<info->TablesPerSpareRegion; i++) { + RF_Malloc(info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t), (RF_SpareTableEntry_t *)); + retcode = copyin(ptrs[i], info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t)); + if (retcode) { + info->SpareTable = NULL; /* blow off the memory we've allocated */ + return(retcode); + } + } + + /* free up the temporary array we used */ + RF_Free(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *)); + + return(0); +} +#endif /* KERNEL */ + +RF_ReconUnitCount_t rf_GetNumSpareRUsDeclustered(raidPtr) + RF_Raid_t *raidPtr; +{ + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + + return( ((RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo)->TotSparePUsPerDisk ); +} + + +void rf_FreeSpareTable(raidPtr) + RF_Raid_t *raidPtr; +{ + long i; + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; + RF_SpareTableEntry_t **table = info->SpareTable; + + for (i=0; i<info->TablesPerSpareRegion; i++) {RF_Free(table[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t));} + RF_Free(table, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *)); + info->SpareTable = (RF_SpareTableEntry_t **) NULL; +} diff --git a/sys/dev/raidframe/rf_decluster.h b/sys/dev/raidframe/rf_decluster.h new file mode 100644 index 00000000000..5e08fa12a55 --- /dev/null +++ b/sys/dev/raidframe/rf_decluster.h @@ -0,0 +1,182 @@ +/* $OpenBSD: rf_decluster.h,v 1.1 1999/01/11 14:29:14 niklas Exp $ */ +/* $NetBSD: rf_decluster.h,v 1.1 1998/11/13 04:20:28 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/*---------------------------------------------------------------------- + * + * decluster.h -- header file for declustered layout code + * + * Adapted from raidSim version July 1994 + * Created 10-21-92 (MCH) + * + *--------------------------------------------------------------------*/ + +/* + * : + * Log: rf_decluster.h,v + * Revision 1.20 1996/07/29 14:05:12 jimz + * fix numPUs/numRUs confusion (everything is now numRUs) + * clean up some commenting, return values + * + * Revision 1.19 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.18 1996/06/19 17:53:48 jimz + * move GetNumSparePUs, InstallSpareTable ops into layout switch + * + * Revision 1.17 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.16 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.15 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.14 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.13 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.12 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.11 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.10 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.9 1995/12/01 15:58:23 root + * added copyright info + * + * Revision 1.8 1995/11/17 18:57:02 wvcii + * added prototyping to MapParity + * + * Revision 1.7 1995/07/02 15:08:31 holland + * bug fixes related to getting distributed sparing numbers + * + * Revision 1.6 1995/06/23 13:41:18 robby + * updeated to prototypes in rf_layout.h + * + */ + +#ifndef _RF__RF_DECLUSTER_H_ +#define _RF__RF_DECLUSTER_H_ + +#include "rf_types.h" + +/* + * These structures define the tables used to locate the spare unit + * associated with a particular data or parity unit, and to perform + * the associated inverse mapping. + */ +struct RF_SpareTableEntry_s { + u_int spareDisk; /* disk to which this block is spared */ + u_int spareBlockOffsetInSUs; /* offset into spare table for that disk */ +}; + +#define RF_SPAREMAP_NAME_LEN 128 + +/* this is the layout-specific info structure for the declustered layout. + */ +struct RF_DeclusteredConfigInfo_s { + RF_StripeCount_t groupSize; /* no. of stripe units per parity stripe */ + RF_RowCol_t **LayoutTable; /* the block design table */ + RF_RowCol_t **OffsetTable; /* the sector offset table */ + RF_RowCol_t **BlockTable; /* the block membership table */ + RF_StripeCount_t SUsPerFullTable; /* stripe units per full table */ + RF_StripeCount_t SUsPerTable; /* stripe units per table */ + RF_StripeCount_t PUsPerBlock; /* parity units per block */ + RF_StripeCount_t SUsPerBlock; /* stripe units per block */ + RF_StripeCount_t BlocksPerTable; /* block design tuples per table */ + RF_StripeCount_t NumParityReps; /* tables per full table */ + RF_StripeCount_t TableDepthInPUs; /* PUs on one disk in 1 table */ + RF_StripeCount_t FullTableDepthInPUs; /* PUs on one disk in 1 fulltable */ + RF_StripeCount_t FullTableLimitSUID; /* SU where partial fulltables start */ + RF_StripeCount_t ExtraTablesPerDisk; /* # of tables in last fulltable */ + RF_SectorNum_t DiskOffsetOfLastFullTableInSUs; /* disk offs of partial ft, if any */ + RF_StripeCount_t numCompleteFullTablesPerDisk; /* ft identifier of partial ft, if any */ + u_int Lambda; /* the pair count in the block design */ + + /* these are used only in the distributed-sparing case */ + RF_StripeCount_t FullTablesPerSpareRegion; /* # of ft's comprising 1 spare region */ + RF_StripeCount_t TablesPerSpareRegion; /* # of tables */ + RF_SectorCount_t SpareSpaceDepthPerRegionInSUs; /* spare space/disk/region */ + RF_SectorCount_t SpareRegionDepthInSUs; /* # of units/disk/region */ + RF_SectorNum_t DiskOffsetOfLastSpareSpaceChunkInSUs; /* locates sp space after partial ft */ + RF_StripeCount_t TotSparePUsPerDisk; /* total number of spare PUs per disk */ + RF_StripeCount_t NumCompleteSRs; + RF_SpareTableEntry_t **SpareTable; /* remap table for spare space */ + char sparemap_fname[RF_SPAREMAP_NAME_LEN]; /* where to find sparemap. not used in kernel */ +}; + +int rf_ConfigureDeclustered(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr); +int rf_ConfigureDeclusteredDS(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr); + +void rf_MapSectorDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_MapParityDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_IdentifyStripeDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t addr, + RF_RowCol_t **diskids, RF_RowCol_t *outRow); +void rf_MapSIDToPSIDDeclustered(RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t stripeID, RF_StripeNum_t *psID, + RF_ReconUnitNum_t *which_ru); +int rf_InstallSpareTable(RF_Raid_t *raidPtr, RF_RowCol_t frow, RF_RowCol_t fcol); +void rf_FreeSpareTable(RF_Raid_t *raidPtr); + +RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitDeclustered(RF_Raid_t *raidPtr); +int rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t *raidPtr); + +void rf_decluster_adjust_params(RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t *SUID, RF_StripeCount_t *sus_per_fulltable, + RF_StripeCount_t *fulltable_depth, RF_StripeNum_t *base_suid); +void rf_remap_to_spare_space( +RF_RaidLayout_t *layoutPtr, +RF_DeclusteredConfigInfo_t *info, RF_RowCol_t row, RF_StripeNum_t FullTableID, + RF_StripeNum_t TableID, RF_SectorNum_t BlockID, RF_StripeNum_t base_suid, + RF_StripeNum_t SpareRegion, RF_RowCol_t *outCol, RF_StripeNum_t *outSU); +int rf_SetSpareTable(RF_Raid_t *raidPtr, void *data); +RF_ReconUnitCount_t rf_GetNumSpareRUsDeclustered(RF_Raid_t *raidPtr); + +#endif /* !_RF__RF_DECLUSTER_H_ */ diff --git a/sys/dev/raidframe/rf_declusterPQ.c b/sys/dev/raidframe/rf_declusterPQ.c new file mode 100644 index 00000000000..75acfa32670 --- /dev/null +++ b/sys/dev/raidframe/rf_declusterPQ.c @@ -0,0 +1,589 @@ +/* $OpenBSD: rf_declusterPQ.c,v 1.1 1999/01/11 14:29:14 niklas Exp $ */ +/* $NetBSD: rf_declusterPQ.c,v 1.1 1998/11/13 04:20:28 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Daniel Stodolsky, Mark Holland, Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/*-------------------------------------------------- + * rf_declusterPQ.c + * + * mapping code for declustered P & Q or declustered EvenOdd + * much code borrowed from rf_decluster.c + * + *--------------------------------------------------*/ + +/* + * $Header: /cvs/OpenBSD/src/sys/dev/raidframe/Attic/rf_declusterPQ.c,v 1.1 1999/01/11 14:29:14 niklas Exp $ + * + * Log: rf_declusterPQ.c,v + * Revision 1.34 1996/08/21 19:47:14 jimz + * fix bogus return values from config + * + * Revision 1.33 1996/08/21 15:09:16 jimz + * cleanup debugging spoo + * + * Revision 1.32 1996/08/21 04:13:36 jimz + * debug with EvenOdd + * + * Revision 1.31 1996/08/20 22:41:54 jimz + * 2 parity disks, not 1 + * + * Revision 1.30 1996/07/31 16:56:18 jimz + * dataBytesPerStripe, sectorsPerDisk init arch-indep. + * + * Revision 1.29 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.28 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.27 1996/06/11 08:45:12 jimz + * improved error-checking on array configuration + * + * Revision 1.26 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.25 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.24 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.23 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.22 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.21 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.20 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.19 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.18 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.17 1996/05/17 00:52:56 jimz + * RepIndex was not being initialized before the computation of + * RepIndexQ in MapQDeclusteredPQ(). I copied the initialization + * from MapParityDeclusteredPQ(). Hope that was right. + * + * Revision 1.16 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.15 1995/12/01 15:57:46 root + * added copyright info + * + * Revision 1.14 1995/11/17 19:00:13 wvcii + * added prototyping to MapParity + * created MapQ + * + * Revision 1.13 1995/10/05 22:20:48 jimz + * free_1d_array() takes two args; provide them both + * + * Revision 1.12 1995/09/06 19:26:33 wvcii + * offset cfgBuf by sparemap length (ConfigureDeclusteredPQ) + * + * Revision 1.11 95/06/23 13:41:11 robby + * updeated to prototypes in rf_layout.h + * + * Revision 1.10 1995/05/02 22:46:53 holland + * minor code cleanups. + * + * Revision 1.9 1995/03/15 20:45:23 holland + * distr sparing changes. + * + * Revision 1.8 1995/03/01 20:25:48 holland + * kernelization changes + * + * Revision 1.7 1995/02/17 19:39:56 holland + * added size param to all calls to Free(). + * this is ignored at user level, but necessary in the kernel. + * + * Revision 1.6 1995/02/10 17:34:10 holland + * kernelization changes + * + * Revision 1.5 1995/02/03 22:31:36 holland + * many changes related to kernelization + * + * Revision 1.4 1995/02/01 15:13:05 holland + * moved #include of general.h out of raid.h and into each file + * + * Revision 1.3 1995/02/01 14:25:19 holland + * began changes for kernelization: + * changed all instances of mutex_t and cond_t to DECLARE macros + * converted configuration code to use config structure + * + * Revision 1.2 1994/11/28 22:13:56 danner + * corrected some mapping bugs. + * + */ + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_configure.h" +#include "rf_decluster.h" +#include "rf_declusterPQ.h" +#include "rf_debugMem.h" +#include "rf_utils.h" +#include "rf_alloclist.h" +#include "rf_general.h" + +/* configuration code */ + +int rf_ConfigureDeclusteredPQ( + RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + int b, v, k, r, lambda; /* block design params */ + int i, j, l; + int *first_avail_slot; + int complete_FT_count, SUID; + RF_DeclusteredConfigInfo_t *info; + int numCompleteFullTablesPerDisk; + int PUsPerDisk, spareRegionDepthInPUs, numCompleteSpareRegionsPerDisk = 0, extraPUsPerDisk; + int totSparePUsPerDisk; + int diskOffsetOfLastFullTableInSUs, SpareSpaceInSUs; + char *cfgBuf = (char *) (cfgPtr->layoutSpecific); + + cfgBuf += RF_SPAREMAP_NAME_LEN; + + b = *( (int *) cfgBuf); cfgBuf += sizeof(int); + v = *( (int *) cfgBuf); cfgBuf += sizeof(int); + k = *( (int *) cfgBuf); cfgBuf += sizeof(int); + r = *( (int *) cfgBuf); cfgBuf += sizeof(int); + lambda = *( (int *) cfgBuf); cfgBuf += sizeof(int); + raidPtr->noRotate = *( (int *) cfgBuf); cfgBuf += sizeof(int); + + if (k <= 2) { + printf("RAIDFRAME: k=%d, minimum value 2\n", k); + return(EINVAL); + } + + /* 1. create layout specific structure */ + RF_MallocAndAdd(info, sizeof(RF_DeclusteredConfigInfo_t), (RF_DeclusteredConfigInfo_t *), raidPtr->cleanupList); + if (info == NULL) + return(ENOMEM); + layoutPtr->layoutSpecificInfo = (void *) info; + + /* the sparemaps are generated assuming that parity is rotated, so we issue + * a warning if both distributed sparing and no-rotate are on at the same time + */ + if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) && raidPtr->noRotate) { + RF_ERRORMSG("Warning: distributed sparing specified without parity rotation.\n"); + } + + if (raidPtr->numCol != v) { + RF_ERRORMSG2("RAID: config error: table element count (%d) not equal to no. of cols (%d)\n", v, raidPtr->numCol); + return(EINVAL); + } + + /* 3. set up the values used in devRaidMap */ + info->BlocksPerTable = b; + info->NumParityReps = info->groupSize = k; + info->PUsPerBlock = k-2; /* PQ */ + info->SUsPerTable = b * info->PUsPerBlock * layoutPtr->SUsPerPU;/* b blks, k-1 SUs each */ + info->SUsPerFullTable = k * info->SUsPerTable; /* rot k times */ + info->SUsPerBlock = info->PUsPerBlock * layoutPtr->SUsPerPU; + info->TableDepthInPUs = (b*k) / v; + info->FullTableDepthInPUs = info->TableDepthInPUs * k; /* k repetitions */ + + /* used only in distributed sparing case */ + info->FullTablesPerSpareRegion = (v-1) / rf_gcd(r, v-1); /* (v-1)/gcd fulltables */ + info->TablesPerSpareRegion = k * info->FullTablesPerSpareRegion; + info->SpareSpaceDepthPerRegionInSUs = (r * info->TablesPerSpareRegion / (v-1)) * layoutPtr->SUsPerPU; + + /* check to make sure the block design is sufficiently small */ + if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { + if (info->FullTableDepthInPUs * layoutPtr->SUsPerPU + info->SpareSpaceDepthPerRegionInSUs > layoutPtr->stripeUnitsPerDisk) { + RF_ERRORMSG3("RAID: config error: Full Table depth (%d) + Spare Space (%d) larger than disk size (%d) (BD too big)\n", + (int)info->FullTableDepthInPUs, + (int)info->SpareSpaceDepthPerRegionInSUs, + (int)layoutPtr->stripeUnitsPerDisk); + return(EINVAL); + } + } else { + if (info->TableDepthInPUs * layoutPtr->SUsPerPU > layoutPtr->stripeUnitsPerDisk) { + RF_ERRORMSG2("RAID: config error: Table depth (%d) larger than disk size (%d) (BD too big)\n", + (int)(info->TableDepthInPUs * layoutPtr->SUsPerPU), + (int)layoutPtr->stripeUnitsPerDisk); + return(EINVAL); + } + } + + + /* compute the size of each disk, and the number of tables in the last fulltable (which + * need not be complete) + */ + if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { + + PUsPerDisk = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU; + spareRegionDepthInPUs = (info->TablesPerSpareRegion * info->TableDepthInPUs + + (info->TablesPerSpareRegion * info->TableDepthInPUs) / (v-1)); + info->SpareRegionDepthInSUs = spareRegionDepthInPUs * layoutPtr->SUsPerPU; + + numCompleteSpareRegionsPerDisk = PUsPerDisk / spareRegionDepthInPUs; + info->NumCompleteSRs = numCompleteSpareRegionsPerDisk; + extraPUsPerDisk = PUsPerDisk % spareRegionDepthInPUs; + + /* assume conservatively that we need the full amount of spare space in one region in order + * to provide spares for the partial spare region at the end of the array. We set "i" to + * the number of tables in the partial spare region. This may actually include some fulltables. + */ + extraPUsPerDisk -= (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU); + if (extraPUsPerDisk <= 0) i = 0; + else i = extraPUsPerDisk/info->TableDepthInPUs; + + complete_FT_count = raidPtr->numRow * (numCompleteSpareRegionsPerDisk * (info->TablesPerSpareRegion/k) + i/k); + info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable; + info->ExtraTablesPerDisk = i % k; + + /* note that in the last spare region, the spare space is complete even though data/parity space is not */ + totSparePUsPerDisk = (numCompleteSpareRegionsPerDisk+1) * (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU); + info->TotSparePUsPerDisk = totSparePUsPerDisk; + + layoutPtr->stripeUnitsPerDisk = + ((complete_FT_count/raidPtr->numRow) * info->FullTableDepthInPUs + /* data & parity space */ + info->ExtraTablesPerDisk * info->TableDepthInPUs + + totSparePUsPerDisk /* spare space */ + ) * layoutPtr->SUsPerPU; + layoutPtr->dataStripeUnitsPerDisk = + (complete_FT_count * info->FullTableDepthInPUs + info->ExtraTablesPerDisk * info->TableDepthInPUs) + * layoutPtr->SUsPerPU * (k-1) / k; + + } else { + /* non-dist spare case: force each disk to contain an integral number of tables */ + layoutPtr->stripeUnitsPerDisk /= (info->TableDepthInPUs * layoutPtr->SUsPerPU); + layoutPtr->stripeUnitsPerDisk *= (info->TableDepthInPUs * layoutPtr->SUsPerPU); + + /* compute the number of tables in the last fulltable, which need not be complete */ + complete_FT_count = + ((layoutPtr->stripeUnitsPerDisk/layoutPtr->SUsPerPU) / info->FullTableDepthInPUs) * raidPtr->numRow; + + info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable; + info->ExtraTablesPerDisk = + ((layoutPtr->stripeUnitsPerDisk/layoutPtr->SUsPerPU) / info->TableDepthInPUs) % k; + } + + raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit; + + /* find the disk offset of the stripe unit where the last fulltable starts */ + numCompleteFullTablesPerDisk = complete_FT_count / raidPtr->numRow; + diskOffsetOfLastFullTableInSUs = numCompleteFullTablesPerDisk * info->FullTableDepthInPUs * layoutPtr->SUsPerPU; + if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { + SpareSpaceInSUs = numCompleteSpareRegionsPerDisk * info->SpareSpaceDepthPerRegionInSUs; + diskOffsetOfLastFullTableInSUs += SpareSpaceInSUs; + info->DiskOffsetOfLastSpareSpaceChunkInSUs = + diskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU; + } + info->DiskOffsetOfLastFullTableInSUs = diskOffsetOfLastFullTableInSUs; + info->numCompleteFullTablesPerDisk = numCompleteFullTablesPerDisk; + + /* 4. create and initialize the lookup tables */ + info->LayoutTable = rf_make_2d_array(b, k, raidPtr->cleanupList); + if (info->LayoutTable == NULL) + return(ENOMEM); + info->OffsetTable = rf_make_2d_array(b, k, raidPtr->cleanupList); + if (info->OffsetTable == NULL) + return(ENOMEM); + info->BlockTable = rf_make_2d_array(info->TableDepthInPUs*layoutPtr->SUsPerPU, raidPtr->numCol, raidPtr->cleanupList); + if (info->BlockTable == NULL) + return(ENOMEM); + + first_avail_slot = (int *) rf_make_1d_array(v, NULL); + if (first_avail_slot == NULL) + return(ENOMEM); + + for (i=0; i<b; i++) + for (j=0; j<k; j++) + info->LayoutTable[i][j] = *cfgBuf++; + + /* initialize offset table */ + for (i=0; i<b; i++) for (j=0; j<k; j++) { + info->OffsetTable[i][j] = first_avail_slot[ info->LayoutTable[i][j] ]; + first_avail_slot[ info->LayoutTable[i][j] ]++; + } + + /* initialize block table */ + for (SUID=l=0; l<layoutPtr->SUsPerPU; l++) { + for (i=0; i<b; i++) { + for (j=0; j<k; j++) { + info->BlockTable[ (info->OffsetTable[i][j] * layoutPtr->SUsPerPU) + l ] + [ info->LayoutTable[i][j] ] = SUID; + } + SUID++; + } + } + + rf_free_1d_array(first_avail_slot, v); + + /* 5. set up the remaining redundant-but-useful parameters */ + + raidPtr->totalSectors = (k*complete_FT_count + raidPtr->numRow*info->ExtraTablesPerDisk) * + info->SUsPerTable * layoutPtr->sectorsPerStripeUnit; + layoutPtr->numStripe = (raidPtr->totalSectors / layoutPtr->sectorsPerStripeUnit) / (k-2); + + /* strange evaluation order below to try and minimize overflow problems */ + + layoutPtr->dataSectorsPerStripe = (k-2) * layoutPtr->sectorsPerStripeUnit; + layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector; + layoutPtr->numDataCol = k-2; + layoutPtr->numParityCol = 2; + + return(0); +} + +int rf_GetDefaultNumFloatingReconBuffersPQ(RF_Raid_t *raidPtr) +{ + int def_decl; + + def_decl = rf_GetDefaultNumFloatingReconBuffersDeclustered(raidPtr); + return(RF_MAX(3 * raidPtr->numCol, def_decl)); +} + +void rf_MapSectorDeclusteredPQ( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; + RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit; + RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset; + RF_StripeNum_t BlockID, BlockOffset, RepIndex; + RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; + RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; + RF_StripeNum_t base_suid = 0, outSU, SpareRegion=0, SpareSpace=0; + + rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); + + FullTableID = SUID / sus_per_fulltable; /* fulltable ID within array (across rows) */ + *row = FullTableID % raidPtr->numRow; + FullTableID /= raidPtr->numRow; /* convert to fulltable ID on this disk */ + if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { + SpareRegion = FullTableID / info->FullTablesPerSpareRegion; + SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs; + } + FullTableOffset = SUID % sus_per_fulltable; + TableID = FullTableOffset / info->SUsPerTable; + TableOffset = FullTableOffset - TableID * info->SUsPerTable; + BlockID = TableOffset / info->PUsPerBlock; + BlockOffset = TableOffset - BlockID * info->PUsPerBlock; + BlockID %= info->BlocksPerTable; + RF_ASSERT(BlockOffset < info->groupSize-2 ); + /* + TableIDs go from 0 .. GroupSize-1 inclusive. + PUsPerBlock is k-2. + We want the tableIDs to rotate from the + right, so use GroupSize + */ + RepIndex = info->groupSize - 1 - TableID; + RF_ASSERT(RepIndex >= 0); + if (!raidPtr->noRotate) + { + if (TableID==0) + BlockOffset++; /* P on last drive, Q on first */ + else + BlockOffset += ((BlockOffset >= RepIndex) ? 2 : 0); /* skip over PQ */ + RF_ASSERT(BlockOffset < info->groupSize); + *col = info->LayoutTable[BlockID][BlockOffset]; + } + + /* remap to distributed spare space if indicated */ + if (remap) { + rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU); + } else { + + outSU = base_suid; + outSU += FullTableID * fulltable_depth; /* offs to strt of FT */ + outSU += SpareSpace; /* skip rsvd spare space */ + outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU; /* offs to strt of tble */ + outSU += info->OffsetTable[BlockID][BlockOffset] * layoutPtr->SUsPerPU; /* offs to the PU */ + } + outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock); /* offs to the SU within a PU */ + + /* convert SUs to sectors, and, if not aligned to SU boundary, add in offset to sector */ + *diskSector = outSU*layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit); +} + + +void rf_MapParityDeclusteredPQ( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; + RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit; + RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset; + RF_StripeNum_t BlockID, BlockOffset, RepIndex; + RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; + RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; + RF_StripeNum_t base_suid = 0, outSU, SpareRegion, SpareSpace=0; + + rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); + + /* compute row & (possibly) spare space exactly as before */ + FullTableID = SUID / sus_per_fulltable; + *row = FullTableID % raidPtr->numRow; + FullTableID /= raidPtr->numRow; /* convert to fulltable ID on this disk */ + if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { + SpareRegion = FullTableID / info->FullTablesPerSpareRegion; + SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs; + } + + /* compute BlockID and RepIndex exactly as before */ + FullTableOffset = SUID % sus_per_fulltable; + TableID = FullTableOffset / info->SUsPerTable; + TableOffset = FullTableOffset - TableID * info->SUsPerTable; + BlockID = TableOffset / info->PUsPerBlock; + BlockOffset = TableOffset - BlockID * info->PUsPerBlock; + BlockID %= info->BlocksPerTable; + + /* the parity block is in the position indicated by RepIndex */ + RepIndex = (raidPtr->noRotate) ? info->PUsPerBlock : info->groupSize - 1 - TableID; + *col = info->LayoutTable[BlockID][RepIndex]; + + if (remap) + RF_PANIC(); + + /* compute sector as before, except use RepIndex instead of BlockOffset */ + outSU = base_suid; + outSU += FullTableID * fulltable_depth; + outSU += SpareSpace; /* skip rsvd spare space */ + outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU; + outSU += info->OffsetTable[BlockID][RepIndex] * layoutPtr->SUsPerPU; + outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock); + + *diskSector = outSU*layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit); +} + +void rf_MapQDeclusteredPQ( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; + RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit; + RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset; + RF_StripeNum_t BlockID, BlockOffset, RepIndex, RepIndexQ; + RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; + RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; + RF_StripeNum_t base_suid = 0, outSU, SpareRegion, SpareSpace=0; + + rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); + + /* compute row & (possibly) spare space exactly as before */ + FullTableID = SUID / sus_per_fulltable; + *row = FullTableID % raidPtr->numRow; + FullTableID /= raidPtr->numRow; /* convert to fulltable ID on this disk */ + if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { + SpareRegion = FullTableID / info->FullTablesPerSpareRegion; + SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs; + } + + /* compute BlockID and RepIndex exactly as before */ + FullTableOffset = SUID % sus_per_fulltable; + TableID = FullTableOffset / info->SUsPerTable; + TableOffset = FullTableOffset - TableID * info->SUsPerTable; + BlockID = TableOffset / info->PUsPerBlock; + BlockOffset = TableOffset - BlockID * info->PUsPerBlock; + BlockID %= info->BlocksPerTable; + + /* the q block is in the position indicated by RepIndex */ + RepIndex = (raidPtr->noRotate) ? info->PUsPerBlock : info->groupSize - 1 - TableID; + RepIndexQ = ((RepIndex == (info->groupSize-1)) ? 0 : RepIndex+1); + *col = info->LayoutTable[BlockID][RepIndexQ]; + + if (remap) + RF_PANIC(); + + /* compute sector as before, except use RepIndex instead of BlockOffset */ + outSU = base_suid; + outSU += FullTableID * fulltable_depth; + outSU += SpareSpace; /* skip rsvd spare space */ + outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU; + outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock); + + outSU += info->OffsetTable[BlockID][RepIndexQ] * layoutPtr->SUsPerPU; + *diskSector = outSU*layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit); +} + +/* returns an array of ints identifying the disks that comprise the stripe containing the indicated address. + * the caller must _never_ attempt to modify this array. + */ +void rf_IdentifyStripeDeclusteredPQ( + RF_Raid_t *raidPtr, + RF_RaidAddr_t addr, + RF_RowCol_t **diskids, + RF_RowCol_t *outRow) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo; + RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable; + RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU; + RF_StripeNum_t base_suid = 0; + RF_StripeNum_t SUID = rf_RaidAddressToStripeUnitID(layoutPtr, addr); + RF_StripeNum_t stripeID, FullTableID; + int tableOffset; + + rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid); + FullTableID = SUID / sus_per_fulltable; /* fulltable ID within array (across rows) */ + *outRow = FullTableID % raidPtr->numRow; + stripeID = rf_StripeUnitIDToStripeID(layoutPtr, SUID); /* find stripe offset into array */ + tableOffset = (stripeID % info->BlocksPerTable); /* find offset into block design table */ + *diskids = info->LayoutTable[tableOffset]; +} diff --git a/sys/dev/raidframe/rf_declusterPQ.h b/sys/dev/raidframe/rf_declusterPQ.h new file mode 100644 index 00000000000..2ef5d4c220e --- /dev/null +++ b/sys/dev/raidframe/rf_declusterPQ.h @@ -0,0 +1,100 @@ +/* $OpenBSD: rf_declusterPQ.h,v 1.1 1999/01/11 14:29:14 niklas Exp $ */ +/* $NetBSD: rf_declusterPQ.h,v 1.1 1998/11/13 04:20:28 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Daniel Stodolsky, Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* : + * Log: rf_declusterPQ.h,v + * Revision 1.13 1996/08/20 22:42:08 jimz + * missing prototype of IdentifyStripeDeclusteredPQ added + * + * Revision 1.12 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.11 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.10 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.9 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.8 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.7 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.6 1995/12/01 15:59:20 root + * added copyright info + * + * Revision 1.5 1995/11/17 19:08:23 wvcii + * added prototyping to MapParity + * + * Revision 1.4 1995/11/07 15:30:33 wvcii + * changed PQDagSelect prototype + * function no longer generates numHdrSucc, numTermAnt + * removed ParityLoggingDagSelect prototype + * + * Revision 1.3 1995/06/23 13:40:57 robby + * updeated to prototypes in rf_layout.h + * + * Revision 1.2 1995/05/02 22:46:53 holland + * minor code cleanups. + * + * Revision 1.1 1994/11/19 20:26:57 danner + * Initial revision + * + */ + +#ifndef _RF__RF_DECLUSTERPQ_H_ +#define _RF__RF_DECLUSTERPQ_H_ + +#include "rf_types.h" + +int rf_ConfigureDeclusteredPQ(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr); +int rf_GetDefaultNumFloatingReconBuffersPQ(RF_Raid_t *raidPtr); +void rf_MapSectorDeclusteredPQ(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_MapParityDeclusteredPQ(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_MapQDeclusteredPQ(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_IdentifyStripeDeclusteredPQ(RF_Raid_t *raidPtr, RF_RaidAddr_t addr, + RF_RowCol_t **diskids, RF_RowCol_t *outRow); + +#endif /* !_RF__RF_DECLUSTERPQ_H_ */ diff --git a/sys/dev/raidframe/rf_demo.c b/sys/dev/raidframe/rf_demo.c new file mode 100644 index 00000000000..91212482c37 --- /dev/null +++ b/sys/dev/raidframe/rf_demo.c @@ -0,0 +1,506 @@ +/* $OpenBSD: rf_demo.c,v 1.1 1999/01/11 14:29:15 niklas Exp $ */ +/* $NetBSD: rf_demo.c,v 1.1 1998/11/13 04:20:28 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, Khalil Amiri + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/********************************************************************************** + * + * rf_demo.c -- code for supporting demos. this is not actually part of the driver. + * + **********************************************************************************/ + +/* : + * Log: rf_demo.c,v + * Revision 1.24 1996/06/17 14:38:33 jimz + * properly #if out RF_DEMO code + * fix bug in MakeConfig that was causing weird behavior + * in configuration routines (config was not zeroed at start) + * clean up genplot handling of stacks + * + * Revision 1.23 1996/06/17 03:23:09 jimz + * explicitly do pthread stuff (for join) + * NOTE: this should be changed! + * + * Revision 1.22 1996/06/14 23:15:38 jimz + * attempt to deal with thread GC problem + * + * Revision 1.21 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.20 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.19 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.18 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.17 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.16 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.15 1996/05/20 16:14:08 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.14 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.13 1995/12/01 15:56:07 root + * added copyright info + * + */ + +#include "rf_archs.h" + +#if RF_DEMO > 0 + +#include <stdio.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <strings.h> +#include <unistd.h> +#include <sys/time.h> +#include <signal.h> + +#include "rf_threadstuff.h" +#include "rf_demo.h" +#include "rf_utils.h" +#include "rf_general.h" +#include "rf_options.h" + +#ifdef SIMULATE +#include "rf_diskevent.h" +#endif /* SIMULATE */ + +static int doMax = 0; /* currently no way to set this */ + +/**************************************************************************************** + * fault-free demo code + ***************************************************************************************/ + +static int user_iops_meter = -1; +static int disk_iops_meter = -1; +static int max_user_meter = -1; +static int max_disk_meter = -1; +static int recon_pctg_meter = -1; +static int avg_resp_time_meter = -1; +static int recon_time_meter = -1; +static int ff_avg_resp_time_meter = -1; +static int deg_avg_resp_time_meter = -1; +static int recon_avg_resp_time_meter = -1; +static int user_ios_ff=0; +static int user_ios_deg=0; +static int user_ios_recon=0; +static long user_resp_time_sum_ff = 0; +static long user_resp_time_sum_deg = 0; +static long user_resp_time_sum_recon = 0; + +int rf_demo_op_mode = 0; + +RF_DECLARE_STATIC_MUTEX(iops_mutex) +static int user_ios_so_far, disk_ios_so_far, max_user, max_disk; +static long user_resp_time_sum_ms; +static int recon_pctg; +static struct timeval iops_starttime; +#ifndef SIMULATE +static RF_Thread_t update_thread_desc; +#endif /* !SIMULATE */ +static int meter_update_terminate; + +static int meter_update_interval = 2; /* seconds between meter updates */ +static int iops_initialized = 0, recon_initialized = 0; + +static char *demoMeterTags[] = {"FF", "Degr", "Recon"}; + +static int vpos=0; + +static int rf_CreateMeter(char *title, char *geom, char *color); +static void rf_UpdateMeter(int meterid, int value); +static void rf_DestroyMeter(int meterid, int killproc); + +void rf_startup_iops_demo(meter_vpos, C, G) + int meter_vpos; + int C; + int G; +{ + char buf[100], title[100]; + int rc; + + vpos = meter_vpos; + sprintf(buf, "%dx%d-0+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE)); + sprintf(title,"%s %d/%d User IOs/sec",demoMeterTags[rf_demoMeterTag],C,G); + user_iops_meter = rf_CreateMeter(title, buf, "black"); + sprintf(buf, "%dx%d-%d+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, RF_DEMO_METER_WIDTH+RF_DEMO_METER_SPACING,vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE)); + sprintf(title,"%s %d/%d Disk IOs/sec",demoMeterTags[rf_demoMeterTag],C,G); + disk_iops_meter = rf_CreateMeter(title, buf, "red"); + if (doMax) { + sprintf(buf, "%dx%d-%d+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, 2*(RF_DEMO_METER_WIDTH+RF_DEMO_METER_SPACING),vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE)); + sprintf(title,"%s %d/%d Avg User IOs/s",demoMeterTags[rf_demoMeterTag],C,G); + max_user_meter = rf_CreateMeter(title, buf, "black"); + sprintf(buf, "%dx%d-%d+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, 3*(RF_DEMO_METER_WIDTH+RF_DEMO_METER_SPACING), vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE)); + sprintf(title,"%s %d/%d Avg Disk IOs/s",demoMeterTags[rf_demoMeterTag],C,G); + max_disk_meter = rf_CreateMeter(title, buf, "red"); + sprintf(buf, "%dx%d-%d+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, 4*(RF_DEMO_METER_WIDTH+RF_DEMO_METER_SPACING), vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE)); + } else { + sprintf(buf, "%dx%d-%d+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, 2*(RF_DEMO_METER_WIDTH+RF_DEMO_METER_SPACING), vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE)); + } + sprintf(title,"%s %d/%d Avg User Resp Time (ms)",demoMeterTags[rf_demoMeterTag],C,G); + avg_resp_time_meter = rf_CreateMeter(title, buf, "blue"); + rc = rf_mutex_init(&iops_mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + return; + } + user_ios_so_far = disk_ios_so_far = max_user = max_disk = 0; + user_resp_time_sum_ms = 0; + + meter_update_terminate = 0; +#ifndef SIMULATE + pthread_create(&update_thread_desc, raidframe_attr_default, (pthread_startroutine_t)rf_meter_update_thread, NULL); +#endif /* !SIMULATE */ + gettimeofday(&iops_starttime, NULL); + iops_initialized = 1; +} + + +void rf_update_user_stats(resptime) + int resptime; +{ + if (!iops_initialized && !recon_initialized) return; + RF_LOCK_MUTEX(iops_mutex); + user_ios_so_far++; + user_resp_time_sum_ms += resptime; + RF_UNLOCK_MUTEX(iops_mutex); +} + +void rf_update_disk_iops(val) + int val; +{ + if (!iops_initialized) return; + RF_LOCK_MUTEX(iops_mutex); + disk_ios_so_far += val; + RF_UNLOCK_MUTEX(iops_mutex); +} + +void rf_meter_update_thread() +{ + struct timeval now, diff; + int iops, resptime; + float secs; + +#ifndef SIMULATE + while (!meter_update_terminate) { + gettimeofday(&now, NULL); + RF_TIMEVAL_DIFF(&iops_starttime, &now, &diff); + secs = ((float) diff.tv_sec) + ((float) diff.tv_usec)/1000000.0; +#else /* !SIMULATE */ + secs = rf_cur_time; +#endif /* !SIMULATE */ + if (user_iops_meter >= 0) { + iops = (secs!=0.0) ? (int) (((float) user_ios_so_far) / secs) : 0; + rf_UpdateMeter(user_iops_meter, iops); + if (max_user_meter && iops > max_user) {max_user = iops; rf_UpdateMeter(max_user_meter, iops);} + } + + if (disk_iops_meter >= 0) { + iops = (secs!=0.0) ? (int) (((float) disk_ios_so_far) / secs) : 0; + rf_UpdateMeter(disk_iops_meter, iops); + if (max_disk_meter && iops > max_disk) {max_disk = iops; rf_UpdateMeter(max_disk_meter, iops);} + } + + if (recon_pctg_meter >= 0) { + rf_UpdateMeter(recon_pctg_meter, recon_pctg); + } + + switch (rf_demo_op_mode){ + case RF_DEMO_FAULT_FREE: + resptime = (user_ios_so_far != 0) ? user_resp_time_sum_ms / user_ios_so_far : 0; + if (resptime && (ff_avg_resp_time_meter >=0)) + rf_UpdateMeter(ff_avg_resp_time_meter, resptime); + user_ios_ff += user_ios_so_far; + user_resp_time_sum_ff += user_resp_time_sum_ms; + break; + case RF_DEMO_DEGRADED: + resptime = (user_ios_so_far != 0) ? user_resp_time_sum_ms / user_ios_so_far : 0; + if (resptime &&(deg_avg_resp_time_meter >=0)) + rf_UpdateMeter(deg_avg_resp_time_meter, resptime); + user_ios_deg += user_ios_so_far; + user_resp_time_sum_deg += user_resp_time_sum_ms; + case RF_DEMO_RECON: + resptime = (user_ios_so_far != 0) ? user_resp_time_sum_ms / user_ios_so_far : 0; + if (resptime && (recon_avg_resp_time_meter >= 0)) + rf_UpdateMeter(recon_avg_resp_time_meter, resptime); + user_ios_recon += user_ios_so_far; + user_resp_time_sum_recon += user_resp_time_sum_ms; + break; + default: printf("WARNING: demo meter update thread: Invalid op mode! \n"); + } + user_ios_so_far = 0; + user_resp_time_sum_ms = 0; +#ifndef SIMULATE + RF_DELAY_THREAD(1,0); + } +#endif /* !SIMULATE */ +} + +void rf_finish_iops_demo() +{ + long status; + + if (!iops_initialized) return; + iops_initialized = 0; /* make sure any subsequent update calls don't do anything */ + meter_update_terminate = 1; +#ifndef SIMULATE + pthread_join(update_thread_desc, (pthread_addr_t)&status); +#endif /* !SIMULATE */ + + rf_DestroyMeter(user_iops_meter, (doMax) ? 1 : 0); + rf_DestroyMeter(disk_iops_meter, (doMax) ? 1 : 0); + rf_DestroyMeter(max_user_meter, 0); + rf_DestroyMeter(max_disk_meter, 0); + rf_DestroyMeter(avg_resp_time_meter, 0); + rf_mutex_destroy(&iops_mutex); +} + +void rf_demo_update_mode(arg_mode) + int arg_mode; +{ + int hpos; + char buf[100], title[100]; + + switch (rf_demo_op_mode = arg_mode) { + case RF_DEMO_DEGRADED: + + /* freeze fault-free response time meter; create degraded mode meter */ + hpos=rf_demoMeterHpos+2; + sprintf(buf, "%dx%d-%d+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, hpos * (RF_DEMO_METER_WIDTH+RF_DEMO_METER_SPACING), vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE)); + sprintf(title,"Degraded Mode Average Response Time (ms)",demoMeterTags[rf_demoMeterTag]); + deg_avg_resp_time_meter = rf_CreateMeter(title, buf, "purple"); + rf_UpdateMeter(ff_avg_resp_time_meter, (user_ios_ff == 0)? 0: user_resp_time_sum_ff/user_ios_ff); + break; + + case RF_DEMO_RECON: + + /* freeze degraded mode response time meter; create recon meters */ + hpos = rf_demoMeterHpos+1; + sprintf(buf, "%dx%d-%d+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, hpos * (RF_DEMO_METER_WIDTH+RF_DEMO_METER_SPACING), vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE)); + sprintf(title,"Reconstruction Average Response Time (ms)",demoMeterTags[rf_demoMeterTag]); + recon_avg_resp_time_meter = rf_CreateMeter(title, buf, "darkgreen"); + sprintf(buf, "%dx%d-%d+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, (rf_demoMeterHpos) * (RF_DEMO_METER_WIDTH + RF_DEMO_METER_SPACING), vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE)); + sprintf(title,"Percent Complete / Recon Time"); + recon_pctg_meter = rf_CreateMeter(title,buf,"red"); + rf_UpdateMeter(deg_avg_resp_time_meter, (user_ios_deg == 0)? 0: user_resp_time_sum_deg/user_ios_deg); + break; + + default: /*do nothing -- finish_recon_demo will update rest of meters */; + } + +} + + +/**************************************************************************************** + * reconstruction demo code + ***************************************************************************************/ + + +void rf_startup_recon_demo(meter_vpos, C, G, init) + int meter_vpos; + int C; + int G; + int init; +{ + char buf[100], title[100]; + int rc; + + vpos = meter_vpos; + if (init) { + /* init demo -- display ff resp time meter */ + sprintf(buf, "%dx%d-%d+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, (rf_demoMeterHpos+3) * (RF_DEMO_METER_WIDTH+RF_DEMO_METER_SPACING), vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE)); + sprintf(title,"%s %d/%d Fault-Free Avg User Resp Time (ms)",demoMeterTags[rf_demoMeterTag],C,G); + ff_avg_resp_time_meter = rf_CreateMeter(title, buf, "blue"); + } + rc = rf_mutex_init(&iops_mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + } + + meter_update_terminate = 0; +#ifndef SIMULATE + pthread_create(&update_thread_desc, raidframe_attr_default, (pthread_startroutine_t)rf_meter_update_thread, NULL); +#endif /* !SIMULATE */ + gettimeofday(&iops_starttime, NULL); + recon_initialized = 1; +} + +void rf_update_recon_meter(val) + int val; +{ + recon_pctg = val; +} + + +void rf_finish_recon_demo(etime) + struct timeval *etime; +{ + long status; + int hpos; + + hpos = rf_demoMeterHpos; + + recon_initialized = 0; /* make sure any subsequent + update calls don't do anything */ + recon_pctg = etime->tv_sec; /* display recon time on meter */ + + rf_UpdateMeter(recon_avg_resp_time_meter, (user_ios_recon == 0)? 0: user_resp_time_sum_recon/user_ios_recon); + + rf_UpdateMeter(recon_pctg_meter, etime->tv_sec); + + meter_update_terminate = 1; + +#ifndef SIMULATE + pthread_join(update_thread_desc, (pthread_addr_t)&status); /* join the meter update thread */ +#endif /* !SIMULATE */ + rf_DestroyMeter(recon_pctg_meter, 0); + rf_DestroyMeter(ff_avg_resp_time_meter, 0); + rf_DestroyMeter(deg_avg_resp_time_meter, 0); + rf_DestroyMeter(recon_avg_resp_time_meter, 0); + rf_mutex_destroy(&iops_mutex); +} + + +/**************************************************************************************** + * meter manipulation code + ***************************************************************************************/ + +#define MAXMETERS 50 +static struct meter_info { int sd; int pid; char name[100]; } minfo[MAXMETERS]; +static int meter_num = 0; + +int rf_ConfigureMeters() +{ + int i; + for (i=0; i<MAXMETERS; i++) + minfo[i].sd = -1; + return(0); +} + +/* forks a dmeter process to create a 4-digit meter window + * "title" appears in the title bar of the meter window + * returns an integer handle (really a socket descriptor) by which + * the new meter can be accessed. + */ +static int rf_CreateMeter(title, geom, color) + char *title; + char *geom; + char *color; +{ + char geombuf[100], *clr; + int sd, pid, i, status; + struct sockaddr sa; + + if (!geom) sprintf(geombuf,"120x40-0+%d", 50*meter_num); else sprintf(geombuf, "%s", geom); + clr = (color) ? color : "black"; + sprintf(minfo[meter_num].name,"/tmp/xm_%d",meter_num); + unlink(minfo[meter_num].name); + + if ( !(pid = fork()) ) { + execlp("dmeter","dmeter","-noscroll","-t",title,"-geometry",geombuf,"-sa",minfo[meter_num].name,"-fg",clr,NULL); + perror("rf_CreateMeter: exec failed"); + return(-1); + } + + sd = socket(AF_UNIX,SOCK_STREAM,0); + sa.sa_family = AF_UNIX; + strcpy(sa.sa_data, minfo[meter_num].name); + for (i=0; i<50; i++) { /* this give us 25 seconds to get the meter running */ + if ( (status = connect(sd,&sa,sizeof(sa))) != -1) break; +#ifdef SIMULATE + sleep (1); +#else /* SIMULATE */ + RF_DELAY_THREAD(0, 500); +#endif /* SIMULATE */ + } + if (status == -1) { + perror("Unable to connect to meter"); + exit(1); + } + minfo[meter_num].sd = sd; + minfo[meter_num].pid = pid; + return(meter_num++); +} + +/* causes the meter to display the given value */ +void rf_UpdateMeter(meterid, value) + int meterid; + int value; +{ + if (write(minfo[meterid].sd, &value, sizeof(int)) < sizeof(int)) { + fprintf(stderr,"Unable to write to meter %d\n",meterid); + } +} + +void rf_DestroyMeter(meterid, killproc) + int meterid; + int killproc; +{ + close(minfo[meterid].sd); + if (killproc) kill(minfo[meterid].pid, SIGTERM); + minfo[meterid].sd = -1; +} + +int rf_ShutdownAllMeters() +{ + int i; + + for (i=0; i<MAXMETERS; i++) + if (minfo[i].sd >= 0) + rf_DestroyMeter(i, 0); + return(0); +} + +#endif /* RF_DEMO > 0 */ diff --git a/sys/dev/raidframe/rf_demo.h b/sys/dev/raidframe/rf_demo.h new file mode 100644 index 00000000000..90a20935d57 --- /dev/null +++ b/sys/dev/raidframe/rf_demo.h @@ -0,0 +1,83 @@ +/* $OpenBSD: rf_demo.h,v 1.1 1999/01/11 14:29:15 niklas Exp $ */ +/* $NetBSD: rf_demo.h,v 1.1 1998/11/13 04:20:28 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, Khalil Amiri + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* rf_demo.h + * some constants for demo'ing software + */ + +/* : + * Log: rf_demo.h,v + * Revision 1.8 1996/06/14 23:15:38 jimz + * attempt to deal with thread GC problem + * + * Revision 1.7 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.6 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.5 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.4 1995/12/01 15:58:53 root + * added copyright info + * + */ + +#ifndef _RF__RF_DEMO_H_ +#define _RF__RF_DEMO_H_ + +#include "rf_types.h" + +#define RF_DEMO_METER_WIDTH 300 /* how wide each meter is */ +#define RF_DEMO_METER_HEIGHT 150 /* how tall */ +#define RF_DEMO_METER_SPACING 15 /* how much space between horizontally */ +#define RF_DEMO_METER_VSPACE 20 /* how much space between vertically */ +#define RF_DEMO_FAULT_FREE 0 +#define RF_DEMO_DEGRADED 1 +#define RF_DEMO_RECON 2 + +void rf_startup_iops_demo(int meter_vpos, int C, int G); +void rf_update_user_stats(int resptime); +void rf_update_disk_iops(int val); +void rf_meter_update_thread(void); +void rf_finish_iops_demo(void); +void rf_demo_update_mode(int arg_mode); +void rf_startup_recon_demo(int meter_vpos, int C, int G, int init); +void rf_update_recon_meter(int val); +void rf_finish_recon_demo(struct timeval *etime); + +extern int rf_demo_op_mode; + +#endif /* !_RF__RF_DEMO_H_ */ diff --git a/sys/dev/raidframe/rf_desc.h b/sys/dev/raidframe/rf_desc.h new file mode 100644 index 00000000000..a1a8e4f3684 --- /dev/null +++ b/sys/dev/raidframe/rf_desc.h @@ -0,0 +1,181 @@ +/* $OpenBSD: rf_desc.h,v 1.1 1999/01/11 14:29:15 niklas Exp $ */ +/* $NetBSD: rf_desc.h,v 1.1 1998/11/13 04:20:28 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * : + * Log: rf_desc.h,v + * Revision 1.29 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.28 1996/06/07 22:49:22 jimz + * fix up raidPtr typing + * + * Revision 1.27 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.26 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.25 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.24 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.23 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.22 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.21 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.20 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.19 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.18 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.17 1995/12/01 15:58:43 root + * added copyright info + * + * Revision 1.16 1995/11/19 16:31:30 wvcii + * descriptors now contain an array of dag lists as opposed to a dag header + * + * Revision 1.15 1995/11/07 16:24:17 wvcii + * updated def of _AccessState + * + */ + +#ifndef _RF__RF_DESC_H_ +#define _RF__RF_DESC_H_ + +#include "rf_archs.h" +#include "rf_types.h" +#include "rf_etimer.h" +#include "rf_dag.h" + +struct RF_RaidReconDesc_s { + RF_Raid_t *raidPtr; /* raid device descriptor */ + RF_RowCol_t row; /* row of failed disk */ + RF_RowCol_t col; /* col of failed disk */ + int state; /* how far along the reconstruction operation has gotten */ + RF_RaidDisk_t *spareDiskPtr; /* describes target disk for recon (not used in dist sparing) */ + int numDisksDone; /* the number of surviving disks that have completed their work */ + RF_RowCol_t srow; /* row ID of the spare disk (not used in dist sparing) */ + RF_RowCol_t scol; /* col ID of the spare disk (not used in dist sparing) */ +#ifdef KERNEL + /* + * Prevent recon from hogging CPU + */ + RF_Etimer_t recon_exec_timer; + RF_uint64 reconExecTimerRunning; + RF_uint64 reconExecTicks; + RF_uint64 maxReconExecTicks; +#endif /* KERNEL */ + +#if RF_RECON_STATS > 0 + RF_uint64 hsStallCount; /* head sep stall count */ + RF_uint64 numReconExecDelays; + RF_uint64 numReconEventWaits; +#endif /* RF_RECON_STATS > 0 */ + RF_RaidReconDesc_t *next; +}; + +struct RF_RaidAccessDesc_s { + RF_Raid_t *raidPtr; /* raid device descriptor */ + RF_IoType_t type; /* read or write */ + RF_RaidAddr_t raidAddress; /* starting address in raid address space */ + RF_SectorCount_t numBlocks; /* number of blocks (sectors) to transfer */ + RF_StripeCount_t numStripes; /* number of stripes involved in access */ + caddr_t bufPtr; /* pointer to data buffer */ + +#if !defined(KERNEL) && !defined(SIMULATE) + caddr_t obufPtr; /* real pointer to data buffer */ +#endif /* !KERNEL && !SIMULATE */ + + RF_RaidAccessFlags_t flags; /* flags controlling operation */ + int state; /* index into states telling how far along the RAID operation has gotten */ + RF_AccessState_t *states; /* array of states to be run */ + int status; /* pass/fail status of the last operation */ + RF_DagList_t *dagArray; /* array of dag lists, one list per stripe */ + RF_AccessStripeMapHeader_t *asmap; /* the asm for this I/O */ + void *bp; /* buf pointer for this RAID acc. ignored outside the kernel */ + RF_DagHeader_t **paramDAG; /* allows the DAG to be returned to the caller after I/O completion */ + RF_AccessStripeMapHeader_t **paramASM; /* allows the ASM to be returned to the caller after I/O completion */ + RF_AccTraceEntry_t tracerec; /* perf monitoring information for a user access (not for dag stats) */ + void (*callbackFunc)(RF_CBParam_t); /* callback function for this I/O */ + void *callbackArg; /* arg to give to callback func */ + int tid; /* debug only, user-level only: thread id of thr that did this access */ + + RF_AllocListElem_t *cleanupList; /* memory to be freed at the end of the access*/ + + RF_RaidAccessDesc_t *next; + RF_RaidAccessDesc_t *head; + + int numPending; + + RF_DECLARE_MUTEX(mutex) /* these are used to implement blocking I/O */ + RF_DECLARE_COND(cond) + +#ifdef SIMULATE + RF_Owner_t owner; + int async_flag; +#endif /* SIMULATE */ + + RF_Etimer_t timer; /* used for timing this access */ +}; + +#endif /* !_RF__RF_DESC_H_ */ diff --git a/sys/dev/raidframe/rf_diskevent.c b/sys/dev/raidframe/rf_diskevent.c new file mode 100644 index 00000000000..927f9ef0e29 --- /dev/null +++ b/sys/dev/raidframe/rf_diskevent.c @@ -0,0 +1,291 @@ +/* $OpenBSD: rf_diskevent.c,v 1.1 1999/01/11 14:29:16 niklas Exp $ */ +/* $NetBSD: rf_diskevent.c,v 1.1 1998/11/13 04:20:28 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Rachad Youssef + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * rf_diskevent. - support for disk device, by managing a heap of future events + * adapted from original code by David Kotz, Song Bac Toh (1994) + */ + +/* : + * Log: rf_diskevent.c,v + * Revision 1.18 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.17 1996/07/27 16:05:19 jimz + * return ENOMEM if DDEventInit fails its call to InitHeap + * + * Revision 1.16 1996/06/10 12:06:24 jimz + * fix spelling errors + * + * Revision 1.15 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.14 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.13 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.12 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.11 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.10 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.9 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.8 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.7 1995/12/01 15:57:56 root + * added copyright info + * + */ + +#include "rf_types.h" +#include "rf_heap.h" +#include "rf_diskevent.h" +#include "rf_general.h" +#include "rf_dag.h" +#include "rf_diskthreads.h" +#include "rf_states.h" +#include "rf_shutdown.h" + +/* trace printing can be turned on/off in the Makefile */ + +RF_TICS_t rf_cur_time; +static RF_Owner_t cur_owner; +static RF_Heap_t heap; + +static void rf_DDEventShutdown(ignored) + void *ignored; +{ + rf_FreeHeap(heap); +} + +/* ======================================================================== */ +/* DDEventInit + * + * Initialize the event heap. + */ +int rf_DDEventInit(listp) + RF_ShutdownList_t **listp; +{ + int rc; + + heap = rf_InitHeap(RF_HEAP_MAX); /* initialize the heap */ + if (heap == NULL) + return(ENOMEM); + rc = rf_ShutdownCreate(listp, rf_DDEventShutdown, NULL); + if (rc) { + RF_ERRORMSG3("RAIDFRAME: failed creating shutdown event file %s line %d rc=%d\n", + __FILE__, __LINE__, rc); + rf_FreeHeap(heap); + return(rc); + } + rf_cur_time=(RF_TICS_t)0; + return(0); +} + + + +/* DDEventRequest + * + * Put an event request into the event heap. + */ +void rf_DDEventRequest( + RF_TICS_t eventTime, + int (*CompleteFunc)(), + void *argument, + RF_Owner_t owner, + RF_RowCol_t row, + RF_RowCol_t col, + RF_Raid_t *raidPtr, + void *diskid) +{ + RF_HeapData_t *hpdat; + + RF_Malloc(hpdat,sizeof(RF_HeapData_t),(RF_HeapData_t *) ); + if (hpdat == NULL) { + fprintf(stderr, "DDEventRequest: malloc failed\n"); + RF_PANIC(); + } + + hpdat->eventTime = eventTime; + hpdat->CompleteFunc = CompleteFunc; + hpdat->argument = argument; + hpdat->owner = owner; + hpdat->row = row; + hpdat->col = col; + hpdat->raidPtr = raidPtr; + hpdat->diskid = diskid; + rf_AddHeap(heap, hpdat, (hpdat->eventTime)); +} + +void rf_DAGEventRequest( + RF_TICS_t eventTime, + RF_Owner_t owner, + RF_RowCol_t row, + RF_RowCol_t col, + RF_RaidAccessDesc_t *desc, + RF_Raid_t *raidPtr) +{ + RF_HeapData_t *hpdat; + + RF_Malloc(hpdat,sizeof(RF_HeapData_t),(RF_HeapData_t *) ); + if (hpdat == NULL) { + fprintf(stderr, "DDEventRequest: malloc failed\n"); + RF_PANIC(); + } + + hpdat->eventTime = eventTime; + hpdat->CompleteFunc = NULL; + hpdat->argument = NULL; + hpdat->owner = owner; + hpdat->row = row; + hpdat->col = col; + hpdat->desc=desc; + hpdat->raidPtr = raidPtr; + + rf_AddHeap(heap, hpdat, (hpdat->eventTime)); +} + + +/* ------------------------------------------------------------------------ */ +/* @SUBTITLE "Print out the request queue" */ +/* There is only 1 request queue so no argument is needed for this + function */ +void rf_DDPrintRequests() +{ + RF_HeapData_t *Hpdat; + RF_HeapKey_t Hpkey; + RF_Heap_t tempHp; + + printf("Events on heap:\n"); + + tempHp = rf_InitHeap(RF_HEAP_MAX); + while (rf_RemHeap(heap, &Hpdat, &Hpkey) != RF_HEAP_NONE) + { + printf ("at %5g HpKey there is: something for owner %d at disk %d %d\n",Hpkey, + Hpdat->owner,Hpdat->row,Hpdat->col); + rf_AddHeap(tempHp, Hpdat, Hpdat->eventTime); + } + + printf("END heap:\n"); + rf_FreeHeap(heap); /* free the empty old heap */ + + heap = tempHp; /* restore the recycled heap */ +} +/* ------------------------------------------------------------------------ */ + +int rf_ProcessEvent() +{ + RF_HeapData_t *Hpdat; + RF_HeapKey_t Hpkey; + int retcode; + + retcode = rf_RemHeap(heap, &Hpdat, &Hpkey); + + if (retcode==RF_HEAP_FOUND) { + if (rf_eventDebug) { + rf_DDPrintRequests(); + printf ("Now processing: at %5g something for owner %d at disk %d %d\n", + Hpkey, Hpdat->owner, Hpdat->row, Hpdat->col); + } + rf_cur_time=Hpkey; + + rf_SetCurrentOwner(Hpdat->owner); + + if (Hpdat->row>=0) {/* ongoing dag event */ + rf_SetDiskIdle (Hpdat->raidPtr, Hpdat->row, Hpdat->col); + if (Hpdat->diskid != NULL) { + rf_simulator_complete_io(Hpdat->diskid); + } + retcode=(Hpdat->CompleteFunc)(Hpdat->argument,0); + if (retcode==RF_HEAP_FOUND) + (((RF_DagNode_t *) (Hpdat->argument))->dagHdr->cbFunc)(((RF_DagNode_t *) (Hpdat->argument))->dagHdr->cbArg); + RF_Free(Hpdat,sizeof(RF_HeapData_t)); + return(retcode); + } + else { + /* this is a dag event or reconstruction event */ + if (Hpdat->row==RF_DD_DAGEVENT_ROW){ /* dag event */ + rf_ContinueRaidAccess(Hpdat->desc); + retcode = RF_FALSE; + RF_Free(Hpdat,sizeof(RF_HeapData_t)); + return (RF_FALSE); + } + else { + /* recon event */ + retcode=(Hpdat->CompleteFunc)(Hpdat->argument,0); + retcode = RF_FALSE; + RF_Free(Hpdat,sizeof(RF_HeapData_t)); + return (RF_FALSE); + } + } + } + if (rf_eventDebug) + printf("HEAP is empty\n"); + return(RF_DD_NOTHING_THERE); +} + +RF_Owner_t rf_GetCurrentOwner() +{ + return(cur_owner); +} + +void rf_SetCurrentOwner(RF_Owner_t owner) +{ + cur_owner=owner; +} + +RF_TICS_t rf_CurTime() +{ + return(rf_cur_time); +} diff --git a/sys/dev/raidframe/rf_diskevent.h b/sys/dev/raidframe/rf_diskevent.h new file mode 100644 index 00000000000..103ddde7d13 --- /dev/null +++ b/sys/dev/raidframe/rf_diskevent.h @@ -0,0 +1,97 @@ +/* $OpenBSD: rf_diskevent.h,v 1.1 1999/01/11 14:29:16 niklas Exp $ */ +/* $NetBSD: rf_diskevent.h,v 1.1 1998/11/13 04:20:28 oster Exp $ */ +/* + * rf_diskevent.h + * Adapted from original code by David Kotz (1994) + * + * The disk-device module is event driven. This module keeps the event + * request mechanism, which is based on proteus SimRequests, + * abstracted away from the bulk of the disk device code. + * + * Functions + * DDEventInit + * DDEventRequest + * DDEventPrint + * DDEventCancel + */ + +/* : + * Log: rf_diskevent.h,v + * Revision 1.10 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.9 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.8 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.7 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.6 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.5 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.4 1995/12/01 15:57:16 root + * added copyright info + * + */ + +#ifndef _RF__RF_DISKEVENT_H_ +#define _RF__RF_DISKEVENT_H_ + +#include "rf_types.h" +#include "rf_heap.h" +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#include "time.h" +#endif + +#define RF_DD_NOTHING_THERE (-1) +#define RF_DD_DAGEVENT_ROW (-3) +#define RF_DD_DAGEVENT_COL RF_DD_DAGEVENT_ROW + +extern RF_TICS_t rf_cur_time; + +/* + * list of disk-device request types, + * initialized in diskdevice.c, + * used in diskevent.c + */ +typedef void (*RF_DDhandler)(int disk, RF_TICS_t eventTime); +struct RF_dd_handlers_s { + RF_DDhandler handler; /* function implementing this event type */ + char name[20]; /* name of that event type */ +}; +extern struct RF_dd_handlers_s rf_DDhandlers[]; + +int rf_DDEventInit(RF_ShutdownList_t **listp); +void rf_DDEventRequest(RF_TICS_t eventTime, int (*CompleteFunc)(), + void *argument, RF_Owner_t owner, RF_RowCol_t row, RF_RowCol_t col, + RF_Raid_t *raidPtr, void *diskid); +void rf_DAGEventRequest(RF_TICS_t eventTime, RF_Owner_t owner, + RF_RowCol_t row, RF_RowCol_t col, RF_RaidAccessDesc_t *desc, + RF_Raid_t *raidPtr); +void rf_DDPrintRequests(void); +int rf_ProcessEvent(void); +RF_Owner_t rf_GetCurrentOwner(void); +void rf_SetCurrentOwner(RF_Owner_t owner); +RF_TICS_t rf_CurTime(void); + +#endif /* !_RF__RF_DISKEVENT_H_ */ diff --git a/sys/dev/raidframe/rf_diskqueue.c b/sys/dev/raidframe/rf_diskqueue.c new file mode 100644 index 00000000000..cd01f3c531f --- /dev/null +++ b/sys/dev/raidframe/rf_diskqueue.c @@ -0,0 +1,929 @@ +/* $OpenBSD: rf_diskqueue.c,v 1.1 1999/01/11 14:29:17 niklas Exp $ */ +/* $NetBSD: rf_diskqueue.c,v 1.2 1998/12/03 14:58:24 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/**************************************************************************************** + * + * rf_diskqueue.c -- higher-level disk queue code + * + * the routines here are a generic wrapper around the actual queueing + * routines. The code here implements thread scheduling, synchronization, + * and locking ops (see below) on top of the lower-level queueing code. + * + * to support atomic RMW, we implement "locking operations". When a locking op + * is dispatched to the lower levels of the driver, the queue is locked, and no further + * I/Os are dispatched until the queue receives & completes a corresponding "unlocking + * operation". This code relies on the higher layers to guarantee that a locking + * op will always be eventually followed by an unlocking op. The model is that + * the higher layers are structured so locking and unlocking ops occur in pairs, i.e. + * an unlocking op cannot be generated until after a locking op reports completion. + * There is no good way to check to see that an unlocking op "corresponds" to the + * op that currently has the queue locked, so we make no such attempt. Since by + * definition there can be only one locking op outstanding on a disk, this should + * not be a problem. + * + * In the kernel, we allow multiple I/Os to be concurrently dispatched to the disk + * driver. In order to support locking ops in this environment, when we decide to + * do a locking op, we stop dispatching new I/Os and wait until all dispatched I/Os + * have completed before dispatching the locking op. + * + * Unfortunately, the code is different in the 3 different operating states + * (user level, kernel, simulator). In the kernel, I/O is non-blocking, and + * we have no disk threads to dispatch for us. Therefore, we have to dispatch + * new I/Os to the scsi driver at the time of enqueue, and also at the time + * of completion. At user level, I/O is blocking, and so only the disk threads + * may dispatch I/Os. Thus at user level, all we can do at enqueue time is + * enqueue and wake up the disk thread to do the dispatch. + * + ***************************************************************************************/ + +/* + * : + * + * Log: rf_diskqueue.c,v + * Revision 1.50 1996/08/07 21:08:38 jimz + * b_proc -> kb_proc + * + * Revision 1.49 1996/07/05 20:36:14 jimz + * make rf_ConfigureDiskQueueSystem return 0 + * + * Revision 1.48 1996/06/18 20:53:11 jimz + * fix up disk queueing (remove configure routine, + * add shutdown list arg to create routines) + * + * Revision 1.47 1996/06/14 14:16:36 jimz + * fix handling of bogus queue type + * + * Revision 1.46 1996/06/13 20:41:44 jimz + * add scan, cscan, random queueing + * + * Revision 1.45 1996/06/11 01:27:50 jimz + * Fixed bug where diskthread shutdown would crash or hang. This + * turned out to be two distinct bugs: + * (1) [crash] The thread shutdown code wasn't properly waiting for + * all the diskthreads to complete. This caused diskthreads that were + * exiting+cleaning up to unlock a destroyed mutex. + * (2) [hang] TerminateDiskQueues wasn't locking, and DiskIODequeue + * only checked for termination _after_ a wakeup if the queues were + * empty. This was a race where the termination wakeup could be lost + * by the dequeueing thread, and the system would hang waiting for the + * thread to exit, while the thread waited for an I/O or a signal to + * check the termination flag. + * + * Revision 1.44 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.43 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.42 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.41 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.40 1996/06/06 17:28:04 jimz + * track sector number of last I/O dequeued + * + * Revision 1.39 1996/06/06 01:14:13 jimz + * fix crashing bug when tracerec is NULL (ie, from copyback) + * initialize req->queue + * + * Revision 1.38 1996/06/05 19:38:32 jimz + * fixed up disk queueing types config + * added sstf disk queueing + * fixed exit bug on diskthreads (ref-ing bad mem) + * + * Revision 1.37 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.36 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.35 1996/05/30 12:59:18 jimz + * make etimer happier, more portable + * + * Revision 1.34 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.33 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.32 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.31 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.30 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.29 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.28 1996/05/20 16:14:29 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.27 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.26 1996/05/16 19:21:49 wvcii + * fixed typo in init_dqd + * + * Revision 1.25 1996/05/16 16:02:51 jimz + * switch to RF_FREELIST stuff for DiskQueueData + * + * Revision 1.24 1996/05/10 16:24:14 jimz + * new cvscan function names + * + * Revision 1.23 1996/05/01 16:27:54 jimz + * don't use ccmn bp management + * + * Revision 1.22 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.21 1995/12/01 15:59:59 root + * added copyright info + * + * Revision 1.20 1995/11/07 16:27:20 wvcii + * added Peek() function to diskqueuesw + * non-locking accesses are never blocked (assume clients enforce proper + * respect for lock acquisition) + * + * Revision 1.19 1995/10/05 18:56:52 jimz + * fix req handling in IOComplete + * + * Revision 1.18 1995/10/04 20:13:50 wvcii + * added asserts to monitor numOutstanding queueLength + * + * Revision 1.17 1995/10/04 07:43:52 wvcii + * queue->numOutstanding now valid for user & sim + * added queue->queueLength + * user tested & verified, sim untested + * + * Revision 1.16 1995/09/12 00:21:19 wvcii + * added support for tracing disk queue time + * + */ + +#include "rf_types.h" +#include "rf_threadstuff.h" +#include "rf_threadid.h" +#include "rf_raid.h" +#include "rf_diskqueue.h" +#include "rf_alloclist.h" +#include "rf_acctrace.h" +#include "rf_etimer.h" +#include "rf_configure.h" +#include "rf_general.h" +#include "rf_freelist.h" +#include "rf_debugprint.h" +#include "rf_shutdown.h" +#include "rf_cvscan.h" +#include "rf_sstf.h" +#include "rf_fifo.h" + +#ifdef SIMULATE +#include "rf_diskevent.h" +#endif /* SIMULATE */ + +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +extern struct buf *ubc_bufget(); +#endif + +static int init_dqd(RF_DiskQueueData_t *); +static void clean_dqd(RF_DiskQueueData_t *); +static void rf_ShutdownDiskQueueSystem(void *); +/* From rf_kintf.c */ +int rf_DispatchKernelIO(RF_DiskQueue_t *,RF_DiskQueueData_t *); + + +#define Dprintf1(s,a) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf2(s,a,b) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf3(s,a,b,c) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL) +#define Dprintf4(s,a,b,c,d) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL) +#define Dprintf5(s,a,b,c,d,e) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL) + +#if !defined(KERNEL) && !defined(SIMULATE) + +/* queue must be locked before invoking this */ +#define SIGNAL_DISK_QUEUE(_q_,_wh_) \ +{ \ + if ( (_q_)->numWaiting > 0) { \ + (_q_)->numWaiting--; \ + RF_SIGNAL_COND( ((_q_)->cond) ); \ + } \ +} + +/* queue must be locked before invoking this */ +#define WAIT_DISK_QUEUE(_q_,_wh_) \ +{ \ + (_q_)->numWaiting++; \ + RF_WAIT_COND( ((_q_)->cond), ((_q_)->mutex) ); \ +} + +#else /* !defined(KERNEL) && !defined(SIMULATE) */ + +#define SIGNAL_DISK_QUEUE(_q_,_wh_) +#define WAIT_DISK_QUEUE(_q_,_wh_) + +#endif /* !defined(KERNEL) && !defined(SIMULATE) */ + +/***************************************************************************************** + * + * the disk queue switch defines all the functions used in the different queueing + * disciplines + * queue ID, init routine, enqueue routine, dequeue routine + * + ****************************************************************************************/ + +static RF_DiskQueueSW_t diskqueuesw[] = { + {"fifo", /* FIFO */ + rf_FifoCreate, + rf_FifoEnqueue, + rf_FifoDequeue, + rf_FifoPeek, + rf_FifoPromote}, + + {"cvscan", /* cvscan */ + rf_CvscanCreate, + rf_CvscanEnqueue, + rf_CvscanDequeue, + rf_CvscanPeek, + rf_CvscanPromote }, + + {"sstf", /* shortest seek time first */ + rf_SstfCreate, + rf_SstfEnqueue, + rf_SstfDequeue, + rf_SstfPeek, + rf_SstfPromote}, + + {"scan", /* SCAN (two-way elevator) */ + rf_ScanCreate, + rf_SstfEnqueue, + rf_ScanDequeue, + rf_ScanPeek, + rf_SstfPromote}, + + {"cscan", /* CSCAN (one-way elevator) */ + rf_CscanCreate, + rf_SstfEnqueue, + rf_CscanDequeue, + rf_CscanPeek, + rf_SstfPromote}, + +#if !defined(KERNEL) && RF_INCLUDE_QUEUE_RANDOM > 0 + /* to make a point to Chris :-> */ + {"random", /* random */ + rf_FifoCreate, + rf_FifoEnqueue, + rf_RandomDequeue, + rf_RandomPeek, + rf_FifoPromote}, +#endif /* !KERNEL && RF_INCLUDE_QUEUE_RANDOM > 0 */ +}; +#define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t)) + +static RF_FreeList_t *rf_dqd_freelist; + +#define RF_MAX_FREE_DQD 256 +#define RF_DQD_INC 16 +#define RF_DQD_INITIAL 64 + +#if defined(__NetBSD__) || defined(__OpenBSD__) +#ifdef _KERNEL +#include <sys/buf.h> +#endif +#endif + +static int init_dqd(dqd) + RF_DiskQueueData_t *dqd; +{ +#ifdef KERNEL +#if defined(__NetBSD__) || defined(__OpenBSD__) + /* XXX not sure if the following malloc is appropriate... probably not quite... */ + dqd->bp = (struct buf *) malloc( sizeof(struct buf), M_DEVBUF, M_NOWAIT); + memset(dqd->bp,0,sizeof(struct buf)); /* if you don't do it, nobody else will.. */ + /* XXX */ + /* printf("NEED TO IMPLEMENT THIS BETTER!\n"); */ +#else + dqd->bp = ubc_bufget(); +#endif + if (dqd->bp == NULL) { + return(ENOMEM); + } +#endif /* KERNEL */ + return(0); +} + +static void clean_dqd(dqd) + RF_DiskQueueData_t *dqd; +{ +#ifdef KERNEL +#if defined(__NetBSD__) || defined(__OpenBSD__) + /* printf("NEED TO IMPLEMENT THIS BETTER(2)!\n"); */ + /* XXX ? */ + free( dqd->bp, M_DEVBUF ); +#else + ubc_buffree(dqd->bp); +#endif + +#endif /* KERNEL */ +} + +/* configures a single disk queue */ +static int config_disk_queue( + RF_Raid_t *raidPtr, + RF_DiskQueue_t *diskqueue, + RF_RowCol_t r, /* row & col -- debug only. BZZT not any more... */ + RF_RowCol_t c, + RF_DiskQueueSW_t *p, + RF_SectorCount_t sectPerDisk, + dev_t dev, + int maxOutstanding, + RF_ShutdownList_t **listp, + RF_AllocListElem_t *clList) +{ + int rc; + + diskqueue->row = r; + diskqueue->col = c; + diskqueue->qPtr = p; + diskqueue->qHdr = (p->Create)(sectPerDisk, clList, listp); + diskqueue->dev = dev; + diskqueue->numOutstanding = 0; + diskqueue->queueLength = 0; + diskqueue->maxOutstanding = maxOutstanding; + diskqueue->curPriority = RF_IO_NORMAL_PRIORITY; + diskqueue->nextLockingOp = NULL; + diskqueue->unlockingOp = NULL; + diskqueue->numWaiting=0; + diskqueue->flags = 0; + diskqueue->raidPtr = raidPtr; +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + diskqueue->rf_cinfo = &raidPtr->raid_cinfo[r][c]; +#endif + rc = rf_create_managed_mutex(listp, &diskqueue->mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + return(rc); + } + rc = rf_create_managed_cond(listp, &diskqueue->cond); + if (rc) { + RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + return(rc); + } + return(0); +} + +static void rf_ShutdownDiskQueueSystem(ignored) + void *ignored; +{ + RF_FREELIST_DESTROY_CLEAN(rf_dqd_freelist,next,(RF_DiskQueueData_t *),clean_dqd); +} + +int rf_ConfigureDiskQueueSystem(listp) + RF_ShutdownList_t **listp; +{ + int rc; + + RF_FREELIST_CREATE(rf_dqd_freelist, RF_MAX_FREE_DQD, + RF_DQD_INC, sizeof(RF_DiskQueueData_t)); + if (rf_dqd_freelist == NULL) + return(ENOMEM); + rc = rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, NULL); + if (rc) { + RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", + __FILE__, __LINE__, rc); + rf_ShutdownDiskQueueSystem(NULL); + return(rc); + } + RF_FREELIST_PRIME_INIT(rf_dqd_freelist, RF_DQD_INITIAL,next, + (RF_DiskQueueData_t *),init_dqd); + return(0); +} + +#ifndef KERNEL +/* this is called prior to shutdown to wakeup everyone waiting on a disk queue + * and tell them to exit + */ +void rf_TerminateDiskQueues(raidPtr) + RF_Raid_t *raidPtr; +{ + RF_RowCol_t r, c; + + raidPtr->terminate_disk_queues = 1; + for (r=0; r<raidPtr->numRow; r++) { + for (c=0; c<raidPtr->numCol + ((r==0) ? raidPtr->numSpare : 0); c++) { + RF_LOCK_QUEUE_MUTEX(&raidPtr->Queues[r][c], "TerminateDiskQueues"); + RF_BROADCAST_COND(raidPtr->Queues[r][c].cond); + RF_UNLOCK_QUEUE_MUTEX(&raidPtr->Queues[r][c], "TerminateDiskQueues"); + } + } +} +#endif /* !KERNEL */ + +int rf_ConfigureDiskQueues( + RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr) +{ + RF_DiskQueue_t **diskQueues, *spareQueues; + RF_DiskQueueSW_t *p; + RF_RowCol_t r, c; + int rc, i; + + raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs; + + for(p=NULL,i=0;i<NUM_DISK_QUEUE_TYPES;i++) { + if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) { + p = &diskqueuesw[i]; + break; + } + } + if (p == NULL) { + RF_ERRORMSG2("Unknown queue type \"%s\". Using %s\n",cfgPtr->diskQueueType, diskqueuesw[0].queueType); + p = &diskqueuesw[0]; + } + + RF_CallocAndAdd(diskQueues, raidPtr->numRow, sizeof(RF_DiskQueue_t *), (RF_DiskQueue_t **), raidPtr->cleanupList); + if (diskQueues == NULL) { + return(ENOMEM); + } + raidPtr->Queues = diskQueues; + for (r=0; r<raidPtr->numRow; r++) { + RF_CallocAndAdd(diskQueues[r], raidPtr->numCol + ((r==0) ? raidPtr->numSpare : 0), sizeof(RF_DiskQueue_t), (RF_DiskQueue_t *), raidPtr->cleanupList); + if (diskQueues[r] == NULL) + return(ENOMEM); + for (c=0; c<raidPtr->numCol; c++) { + rc = config_disk_queue(raidPtr, &diskQueues[r][c], r, c, p, + raidPtr->sectorsPerDisk, raidPtr->Disks[r][c].dev, + cfgPtr->maxOutstandingDiskReqs, listp, raidPtr->cleanupList); + if (rc) + return(rc); + } + } + + spareQueues = &raidPtr->Queues[0][raidPtr->numCol]; + for (r=0; r<raidPtr->numSpare; r++) { + rc = config_disk_queue(raidPtr, &spareQueues[r], + 0, raidPtr->numCol+r, p, + raidPtr->sectorsPerDisk, + raidPtr->Disks[0][raidPtr->numCol+r].dev, + cfgPtr->maxOutstandingDiskReqs, listp, + raidPtr->cleanupList); + if (rc) + return(rc); + } + return(0); +} + +/* Enqueue a disk I/O + * + * Unfortunately, we have to do things differently in the different + * environments (simulator, user-level, kernel). + * At user level, all I/O is blocking, so we have 1 or more threads/disk + * and the thread that enqueues is different from the thread that dequeues. + * In the kernel, I/O is non-blocking and so we'd like to have multiple + * I/Os outstanding on the physical disks when possible. + * + * when any request arrives at a queue, we have two choices: + * dispatch it to the lower levels + * queue it up + * + * kernel rules for when to do what: + * locking request: queue empty => dispatch and lock queue, + * else queue it + * unlocking req : always dispatch it + * normal req : queue empty => dispatch it & set priority + * queue not full & priority is ok => dispatch it + * else queue it + * + * user-level rules: + * always enqueue. In the special case of an unlocking op, enqueue + * in a special way that will cause the unlocking op to be the next + * thing dequeued. + * + * simulator rules: + * Do the same as at user level, with the sleeps and wakeups suppressed. + */ +void rf_DiskIOEnqueue(queue, req, pri) + RF_DiskQueue_t *queue; + RF_DiskQueueData_t *req; + int pri; +{ + int tid; + + RF_ETIMER_START(req->qtime); + rf_get_threadid(tid); + RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector); + req->priority = pri; + + if (rf_queueDebug && (req->numSector == 0)) { + printf("Warning: Enqueueing zero-sector access\n"); + } + +#ifdef KERNEL + /* + * kernel + */ + RF_LOCK_QUEUE_MUTEX( queue, "DiskIOEnqueue" ); + /* locking request */ + if (RF_LOCKING_REQ(req)) { + if (RF_QUEUE_EMPTY(queue)) { + Dprintf3("Dispatching pri %d locking op to r %d c %d (queue empty)\n",pri,queue->row, queue->col); + RF_LOCK_QUEUE(queue); + rf_DispatchKernelIO(queue, req); + } else { + queue->queueLength++; /* increment count of number of requests waiting in this queue */ + Dprintf3("Enqueueing pri %d locking op to r %d c %d (queue not empty)\n",pri,queue->row, queue->col); + req->queue = (void *)queue; + (queue->qPtr->Enqueue)(queue->qHdr, req, pri); + } + } + /* unlocking request */ + else if (RF_UNLOCKING_REQ(req)) { /* we'll do the actual unlock when this I/O completes */ + Dprintf3("Dispatching pri %d unlocking op to r %d c %d\n",pri,queue->row, queue->col); + RF_ASSERT(RF_QUEUE_LOCKED(queue)); + rf_DispatchKernelIO(queue, req); + } + /* normal request */ + else if (RF_OK_TO_DISPATCH(queue, req)) { + Dprintf3("Dispatching pri %d regular op to r %d c %d (ok to dispatch)\n",pri,queue->row, queue->col); + rf_DispatchKernelIO(queue, req); + } else { + queue->queueLength++; /* increment count of number of requests waiting in this queue */ + Dprintf3("Enqueueing pri %d regular op to r %d c %d (not ok to dispatch)\n",pri,queue->row, queue->col); + req->queue = (void *)queue; + (queue->qPtr->Enqueue)(queue->qHdr, req, pri); + } + RF_UNLOCK_QUEUE_MUTEX( queue, "DiskIOEnqueue" ); + +#else /* KERNEL */ + /* + * user-level + */ + RF_LOCK_QUEUE_MUTEX( queue, "DiskIOEnqueue" ); + queue->queueLength++; /* increment count of number of requests waiting in this queue */ + /* unlocking request */ + if (RF_UNLOCKING_REQ(req)) { + Dprintf4("[%d] enqueueing pri %d unlocking op & signalling r %d c %d\n", tid, pri, queue->row, queue->col); + RF_ASSERT(RF_QUEUE_LOCKED(queue) && queue->unlockingOp == NULL); + queue->unlockingOp = req; + } + /* locking and normal requests */ + else { + req->queue = (void *)queue; + Dprintf5("[%d] enqueueing pri %d %s op & signalling r %d c %d\n", tid, pri, + (RF_LOCKING_REQ(req)) ? "locking" : "regular",queue->row,queue->col); + (queue->qPtr->Enqueue)(queue->qHdr, req, pri); + } + SIGNAL_DISK_QUEUE( queue, "DiskIOEnqueue"); + RF_UNLOCK_QUEUE_MUTEX( queue, "DiskIOEnqueue" ); +#endif /* KERNEL */ +} + +#if !defined(KERNEL) && !defined(SIMULATE) +/* user-level only: tell all threads to wake up & recheck the queue */ +void rf_BroadcastOnQueue(queue) + RF_DiskQueue_t *queue; +{ + int i; + + if (queue->maxOutstanding > 1) for (i=0; i<queue->maxOutstanding; i++) { + SIGNAL_DISK_QUEUE(queue, "BroadcastOnQueue" ); + } +} +#endif /* !KERNEL && !SIMULATE */ + +#ifndef KERNEL /* not used in kernel */ + +RF_DiskQueueData_t *rf_DiskIODequeue(queue) + RF_DiskQueue_t *queue; +{ + RF_DiskQueueData_t *p, *headItem; + int tid; + + rf_get_threadid(tid); + RF_LOCK_QUEUE_MUTEX( queue, "DiskIODequeue" ); + for (p=NULL; !p; ) { + if (queue->unlockingOp) { + /* unlocking request */ + RF_ASSERT(RF_QUEUE_LOCKED(queue)); + p = queue->unlockingOp; + queue->unlockingOp = NULL; + Dprintf4("[%d] dequeueing pri %d unlocking op r %d c %d\n", tid, p->priority, queue->row,queue->col); + } + else { + headItem = (queue->qPtr->Peek)(queue->qHdr); + if (headItem) { + if (RF_LOCKING_REQ(headItem)) { + /* locking request */ + if (!RF_QUEUE_LOCKED(queue)) { + /* queue isn't locked, so dequeue the request & lock the queue */ + p = (queue->qPtr->Dequeue)( queue->qHdr ); + if (p) + Dprintf4("[%d] dequeueing pri %d locking op r %d c %d\n", tid, p->priority, queue->row, queue->col); + else + Dprintf3("[%d] no dequeue -- raw queue empty r %d c %d\n", tid, queue->row, queue->col); + } + else { + /* queue already locked, no dequeue occurs */ + Dprintf3("[%d] no dequeue -- queue is locked r %d c %d\n", tid, queue->row, queue->col); + p = NULL; + } + } + else { + /* normal request, always dequeue and assume caller already has lock (if needed) */ + p = (queue->qPtr->Dequeue)( queue->qHdr ); + if (p) + Dprintf4("[%d] dequeueing pri %d regular op r %d c %d\n", tid, p->priority, queue->row, queue->col); + else + Dprintf3("[%d] no dequeue -- raw queue empty r %d c %d\n", tid, queue->row, queue->col); + } + } + else { + Dprintf3("[%d] no dequeue -- raw queue empty r %d c %d\n", tid, queue->row, queue->col); + } + } + + if (queue->raidPtr->terminate_disk_queues) { + p = NULL; + break; + } +#ifdef SIMULATE + break; /* in simulator, return NULL on empty queue instead of blocking */ +#else /* SIMULATE */ + if (!p) { + Dprintf3("[%d] nothing to dequeue: waiting r %d c %d\n", tid, queue->row, queue->col); + WAIT_DISK_QUEUE( queue, "DiskIODequeue" ); + } +#endif /* SIMULATE */ + } + + if (p) { + queue->queueLength--; /* decrement count of number of requests waiting in this queue */ + RF_ASSERT(queue->queueLength >= 0); + queue->numOutstanding++; + queue->last_deq_sector = p->sectorOffset; + /* record the amount of time this request spent in the disk queue */ + RF_ETIMER_STOP(p->qtime); + RF_ETIMER_EVAL(p->qtime); + if (p->tracerec) + p->tracerec->diskqueue_us += RF_ETIMER_VAL_US(p->qtime); + } + + if (p && RF_LOCKING_REQ(p)) { + RF_ASSERT(!RF_QUEUE_LOCKED(queue)); + Dprintf3("[%d] locking queue r %d c %d\n",tid,queue->row,queue->col); + RF_LOCK_QUEUE(queue); + } + RF_UNLOCK_QUEUE_MUTEX( queue, "DiskIODequeue" ); + + return(p); +} + +#else /* !KERNEL */ + +/* get the next set of I/Os started, kernel version only */ +void rf_DiskIOComplete(queue, req, status) + RF_DiskQueue_t *queue; + RF_DiskQueueData_t *req; + int status; +{ + int done=0; + + RF_LOCK_QUEUE_MUTEX( queue, "DiskIOComplete" ); + + /* unlock the queue: + (1) after an unlocking req completes + (2) after a locking req fails + */ + if (RF_UNLOCKING_REQ(req) || (RF_LOCKING_REQ(req) && status)) { + Dprintf2("DiskIOComplete: unlocking queue at r %d c %d\n", queue->row, queue->col); + RF_ASSERT(RF_QUEUE_LOCKED(queue) && (queue->unlockingOp == NULL)); + RF_UNLOCK_QUEUE(queue); + } + + queue->numOutstanding--; + RF_ASSERT(queue->numOutstanding >= 0); + + /* dispatch requests to the disk until we find one that we can't. */ + /* no reason to continue once we've filled up the queue */ + /* no reason to even start if the queue is locked */ + + while (!done && !RF_QUEUE_FULL(queue) && !RF_QUEUE_LOCKED(queue)) { + if (queue->nextLockingOp) { + req = queue->nextLockingOp; queue->nextLockingOp = NULL; + Dprintf3("DiskIOComplete: a pri %d locking req was pending at r %d c %d\n",req->priority,queue->row, queue->col); + } else { + req = (queue->qPtr->Dequeue)( queue->qHdr ); + if (req != NULL) { + Dprintf3("DiskIOComplete: extracting pri %d req from queue at r %d c %d\n",req->priority,queue->row, queue->col); + } else { + Dprintf1("DiskIOComplete: no more requests to extract.\n",""); + } + } + if (req) { + queue->queueLength--; /* decrement count of number of requests waiting in this queue */ + RF_ASSERT(queue->queueLength >= 0); + } + if (!req) done=1; + else if (RF_LOCKING_REQ(req)) { + if (RF_QUEUE_EMPTY(queue)) { /* dispatch it */ + Dprintf3("DiskIOComplete: dispatching pri %d locking req to r %d c %d (queue empty)\n",req->priority,queue->row, queue->col); + RF_LOCK_QUEUE(queue); + rf_DispatchKernelIO(queue, req); + done = 1; + } else { /* put it aside to wait for the queue to drain */ + Dprintf3("DiskIOComplete: postponing pri %d locking req to r %d c %d\n",req->priority,queue->row, queue->col); + RF_ASSERT(queue->nextLockingOp == NULL); + queue->nextLockingOp = req; + done = 1; + } + } else if (RF_UNLOCKING_REQ(req)) { /* should not happen: unlocking ops should not get queued */ + RF_ASSERT(RF_QUEUE_LOCKED(queue)); /* support it anyway for the future */ + Dprintf3("DiskIOComplete: dispatching pri %d unl req to r %d c %d (SHOULD NOT SEE THIS)\n",req->priority,queue->row, queue->col); + rf_DispatchKernelIO(queue, req); + done = 1; + } else if (RF_OK_TO_DISPATCH(queue, req)) { + Dprintf3("DiskIOComplete: dispatching pri %d regular req to r %d c %d (ok to dispatch)\n",req->priority,queue->row, queue->col); + rf_DispatchKernelIO(queue, req); + } else { /* we can't dispatch it, so just re-enqueue it. */ + /* potential trouble here if disk queues batch reqs */ + Dprintf3("DiskIOComplete: re-enqueueing pri %d regular req to r %d c %d\n",req->priority,queue->row, queue->col); + queue->queueLength++; + (queue->qPtr->Enqueue)(queue->qHdr, req, req->priority); + done = 1; + } + } + + RF_UNLOCK_QUEUE_MUTEX( queue, "DiskIOComplete" ); +} +#endif /* !KERNEL */ + +/* promotes accesses tagged with the given parityStripeID from low priority + * to normal priority. This promotion is optional, meaning that a queue + * need not implement it. If there is no promotion routine associated with + * a queue, this routine does nothing and returns -1. + */ +int rf_DiskIOPromote(queue, parityStripeID, which_ru) + RF_DiskQueue_t *queue; + RF_StripeNum_t parityStripeID; + RF_ReconUnitNum_t which_ru; +{ + int retval; + + if (!queue->qPtr->Promote) + return(-1); + RF_LOCK_QUEUE_MUTEX( queue, "DiskIOPromote" ); + retval = (queue->qPtr->Promote)( queue->qHdr, parityStripeID, which_ru ); + RF_UNLOCK_QUEUE_MUTEX( queue, "DiskIOPromote" ); + return(retval); +} + +RF_DiskQueueData_t *rf_CreateDiskQueueData( + RF_IoType_t typ, + RF_SectorNum_t ssect, + RF_SectorCount_t nsect, + caddr_t buf, + RF_StripeNum_t parityStripeID, + RF_ReconUnitNum_t which_ru, + int (*wakeF)(void *,int), + void *arg, + RF_DiskQueueData_t *next, + RF_AccTraceEntry_t *tracerec, + void *raidPtr, + RF_DiskQueueDataFlags_t flags, + void *kb_proc) +{ + RF_DiskQueueData_t *p; + + RF_FREELIST_GET_INIT(rf_dqd_freelist,p,next,(RF_DiskQueueData_t *),init_dqd); + + p->sectorOffset = ssect + rf_protectedSectors; + p->numSector = nsect; + p->type = typ; + p->buf = buf; + p->parityStripeID= parityStripeID; + p->which_ru = which_ru; + p->CompleteFunc = wakeF; + p->argument = arg; + p->next = next; + p->tracerec = tracerec; + p->priority = RF_IO_NORMAL_PRIORITY; + p->AuxFunc = NULL; + p->buf2 = NULL; +#ifdef SIMULATE + p->owner = rf_GetCurrentOwner(); +#endif /* SIMULATE */ + p->raidPtr = raidPtr; + p->flags = flags; +#ifdef KERNEL + p->b_proc = kb_proc; +#endif /* KERNEL */ + return(p); +} + +RF_DiskQueueData_t *rf_CreateDiskQueueDataFull( + RF_IoType_t typ, + RF_SectorNum_t ssect, + RF_SectorCount_t nsect, + caddr_t buf, + RF_StripeNum_t parityStripeID, + RF_ReconUnitNum_t which_ru, + int (*wakeF)(void *,int), + void *arg, + RF_DiskQueueData_t *next, + RF_AccTraceEntry_t *tracerec, + int priority, + int (*AuxFunc)(void *,...), + caddr_t buf2, + void *raidPtr, + RF_DiskQueueDataFlags_t flags, + void *kb_proc) +{ + RF_DiskQueueData_t *p; + + RF_FREELIST_GET_INIT(rf_dqd_freelist,p,next,(RF_DiskQueueData_t *),init_dqd); + + p->sectorOffset = ssect + rf_protectedSectors; + p->numSector = nsect; + p->type = typ; + p->buf = buf; + p->parityStripeID= parityStripeID; + p->which_ru = which_ru; + p->CompleteFunc = wakeF; + p->argument = arg; + p->next = next; + p->tracerec = tracerec; + p->priority = priority; + p->AuxFunc = AuxFunc; + p->buf2 = buf2; +#ifdef SIMULATE + p->owner = rf_GetCurrentOwner(); +#endif /* SIMULATE */ + p->raidPtr = raidPtr; + p->flags = flags; +#ifdef KERNEL + p->b_proc = kb_proc; +#endif /* KERNEL */ + return(p); +} + +void rf_FreeDiskQueueData(p) + RF_DiskQueueData_t *p; +{ + RF_FREELIST_FREE_CLEAN(rf_dqd_freelist,p,next,clean_dqd); +} diff --git a/sys/dev/raidframe/rf_diskqueue.h b/sys/dev/raidframe/rf_diskqueue.h new file mode 100644 index 00000000000..20878553479 --- /dev/null +++ b/sys/dev/raidframe/rf_diskqueue.h @@ -0,0 +1,315 @@ +/* $OpenBSD: rf_diskqueue.h,v 1.1 1999/01/11 14:29:17 niklas Exp $ */ +/* $NetBSD: rf_diskqueue.h,v 1.1 1998/11/13 04:20:29 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/***************************************************************************************** + * + * rf_diskqueue.h -- header file for disk queues + * + * see comments in rf_diskqueue.c + * + ****************************************************************************************/ +/* + * + * : + * + * Log: rf_diskqueue.h,v + * Revision 1.31 1996/08/07 21:08:49 jimz + * b_proc -> kb_proc (IRIX complained) + * + * Revision 1.30 1996/06/18 20:53:11 jimz + * fix up disk queueing (remove configure routine, + * add shutdown list arg to create routines) + * + * Revision 1.29 1996/06/13 20:38:19 jimz + * fix queue type in DiskQueueData + * + * Revision 1.28 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.27 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.26 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.25 1996/06/06 17:29:12 jimz + * track arm position of last I/O dequeued + * + * Revision 1.24 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.23 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.22 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.21 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.20 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.19 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.18 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.17 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.16 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.15 1996/05/10 19:39:31 jimz + * add prev pointer to DiskQueueData + * + * Revision 1.14 1996/05/10 16:24:04 jimz + * mark old defines as deprecated, add RF_ defines + * + * Revision 1.13 1995/12/01 15:59:04 root + * added copyright info + * + * Revision 1.12 1995/11/07 16:26:44 wvcii + * added Peek() function to diskqueuesw + * + * Revision 1.11 1995/10/05 02:33:15 jimz + * made queue lens longs (less instructions to read :-) + * + * Revision 1.10 1995/10/04 07:07:07 wvcii + * queue->numOutstanding now valid for user & sim + * user tested & verified, sim untested + * + * Revision 1.9 1995/09/12 00:21:37 wvcii + * added support for tracing disk queue time + * + * Revision 1.8 95/04/24 13:25:51 holland + * rewrite to move disk queues, recon, & atomic RMW to kernel + * + * Revision 1.6.10.2 1995/04/03 20:13:56 holland + * added numOutstanding and maxOutstanding to support moving + * disk queues into kernel code + * + * Revision 1.6.10.1 1995/04/03 20:03:56 holland + * initial checkin on branch + * + * Revision 1.6 1995/03/03 18:34:33 rachad + * Simulator mechanism added + * + * Revision 1.5 1995/03/01 20:25:48 holland + * kernelization changes + * + * Revision 1.4 1995/02/03 22:31:36 holland + * many changes related to kernelization + * + * Revision 1.3 1995/02/01 14:25:19 holland + * began changes for kernelization: + * changed all instances of mutex_t and cond_t to DECLARE macros + * converted configuration code to use config structure + * + * Revision 1.2 1994/11/29 20:36:02 danner + * Added symbolic constants for io_type (e.g,IO_TYPE_READ) + * and support for READ_OP_WRITE + * + */ + + +#ifndef _RF__RF_DISKQUEUE_H_ +#define _RF__RF_DISKQUEUE_H_ + +#include "rf_threadstuff.h" +#include "rf_acctrace.h" +#include "rf_alloclist.h" +#include "rf_types.h" +#include "rf_etimer.h" + + +#ifdef _KERNEL +#if defined(__NetBSD__) +#include "rf_netbsd.h" +#elif defined(__OpenBSD__) +#include "rf_openbsd.h" +#endif +#endif + + +#define RF_IO_NORMAL_PRIORITY 1 +#define RF_IO_LOW_PRIORITY 0 + +/* the data held by a disk queue entry */ +struct RF_DiskQueueData_s { + RF_SectorNum_t sectorOffset; /* sector offset into the disk */ + RF_SectorCount_t numSector; /* number of sectors to read/write */ + RF_IoType_t type; /* read/write/nop */ + caddr_t buf; /* buffer pointer */ + RF_StripeNum_t parityStripeID; /* the RAID parity stripe ID this access is for */ + RF_ReconUnitNum_t which_ru; /* which RU within this parity stripe */ + int priority; /* the priority of this request */ + int (*CompleteFunc)(void *,int);/* function to be called upon completion */ + int (*AuxFunc)(void *,...); /* function called upon completion of the first I/O of a Read_Op_Write pair*/ + void *argument; /* argument to be passed to CompleteFunc */ +#ifdef SIMULATE + RF_Owner_t owner; /* which task is responsible for this request */ +#endif /* SIMULATE */ + void *raidPtr; /* needed for simulation */ + RF_AccTraceEntry_t *tracerec; /* perf mon only */ + RF_Etimer_t qtime; /* perf mon only - time request is in queue */ + long entryTime; + RF_DiskQueueData_t *next; + RF_DiskQueueData_t *prev; + caddr_t buf2; /* for read-op-write */ + dev_t dev; /* the device number for in-kernel version */ + RF_DiskQueue_t *queue; /* the disk queue to which this req is targeted */ + RF_DiskQueueDataFlags_t flags; /* flags controlling operation */ + +#ifdef KERNEL + struct proc *b_proc; /* the b_proc from the original bp passed into the driver for this I/O */ + struct buf *bp; /* a bp to use to get this I/O done */ +#endif /* KERNEL */ +}; + +#define RF_LOCK_DISK_QUEUE 0x01 +#define RF_UNLOCK_DISK_QUEUE 0x02 + +/* note: "Create" returns type-specific queue header pointer cast to (void *) */ +struct RF_DiskQueueSW_s { + RF_DiskQueueType_t queueType; + void *(*Create)(RF_SectorCount_t, RF_AllocListElem_t *, RF_ShutdownList_t **); /* creation routine -- one call per queue in system */ + void (*Enqueue)(void *,RF_DiskQueueData_t * ,int); /* enqueue routine */ + RF_DiskQueueData_t *(*Dequeue)(void *); /* dequeue routine */ + RF_DiskQueueData_t *(*Peek)(void *); /* peek at head of queue */ + + /* the rest are optional: they improve performance, but the driver will deal with it if they don't exist */ + int (*Promote)(void *, RF_StripeNum_t, RF_ReconUnitNum_t); /* promotes priority of tagged accesses */ +}; + +struct RF_DiskQueue_s { + RF_DiskQueueSW_t *qPtr; /* access point to queue functions */ + void *qHdr; /* queue header, of whatever type */ + RF_DECLARE_MUTEX(mutex) /* mutex locking data structures */ + RF_DECLARE_COND(cond) /* condition variable for synchronization */ + long numOutstanding; /* number of I/Os currently outstanding on disk */ + long maxOutstanding; /* max # of I/Os that can be outstanding on a disk (in-kernel only) */ + int curPriority; /* the priority of accs all that are currently outstanding */ + long queueLength; /* number of requests in queue */ + RF_DiskQueueData_t *nextLockingOp; /* a locking op that has arrived at the head of the queue & is waiting for drainage */ + RF_DiskQueueData_t *unlockingOp; /* used at user level to communicate unlocking op b/w user (or dag exec) & disk threads */ + int numWaiting; /* number of threads waiting on this variable. user-level only */ + RF_DiskQueueFlags_t flags; /* terminate, locked */ + RF_Raid_t *raidPtr; /* associated array */ + dev_t dev; /* device number for kernel version */ + RF_SectorNum_t last_deq_sector; /* last sector number dequeued or dispatched */ + int row, col; /* debug only */ +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + struct raidcinfo *rf_cinfo; /* disks component info.. */ +#endif +}; + +#define RF_DQ_LOCKED 0x02 /* no new accs allowed until queue is explicitly unlocked */ + +/* macros setting & returning information about queues and requests */ +#define RF_QUEUE_LOCKED(_q) ((_q)->flags & RF_DQ_LOCKED) +#define RF_QUEUE_EMPTY(_q) (((_q)->numOutstanding == 0) && ((_q)->nextLockingOp == NULL) && !RF_QUEUE_LOCKED(_q)) +#define RF_QUEUE_FULL(_q) ((_q)->numOutstanding == (_q)->maxOutstanding) + +#define RF_LOCK_QUEUE(_q) (_q)->flags |= RF_DQ_LOCKED +#define RF_UNLOCK_QUEUE(_q) (_q)->flags &= ~RF_DQ_LOCKED + +#define RF_LOCK_QUEUE_MUTEX(_q_,_wh_) RF_LOCK_MUTEX((_q_)->mutex) +#define RF_UNLOCK_QUEUE_MUTEX(_q_,_wh_) RF_UNLOCK_MUTEX((_q_)->mutex) + +#define RF_LOCKING_REQ(_r) ((_r)->flags & RF_LOCK_DISK_QUEUE) +#define RF_UNLOCKING_REQ(_r) ((_r)->flags & RF_UNLOCK_DISK_QUEUE) + +/* whether it is ok to dispatch a regular request */ +#define RF_OK_TO_DISPATCH(_q_,_r_) \ + (RF_QUEUE_EMPTY(_q_) || \ + (!RF_QUEUE_FULL(_q_) && ((_r_)->priority >= (_q_)->curPriority))) + +int rf_ConfigureDiskQueueSystem(RF_ShutdownList_t **listp); + +void rf_TerminateDiskQueues(RF_Raid_t *raidPtr); + +int rf_ConfigureDiskQueues(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr); + +void rf_DiskIOEnqueue(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int pri); + +#if !defined(KERNEL) && !defined(SIMULATE) +void rf_BroadcastOnQueue(RF_DiskQueue_t *queue); +#endif /* !KERNEL && !SIMULATE */ + +#ifndef KERNEL +RF_DiskQueueData_t *rf_DiskIODequeue(RF_DiskQueue_t *queue); +#else /* !KERNEL */ +void rf_DiskIOComplete(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int status); +#endif /* !KERNEL */ + +int rf_DiskIOPromote(RF_DiskQueue_t *queue, RF_StripeNum_t parityStripeID, + RF_ReconUnitNum_t which_ru); + +RF_DiskQueueData_t *rf_CreateDiskQueueData(RF_IoType_t typ, + RF_SectorNum_t ssect, RF_SectorCount_t nsect, caddr_t buf, + RF_StripeNum_t parityStripeID, RF_ReconUnitNum_t which_ru, + int (*wakeF)(void *, int), + void *arg, RF_DiskQueueData_t *next, RF_AccTraceEntry_t *tracerec, + void *raidPtr, RF_DiskQueueDataFlags_t flags, void *kb_proc); + +RF_DiskQueueData_t *rf_CreateDiskQueueDataFull(RF_IoType_t typ, + RF_SectorNum_t ssect, RF_SectorCount_t nsect, caddr_t buf, + RF_StripeNum_t parityStripeID, RF_ReconUnitNum_t which_ru, + int (*wakeF)(void *, int), + void *arg, RF_DiskQueueData_t *next, RF_AccTraceEntry_t *tracerec, + int priority, int (*AuxFunc)(void *,...), caddr_t buf2, + void *raidPtr, RF_DiskQueueDataFlags_t flags, void *kb_proc); + +void rf_FreeDiskQueueData(RF_DiskQueueData_t *p); + +#endif /* !_RF__RF_DISKQUEUE_H_ */ diff --git a/sys/dev/raidframe/rf_disks.c b/sys/dev/raidframe/rf_disks.c new file mode 100644 index 00000000000..fc89d407f47 --- /dev/null +++ b/sys/dev/raidframe/rf_disks.c @@ -0,0 +1,651 @@ +/* $OpenBSD: rf_disks.c,v 1.1 1999/01/11 14:29:17 niklas Exp $ */ +/* $NetBSD: rf_disks.c,v 1.2 1998/12/03 15:06:25 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/*************************************************************** + * rf_disks.c -- code to perform operations on the actual disks + ***************************************************************/ + +/* : + * Log: rf_disks.c,v + * Revision 1.32 1996/07/27 18:40:24 jimz + * cleanup sweep + * + * Revision 1.31 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.30 1996/07/19 16:11:21 jimz + * pass devname to DoReadCapacity + * + * Revision 1.29 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.28 1996/07/10 22:28:38 jimz + * get rid of obsolete row statuses (dead,degraded2) + * + * Revision 1.27 1996/06/10 12:06:14 jimz + * don't do any SCSI op stuff in simulator at all + * + * Revision 1.26 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.25 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.24 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.23 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.22 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.21 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.20 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.19 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.18 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.17 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.16 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.15 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.14 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.13 1996/05/02 14:57:43 jimz + * initialize sectorMask + * + * Revision 1.12 1995/12/01 15:57:04 root + * added copyright info + * + */ + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_alloclist.h" +#include "rf_utils.h" +#include "rf_configure.h" +#include "rf_general.h" +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#include "rf_camlayer.h" +#endif +#include "rf_options.h" +#include "rf_sys.h" + +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/ioctl.h> +#include <sys/fcntl.h> +#ifdef __NETBSD__ +#include <sys/vnode.h> +#endif + +int raidlookup __P((char *, struct proc *p, struct vnode **)); +#endif + +#ifdef SIMULATE +static char disk_db_file_name[120], disk_type_name[120]; +static double init_offset; +#endif /* SIMULATE */ + +#define DPRINTF6(a,b,c,d,e,f) if (rf_diskDebug) printf(a,b,c,d,e,f) +#define DPRINTF7(a,b,c,d,e,f,g) if (rf_diskDebug) printf(a,b,c,d,e,f,g) + +#include "rf_ccmn.h" + +/**************************************************************************************** + * + * initialize the disks comprising the array + * + * We want the spare disks to have regular row,col numbers so that we can easily + * substitue a spare for a failed disk. But, the driver code assumes throughout + * that the array contains numRow by numCol _non-spare_ disks, so it's not clear + * how to fit in the spares. This is an unfortunate holdover from raidSim. The + * quick and dirty fix is to make row zero bigger than the rest, and put all the + * spares in it. This probably needs to get changed eventually. + * + ***************************************************************************************/ +int rf_ConfigureDisks( + RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr) +{ + RF_RaidDisk_t **disks; + RF_SectorCount_t min_numblks = (RF_SectorCount_t)0x7FFFFFFFFFFFLL; + RF_RowCol_t r, c; + int bs, ret; + unsigned i, count, foundone=0, numFailuresThisRow; + RF_DiskOp_t *rdcap_op = NULL, *tur_op = NULL; + int num_rows_done,num_cols_done; + +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + struct proc *proc = 0; +#endif +#ifndef SIMULATE +#if !defined(__NetBSD__) && !defined(__OpenBSD__) + ret = rf_SCSI_AllocReadCapacity(&rdcap_op); + if (ret) + goto fail; + ret = rf_SCSI_AllocTUR(&tur_op); + if (ret) + goto fail; +#endif /* !__NetBSD__ && !__OpenBSD__ */ +#endif /* !SIMULATE */ + + num_rows_done = 0; + num_cols_done = 0; + + + RF_CallocAndAdd(disks, raidPtr->numRow, sizeof(RF_RaidDisk_t *), (RF_RaidDisk_t **), raidPtr->cleanupList); + if (disks == NULL) { + ret = ENOMEM; + goto fail; + } + raidPtr->Disks = disks; + +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + + proc = raidPtr->proc; /* Blah XXX */ + + /* get space for the device-specific stuff... */ + RF_CallocAndAdd(raidPtr->raid_cinfo, raidPtr->numRow, + sizeof(struct raidcinfo *), (struct raidcinfo **), + raidPtr->cleanupList); + if (raidPtr->raid_cinfo == NULL) { + ret = ENOMEM; + goto fail; + } +#endif + + for (r=0; r<raidPtr->numRow; r++) { + numFailuresThisRow = 0; + RF_CallocAndAdd(disks[r], raidPtr->numCol + ((r==0) ? raidPtr->numSpare : 0), sizeof(RF_RaidDisk_t), (RF_RaidDisk_t *), raidPtr->cleanupList); + if (disks[r] == NULL) { + ret = ENOMEM; + goto fail; + } + + /* get more space for device specific stuff.. */ + RF_CallocAndAdd(raidPtr->raid_cinfo[r], + raidPtr->numCol + ((r==0) ? raidPtr->numSpare : 0), + sizeof(struct raidcinfo), (struct raidcinfo *), + raidPtr->cleanupList); + if (raidPtr->raid_cinfo[r] == NULL) { + ret = ENOMEM; + goto fail; + } + + + for (c=0; c<raidPtr->numCol; c++) { + ret = rf_ConfigureDisk(raidPtr,&cfgPtr->devnames[r][c][0], + &disks[r][c], rdcap_op, tur_op, + cfgPtr->devs[r][c],r,c); + if (ret) + goto fail; + if (disks[r][c].status != rf_ds_optimal) { + numFailuresThisRow++; + } + else { + if (disks[r][c].numBlocks < min_numblks) + min_numblks = disks[r][c].numBlocks; + DPRINTF7("Disk at row %d col %d: dev %s numBlocks %ld blockSize %d (%ld MB)\n", + r,c,disks[r][c].devname, + (long int) disks[r][c].numBlocks, + disks[r][c].blockSize, + (long int) disks[r][c].numBlocks * disks[r][c].blockSize / 1024 / 1024); + } + num_cols_done++; + } + /* XXX fix for n-fault tolerant */ + if (numFailuresThisRow > 0) + raidPtr->status[r] = rf_rs_degraded; + num_rows_done++; + } +#ifndef SIMULATE +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + /* we do nothing */ +#else + rf_SCSI_FreeDiskOp(rdcap_op, 1); rdcap_op = NULL; + rf_SCSI_FreeDiskOp(tur_op, 0); tur_op = NULL; +#endif +#endif /* !SIMULATE */ + /* all disks must be the same size & have the same block size, bs must be a power of 2 */ + bs = 0; + for (foundone=r=0; !foundone && r<raidPtr->numRow; r++) { + for (c=0; !foundone && c<raidPtr->numCol; c++) { + if (disks[r][c].status == rf_ds_optimal) { + bs = disks[r][c].blockSize; + foundone = 1; + } + } + } + if (!foundone) { + RF_ERRORMSG("RAIDFRAME: Did not find any live disks in the array.\n"); + ret = EINVAL; + goto fail; + } + for (count=0,i=1; i; i<<=1) if (bs & i) + count++; + if (count != 1) { + RF_ERRORMSG1("Error: block size on disks (%d) must be a power of 2\n",bs); + ret = EINVAL; + goto fail; + } + for (r=0; r<raidPtr->numRow; r++) { + for (c=0; c<raidPtr->numCol; c++) { + if (disks[r][c].status == rf_ds_optimal) { + if (disks[r][c].blockSize != bs) { + RF_ERRORMSG2("Error: block size of disk at r %d c %d different from disk at r 0 c 0\n",r,c); + ret = EINVAL; + goto fail; + } + if (disks[r][c].numBlocks != min_numblks) { + RF_ERRORMSG3("WARNING: truncating disk at r %d c %d to %d blocks\n", + r,c,(int) min_numblks); + disks[r][c].numBlocks = min_numblks; + } + } + } + } + + raidPtr->sectorsPerDisk = min_numblks; + raidPtr->logBytesPerSector = ffs(bs) - 1; + raidPtr->bytesPerSector = bs; + raidPtr->sectorMask = bs-1; + return(0); + +fail: + +#ifndef SIMULATE +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + + for(r=0;r<raidPtr->numRow;r++) { + for(c=0;c<raidPtr->numCol;c++) { + /* Cleanup.. */ +#ifdef DEBUG + printf("Cleaning up row: %d col: %d\n",r,c); +#endif + if (raidPtr->raid_cinfo[r][c].ci_vp) { + (void)vn_close(raidPtr->raid_cinfo[r][c].ci_vp, + FREAD|FWRITE, proc->p_ucred, proc); + } + } + } + /* Space allocated for raid_vpp will get cleaned up at some other point */ + /* XXX Need more #ifdefs in the above... */ + +#else + + if (rdcap_op) rf_SCSI_FreeDiskOp(rdcap_op, 1); + if (tur_op) rf_SCSI_FreeDiskOp(tur_op, 0); + +#endif +#endif /* !SIMULATE */ + return(ret); +} + + +/**************************************************************************************** + * set up the data structures describing the spare disks in the array + * recall from the above comment that the spare disk descriptors are stored + * in row zero, which is specially expanded to hold them. + ***************************************************************************************/ +int rf_ConfigureSpareDisks( + RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr) +{ + char buf[256]; + int r,c,i, ret; + RF_DiskOp_t *rdcap_op = NULL, *tur_op = NULL; + unsigned bs; + RF_RaidDisk_t *disks; + int num_spares_done; + +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + struct proc *proc; +#endif + +#ifndef SIMULATE +#if !defined(__NetBSD__) && !defined(__OpenBSD__) + ret = rf_SCSI_AllocReadCapacity(&rdcap_op); + if (ret) + goto fail; + ret = rf_SCSI_AllocTUR(&tur_op); + if (ret) + goto fail; +#endif /* !__NetBSD__ && !__OpenBSD__ */ +#endif /* !SIMULATE */ + + num_spares_done = 0; + +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + proc = raidPtr->proc; + /* The space for the spares should have already been + allocated by ConfigureDisks() */ +#endif + + disks = &raidPtr->Disks[0][raidPtr->numCol]; + for (i=0; i<raidPtr->numSpare; i++) { + ret = rf_ConfigureDisk(raidPtr,&cfgPtr->spare_names[i][0], + &disks[i], rdcap_op, tur_op, + cfgPtr->spare_devs[i],0,raidPtr->numCol+i); + if (ret) + goto fail; + if (disks[i].status != rf_ds_optimal) { + RF_ERRORMSG1("Warning: spare disk %s failed TUR\n",buf); + } else { + disks[i].status = rf_ds_spare; /* change status to spare */ + DPRINTF6("Spare Disk %d: dev %s numBlocks %ld blockSize %d (%ld MB)\n",i, + disks[i].devname, + (long int) disks[i].numBlocks,disks[i].blockSize, + (long int) disks[i].numBlocks * disks[i].blockSize / 1024 / 1024); + } + num_spares_done++; + } +#ifndef SIMULATE +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && (_KERNEL) + +#else + rf_SCSI_FreeDiskOp(rdcap_op, 1); rdcap_op = NULL; + rf_SCSI_FreeDiskOp(tur_op, 0); tur_op = NULL; +#endif +#endif /* !SIMULATE */ + + /* check sizes and block sizes on spare disks */ + bs = 1 << raidPtr->logBytesPerSector; + for (i=0; i<raidPtr->numSpare; i++) { + if (disks[i].blockSize != bs) { + RF_ERRORMSG3("Block size of %d on spare disk %s is not the same as on other disks (%d)\n",disks[i].blockSize, disks[i].devname, bs); + ret = EINVAL; + goto fail; + } + if (disks[i].numBlocks < raidPtr->sectorsPerDisk) { + RF_ERRORMSG3("Spare disk %s (%d blocks) is too small to serve as a spare (need %ld blocks)\n", + disks[i].devname, disks[i].blockSize, (long int)raidPtr->sectorsPerDisk); + ret = EINVAL; + goto fail; + } else if (disks[i].numBlocks > raidPtr->sectorsPerDisk) { + RF_ERRORMSG2("Warning: truncating spare disk %s to %ld blocks\n",disks[i].devname, (long int) raidPtr->sectorsPerDisk); + + disks[i].numBlocks = raidPtr->sectorsPerDisk; + } + } + + return(0); + +fail: +#ifndef SIMULATE +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + + /* Release the hold on the main components. We've failed to allocate a + spare, and since we're failing, we need to free things.. */ + + for(r=0;r<raidPtr->numRow;r++) { + for(c=0;c<raidPtr->numCol;c++) { + /* Cleanup.. */ +#ifdef DEBUG + printf("Cleaning up row: %d col: %d\n",r,c); +#endif + if (raidPtr->raid_cinfo[r][c].ci_vp) { + (void)vn_close(raidPtr->raid_cinfo[r][c].ci_vp, + FREAD|FWRITE, proc->p_ucred, proc); + } + } + } + + for(i=0;i<raidPtr->numSpare;i++) { + /* Cleanup.. */ +#ifdef DEBUG + printf("Cleaning up spare: %d\n",i); +#endif + if (raidPtr->raid_cinfo[0][raidPtr->numCol+i].ci_vp) { + (void)vn_close(raidPtr->raid_cinfo[0][raidPtr->numCol+i].ci_vp, + FREAD|FWRITE, proc->p_ucred, proc); + } + } + +#else + + if (rdcap_op) rf_SCSI_FreeDiskOp(rdcap_op, 1); + if (tur_op) rf_SCSI_FreeDiskOp(tur_op, 0); + +#endif + +#endif /* !SIMULATE */ + return(ret); +} + + + +/* configure a single disk in the array */ +int rf_ConfigureDisk(raidPtr, buf, diskPtr, rdcap_op, tur_op, dev, row, col) + RF_Raid_t *raidPtr; /* We need this down here too!! GO */ + char *buf; + RF_RaidDisk_t *diskPtr; + RF_DiskOp_t *rdcap_op; + RF_DiskOp_t *tur_op; + dev_t dev; /* device number used only in kernel */ + RF_RowCol_t row; + RF_RowCol_t col; +{ + char *p; +#ifdef SIMULATE + double init_offset; +#else /* SIMULATE */ +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + int retcode; +#else + int busid, targid, lun, retcode; +#endif +#endif /* SIMULATE */ + +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + struct partinfo dpart; + struct vnode *vp; + struct vattr va; + struct proc *proc; + int error; +#endif + +retcode = 0; + p = rf_find_non_white(buf); + if (p[strlen(p)-1] == '\n') { + /* strip off the newline */ + p[strlen(p)-1] = '\0'; + } + (void) strcpy(diskPtr->devname, p); + +#ifdef SIMULATE + + init_offset = 0.0; + rf_InitDisk(&diskPtr->diskState, disk_db_file_name,diskPtr->devname,0,0,init_offset,row,col); + rf_GeometryDoReadCapacity(&diskPtr->diskState, &diskPtr->numBlocks, &diskPtr->blockSize); + diskPtr->numBlocks = diskPtr->numBlocks * rf_sizePercentage / 100; + + /* we allow the user to specify that only a fraction of the disks should be used + * this is just for debug: it speeds up the parity scan + */ + +#else /* SIMULATE */ +#if !defined(__NetBSD__) && !defined(__OpenBSD__) + /* get bus, target, lun */ + retcode = rf_extract_ids(p, &busid, &targid, &lun); + if (retcode) + return(retcode); + + /* required in kernel, nop at user level */ + retcode = rf_SCSI_OpenUnit(dev); + if (retcode) + return(retcode); + + diskPtr->dev = dev; + if (rf_SCSI_DoTUR(tur_op, (u_char)busid, (u_char)targid, (u_char)lun, dev)) { + RF_ERRORMSG1("Disk %s failed TUR. Marked as dead.\n",diskPtr->devname); + diskPtr->status = rf_ds_failed; + } else { + diskPtr->status = rf_ds_optimal; + retcode = rf_SCSI_DoReadCapacity(raidPtr,rdcap_op, busid, targid, lun, dev, + &diskPtr->numBlocks, &diskPtr->blockSize, diskPtr->devname); + if (retcode) + return(retcode); + + /* we allow the user to specify that only a fraction of the disks should be used + * this is just for debug: it speeds up the parity scan + */ + diskPtr->numBlocks = diskPtr->numBlocks * rf_sizePercentage / 100; + } +#endif +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + + proc = raidPtr->proc; /* XXX Yes, this is not nice.. */ + + /* Let's start by claiming the component is fine and well... */ + /* XXX not the case if the disk is toast.. */ + diskPtr->status = rf_ds_optimal; + + + raidPtr->raid_cinfo[row][col].ci_vp = NULL; + raidPtr->raid_cinfo[row][col].ci_dev = NULL; + + error = raidlookup(diskPtr->devname, proc, &vp); + if (error) { + printf("raidlookup on device: %s failed!\n",diskPtr->devname); + if (error == ENXIO) { + /* XXX the component isn't there... must be dead :-( */ + diskPtr->status = rf_ds_failed; + } else { + return(error); + } + } + + if (diskPtr->status == rf_ds_optimal) { + + if ((error = VOP_GETATTR(vp, &va, proc->p_ucred, proc)) != 0) { + return(error); + } + + error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart, + FREAD, proc->p_ucred, proc); + if (error) { + return(error); + } + + + diskPtr->blockSize = dpart.disklab->d_secsize; + + diskPtr->numBlocks = dpart.part->p_size - rf_protectedSectors; + + raidPtr->raid_cinfo[row][col].ci_vp = vp; + raidPtr->raid_cinfo[row][col].ci_dev = va.va_rdev; + +#if 0 + diskPtr->dev = dev; +#endif + + diskPtr->dev = va.va_rdev; /* XXX or the above? */ + + /* we allow the user to specify that only a fraction of the disks should be used + * this is just for debug: it speeds up the parity scan + */ + diskPtr->numBlocks = diskPtr->numBlocks * rf_sizePercentage / 100; + + } + +#endif /* !__NetBSD__ && !__OpenBSD__ */ +#endif /* SIMULATE */ + + return(0); +} + +#ifdef SIMULATE + +void rf_default_disk_names() +{ + sprintf(disk_db_file_name,"disk.db"); + sprintf(disk_type_name,"HP2247"); +} + +void rf_set_disk_db_name(s) + char *s; +{ + strcpy(disk_db_file_name,s); +} + +void rf_set_disk_type_name(s) + char *s; +{ + strcpy(disk_type_name,s); +} + +#endif /* SIMULATE */ diff --git a/sys/dev/raidframe/rf_disks.h b/sys/dev/raidframe/rf_disks.h new file mode 100644 index 00000000000..8857391a8bd --- /dev/null +++ b/sys/dev/raidframe/rf_disks.h @@ -0,0 +1,161 @@ +/* $OpenBSD: rf_disks.h,v 1.1 1999/01/11 14:29:18 niklas Exp $ */ +/* $NetBSD: rf_disks.h,v 1.1 1998/11/13 04:20:29 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * rf_disks.h -- header file for code related to physical disks + */ + +/* : + * Log: rf_disks.h,v + * Revision 1.15 1996/08/20 23:05:13 jimz + * add nreads, nwrites to RaidDisk + * + * Revision 1.14 1996/06/17 03:20:15 jimz + * increase devname len to 56 + * + * Revision 1.13 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.12 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.11 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.10 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.9 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.8 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.7 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.6 1996/05/02 22:06:57 jimz + * add RF_RaidDisk_t + * + * Revision 1.5 1995/12/01 15:56:53 root + * added copyright info + * + */ + +#ifndef _RF__RF_DISKS_H_ +#define _RF__RF_DISKS_H_ + +#include <sys/types.h> + +#include "rf_archs.h" +#include "rf_types.h" +#ifdef SIMULATE +#include "rf_geometry.h" +#endif /* SIMULATE */ + +/* + * A physical disk can be in one of several states: + * IF YOU ADD A STATE, CHECK TO SEE IF YOU NEED TO MODIFY RF_DEAD_DISK() BELOW. + */ +enum RF_DiskStatus_e { + rf_ds_optimal, /* no problems */ + rf_ds_failed, /* reconstruction ongoing */ + rf_ds_reconstructing, /* reconstruction complete to spare, dead disk not yet replaced */ + rf_ds_dist_spared, /* reconstruction complete to distributed spare space, dead disk not yet replaced */ + rf_ds_spared, /* reconstruction complete to distributed spare space, dead disk not yet replaced */ + rf_ds_spare, /* an available spare disk */ + rf_ds_used_spare /* a spare which has been used, and hence is not available */ +}; +typedef enum RF_DiskStatus_e RF_DiskStatus_t; + +struct RF_RaidDisk_s { + char devname[56]; /* name of device file */ + RF_DiskStatus_t status; /* whether it is up or down */ + RF_RowCol_t spareRow; /* if in status "spared", this identifies the spare disk */ + RF_RowCol_t spareCol; /* if in status "spared", this identifies the spare disk */ + RF_SectorCount_t numBlocks; /* number of blocks, obtained via READ CAPACITY */ + int blockSize; + /* XXX the folling is needed since we seem to need SIMULATE defined + in order to get user-land stuff to compile, but we *don't* want + this in the structure for the user-land utilities, as the + kernel doesn't know about it!! (and it messes up the size of + the structure, so there is a communication problem between + the kernel and the userland utils :-( GO */ +#if defined(SIMULATE) && !defined(RF_UTILITY) + RF_DiskState_t diskState; /* the name of the disk as used in the disk module */ +#endif /* SIMULATE */ +#if RF_KEEP_DISKSTATS > 0 + RF_uint64 nreads; + RF_uint64 nwrites; +#endif /* RF_KEEP_DISKSTATS > 0 */ + dev_t dev; +}; + +/* + * An RF_DiskOp_t ptr is really a pointer to a UAGT_CCB, but I want + * to isolate the cam layer from all other layers, so I typecast to/from + * RF_DiskOp_t * (i.e. void *) at the interfaces. + */ +typedef void RF_DiskOp_t; + +/* if a disk is in any of these states, it is inaccessible */ +#define RF_DEAD_DISK(_dstat_) (((_dstat_) == rf_ds_spared) || \ + ((_dstat_) == rf_ds_reconstructing) || ((_dstat_) == rf_ds_failed) || \ + ((_dstat_) == rf_ds_dist_spared)) + +int rf_ConfigureDisks(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr); +int rf_ConfigureSpareDisks(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr); +int rf_ConfigureDisk(RF_Raid_t *raidPtr, char *buf, RF_RaidDisk_t *diskPtr, + RF_DiskOp_t *rdcap_op, RF_DiskOp_t *tur_op, dev_t dev, + RF_RowCol_t row, RF_RowCol_t col); + +#ifdef SIMULATE +void rf_default_disk_names(void); +void rf_set_disk_db_name(char *s); +void rf_set_disk_type_name(char *s); +#endif /* SIMULATE */ + +#endif /* !_RF__RF_DISKS_H_ */ diff --git a/sys/dev/raidframe/rf_diskthreads.h b/sys/dev/raidframe/rf_diskthreads.h new file mode 100644 index 00000000000..60181759b6d --- /dev/null +++ b/sys/dev/raidframe/rf_diskthreads.h @@ -0,0 +1,103 @@ +/* $OpenBSD: rf_diskthreads.h,v 1.1 1999/01/11 14:29:18 niklas Exp $ */ +/* $NetBSD: rf_diskthreads.h,v 1.1 1998/11/13 04:20:29 oster Exp $ */ +/* + * rf_diskthreads.h + */ +/* + * Copyright (c) 1996 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ +/* + * : + * Log: rf_diskthreads.h,v + * Revision 1.7 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.6 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.5 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.4 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.3 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.2 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.1 1996/05/18 19:55:58 jimz + * Initial revision + * + */ +/* + * rf_diskthreads.h -- types and prototypes for disk thread system + */ + +#ifndef _RF__RF_DISKTHREADS_H_ +#define _RF__RF_DISKTHREADS_H_ + +#include "rf_types.h" + +/* this is the information that a disk thread needs to do its job */ +struct RF_DiskId_s { + RF_DiskQueue_t *queue; + RF_Raid_t *raidPtr; + RF_RaidDisk_t *disk; + int fd; /* file descriptor */ + RF_RowCol_t row, col; /* debug only */ +#ifdef SIMULATE + int state; +#endif /* SIMULATE */ +}; + +int rf_ConfigureDiskThreads(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr); + +#ifdef SIMULATE +int rf_SetDiskIdle(RF_Raid_t *raidPtr, RF_RowCol_t r, RF_RowCol_t c); +int rf_ScanDiskQueues(RF_Raid_t *raidPtr); +void rf_simulator_complete_io(RF_DiskId_t *id); +void rf_PrintDiskStat(RF_Raid_t *raidPtr); +#else /* SIMULATE */ +int rf_ShutdownDiskThreads(RF_Raid_t *raidPtr); +#endif /* SIMULATE */ + +#endif /* !_RF__RF_DISKTHREADS_H_ */ diff --git a/sys/dev/raidframe/rf_driver.c b/sys/dev/raidframe/rf_driver.c new file mode 100644 index 00000000000..f8db8f5baf0 --- /dev/null +++ b/sys/dev/raidframe/rf_driver.c @@ -0,0 +1,1765 @@ +/* $OpenBSD: rf_driver.c,v 1.1 1999/01/11 14:29:18 niklas Exp $ */ +/* $NetBSD: rf_driver.c,v 1.2 1998/11/13 13:45:15 drochner Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, Khalil Amiri, Claudson Bornstein, William V. Courtright II, + * Robby Findler, Daniel Stodolsky, Rachad Youssef, Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/****************************************************************************** + * + * rf_driver.c -- main setup, teardown, and access routines for the RAID driver + * + * all routines are prefixed with rf_ (raidframe), to avoid conficts. + * + ******************************************************************************/ + +/* + * : + * Log: rf_driver.c,v + * Revision 1.147 1996/08/21 04:12:46 jimz + * added hook for starting out req_hist w/ more distributed values + * (currently not done) + * + * Revision 1.146 1996/07/29 14:05:12 jimz + * fix numPUs/numRUs confusion (everything is now numRUs) + * clean up some commenting, return values + * + * Revision 1.145 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.144 1996/07/27 18:40:24 jimz + * cleanup sweep + * + * Revision 1.143 1996/07/22 21:11:53 jimz + * fix formatting on DoAccess error msg + * + * Revision 1.142 1996/07/19 16:10:06 jimz + * added call to rf_ResetDebugOptions() in rf_ConfigureDebug() + * + * Revision 1.141 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.140 1996/07/17 21:00:58 jimz + * clean up timer interface, tracing + * + * Revision 1.139 1996/07/15 05:40:41 jimz + * some recon datastructure cleanup + * better handling of multiple failures + * added undocumented double-recon test + * + * Revision 1.138 1996/07/11 19:08:00 jimz + * generalize reconstruction mechanism + * allow raid1 reconstructs via copyback (done with array + * quiesced, not online, therefore not disk-directed) + * + * Revision 1.137 1996/07/10 22:28:00 jimz + * get rid of obsolete row statuses (dead,degraded2) + * + * Revision 1.136 1996/06/17 14:38:33 jimz + * properly #if out RF_DEMO code + * fix bug in MakeConfig that was causing weird behavior + * in configuration routines (config was not zeroed at start) + * clean up genplot handling of stacks + * + * Revision 1.135 1996/06/17 03:20:32 jimz + * move out raidframe_attr_default + * don't monkey with stack sizes + * + * Revision 1.134 1996/06/14 23:15:38 jimz + * attempt to deal with thread GC problem + * + * Revision 1.133 1996/06/14 21:24:08 jimz + * new ConfigureEtimer init + * moved out timer vars + * + * Revision 1.132 1996/06/14 16:19:03 jimz + * remove include of pdllib.h (beginning of PDL cleanup) + * + * Revision 1.131 1996/06/14 14:35:24 jimz + * clean up dfstrace protection + * + * Revision 1.130 1996/06/14 14:16:09 jimz + * engine config is now array-specific + * + * Revision 1.129 1996/06/13 19:08:10 jimz + * add debug var to force keep_acc_totals on + * + * Revision 1.128 1996/06/11 10:57:08 jimz + * init recon_done_proc_mutex + * + * Revision 1.127 1996/06/10 14:18:58 jimz + * move user, throughput stats into per-array structure + * + * Revision 1.126 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.125 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.124 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.123 1996/06/05 19:38:32 jimz + * fixed up disk queueing types config + * added sstf disk queueing + * fixed exit bug on diskthreads (ref-ing bad mem) + * + * Revision 1.122 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.121 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.120 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.119 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.118 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.117 1996/05/30 16:28:33 jimz + * typo in rf_SignalQuiescenceLock() fixed + * + * Revision 1.116 1996/05/30 12:59:18 jimz + * make etimer happier, more portable + * + * Revision 1.115 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.114 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.113 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.112 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.111 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.110 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.109 1996/05/23 00:39:56 jimz + * demoMode -> rf_demoMode + * + * Revision 1.108 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.107 1996/05/21 14:30:04 jimz + * idler_desc_mutex should be ifndef SIMULATE + * + * Revision 1.106 1996/05/20 19:31:12 jimz + * add atomic debug (mutex and cond leak finder) stuff + * + * Revision 1.105 1996/05/20 16:12:45 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.104 1996/05/18 20:09:41 jimz + * bit of cleanup to compile cleanly in kernel, once again + * + * Revision 1.103 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.102 1996/05/16 21:20:51 jimz + * use FREELIST stuff to manage access descriptors + * + * Revision 1.101 1996/05/16 14:21:10 jimz + * remove bogus copies from write path on user + * + * Revision 1.100 1996/05/15 22:33:54 jimz + * appropriately #ifdef cache stuff + * + * Revision 1.99 1996/05/08 21:34:41 jimz + * #if 0 ShutdownCache() and ConfigureCache() + * + * Revision 1.98 1996/05/08 21:01:24 jimz + * fixed up enum type names that were conflicting with other + * enums and function names (ie, "panic") + * future naming trends will be towards RF_ and rf_ for + * everything raidframe-related + * + * Revision 1.97 1996/05/07 19:02:58 wvcii + * corrected header comment of rf_DoAccess() + * reordered free of desc in FreeRaidAccDesc() The desc is now + * freed last. + * + * Revision 1.96 1996/05/07 17:40:50 jimz + * add doDebug + * + * Revision 1.95 1996/05/06 21:35:23 jimz + * fixed ordering of cleanup and removed extra decrement of configureCount + * + * Revision 1.94 1996/05/06 18:44:14 jimz + * reorder cleanup to not blow alloclist out from under various modules + * zero raidPtr contents on config + * + * Revision 1.93 1996/05/04 17:06:53 jimz + * Fail the I/O with ENOSPC if reading past end of the array in the kernel. + * + * Revision 1.92 1996/05/03 19:44:22 wvcii + * debug vars degDagDebug and enableAtomicRMW now defined + * in this file. + * + * Revision 1.91 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.90 1995/12/08 15:07:03 arw + * cache code cleanup + * + * Revision 1.89 1995/12/06 20:53:58 wvcii + * created debug var forceParityLogReint + * this variable forces reintegration of all parity logs at shutdown + * + * Revision 1.88 1995/12/01 15:59:10 root + * added copyright info + * + * Revision 1.87 1995/11/28 21:34:02 amiri + * modified SetReconfiguredMode so that it installs the + * spare table only if arch is declustered based on block designs + * + * Revision 1.86 1995/11/21 23:06:11 amiri + * added division by zero check in printing + * throughput stats. + * + * Revision 1.85 1995/11/19 16:27:25 wvcii + * disableParityVerify now defined locally, only read from config + * file for !KERNEL compiles + * + * Revision 1.84 1995/11/17 15:08:31 wvcii + * added debug var disableParityVerify + * used in RealLoopTest to disable parity verification + * + * Revision 1.83 1995/11/07 15:48:43 wvcii + * deleted debug vars: suppressAtomicRMW, enableRollAway, concatDagDebug + * deleted debug vars: debugSelectUnit, debugSelectBlock + * added debug var: enableAtomicRMW + * + * Revision 1.82 1995/10/18 19:28:45 amiri + * added support for reconstruction demos in the + * simulator, by updating some simulator + * variables in Faildisk. + * + * Revision 1.81 1995/10/09 18:36:33 jimz + * move rf_StopThroughputStats() into FreeAccDesc() + * changed throughput output print format + * added user-level copy to write path to emulate kernel hack + * + * Revision 1.80 1995/10/09 18:07:47 wvcii + * moved call to rf_StopThroughputStats to rf_states.c + * + * Revision 1.79 1995/10/09 17:38:53 jimz + * quiesce an array for user-level testing before shutting it down + * (should this also be done in the kernel?) + * + * Revision 1.78 1995/10/09 15:35:43 wvcii + * added code to measure throughput in user mode + * + * Revision 1.77 1995/10/05 06:18:59 jimz + * Changed DDEventRequest() to take additional arg, used by simulator + * to cache diskid so queue length can be decremented on io complete + * (this is a hack to get around the fact that the event mechanism + * assumes it can dereference arbitrary handles on enqueued events) + * + * Revision 1.76 1995/10/04 07:25:10 jimz + * turn off bigstacks by default + * + * Revision 1.75 1995/10/04 07:24:34 jimz + * code for bigstacks in user process + * + * Revision 1.74 1995/09/26 21:42:51 wvcii + * removed calls to ConfigureCache, ShutdownCache when building kernel + * kernel currently does not support any cached architectures + * + * Revision 1.73 1995/09/20 21:05:35 jimz + * add missing unit arg to IO_BUF_ERR() in non-kernel case + * + * Revision 1.72 1995/09/19 23:02:44 jimz + * call RF_DKU_END_IO in the appropriate places + * + * Revision 1.71 1995/09/07 19:02:31 jimz + * mods to get raidframe to compile and link + * in kernel environment + * + * Revision 1.70 1995/09/06 19:24:01 wvcii + * added debug vars enableRollAway and debugRecovery + * + * Revision 1.69 1995/08/24 19:25:36 rachad + * Fixes to LSS GC in the simulater + * + * Revision 1.68 1995/07/28 21:43:42 robby + * checkin after leaving for Rice. Bye + * + * Revision 1.67 1995/07/26 18:06:52 cfb + * *** empty log message *** + * + * Revision 1.66 1995/07/26 03:25:24 robby + * fixed accesses mutex and updated call to ConfigureCache + * + * Revision 1.65 1995/07/25 14:36:52 rachad + * *** empty log message *** + * + * Revision 1.64 1995/07/21 19:29:05 robby + * added total_accesses + * + * Revision 1.63 1995/07/20 19:43:35 cfb + * *** empty log message *** + * + * Revision 1.62 1995/07/20 16:10:24 rachad + * *** empty log message *** + * + * Revision 1.61 1995/07/20 03:36:53 rachad + * Added suport for cache warming + * + * Revision 1.60 1995/07/17 22:31:31 cfb + * *** empty log message *** + * + * Revision 1.59 1995/07/16 17:02:23 cfb + * *** empty log message *** + * + * Revision 1.58 1995/07/16 15:19:27 cfb + * *** empty log message *** + * + * Revision 1.57 1995/07/16 03:17:01 cfb + * *** empty log message *** + * + * Revision 1.56 1995/07/13 16:11:59 cfb + * *** empty log message *** + * + * Revision 1.55 1995/07/13 15:42:40 cfb + * added cacheDebug variable ... + * + * Revision 1.54 1995/07/13 14:28:27 rachad + * *** empty log message *** + * + * Revision 1.53 1995/07/10 21:48:52 robby + * added virtualStripingWarnings + * + * Revision 1.52 1995/07/10 20:41:13 rachad + * *** empty log message *** + * + * Revision 1.51 1995/07/09 19:46:49 cfb + * Added cache Shutdown + * + * Revision 1.50 1995/07/08 21:38:53 rachad + * Added support for interactive traces + * in the simulator + * + * Revision 1.49 1995/07/08 18:05:39 rachad + * Linked up Claudsons code with the real cache + * + * Revision 1.48 1995/07/07 16:00:22 cfb + * Added initialization of cacheDesc to AllocRaidAccDesc + * + * Revision 1.47 1995/07/06 14:22:37 rachad + * Merge complete + * + * Revision 1.46.50.2 1995/06/21 17:48:30 robby + * test + * + * Revision 1.46.50.1 1995/06/21 17:34:49 robby + * branching to work on "meta-dag" capabilities + * + * Revision 1.46.10.5 1995/07/03 21:58:34 holland + * added support for suppressing both stripe locks & large writes + * + * Revision 1.46.10.4 1995/06/27 03:42:48 holland + * typo fix + * + * Revision 1.46.10.3 1995/06/27 03:31:42 holland + * prototypes + * + * Revision 1.46.10.2 1995/06/27 03:17:57 holland + * fixed callback bug in kernel rf_DoAccess + * + * Revision 1.46.10.1 1995/06/25 14:32:44 holland + * initial checkin on new branch + * + * Revision 1.46 1995/06/13 17:52:41 holland + * added UserStats stuff + * + * Revision 1.45 1995/06/13 16:03:41 rachad + * *** empty log message *** + * + * Revision 1.44 1995/06/12 15:54:40 rachad + * Added garbege collection for log structured storage + * + * Revision 1.43 1995/06/09 18:01:09 holland + * various changes related to in-kernel recon, multiple-row arrays, + * trace extraction from kernel, etc. + * + * Revision 1.42 1995/06/08 19:52:28 rachad + * *** empty log message *** + * + * Revision 1.41 1995/06/08 00:11:49 robby + * added a debug variable -- showVirtualSizeRequirements + * + * Revision 1.40 1995/06/05 00:33:30 holland + * protectedSectors bug fix + * + * Revision 1.39 1995/06/01 22:45:03 holland + * made compilation of parity logging and virtual striping + * stuff conditional on some constants defined in rf_archs.h + * + * Revision 1.38 1995/06/01 21:52:37 holland + * replaced NULL sizes in calls to Free() by -1, and caused this + * to suppress the size-mismatch error + * + * Revision 1.37 1995/05/26 20:04:54 wvcii + * modified parity logging debug vars + * + * Revision 1.36 95/05/21 15:32:41 wvcii + * added debug vars: parityLogDebug, numParityRegions, numParityLogs, + * numReintegrationThreads + * + * Revision 1.35 95/05/19 20:58:21 holland + * cleanups on error cases in rf_DoAccess + * + * Revision 1.34 1995/05/16 17:35:53 holland + * added rf_copyback_in_progress. this is debug-only. + * + * Revision 1.33 1995/05/15 12:25:35 holland + * bug fix in test code: no stripe locks were getting acquired in RAID0 mode + * + * Revision 1.32 1995/05/10 18:54:12 holland + * bug fixes related to deadlock problem at time of disk failure + * eliminated read-op-write code + * beefed up parity checking in loop test + * various small changes & new ASSERTs + * + * Revision 1.31 1995/05/02 22:49:02 holland + * add shutdown calls for each architecture + * + * Revision 1.30 1995/05/01 14:43:37 holland + * merged changes from Bill + * + * Revision 1.29 1995/05/01 13:28:00 holland + * parity range locks, locking disk requests, recon+parityscan in kernel, etc. + * + * Revision 1.28 1995/04/24 13:25:51 holland + * rewrite to move disk queues, recon, & atomic RMW to kernel + * + * Revision 1.27 1995/04/06 14:47:56 rachad + * merge completed + * + * Revision 1.26 1995/04/03 20:32:35 rachad + * added reconstruction to simulator + * + * Revision 1.25.10.2 1995/04/03 20:41:00 holland + * misc changes related to distributed sparing + * + * Revision 1.25.10.1 1995/03/17 20:04:01 holland + * initial checkin on new branch + * + * Revision 1.25 1995/03/15 20:34:30 holland + * changes for distributed sparing. + * + * Revision 1.24 1995/03/09 19:53:05 rachad + * *** empty log message *** + * + * Revision 1.23 1995/03/03 18:36:16 rachad + * Simulator mechanism added + * + * Revision 1.22 1995/03/01 20:25:48 holland + * kernelization changes + * + * Revision 1.21 1995/02/17 19:39:56 holland + * added size param to all calls to Free(). + * this is ignored at user level, but necessary in the kernel. + * + * Revision 1.20 1995/02/17 13:37:49 holland + * kernelization changes -- not yet complete + * + * Revision 1.19 1995/02/10 18:08:07 holland + * fixed a few things I broke during kernelization + * + * Revision 1.18 1995/02/10 17:34:10 holland + * kernelization changes + * + * Revision 1.17 1995/02/04 15:51:35 holland + * kernelization changes + * + * Revision 1.16 1995/02/03 22:31:36 holland + * many changes related to kernelization + * + * Revision 1.15 1995/02/01 15:13:05 holland + * moved #include of general.h out of raid.h and into each file + * + * Revision 1.14 1995/02/01 14:25:19 holland + * began changes for kernelization: + * changed all instances of mutex_t and cond_t to DECLARE macros + * converted configuration code to use config structure + * + * Revision 1.13 1995/01/30 14:53:46 holland + * extensive changes related to making DoIO non-blocking + * + * Revision 1.12 1995/01/25 00:26:21 holland + * eliminated support for aio + * + * Revision 1.11 1995/01/24 23:58:46 holland + * multi-way recon XOR, plus various small changes + * + * Revision 1.10 1995/01/11 19:27:02 holland + * various changes related to performance tuning + * + * Revision 1.9 1994/12/05 15:29:09 holland + * added trace run time limitation (maxTraceRunTimeSec) + * + * Revision 1.8 1994/12/05 04:18:12 holland + * various new control vars in the config file + * + * Revision 1.7 1994/11/29 23:11:36 holland + * tracerec bug on dag retry fixed + * + * Revision 1.6 1994/11/29 22:11:38 danner + * holland updates + * + * Revision 1.5 1994/11/29 21:09:47 danner + * Detailed tracing support (holland). + * + * Revision 1.4 1994/11/29 20:36:02 danner + * Added suppressAtomicRMW option. + * + * Revision 1.3 1994/11/21 15:34:06 danner + * Added ConfigureAllocList() call. + * + */ + +#ifdef _KERNEL +#define KERNEL +#endif + +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/ioctl.h> +#include <sys/fcntl.h> +#ifdef __NETBSD__ +#include <sys/vnode.h> +#endif +#endif + +#ifdef KERNEL +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#include <dkusage.h> +#include <dfstrace.h> +#endif /* !__NetBSD__ && !__OpenBSD__ */ +#endif /* KERNEL */ + +#include "rf_archs.h" +#include "rf_threadstuff.h" + +#ifndef KERNEL +#include <stdio.h> +#include <stdlib.h> +#endif /* KERNEL */ + +#include <sys/errno.h> + +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_aselect.h" +#include "rf_diskqueue.h" +#include "rf_parityscan.h" +#include "rf_alloclist.h" +#include "rf_threadid.h" +#include "rf_dagutils.h" +#include "rf_utils.h" +#include "rf_etimer.h" +#include "rf_acctrace.h" +#include "rf_configure.h" +#include "rf_general.h" +#include "rf_desc.h" +#include "rf_states.h" +#include "rf_freelist.h" +#include "rf_decluster.h" +#include "rf_map.h" +#include "rf_diskthreads.h" +#include "rf_revent.h" +#include "rf_callback.h" +#include "rf_engine.h" +#include "rf_memchunk.h" +#include "rf_mcpair.h" +#include "rf_nwayxor.h" +#include "rf_debugprint.h" +#include "rf_copyback.h" +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#include "rf_camlayer.h" +#endif +#include "rf_driver.h" +#include "rf_options.h" +#include "rf_shutdown.h" +#include "rf_sys.h" +#include "rf_cpuutil.h" + +#ifdef SIMULATE +#include "rf_diskevent.h" +#endif /* SIMULATE */ + +#ifdef KERNEL +#include <sys/buf.h> +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#include <io/common/devdriver.h> +#endif /* !__NetBSD__ && !__OpenBSD__ */ + +#if DFSTRACE > 0 +#include <sys/dfs_log.h> +#include <sys/dfstracebuf.h> +#endif /* DFSTRACE > 0 */ + +#if DKUSAGE > 0 +#include <sys/dkusage.h> +#include <io/common/iotypes.h> +#include <io/cam/dec_cam.h> +#include <io/cam/cam.h> +#include <io/cam/pdrv.h> +#endif /* DKUSAGE > 0 */ +#endif /* KERNEL */ + +#if RF_DEMO > 0 +#include "rf_demo.h" +#endif /* RF_DEMO > 0 */ + +/* rad == RF_RaidAccessDesc_t */ +static RF_FreeList_t *rf_rad_freelist; +#define RF_MAX_FREE_RAD 128 +#define RF_RAD_INC 16 +#define RF_RAD_INITIAL 32 + +/* debug variables */ +char rf_panicbuf[2048]; /* a buffer to hold an error msg when we panic */ + +/* main configuration routines */ +static int raidframe_booted = 0; + +static void rf_ConfigureDebug(RF_Config_t *cfgPtr); +static void set_debug_option(char *name, long val); +static void rf_UnconfigureArray(void); +static int init_rad(RF_RaidAccessDesc_t *); +static void clean_rad(RF_RaidAccessDesc_t *); +static void rf_ShutdownRDFreeList(void *); +static int rf_ConfigureRDFreeList(RF_ShutdownList_t **); + + +RF_DECLARE_MUTEX(rf_printf_mutex) /* debug only: avoids interleaved printfs by different stripes */ +RF_DECLARE_GLOBAL_THREADID /* declarations for threadid.h */ + +#if !defined(KERNEL) && !defined(SIMULATE) +static int rf_InitThroughputStats(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, RF_Config_t *cfgPtr); +static void rf_StopThroughputStats(RF_Raid_t *raidPtr); +static void rf_PrintThroughputStats(RF_Raid_t *raidPtr); +#endif /* !KERNEL && !SIMULATE */ + +#ifdef KERNEL +#define SIGNAL_QUIESCENT_COND(_raid_) wakeup(&((_raid_)->accesses_suspended)) +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#define WAIT_FOR_QUIESCENCE(_raid_) \ + mpsleep(&((_raid_)->accesses_suspended), PZERO, "raidframe quiesce", 0, \ + (void *) simple_lock_addr((_raid_)->access_suspend_mutex), MS_LOCK_SIMPLE) +#else +#define WAIT_FOR_QUIESCENCE(_raid_) \ + tsleep(&((_raid_)->accesses_suspended),PRIBIO|PCATCH,"raidframe quiesce", 0); + +#endif +#if DKUSAGE > 0 +#define IO_BUF_ERR(bp, err, unit) { \ + bp->b_flags |= B_ERROR; \ + bp->b_resid = bp->b_bcount; \ + bp->b_error = err; \ + RF_DKU_END_IO(unit, bp); \ + biodone(bp); \ +} +#else +#define IO_BUF_ERR(bp, err, unit) { \ + bp->b_flags |= B_ERROR; \ + bp->b_resid = bp->b_bcount; \ + bp->b_error = err; \ + RF_DKU_END_IO(unit); \ + biodone(bp); \ +} +#endif /* DKUSAGE > 0 */ +#else /* KERNEL */ + +#define SIGNAL_QUIESCENT_COND(_raid_) RF_SIGNAL_COND((_raid_)->quiescent_cond) +#define WAIT_FOR_QUIESCENCE(_raid_) RF_WAIT_COND((_raid_)->quiescent_cond, (_raid_)->access_suspend_mutex) +#define IO_BUF_ERR(bp, err, unit) + +#endif /* KERNEL */ + +static int configureCount=0; /* number of active configurations */ +static int isconfigged=0; /* is basic raidframe (non per-array) stuff configged */ +RF_DECLARE_STATIC_MUTEX(configureMutex) /* used to lock the configuration stuff */ + +static RF_ShutdownList_t *globalShutdown; /* non array-specific stuff */ + +static int rf_ConfigureRDFreeList(RF_ShutdownList_t **listp); + +/* called at system boot time */ +int rf_BootRaidframe() +{ +#if 0 + long stacksize; +#endif + int rc; + + if (raidframe_booted) + return(EBUSY); + raidframe_booted = 1; + +#if RF_DEBUG_ATOMIC > 0 + rf_atent_init(); +#endif /* RF_DEBUG_ATOMIC > 0 */ + + rf_setup_threadid(); + rf_assign_threadid(); + +#if !defined(KERNEL) && !defined(SIMULATE) + if (RF_THREAD_ATTR_CREATE(raidframe_attr_default)) { + fprintf(stderr, "Unable to create default thread attr\n"); + exit(1); + } +#if 0 + stacksize = RF_THREAD_ATTR_GETSTACKSIZE(raidframe_attr_default); + if (stacksize < 0) { + fprintf(stderr, "Unable to get stack size of default thread attr\n"); + exit(1); + } + stacksize += 16384; + rc = RF_THREAD_ATTR_SETSTACKSIZE(raidframe_attr_default, stacksize); + if (rc) { + fprintf(stderr, "Unable to set stack size of default thread attr\n"); + exit(1); + } +#endif /* 0 */ +#endif /* !KERNEL && !SIMULATE */ + rc = rf_mutex_init(&configureMutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + RF_PANIC(); + } + configureCount = 0; + isconfigged = 0; + globalShutdown = NULL; + return(0); +} + +/* + * This function is really just for debugging user-level stuff: it + * frees up all memory, other RAIDframe resources which might otherwise + * be kept around. This is used with systems like "sentinel" to detect + * memory leaks. + */ +int rf_UnbootRaidframe() +{ + int rc; + + RF_LOCK_MUTEX(configureMutex); + if (configureCount) { + RF_UNLOCK_MUTEX(configureMutex); + return(EBUSY); + } + raidframe_booted = 0; + RF_UNLOCK_MUTEX(configureMutex); + rc = rf_mutex_destroy(&configureMutex); + if (rc) { + RF_ERRORMSG3("Unable to destroy mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + RF_PANIC(); + } +#if RF_DEBUG_ATOMIC > 0 + rf_atent_shutdown(); +#endif /* RF_DEBUG_ATOMIC > 0 */ + return(0); +} + +/* + * Called whenever an array is shutdown + */ +static void rf_UnconfigureArray() +{ + int rc; + + RF_LOCK_MUTEX(configureMutex); + if (--configureCount == 0) { /* if no active configurations, shut everything down */ + isconfigged = 0; + + rc = rf_ShutdownList(&globalShutdown); + if (rc) { + RF_ERRORMSG1("RAIDFRAME: unable to do global shutdown, rc=%d\n", rc); + } + + rf_shutdown_threadid(); + + /* + * We must wait until now, because the AllocList module + * uses the DebugMem module. + */ + if (rf_memDebug) + rf_print_unfreed(); + } + RF_UNLOCK_MUTEX(configureMutex); +} + +/* + * Called to shut down an array. + */ +int rf_Shutdown(raidPtr) + RF_Raid_t *raidPtr; +{ +#if !defined(__NetBSD__) && !defined(__OpenBSD__) && !defined(_KERNEL) + int rc; +#endif + int r,c; + +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + struct proc *p; +#endif + + if (!raidPtr->valid) { + RF_ERRORMSG("Attempt to shut down unconfigured RAIDframe driver. Aborting shutdown\n"); + return(EINVAL); + } + + /* + * wait for outstanding IOs to land + * As described in rf_raid.h, we use the rad_freelist lock + * to protect the per-array info about outstanding descs + * since we need to do freelist locking anyway, and this + * cuts down on the amount of serialization we've got going + * on. + */ + RF_FREELIST_DO_LOCK(rf_rad_freelist); + if (raidPtr->waitShutdown) { + RF_FREELIST_DO_UNLOCK(rf_rad_freelist); + return(EBUSY); + } + raidPtr->waitShutdown = 1; + while (raidPtr->nAccOutstanding) { + RF_WAIT_COND(raidPtr->outstandingCond, RF_FREELIST_MUTEX_OF(rf_rad_freelist)); + } + RF_FREELIST_DO_UNLOCK(rf_rad_freelist); + +#if !defined(KERNEL) && !defined(SIMULATE) + rf_PrintThroughputStats(raidPtr); +#endif /* !KERNEL && !SIMULATE */ + + raidPtr->valid = 0; + +#if !defined(KERNEL) && !defined(SIMULATE) + rf_TerminateDiskQueues(raidPtr); /* tell all disk queues to release any waiting threads */ + rf_ShutdownDiskThreads(raidPtr); /* wait for all threads to exit */ +#endif /* !KERNEL && !SIMULATE */ + +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + /* We take this opportunity to close the vnodes like we should.. */ + + p = raidPtr->proc; /* XXX */ + + for(r=0;r<raidPtr->numRow;r++) { + for(c=0;c<raidPtr->numCol;c++) { + printf("Closing vnode for row: %d col: %d\n",r,c); + if (raidPtr->raid_cinfo[r][c].ci_vp) { + (void)vn_close(raidPtr->raid_cinfo[r][c].ci_vp, + FREAD|FWRITE, p->p_ucred, p); + } else { + printf("vnode was NULL\n"); + } + + } + } + for(r=0;r<raidPtr->numSpare;r++) { + printf("Closing vnode for spare: %d\n",r); + if (raidPtr->raid_cinfo[0][raidPtr->numCol+r].ci_vp) { + (void)vn_close(raidPtr->raid_cinfo[0][raidPtr->numCol+r].ci_vp, + FREAD|FWRITE, p->p_ucred, p); + } else { + printf("vnode was NULL\n"); + } + } + + +#endif + + rf_ShutdownList(&raidPtr->shutdownList); + + rf_UnconfigureArray(); + + return(0); +} + +#define DO_INIT_CONFIGURE(f) { \ + rc = f (&globalShutdown); \ + if (rc) { \ + RF_ERRORMSG2("RAIDFRAME: failed %s with %d\n", RF_STRING(f), rc); \ + rf_ShutdownList(&globalShutdown); \ + configureCount--; \ + RF_UNLOCK_MUTEX(configureMutex); \ + return(rc); \ + } \ +} + +#define DO_RAID_FAIL() { \ + rf_ShutdownList(&raidPtr->shutdownList); \ + rf_UnconfigureArray(); \ +} + +#define DO_RAID_INIT_CONFIGURE(f) { \ + rc = f (&raidPtr->shutdownList, raidPtr, cfgPtr); \ + if (rc) { \ + RF_ERRORMSG2("RAIDFRAME: failed %s with %d\n", RF_STRING(f), rc); \ + DO_RAID_FAIL(); \ + return(rc); \ + } \ +} + +#define DO_RAID_MUTEX(_m_) { \ + rc = rf_create_managed_mutex(&raidPtr->shutdownList, (_m_)); \ + if (rc) { \ + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", \ + __FILE__, __LINE__, rc); \ + DO_RAID_FAIL(); \ + return(rc); \ + } \ +} + +#define DO_RAID_COND(_c_) { \ + rc = rf_create_managed_cond(&raidPtr->shutdownList, (_c_)); \ + if (rc) { \ + RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", \ + __FILE__, __LINE__, rc); \ + DO_RAID_FAIL(); \ + return(rc); \ + } \ +} + +int rf_Configure(raidPtr, cfgPtr) + RF_Raid_t *raidPtr; + RF_Config_t *cfgPtr; +{ + RF_RowCol_t row, col; + int i, rc; + int unit; + struct proc *p; + + if (raidPtr->valid) { + RF_ERRORMSG("RAIDframe configuration not shut down. Aborting configure.\n"); + return(EINVAL); + } + + RF_LOCK_MUTEX(configureMutex); + configureCount++; + if (isconfigged == 0) { + rc = rf_create_managed_mutex(&globalShutdown, &rf_printf_mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + rf_ShutdownList(&globalShutdown); + return(rc); + } + + /* initialize globals */ + printf("RAIDFRAME: protectedSectors is %ld\n",rf_protectedSectors); + + rf_clear_debug_print_buffer(); + + DO_INIT_CONFIGURE(rf_ConfigureAllocList); + DO_INIT_CONFIGURE(rf_ConfigureEtimer); + /* + * Yes, this does make debugging general to the whole system instead + * of being array specific. Bummer, drag. + */ + rf_ConfigureDebug(cfgPtr); + DO_INIT_CONFIGURE(rf_ConfigureDebugMem); +#ifdef SIMULATE + rf_default_disk_names(); + DO_INIT_CONFIGURE(rf_DDEventInit); +#endif /* SIMULATE */ + DO_INIT_CONFIGURE(rf_ConfigureAccessTrace); + DO_INIT_CONFIGURE(rf_ConfigureMapModule); + DO_INIT_CONFIGURE(rf_ConfigureReconEvent); + DO_INIT_CONFIGURE(rf_ConfigureCallback); + DO_INIT_CONFIGURE(rf_ConfigureMemChunk); + DO_INIT_CONFIGURE(rf_ConfigureRDFreeList); + DO_INIT_CONFIGURE(rf_ConfigureNWayXor); + DO_INIT_CONFIGURE(rf_ConfigureStripeLockFreeList); + DO_INIT_CONFIGURE(rf_ConfigureMCPair); +#ifndef SIMULATE +#if !defined(__NetBSD__) && !defined(__OpenBSD__) + DO_INIT_CONFIGURE(rf_ConfigureCamLayer); +#endif +#endif /* !SIMULATE */ + DO_INIT_CONFIGURE(rf_ConfigureDAGs); + DO_INIT_CONFIGURE(rf_ConfigureDAGFuncs); + DO_INIT_CONFIGURE(rf_ConfigureDebugPrint); + DO_INIT_CONFIGURE(rf_ConfigureReconstruction); + DO_INIT_CONFIGURE(rf_ConfigureCopyback); + DO_INIT_CONFIGURE(rf_ConfigureDiskQueueSystem); + DO_INIT_CONFIGURE(rf_ConfigureCpuMonitor); + isconfigged = 1; + } + RF_UNLOCK_MUTEX(configureMutex); + + /* + * Null out the entire raid descriptor to avoid problems when we reconfig. + * This also clears the valid bit. + */ + /* XXX this clearing should be moved UP to outside of here.... that, or + rf_Configure() needs to take more arguments... XXX */ +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + unit = raidPtr->raidid; + p = raidPtr->proc; /* XXX save these... */ +#endif + bzero((char *)raidPtr, sizeof(RF_Raid_t)); +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + raidPtr->raidid = unit; + raidPtr->proc = p; /* XXX and then recover them..*/ +#endif + DO_RAID_MUTEX(&raidPtr->mutex); + /* set up the cleanup list. Do this after ConfigureDebug so that value of memDebug will be set */ + + rf_MakeAllocList(raidPtr->cleanupList); + if (raidPtr->cleanupList == NULL) { + DO_RAID_FAIL(); + return(ENOMEM); + } + + rc = rf_ShutdownCreate(&raidPtr->shutdownList, + (void (*)(void *))rf_FreeAllocList, + raidPtr->cleanupList); + if (rc) { + RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", + __FILE__, __LINE__, rc); + DO_RAID_FAIL(); + return(rc); + } + + raidPtr->numRow = cfgPtr->numRow; + raidPtr->numCol = cfgPtr->numCol; + raidPtr->numSpare = cfgPtr->numSpare; + +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + /* XXX we don't even pretend to support more than one row + in the kernel... */ + if (raidPtr->numRow != 1) { + RF_ERRORMSG("Only one row supported in kernel.\n"); + DO_RAID_FAIL(); + return(EINVAL); + } +#endif + + + + RF_CallocAndAdd(raidPtr->status, raidPtr->numRow, sizeof(RF_RowStatus_t), + (RF_RowStatus_t *), raidPtr->cleanupList); + if (raidPtr->status == NULL) { + DO_RAID_FAIL(); + return(ENOMEM); + } + + RF_CallocAndAdd(raidPtr->reconControl, raidPtr->numRow, + sizeof(RF_ReconCtrl_t *), (RF_ReconCtrl_t **), raidPtr->cleanupList); + if (raidPtr->reconControl == NULL) { + DO_RAID_FAIL(); + return(ENOMEM); + } + for (i=0; i<raidPtr->numRow; i++) { + raidPtr->status[i] = rf_rs_optimal; + raidPtr->reconControl[i] = NULL; + } + + DO_RAID_INIT_CONFIGURE(rf_ConfigureEngine); +#if !defined(KERNEL) && !defined(SIMULATE) + DO_RAID_INIT_CONFIGURE(rf_InitThroughputStats); +#endif /* !KERNEL && !SIMULATE */ + + DO_RAID_INIT_CONFIGURE(rf_ConfigureStripeLocks); + + DO_RAID_COND(&raidPtr->outstandingCond); + + raidPtr->nAccOutstanding = 0; + raidPtr->waitShutdown = 0; + + DO_RAID_MUTEX(&raidPtr->access_suspend_mutex); + DO_RAID_COND(&raidPtr->quiescent_cond); + + DO_RAID_COND(&raidPtr->waitForReconCond); + + DO_RAID_MUTEX(&raidPtr->recon_done_proc_mutex); + DO_RAID_INIT_CONFIGURE(rf_ConfigureDisks); + DO_RAID_INIT_CONFIGURE(rf_ConfigureSpareDisks); + /* do this after ConfigureDisks & ConfigureSpareDisks to be sure dev no. is set */ + DO_RAID_INIT_CONFIGURE(rf_ConfigureDiskQueues); +#ifndef KERNEL + DO_RAID_INIT_CONFIGURE(rf_ConfigureDiskThreads); +#endif /* !KERNEL */ + + DO_RAID_INIT_CONFIGURE(rf_ConfigureLayout); + + DO_RAID_INIT_CONFIGURE(rf_ConfigurePSStatus); + + for(row=0;row<raidPtr->numRow;row++) { + for(col=0;col<raidPtr->numCol;col++) { + /* + * XXX better distribution + */ + raidPtr->hist_diskreq[row][col] = 0; + } + } + + if (rf_keepAccTotals) { + raidPtr->keep_acc_totals = 1; + } + + rf_StartUserStats(raidPtr); + + raidPtr->valid = 1; + return(0); +} + +static int init_rad(desc) + RF_RaidAccessDesc_t *desc; +{ + int rc; + + rc = rf_mutex_init(&desc->mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + return(rc); + } + rc = rf_cond_init(&desc->cond); + if (rc) { + RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + rf_mutex_destroy(&desc->mutex); + return(rc); + } + return(0); +} + +static void clean_rad(desc) + RF_RaidAccessDesc_t *desc; +{ + rf_mutex_destroy(&desc->mutex); + rf_cond_destroy(&desc->cond); +} + +static void rf_ShutdownRDFreeList(ignored) + void *ignored; +{ + RF_FREELIST_DESTROY_CLEAN(rf_rad_freelist,next,(RF_RaidAccessDesc_t *),clean_rad); +} + +static int rf_ConfigureRDFreeList(listp) + RF_ShutdownList_t **listp; +{ + int rc; + + RF_FREELIST_CREATE(rf_rad_freelist, RF_MAX_FREE_RAD, + RF_RAD_INC, sizeof(RF_RaidAccessDesc_t)); + if (rf_rad_freelist == NULL) { + return(ENOMEM); + } + rc = rf_ShutdownCreate(listp, rf_ShutdownRDFreeList, NULL); + if (rc) { + RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + rf_ShutdownRDFreeList(NULL); + return(rc); + } + RF_FREELIST_PRIME_INIT(rf_rad_freelist, RF_RAD_INITIAL,next, + (RF_RaidAccessDesc_t *),init_rad); + return(0); +} + +RF_RaidAccessDesc_t *rf_AllocRaidAccDesc( + RF_Raid_t *raidPtr, + RF_IoType_t type, + RF_RaidAddr_t raidAddress, + RF_SectorCount_t numBlocks, + caddr_t bufPtr, + void *bp, + RF_DagHeader_t **paramDAG, + RF_AccessStripeMapHeader_t **paramASM, + RF_RaidAccessFlags_t flags, + void (*cbF)(struct buf *), + void *cbA, + RF_AccessState_t *states) +{ + RF_RaidAccessDesc_t *desc; + + RF_FREELIST_GET_INIT_NOUNLOCK(rf_rad_freelist,desc,next,(RF_RaidAccessDesc_t *),init_rad); + if (raidPtr->waitShutdown) { + /* + * Actually, we're shutting the array down. Free the desc + * and return NULL. + */ + RF_FREELIST_DO_UNLOCK(rf_rad_freelist); + RF_FREELIST_FREE_CLEAN(rf_rad_freelist,desc,next,clean_rad); + return(NULL); + } + raidPtr->nAccOutstanding++; + RF_FREELIST_DO_UNLOCK(rf_rad_freelist); + + desc->raidPtr = (void*)raidPtr; + desc->type = type; + desc->raidAddress = raidAddress; + desc->numBlocks = numBlocks; + desc->bufPtr = bufPtr; + desc->bp = bp; + desc->paramDAG = paramDAG; + desc->paramASM = paramASM; + desc->flags = flags; + desc -> states = states; + desc -> state = 0; + + desc->status = 0; + bzero((char *)&desc->tracerec, sizeof(RF_AccTraceEntry_t)); + desc->callbackFunc= (void (*)(RF_CBParam_t))cbF; /* XXX */ + desc->callbackArg = cbA; + desc->next = NULL; + desc->head = desc; + desc->numPending = 0; + desc->cleanupList = NULL; + rf_MakeAllocList(desc->cleanupList); + rf_get_threadid(desc->tid); +#ifdef SIMULATE + desc->owner = rf_GetCurrentOwner(); +#endif /* SIMULATE */ + return(desc); +} + +void rf_FreeRaidAccDesc(RF_RaidAccessDesc_t *desc) +{ + RF_Raid_t *raidPtr = desc->raidPtr; + + RF_ASSERT(desc); + +#if !defined(KERNEL) && !defined(SIMULATE) + rf_StopThroughputStats(raidPtr); +#endif /* !KERNEL && !SIMULATE */ + + rf_FreeAllocList(desc->cleanupList); + RF_FREELIST_FREE_CLEAN_NOUNLOCK(rf_rad_freelist,desc,next,clean_rad); + raidPtr->nAccOutstanding--; + if (raidPtr->waitShutdown) { + RF_SIGNAL_COND(raidPtr->outstandingCond); + } + RF_FREELIST_DO_UNLOCK(rf_rad_freelist); +} + +#ifdef JIMZ +#define THREAD_NUMDESC 1024 +#define THREAD_NUM 600 +static RF_RaidAccessDesc_t *dest_hist[THREAD_NUM*THREAD_NUMDESC]; +int jimz_access_num[THREAD_NUM]; +#endif /* JIMZ */ + +/********************************************************************* + * Main routine for performing an access. + * Accesses are retried until a DAG can not be selected. This occurs + * when either the DAG library is incomplete or there are too many + * failures in a parity group. + ********************************************************************/ +int rf_DoAccess( + RF_Raid_t *raidPtr, + RF_IoType_t type, + int async_flag, + RF_RaidAddr_t raidAddress, + RF_SectorCount_t numBlocks, + caddr_t bufPtr, + void *bp_in, + RF_DagHeader_t **paramDAG, + RF_AccessStripeMapHeader_t **paramASM, + RF_RaidAccessFlags_t flags, + RF_RaidAccessDesc_t **paramDesc, + void (*cbF)(struct buf *), + void *cbA) +/* +type should be read or write +async_flag should be RF_TRUE or RF_FALSE +bp_in is a buf pointer. void * to facilitate ignoring it outside the kernel +*/ +{ + int tid; + RF_RaidAccessDesc_t *desc; + caddr_t lbufPtr = bufPtr; +#ifdef KERNEL + struct buf *bp = (struct buf *) bp_in; +#if DFSTRACE > 0 + struct { RF_uint64 raidAddr; int numBlocks; char type;} dfsrecord; +#endif /* DFSTRACE > 0 */ +#else /* KERNEL */ + void *bp = bp_in; +#endif /* KERNEL */ + + raidAddress += rf_raidSectorOffset; + + if (!raidPtr->valid) { + RF_ERRORMSG("RAIDframe driver not successfully configured. Rejecting access.\n"); + IO_BUF_ERR(bp, EINVAL, raidPtr->raidid); + return(EINVAL); + } + +#if defined(KERNEL) && DFSTRACE > 0 + if (rf_DFSTraceAccesses) { + dfsrecord.raidAddr = raidAddress; + dfsrecord.numBlocks = numBlocks; + dfsrecord.type = type; + dfs_log(DFS_NOTE, (char *) &dfsrecord, sizeof(dfsrecord), 0); + } +#endif /* KERNEL && DFSTRACE > 0 */ + + rf_get_threadid(tid); + if (rf_accessDebug) { + + printf("logBytes is: %d %d %d\n",raidPtr->raidid, + raidPtr->logBytesPerSector, + (int)rf_RaidAddressToByte(raidPtr,numBlocks)); + printf("[%d] %s raidAddr %d (stripeid %d-%d) numBlocks %d (%d bytes) buf 0x%lx\n",tid, + (type==RF_IO_TYPE_READ) ? "READ":"WRITE", (int)raidAddress, + (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress), + (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress+numBlocks-1), + (int) numBlocks, + (int) rf_RaidAddressToByte(raidPtr,numBlocks), + (long) bufPtr); + } + + if (raidAddress + numBlocks > raidPtr->totalSectors) { + + printf("DoAccess: raid addr %lu too large to access %lu sectors. Max legal addr is %lu\n", + (u_long)raidAddress,(u_long)numBlocks,(u_long)raidPtr->totalSectors); + +#ifdef KERNEL + if (type == RF_IO_TYPE_READ) { + IO_BUF_ERR(bp, ENOSPC, raidPtr->raidid); + return(ENOSPC); + } else { + IO_BUF_ERR(bp, ENOSPC, raidPtr->raidid); + return(ENOSPC); + } +#elif defined(SIMULATE) + RF_PANIC(); +#else /* SIMULATE */ + return(EIO); +#endif /* SIMULATE */ + } + +#if !defined(KERNEL) && !defined(SIMULATE) + rf_StartThroughputStats(raidPtr); +#endif /* !KERNEL && !SIMULATE */ + + desc = rf_AllocRaidAccDesc(raidPtr, type, raidAddress, + numBlocks, lbufPtr, bp, paramDAG, paramASM, + flags, cbF, cbA, raidPtr->Layout.map->states); + + if (desc == NULL) { + return(ENOMEM); + } +#ifdef JIMZ + dest_hist[(tid*THREAD_NUMDESC)+jimz_access_num[tid]]; jimz_access_num[tid]++; +#endif /* JIMZ */ + + RF_ETIMER_START(desc->tracerec.tot_timer); + +#ifdef SIMULATE + /* simulator uses paramDesc to continue dag from test function */ + desc->async_flag=async_flag; + + *paramDesc=desc; + + return(0); +#endif /* SIMULATE */ + + rf_ContinueRaidAccess(desc); + +#ifndef KERNEL + if (!(flags & RF_DAG_NONBLOCKING_IO)) { + RF_LOCK_MUTEX(desc->mutex); + while (!(desc->flags & RF_DAG_ACCESS_COMPLETE)) { + RF_WAIT_COND(desc->cond, desc->mutex); + } + RF_UNLOCK_MUTEX(desc->mutex); + rf_FreeRaidAccDesc(desc); + } +#endif /* !KERNEL */ + + return(0); +} + +/* force the array into reconfigured mode without doing reconstruction */ +int rf_SetReconfiguredMode(raidPtr, row, col) + RF_Raid_t *raidPtr; + int row; + int col; +{ + if (!(raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) { + printf("Can't set reconfigured mode in dedicated-spare array\n"); + RF_PANIC(); + } + RF_LOCK_MUTEX(raidPtr->mutex); + raidPtr->numFailures++; + raidPtr->Disks[row][col].status = rf_ds_dist_spared; + raidPtr->status[row] = rf_rs_reconfigured; + /* install spare table only if declustering + distributed sparing architecture. */ + if ( raidPtr->Layout.map->flags & RF_BD_DECLUSTERED ) + rf_InstallSpareTable(raidPtr, row, col); + RF_UNLOCK_MUTEX(raidPtr->mutex); + return(0); +} + +extern int fail_row, fail_col, fail_time; +extern int delayed_recon; + +int rf_FailDisk( + RF_Raid_t *raidPtr, + int frow, + int fcol, + int initRecon) +{ + int tid; + + rf_get_threadid(tid); + printf("[%d] Failing disk r%d c%d\n",tid,frow,fcol); + RF_LOCK_MUTEX(raidPtr->mutex); + raidPtr->numFailures++; + raidPtr->Disks[frow][fcol].status = rf_ds_failed; + raidPtr->status[frow] = rf_rs_degraded; + RF_UNLOCK_MUTEX(raidPtr->mutex); +#ifdef SIMULATE +#if RF_DEMO > 0 + if (rf_demoMode) { + rf_demo_update_mode (RF_DEMO_DEGRADED); + fail_col = fcol; fail_row = frow; + fail_time = rf_CurTime(); /* XXX */ + if (initRecon) + delayed_recon = RF_TRUE; + } + else { + if (initRecon) + rf_ReconstructFailedDisk(raidPtr, frow, fcol); + } +#else /* RF_DEMO > 0 */ + if (initRecon) + rf_ReconstructFailedDisk(raidPtr, frow, fcol); +#endif /* RF_DEMO > 0 */ +#else /* SIMULATE */ + if (initRecon) + rf_ReconstructFailedDisk(raidPtr, frow, fcol); +#endif /* SIMULATE */ + return(0); +} + +#ifdef SIMULATE +extern RF_Owner_t recon_owner; + +void rf_ScheduleContinueReconstructFailedDisk(reconDesc) + RF_RaidReconDesc_t *reconDesc; +{ + rf_DDEventRequest(rf_CurTime(), rf_ContinueReconstructFailedDisk, + (void *) reconDesc, recon_owner, -4, -4, reconDesc->raidPtr, NULL); +} +#endif /* SIMULATE */ + +/* releases a thread that is waiting for the array to become quiesced. + * access_suspend_mutex should be locked upon calling this + */ +void rf_SignalQuiescenceLock(raidPtr, reconDesc) + RF_Raid_t *raidPtr; + RF_RaidReconDesc_t *reconDesc; +{ + int tid; + + if (rf_quiesceDebug) { + rf_get_threadid(tid); + printf("[%d] Signalling quiescence lock\n", tid); + } + raidPtr->access_suspend_release = 1; + + if (raidPtr->waiting_for_quiescence) { +#ifndef SIMULATE + SIGNAL_QUIESCENT_COND(raidPtr); +#else /* !SIMULATE */ + if (reconDesc) { + rf_ScheduleContinueReconstructFailedDisk(reconDesc); + } +#endif /* !SIMULATE */ + } +} + +/* suspends all new requests to the array. No effect on accesses that are in flight. */ +int rf_SuspendNewRequestsAndWait(raidPtr) + RF_Raid_t *raidPtr; +{ + if (rf_quiesceDebug) + printf("Suspending new reqs\n"); + + RF_LOCK_MUTEX(raidPtr->access_suspend_mutex); + raidPtr->accesses_suspended++; + raidPtr->waiting_for_quiescence = (raidPtr->accs_in_flight == 0) ? 0 : 1; + +#ifndef SIMULATE + if (raidPtr->waiting_for_quiescence) { + raidPtr->access_suspend_release=0; + while (!raidPtr->access_suspend_release) { + printf("Suspending: Waiting for Quiescence\n"); + WAIT_FOR_QUIESCENCE(raidPtr); + raidPtr->waiting_for_quiescence = 0; + } + } + printf("Quiescence reached..\n"); +#endif /* !SIMULATE */ + + RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex); + return (raidPtr->waiting_for_quiescence); +} + +/* wake up everyone waiting for quiescence to be released */ +void rf_ResumeNewRequests(raidPtr) + RF_Raid_t *raidPtr; +{ + RF_CallbackDesc_t *t, *cb; + + if (rf_quiesceDebug) + printf("Resuming new reqs\n"); + + RF_LOCK_MUTEX(raidPtr->access_suspend_mutex); + raidPtr->accesses_suspended--; + if (raidPtr->accesses_suspended == 0) + cb = raidPtr->quiesce_wait_list; + else + cb = NULL; + raidPtr->quiesce_wait_list = NULL; + RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex); + + while (cb) { + t = cb; + cb = cb->next; + (t->callbackFunc)(t->callbackArg); + rf_FreeCallbackDesc(t); + } +} + +/***************************************************************************************** + * + * debug routines + * + ****************************************************************************************/ + +static void set_debug_option(name, val) + char *name; + long val; +{ + RF_DebugName_t *p; + + for (p = rf_debugNames; p->name; p++) { + if (!strcmp(p->name, name)) { + *(p->ptr) = val; + printf("[Set debug variable %s to %ld]\n",name,val); + return; + } + } + RF_ERRORMSG1("Unknown debug string \"%s\"\n",name); +} + + +/* would like to use sscanf here, but apparently not available in kernel */ +/*ARGSUSED*/ +static void rf_ConfigureDebug(cfgPtr) + RF_Config_t *cfgPtr; +{ + char *val_p, *name_p, *white_p; + long val; + int i; + + rf_ResetDebugOptions(); + for (i=0; cfgPtr->debugVars[i][0] && i < RF_MAXDBGV; i++) { + name_p = rf_find_non_white(&cfgPtr->debugVars[i][0]); + white_p = rf_find_white(name_p); /* skip to start of 2nd word */ + val_p = rf_find_non_white(white_p); + if (*val_p == '0' && *(val_p+1) == 'x') val = rf_htoi(val_p+2); + else val = rf_atoi(val_p); + *white_p = '\0'; + set_debug_option(name_p, val); + } +} + +/* performance monitoring stuff */ + +#define TIMEVAL_TO_US(t) (((long) t.tv_sec) * 1000000L + (long) t.tv_usec) + +#if !defined(KERNEL) && !defined(SIMULATE) + +/* + * Throughput stats currently only used in user-level RAIDframe + */ + +static int rf_InitThroughputStats( + RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr) +{ + int rc; + + /* these used by user-level raidframe only */ + rc = rf_create_managed_mutex(listp, &raidPtr->throughputstats.mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + return(rc); + } + raidPtr->throughputstats.sum_io_us = 0; + raidPtr->throughputstats.num_ios = 0; + raidPtr->throughputstats.num_out_ios = 0; + return(0); +} + +void rf_StartThroughputStats(RF_Raid_t *raidPtr) +{ + RF_LOCK_MUTEX(raidPtr->throughputstats.mutex); + raidPtr->throughputstats.num_ios++; + raidPtr->throughputstats.num_out_ios++; + if (raidPtr->throughputstats.num_out_ios == 1) + RF_GETTIME(raidPtr->throughputstats.start); + RF_UNLOCK_MUTEX(raidPtr->throughputstats.mutex); +} + +static void rf_StopThroughputStats(RF_Raid_t *raidPtr) +{ + struct timeval diff; + + RF_LOCK_MUTEX(raidPtr->throughputstats.mutex); + raidPtr->throughputstats.num_out_ios--; + if (raidPtr->throughputstats.num_out_ios == 0) { + RF_GETTIME(raidPtr->throughputstats.stop); + RF_TIMEVAL_DIFF(&raidPtr->throughputstats.start, &raidPtr->throughputstats.stop, &diff); + raidPtr->throughputstats.sum_io_us += TIMEVAL_TO_US(diff); + } + RF_UNLOCK_MUTEX(raidPtr->throughputstats.mutex); +} + +static void rf_PrintThroughputStats(RF_Raid_t *raidPtr) +{ + RF_ASSERT(raidPtr->throughputstats.num_out_ios == 0); + if ( raidPtr->throughputstats.sum_io_us != 0 ) { + printf("[Througphut: %8.2f IOs/second]\n", raidPtr->throughputstats.num_ios + / (raidPtr->throughputstats.sum_io_us / 1000000.0)); + } +} + +#endif /* !KERNEL && !SIMULATE */ + +void rf_StartUserStats(RF_Raid_t *raidPtr) +{ + RF_GETTIME(raidPtr->userstats.start); + raidPtr->userstats.sum_io_us = 0; + raidPtr->userstats.num_ios = 0; + raidPtr->userstats.num_sect_moved = 0; +} + +void rf_StopUserStats(RF_Raid_t *raidPtr) +{ + RF_GETTIME(raidPtr->userstats.stop); +} + +void rf_UpdateUserStats(raidPtr, rt, numsect) + RF_Raid_t *raidPtr; + int rt; /* resp time in us */ + int numsect; /* number of sectors for this access */ +{ + raidPtr->userstats.sum_io_us += rt; + raidPtr->userstats.num_ios++; + raidPtr->userstats.num_sect_moved += numsect; +} + +void rf_PrintUserStats(RF_Raid_t *raidPtr) +{ + long elapsed_us, mbs, mbs_frac; + struct timeval diff; + + RF_TIMEVAL_DIFF(&raidPtr->userstats.start, &raidPtr->userstats.stop, &diff); + elapsed_us = TIMEVAL_TO_US(diff); + + /* 2000 sectors per megabyte, 10000000 microseconds per second */ + if (elapsed_us) + mbs = (raidPtr->userstats.num_sect_moved / 2000) / (elapsed_us / 1000000); + else + mbs = 0; + + /* this computes only the first digit of the fractional mb/s moved */ + if (elapsed_us) { + mbs_frac = ((raidPtr->userstats.num_sect_moved / 200) / (elapsed_us / 1000000)) + - (mbs * 10); + } + else { + mbs_frac = 0; + } + + printf("Number of I/Os: %ld\n",raidPtr->userstats.num_ios); + printf("Elapsed time (us): %ld\n",elapsed_us); + printf("User I/Os per second: %ld\n",RF_DB0_CHECK(raidPtr->userstats.num_ios, (elapsed_us/1000000))); + printf("Average user response time: %ld us\n",RF_DB0_CHECK(raidPtr->userstats.sum_io_us, raidPtr->userstats.num_ios)); + printf("Total sectors moved: %ld\n",raidPtr->userstats.num_sect_moved); + printf("Average access size (sect): %ld\n",RF_DB0_CHECK(raidPtr->userstats.num_sect_moved, raidPtr->userstats.num_ios)); + printf("Achieved data rate: %ld.%ld MB/sec\n",mbs,mbs_frac); +} diff --git a/sys/dev/raidframe/rf_driver.h b/sys/dev/raidframe/rf_driver.h new file mode 100644 index 00000000000..7c9a1c4084b --- /dev/null +++ b/sys/dev/raidframe/rf_driver.h @@ -0,0 +1,126 @@ +/* $OpenBSD: rf_driver.h,v 1.1 1999/01/11 14:29:19 niklas Exp $ */ +/* $NetBSD: rf_driver.h,v 1.1 1998/11/13 04:20:29 oster Exp $ */ +/* + * rf_driver.h + */ +/* + * Copyright (c) 1996 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ +/* + * : + * Log: rf_driver.h,v + * Revision 1.11 1996/07/11 19:08:00 jimz + * generalize reconstruction mechanism + * allow raid1 reconstructs via copyback (done with array + * quiesced, not online, therefore not disk-directed) + * + * Revision 1.10 1996/06/10 14:18:58 jimz + * move user, throughput stats into per-array structure + * + * Revision 1.9 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.8 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.7 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.6 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.5 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.4 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.3 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.2 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.1 1996/05/18 19:56:10 jimz + * Initial revision + * + */ + +#ifndef _RF__RF_DRIVER_H_ +#define _RF__RF_DRIVER_H_ + +#include "rf_threadstuff.h" +#include "rf_types.h" + +RF_DECLARE_EXTERN_MUTEX(rf_printf_mutex) + +int rf_BootRaidframe(void); +int rf_UnbootRaidframe(void); +int rf_Shutdown(RF_Raid_t *raidPtr); +int rf_Configure(RF_Raid_t *raidPtr, RF_Config_t *cfgPtr); +RF_RaidAccessDesc_t *rf_AllocRaidAccDesc(RF_Raid_t *raidPtr, RF_IoType_t type, + RF_RaidAddr_t raidAddress, RF_SectorCount_t numBlocks, caddr_t bufPtr, + void *bp, RF_DagHeader_t **paramDAG, RF_AccessStripeMapHeader_t **paramASM, + RF_RaidAccessFlags_t flags, void (*cbF)(struct buf *), void *cbA, + RF_AccessState_t *states); +void rf_FreeRaidAccDesc(RF_RaidAccessDesc_t *desc); +int rf_DoAccess(RF_Raid_t *raidPtr, RF_IoType_t type, int async_flag, + RF_RaidAddr_t raidAddress, RF_SectorCount_t numBlocks, caddr_t bufPtr, + void *bp_in, RF_DagHeader_t **paramDAG, + RF_AccessStripeMapHeader_t **paramASM, RF_RaidAccessFlags_t flags, + RF_RaidAccessDesc_t **paramDesc, void (*cbF)(struct buf *), void *cbA); +int rf_SetReconfiguredMode(RF_Raid_t *raidPtr, RF_RowCol_t row, + RF_RowCol_t col); +int rf_FailDisk(RF_Raid_t *raidPtr, RF_RowCol_t frow, RF_RowCol_t fcol, + int initRecon); +#ifdef SIMULATE +void rf_ScheduleContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc); +#endif /* SIMULATE */ +void rf_SignalQuiescenceLock(RF_Raid_t *raidPtr, RF_RaidReconDesc_t *reconDesc); +int rf_SuspendNewRequestsAndWait(RF_Raid_t *raidPtr); +void rf_ResumeNewRequests(RF_Raid_t *raidPtr); +void rf_StartThroughputStats(RF_Raid_t *raidPtr); +void rf_StartUserStats(RF_Raid_t *raidPtr); +void rf_StopUserStats(RF_Raid_t *raidPtr); +void rf_UpdateUserStats(RF_Raid_t *raidPtr, int rt, int numsect); +void rf_PrintUserStats(RF_Raid_t *raidPtr); + +#endif /* !_RF__RF_DRIVER_H_ */ diff --git a/sys/dev/raidframe/rf_engine.c b/sys/dev/raidframe/rf_engine.c new file mode 100644 index 00000000000..c99782cbed5 --- /dev/null +++ b/sys/dev/raidframe/rf_engine.c @@ -0,0 +1,1096 @@ +/* $OpenBSD: rf_engine.c,v 1.1 1999/01/11 14:29:19 niklas Exp $ */ +/* $NetBSD: rf_engine.c,v 1.2 1998/11/13 11:48:26 simonb Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: William V. Courtright II, Mark Holland, Rachad Youssef + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/**************************************************************************** + * * + * engine.c -- code for DAG execution engine * + * * + * Modified to work as follows (holland): * + * A user-thread calls into DispatchDAG, which fires off the nodes that * + * are direct successors to the header node. DispatchDAG then returns, * + * and the rest of the I/O continues asynchronously. As each node * + * completes, the node execution function calls FinishNode(). FinishNode * + * scans the list of successors to the node and increments the antecedent * + * counts. Each node that becomes enabled is placed on a central node * + * queue. A dedicated dag-execution thread grabs nodes off of this * + * queue and fires them. * + * * + * NULL nodes are never fired. * + * * + * Terminator nodes are never fired, but rather cause the callback * + * associated with the DAG to be invoked. * + * * + * If a node fails, the dag either rolls forward to the completion or * + * rolls back, undoing previously-completed nodes and fails atomically. * + * The direction of recovery is determined by the location of the failed * + * node in the graph. If the failure occured before the commit node in * + * the graph, backward recovery is used. Otherwise, forward recovery is * + * used. * + * * + ****************************************************************************/ + +/* + * : + * + * Log: rf_engine.c,v + * Revision 1.56 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.55 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.54 1996/07/17 21:00:58 jimz + * clean up timer interface, tracing + * + * Revision 1.53 1996/07/15 17:22:18 jimz + * nit-pick code cleanup + * resolve stdlib problems on DEC OSF + * + * Revision 1.52 1996/06/17 03:17:08 jimz + * correctly shut down engine thread in kernel + * + * Revision 1.51 1996/06/14 15:02:10 jimz + * make new engine code happy in simulator + * + * Revision 1.50 1996/06/14 14:19:48 jimz + * use diskgroup to control engine thread, make all engine-thread-related + * stuff per-array + * + * Revision 1.49 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.48 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.47 1996/06/06 01:23:23 jimz + * fix bug in node traversal when firing multiple nodes simultaneously + * + * Revision 1.46 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.45 1996/05/30 12:59:18 jimz + * make etimer happier, more portable + * + * Revision 1.44 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.43 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.42 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.41 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.40 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.39 1996/05/20 16:15:17 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.38 1996/05/18 20:09:54 jimz + * bit of cleanup to compile cleanly in kernel, once again + * + * Revision 1.37 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.36 1996/05/15 20:24:19 wvcii + * fixed syntax bug in SIMULATE clause above ProcessNode + * + * Revision 1.35 1996/05/08 21:01:24 jimz + * fixed up enum type names that were conflicting with other + * enums and function names (ie, "panic") + * future naming trends will be towards RF_ and rf_ for + * everything raidframe-related + * + * Revision 1.34 1996/05/08 15:25:28 wvcii + * eliminated dead code + * merged common cases (sim/user/kernel) + * entire node lists (arrays) now fired atomically + * reordered source code for readability + * beefed-up & corrected comments + * + * Revision 1.33 1996/05/07 19:39:40 jimz + * 1. fixed problems in PropogateResults() with nodes being referenced + * after they were no longer valid + * 2. fixed problems in PropogateResults() with the node list being + * incorrectly threaded + * + * Revision 1.32 1996/05/07 19:03:56 wvcii + * in PropagateResults, fixed a bug in the rollBackward case: + * node data is copied before the call to FinishNode which + * frees the node and destroys its data. + * + * Revision 1.31 1996/05/07 17:45:17 jimz + * remove old #if 0 code from PropogateResults() (was kept in + * previous version for archival purposes (rcsdiff)) + * + * Revision 1.30 1996/05/07 17:44:19 jimz + * fix threading of nodes to be fired in PropagateResults() + * fix iteration through skiplist in PropagateResults() + * fix incorrect accesses to freed memory (dereferencing a + * node that was freed by the action of calling FinishNode() + * on it, which in turn completed its DAG) in PropagateResults() + * + * Revision 1.29 1996/05/02 15:04:15 wvcii + * fixed bad array index in PropagateResults + * + * Revision 1.28 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.27 1995/12/08 15:07:03 arw + * cache code cleanup + * + * Revision 1.26 1995/11/07 16:18:01 wvcii + * numerous changes associated with roll-away error recovery + * when a node fails, dag enters rollForward or rollBackward state + * + * Revision 1.25 1995/09/06 19:27:17 wvcii + * added debug vars enableRollAway and debugRecovery + * + */ + +#ifdef _KERNEL +#define KERNEL +#endif + +#include "rf_threadstuff.h" + +#ifndef KERNEL +#include <stdio.h> +#include <stdlib.h> +#endif /* !KERNEL */ + +#include <sys/errno.h> + +#include "rf_dag.h" +#include "rf_engine.h" +#include "rf_threadid.h" +#include "rf_etimer.h" +#include "rf_general.h" +#include "rf_dagutils.h" +#include "rf_shutdown.h" +#include "rf_raid.h" + +#ifndef SIMULATE +static void DAGExecutionThread(RF_ThreadArg_t arg); +#endif /* !SIMULATE */ + +#define DO_INIT(_l_,_r_) { \ + int _rc; \ + _rc = rf_create_managed_mutex(_l_,&(_r_)->node_queue_mutex); \ + if (_rc) { \ + return(_rc); \ + } \ + _rc = rf_create_managed_cond(_l_,&(_r_)->node_queue_cond); \ + if (_rc) { \ + return(_rc); \ + } \ +} + +/* synchronization primitives for this file. DO_WAIT should be enclosed in a while loop. */ +#ifndef KERNEL + +#define DO_LOCK(_r_) RF_LOCK_MUTEX((_r_)->node_queue_mutex) +#define DO_UNLOCK(_r_) RF_UNLOCK_MUTEX((_r_)->node_queue_mutex) +#define DO_WAIT(_r_) RF_WAIT_COND((_r_)->node_queue_cond, (_r_)->node_queue_mutex) +#define DO_SIGNAL(_r_) RF_SIGNAL_COND((_r_)->node_queue_cond) + +#else /* !KERNEL */ + +/* + * XXX Is this spl-ing really necessary? + */ +#define DO_LOCK(_r_) { ks = splbio(); RF_LOCK_MUTEX((_r_)->node_queue_mutex); } +#define DO_UNLOCK(_r_) { RF_UNLOCK_MUTEX((_r_)->node_queue_mutex); splx(ks); } +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#define DO_WAIT(_r_) mpsleep(&(_r_)->node_queue, PZERO, "raidframe nq", 0, (void *) simple_lock_addr((_r_)->node_queue_mutex), MS_LOCK_SIMPLE) +#else +#define DO_WAIT(_r_) tsleep(&(_r_)->node_queue, PRIBIO | PCATCH, "raidframe nq",0) +#endif +#define DO_SIGNAL(_r_) wakeup(&(_r_)->node_queue) + +#endif /* !KERNEL */ + +static void rf_ShutdownEngine(void *); + +static void rf_ShutdownEngine(arg) + void *arg; +{ + RF_Raid_t *raidPtr; + + raidPtr = (RF_Raid_t *)arg; +#ifndef SIMULATE + raidPtr->shutdown_engine = 1; + DO_SIGNAL(raidPtr); + /* XXX something is missing here... */ +#ifdef DEBUG + printf("IGNORING WAIT_STOP\n"); +#endif +#if 0 + RF_THREADGROUP_WAIT_STOP(&raidPtr->engine_tg); +#endif +#endif /* !SIMULATE */ +} + +int rf_ConfigureEngine( + RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr) +{ + int rc, tid=0; + + if (rf_engineDebug) { + rf_get_threadid(tid); + } + + DO_INIT(listp,raidPtr); + + raidPtr->node_queue = NULL; + raidPtr->dags_in_flight = 0; + +#ifndef SIMULATE + rc = rf_init_managed_threadgroup(listp, &raidPtr->engine_tg); + if (rc) + return(rc); + + /* we create the execution thread only once per system boot. + * no need to check return code b/c the kernel panics if it can't create the thread. + */ + if (rf_engineDebug) { + printf("[%d] Creating engine thread\n", tid); + } + + if (RF_CREATE_THREAD(raidPtr->engine_thread, DAGExecutionThread, raidPtr)) { + RF_ERRORMSG("RAIDFRAME: Unable to create engine thread\n"); + return(ENOMEM); + } + if (rf_engineDebug) { + printf("[%d] Created engine thread\n", tid); + } + RF_THREADGROUP_STARTED(&raidPtr->engine_tg); + /* XXX something is missing here... */ +#ifdef debug + printf("Skipping the WAIT_START!!\n"); +#endif +#if 0 + RF_THREADGROUP_WAIT_START(&raidPtr->engine_tg); +#endif + /* engine thread is now running and waiting for work */ + if (rf_engineDebug) { + printf("[%d] Engine thread running and waiting for events\n", tid); + } +#endif /* !SIMULATE */ + + rc = rf_ShutdownCreate(listp, rf_ShutdownEngine, raidPtr); + if (rc) { + RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + rf_ShutdownEngine(NULL); + } + + return(rc); +} + +static int BranchDone(RF_DagNode_t *node) +{ + int i; + + /* return true if forward execution is completed for a node and it's succedents */ + switch (node->status) { + case rf_wait : + /* should never be called in this state */ + RF_PANIC(); + break; + case rf_fired : + /* node is currently executing, so we're not done */ + return(RF_FALSE); + case rf_good : + for (i = 0; i < node->numSuccedents; i++) /* for each succedent */ + if (!BranchDone(node->succedents[i])) /* recursively check branch */ + return RF_FALSE; + return RF_TRUE; /* node and all succedent branches aren't in fired state */ + break; + case rf_bad : + /* succedents can't fire */ + return(RF_TRUE); + case rf_recover : + /* should never be called in this state */ + RF_PANIC(); + break; + case rf_undone : + case rf_panic : + /* XXX need to fix this case */ + /* for now, assume that we're done */ + return(RF_TRUE); + break; + default : + /* illegal node status */ + RF_PANIC(); + break; + } +} + +#ifdef SIMULATE +/* this is only ifdef SIMULATE because nothing else needs it */ +/* recursively determine if a DAG has completed execution */ +static int DAGDone(RF_DagHeader_t *dag) +{ + int i; + + for (i = 0; i < dag->numSuccedents; i++) + if (!BranchDone(dag->succedents[i])) + return RF_FALSE; + return RF_TRUE; +} +#endif /* SIMULATE */ + +static int NodeReady(RF_DagNode_t *node) +{ + int ready; + + switch (node->dagHdr->status) { + case rf_enable : + case rf_rollForward : + if ((node->status == rf_wait) && (node->numAntecedents == node->numAntDone)) + ready = RF_TRUE; + else + ready = RF_FALSE; + break; + case rf_rollBackward : + RF_ASSERT(node->numSuccDone <= node->numSuccedents); + RF_ASSERT(node->numSuccFired <= node->numSuccedents); + RF_ASSERT(node->numSuccFired <= node->numSuccDone); + if ((node->status == rf_good) && (node->numSuccDone == node->numSuccedents)) + ready = RF_TRUE; + else + ready = RF_FALSE; + break; + default : + printf("Execution engine found illegal DAG status in NodeReady\n"); + RF_PANIC(); + break; + } + + return(ready); +} + + + +/* user context and dag-exec-thread context: + * Fire a node. The node's status field determines which function, do or undo, + * to be fired. + * This routine assumes that the node's status field has alread been set to + * "fired" or "recover" to indicate the direction of execution. + */ +static void FireNode(RF_DagNode_t *node) +{ + int tid; + + switch (node->status) { + case rf_fired : + /* fire the do function of a node */ + if (rf_engineDebug) { + rf_get_threadid(tid); + printf("[%d] Firing node 0x%lx (%s)\n",tid,(unsigned long) node, node->name); + } +#ifdef KERNEL + if (node->flags & RF_DAGNODE_FLAG_YIELD) { +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + /* thread_block(); */ + /* printf("Need to block the thread here...\n"); */ + /* XXX thread_block is actually mentioned in + /usr/include/vm/vm_extern.h */ +#else + thread_block(); +#endif + } +#endif /* KERNEL */ + (*(node->doFunc)) (node); + break; + case rf_recover : + /* fire the undo function of a node */ + if (rf_engineDebug || 1) { + rf_get_threadid(tid); + printf("[%d] Firing (undo) node 0x%lx (%s)\n",tid,(unsigned long) node, node->name); + } +#ifdef KERNEL + if (node->flags & RF_DAGNODE_FLAG_YIELD) +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + /* thread_block(); */ + /* printf("Need to block the thread here...\n"); */ + /* XXX thread_block is actually mentioned in + /usr/include/vm/vm_extern.h */ +#else + thread_block(); +#endif +#endif /* KERNEL */ + (*(node->undoFunc)) (node); + break; + default : + RF_PANIC(); + break; + } +} + + + +/* user context: + * Attempt to fire each node in a linear array. + * The entire list is fired atomically. + */ +static void FireNodeArray( + int numNodes, + RF_DagNode_t **nodeList) +{ + RF_DagStatus_t dstat; + RF_DagNode_t *node; + int i, j; + + /* first, mark all nodes which are ready to be fired */ + for (i = 0; i < numNodes; i++) { + node = nodeList[i]; + dstat = node->dagHdr->status; + RF_ASSERT((node->status == rf_wait) || (node->status == rf_good)); + if (NodeReady(node)) { + if ((dstat == rf_enable) || (dstat == rf_rollForward)) { + RF_ASSERT(node->status == rf_wait); + if (node->commitNode) + node->dagHdr->numCommits++; + node->status = rf_fired; + for (j = 0; j < node->numAntecedents; j++) + node->antecedents[j]->numSuccFired++; + } + else { + RF_ASSERT(dstat == rf_rollBackward); + RF_ASSERT(node->status == rf_good); + RF_ASSERT(node->commitNode == RF_FALSE); /* only one commit node per graph */ + node->status = rf_recover; + } + } + } + /* now, fire the nodes */ + for (i = 0; i < numNodes; i++) { + if ((nodeList[i]->status == rf_fired) || (nodeList[i]->status == rf_recover)) + FireNode(nodeList[i]); + } +} + + +#ifndef SIMULATE +/* user context: + * Attempt to fire each node in a linked list. + * The entire list is fired atomically. + */ +static void FireNodeList(RF_DagNode_t *nodeList) +{ + RF_DagNode_t *node, *next; + RF_DagStatus_t dstat; + int j; + + if (nodeList) { + /* first, mark all nodes which are ready to be fired */ + for (node = nodeList; node; node = next) { + next = node->next; + dstat = node->dagHdr->status; + RF_ASSERT((node->status == rf_wait) || (node->status == rf_good)); + if (NodeReady(node)) { + if ((dstat == rf_enable) || (dstat == rf_rollForward)) { + RF_ASSERT(node->status == rf_wait); + if (node->commitNode) + node->dagHdr->numCommits++; + node->status = rf_fired; + for (j = 0; j < node->numAntecedents; j++) + node->antecedents[j]->numSuccFired++; + } + else { + RF_ASSERT(dstat == rf_rollBackward); + RF_ASSERT(node->status == rf_good); + RF_ASSERT(node->commitNode == RF_FALSE); /* only one commit node per graph */ + node->status = rf_recover; + } + } + } + /* now, fire the nodes */ + for (node = nodeList; node; node = next) { + next = node->next; + if ((node->status == rf_fired) || (node->status == rf_recover)) + FireNode(node); + } + } +} +#endif /* !SIMULATE */ + + + +/* interrupt context: + * for each succedent + * propagate required results from node to succedent + * increment succedent's numAntDone + * place newly-enable nodes on node queue for firing + * + * To save context switches, we don't place NIL nodes on the node queue, + * but rather just process them as if they had fired. Note that NIL nodes + * that are the direct successors of the header will actually get fired by + * DispatchDAG, which is fine because no context switches are involved. + * + * Important: when running at user level, this can be called by any + * disk thread, and so the increment and check of the antecedent count + * must be locked. I used the node queue mutex and locked down the + * entire function, but this is certainly overkill. + */ +static void PropagateResults( + RF_DagNode_t *node, + int context) +{ + RF_DagNode_t *s, *a; + RF_Raid_t *raidPtr; + int tid, i, ks; +#ifdef SIMULATE + RF_PropHeader_t *p; /* prop list for succedent i */ +#else /* SIMULATE */ + RF_DagNode_t *finishlist = NULL; /* a list of NIL nodes to be finished */ + RF_DagNode_t *skiplist = NULL; /* list of nodes with failed truedata antecedents */ + RF_DagNode_t *firelist = NULL; /* a list of nodes to be fired */ + RF_DagNode_t *q = NULL, *qh = NULL, *next; + int j, skipNode; +#endif /* SIMULATE */ + + rf_get_threadid(tid); + + raidPtr = node->dagHdr->raidPtr; + + DO_LOCK(raidPtr); + + /* debug - validate fire counts */ + for (i = 0; i < node->numAntecedents; i++) { + a = *(node->antecedents + i); + RF_ASSERT(a->numSuccFired >= a->numSuccDone); + RF_ASSERT(a->numSuccFired <= a->numSuccedents); + a->numSuccDone++; + } + + switch (node->dagHdr->status) { + case rf_enable : + case rf_rollForward : +#ifdef SIMULATE + /* currently we never propagate results unless in simulation */ + for (i = 0; i < node->numSuccedents; i++) { + s = *(node->succedents + i); + RF_ASSERT(s->status == rf_wait); + (s->numAntDone)++; + if (node->propList == NULL) + /* null propList implies no results to be propagated */ + p = NULL; + else + /* p=head of prop list for succedent i */ + p = *(node->propList + i); + while (p != NULL) { + /* bind node results to succedent's parameters */ +#if 0 + *(s->params + p->paramNum) = *(node->results + p->resultNum); +#else + s->params[p->paramNum].p = node->results[p->resultNum]; +#endif + p = p->next; + } + } +#else /* SIMULATE */ + for (i = 0; i < node->numSuccedents; i++) { + s = *(node->succedents + i); + RF_ASSERT(s->status == rf_wait); + (s->numAntDone)++; + if (s->numAntDone == s->numAntecedents) { + /* look for NIL nodes */ + if (s->doFunc == rf_NullNodeFunc) { + /* don't fire NIL nodes, just process them */ + s->next = finishlist; + finishlist = s; + } + else { + /* look to see if the node is to be skipped */ + skipNode = RF_FALSE; + for (j = 0; j < s->numAntecedents; j++) + if ((s->antType[j] == rf_trueData) && (s->antecedents[j]->status == rf_bad)) + skipNode = RF_TRUE; + if (skipNode) { + /* this node has one or more failed true data dependencies, so skip it */ + s->next = skiplist; + skiplist = s; + } + else + /* add s to list of nodes (q) to execute */ + if (context != RF_INTR_CONTEXT) { + /* we only have to enqueue if we're at intr context */ + s->next = firelist; /* put node on a list to be fired after we unlock */ + firelist = s; + } else { /* enqueue the node for the dag exec thread to fire */ + RF_ASSERT(NodeReady(s)); + if (q) { + q->next = s; + q = s; + } + else { + qh = q = s; + qh->next = NULL; + } + } + } + } + } + + if (q) { + /* xfer our local list of nodes to the node queue */ + q->next = raidPtr->node_queue; raidPtr->node_queue = qh; + DO_SIGNAL(raidPtr); + } + DO_UNLOCK(raidPtr); + + for (; skiplist; skiplist = next) { + next = skiplist->next; + skiplist->status = rf_skipped; + for (i = 0; i < skiplist->numAntecedents; i++) { + skiplist->antecedents[i]->numSuccFired++; + } + if (skiplist->commitNode) { + skiplist->dagHdr->numCommits++; + } + rf_FinishNode(skiplist, context); + } + for (; finishlist; finishlist = next) { + /* NIL nodes: no need to fire them */ + next = finishlist->next; + finishlist->status = rf_good; + for (i = 0; i < finishlist->numAntecedents; i++) { + finishlist->antecedents[i]->numSuccFired++; + } + if (finishlist->commitNode) + finishlist->dagHdr->numCommits++; + /* + * Okay, here we're calling rf_FinishNode() on nodes that + * have the null function as their work proc. Such a node + * could be the terminal node in a DAG. If so, it will + * cause the DAG to complete, which will in turn free + * memory used by the DAG, which includes the node in + * question. Thus, we must avoid referencing the node + * at all after calling rf_FinishNode() on it. + */ + rf_FinishNode(finishlist, context); /* recursive call */ + } + /* fire all nodes in firelist */ + FireNodeList(firelist); +#endif /* SIMULATE */ + break; + + case rf_rollBackward : +#ifdef SIMULATE +#else /* SIMULATE */ + for (i = 0; i < node->numAntecedents; i++) { + a = *(node->antecedents + i); + RF_ASSERT(a->status == rf_good); + RF_ASSERT(a->numSuccDone <= a->numSuccedents); + RF_ASSERT(a->numSuccDone <= a->numSuccFired); + + if (a->numSuccDone == a->numSuccFired) { + if (a->undoFunc == rf_NullNodeFunc) { + /* don't fire NIL nodes, just process them */ + a->next = finishlist; + finishlist = a; + } else { + if (context != RF_INTR_CONTEXT) { + /* we only have to enqueue if we're at intr context */ + a->next = firelist; /* put node on a list to be fired after we unlock */ + firelist = a; + } else { /* enqueue the node for the dag exec thread to fire */ + RF_ASSERT(NodeReady(a)); + if (q) { + q->next = a; + q = a; + } + else { + qh = q = a; + qh->next = NULL; + } + } + } + } + + } + if (q) { + /* xfer our local list of nodes to the node queue */ + q->next = raidPtr->node_queue; raidPtr->node_queue = qh; + DO_SIGNAL(raidPtr); + } + DO_UNLOCK(raidPtr); + for (; finishlist; finishlist = next) { /* NIL nodes: no need to fire them */ + next = finishlist->next; + finishlist->status = rf_good; + /* + * Okay, here we're calling rf_FinishNode() on nodes that + * have the null function as their work proc. Such a node + * could be the first node in a DAG. If so, it will + * cause the DAG to complete, which will in turn free + * memory used by the DAG, which includes the node in + * question. Thus, we must avoid referencing the node + * at all after calling rf_FinishNode() on it. + */ + rf_FinishNode(finishlist, context); /* recursive call */ + } + /* fire all nodes in firelist */ + FireNodeList(firelist); +#endif /* SIMULATE */ + + break; + default : + printf("Engine found illegal DAG status in PropagateResults()\n"); + RF_PANIC(); + break; + } +} + + + +/* + * Process a fired node which has completed + */ +static void ProcessNode( + RF_DagNode_t *node, + int context) +{ + RF_Raid_t *raidPtr; + int tid; + + raidPtr = node->dagHdr->raidPtr; + + switch (node->status) { + case rf_good : + /* normal case, don't need to do anything */ + break; + case rf_bad : + if ((node->dagHdr->numCommits > 0) || (node->dagHdr->numCommitNodes == 0)) { + node->dagHdr->status = rf_rollForward; /* crossed commit barrier */ + if (rf_engineDebug || 1) { + rf_get_threadid(tid); + printf("[%d] node (%s) returned fail, rolling forward\n", tid, node->name); + } + } + else { + node->dagHdr->status = rf_rollBackward; /* never reached commit barrier */ + if (rf_engineDebug || 1) { + rf_get_threadid(tid); + printf("[%d] node (%s) returned fail, rolling backward\n", tid, node->name); + } + } + break; + case rf_undone : + /* normal rollBackward case, don't need to do anything */ + break; + case rf_panic : + /* an undo node failed!!! */ + printf("UNDO of a node failed!!!/n"); + break; + default : + printf("node finished execution with an illegal status!!!\n"); + RF_PANIC(); + break; + } + +#ifdef SIMULATE + /* simulator fires nodes here. + * user/kernel rely upon PropagateResults to do this. + * XXX seems like this code should be merged so that the same thing happens for + * both sim, user, and kernel. -wvcii + */ + switch (node->dagHdr->status) { + case rf_enable : + case rf_rollForward : + if (node->numSuccedents == 0) { + /* process terminal node */ + if (rf_engineDebug) if (!DAGDone(node->dagHdr)) { + rf_get_threadid(tid); + printf("[%d] ProcessNode: !!!done but dag still in flight\n",tid); + RF_PANIC(); + } + if (rf_engineDebug) printf("[%d] ProcessNode: !!!done will return true\n",tid); + /* Mark dag as done */ + (node->dagHdr)->done=RF_TRUE; + raidPtr->dags_in_flight--; + } + else { + PropagateResults(node, context); + FireNodeArray(node->numSuccedents, node->succedents); + } + break; + case rf_rollBackward : + if (node->numAntecedents == 0) { + /* reached head of dag, we're done */ + if (rf_engineDebug) if (!DAGDone(node->dagHdr)) { + rf_get_threadid(tid); + printf("[%d] ProcessNode: !!!done but dag still in flight\n",tid); + RF_PANIC(); + } + if (rf_engineDebug) printf("[%d] ProcessNode: !!!done will return true\n",tid); + /* Mark dag as done */ + (node->dagHdr)->done=RF_TRUE; + raidPtr->dags_in_flight--; + } + else { + PropagateResults(node, context); + FireNodeArray(node->numAntecedents, node->antecedents); + } + break; + default : + RF_PANIC(); + break; + } + + +#else /* SIMULATE */ + /* enqueue node's succedents (antecedents if rollBackward) for execution */ + PropagateResults(node, context); +#endif /* SIMULATE */ +} + + + +/* user context or dag-exec-thread context: + * This is the first step in post-processing a newly-completed node. + * This routine is called by each node execution function to mark the node + * as complete and fire off any successors that have been enabled. + */ +int rf_FinishNode( + RF_DagNode_t *node, + int context) +{ + /* as far as I can tell, retcode is not used -wvcii */ + int retcode = RF_FALSE; + node->dagHdr->numNodesCompleted++; + ProcessNode(node, context); + +#ifdef SIMULATE + if ((node->dagHdr)->done == RF_TRUE) + retcode = RF_TRUE; +#endif /* SIMULATE */ + + return(retcode); +} + + +/* user context: + * submit dag for execution, return non-zero if we have to wait for completion. + * if and only if we return non-zero, we'll cause cbFunc to get invoked with + * cbArg when the DAG has completed. + * + * for now we always return 1. If the DAG does not cause any I/O, then the callback + * may get invoked before DispatchDAG returns. There's code in state 5 of ContinueRaidAccess + * to handle this. + * + * All we do here is fire the direct successors of the header node. The + * DAG execution thread does the rest of the dag processing. + */ +int rf_DispatchDAG( + RF_DagHeader_t *dag, + void (*cbFunc)(void *), + void *cbArg) +{ + RF_Raid_t *raidPtr; + int tid; + + raidPtr = dag->raidPtr; + if (dag->tracerec) { + RF_ETIMER_START(dag->tracerec->timer); + } + + if (rf_engineDebug || rf_validateDAGDebug) { + if (rf_ValidateDAG(dag)) + RF_PANIC(); + } + if (rf_engineDebug) { + rf_get_threadid(tid); + printf("[%d] Entering DispatchDAG\n",tid); + } + + raidPtr->dags_in_flight++; /* debug only: blow off proper locking */ + dag->cbFunc = cbFunc; + dag->cbArg = cbArg; + dag->numNodesCompleted = 0; + dag->status = rf_enable; + FireNodeArray(dag->numSuccedents, dag->succedents); + return(1); +} + +/* dedicated kernel thread: + * the thread that handles all DAG node firing. + * To minimize locking and unlocking, we grab a copy of the entire node queue and then set the + * node queue to NULL before doing any firing of nodes. This way we only have to release the + * lock once. Of course, it's probably rare that there's more than one node in the queue at + * any one time, but it sometimes happens. + * + * In the kernel, this thread runs at spl0 and is not swappable. I copied these + * characteristics from the aio_completion_thread. + */ + +#ifndef SIMULATE +static void DAGExecutionThread(RF_ThreadArg_t arg) +{ + RF_DagNode_t *nd, *local_nq, *term_nq, *fire_nq; + RF_Raid_t *raidPtr; + int ks, tid; + int s; +#if !defined(__NetBSD__) && !defined(__OpenBSD__) + RF_Thread_t thread; +#endif + + raidPtr = (RF_Raid_t *)arg; + + rf_assign_threadid(); + if (rf_engineDebug) { + rf_get_threadid(tid); + printf("[%d] Engine thread is running\n", tid); + } + +#ifdef KERNEL +#if !defined(__NetBSD__) && !defined(__OpenBSD__) + thread = current_thread(); + thread_swappable(thread, RF_FALSE); + thread->priority = thread->sched_pri = BASEPRI_SYSTEM; + s = spl0(); +#endif + /* XXX what to put here XXX */ + + s=splbio(); + +#endif /* KERNEL */ + + RF_THREADGROUP_RUNNING(&raidPtr->engine_tg); + + DO_LOCK(raidPtr); + while (!raidPtr->shutdown_engine) { + + while (raidPtr->node_queue != NULL) { + local_nq = raidPtr->node_queue; + fire_nq = NULL; + term_nq = NULL; + raidPtr->node_queue = NULL; + DO_UNLOCK(raidPtr); + + /* first, strip out the terminal nodes */ + while (local_nq) { + nd = local_nq; + local_nq = local_nq->next; + switch(nd->dagHdr->status) { + case rf_enable : + case rf_rollForward : + if (nd->numSuccedents == 0) { + /* end of the dag, add to callback list */ + nd->next = term_nq; + term_nq = nd; + } + else { + /* not the end, add to the fire queue */ + nd->next = fire_nq; + fire_nq = nd; + } + break; + case rf_rollBackward : + if (nd->numAntecedents == 0) { + /* end of the dag, add to the callback list */ + nd->next = term_nq; + term_nq = nd; + } + else { + /* not the end, add to the fire queue */ + nd->next = fire_nq; + fire_nq = nd; + } + break; + default : + RF_PANIC(); + break; + } + } + + /* execute callback of dags which have reached the terminal node */ + while (term_nq) { + nd = term_nq; + term_nq = term_nq->next; + nd->next = NULL; + (nd->dagHdr->cbFunc)(nd->dagHdr->cbArg); + raidPtr->dags_in_flight--; /* debug only */ + } + + /* fire remaining nodes */ + FireNodeList(fire_nq); + + DO_LOCK(raidPtr); + } + while (!raidPtr->shutdown_engine && raidPtr->node_queue == NULL) + DO_WAIT(raidPtr); + } + DO_UNLOCK(raidPtr); + + RF_THREADGROUP_DONE(&raidPtr->engine_tg); +#ifdef KERNEL +#if defined(__NetBSD__) || defined(__OpenBSD__) + splx(s); + kthread_exit(0); +#else + splx(s); + thread_terminate(thread); + thread_halt_self(); +#endif +#endif /* KERNEL */ +} + +#endif /* !SIMULATE */ diff --git a/sys/dev/raidframe/rf_engine.h b/sys/dev/raidframe/rf_engine.h new file mode 100644 index 00000000000..c3186aa791f --- /dev/null +++ b/sys/dev/raidframe/rf_engine.h @@ -0,0 +1,75 @@ +/* $OpenBSD: rf_engine.h,v 1.1 1999/01/11 14:29:19 niklas Exp $ */ +/* $NetBSD: rf_engine.h,v 1.1 1998/11/13 04:20:29 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: William V. Courtright II, Mark Holland, Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/********************************************************** + * * + * engine.h -- header file for execution engine functions * + * * + **********************************************************/ + +/* : + * Log: rf_engine.h,v + * Revision 1.11 1996/06/14 14:16:22 jimz + * new decl of ConfigureEngine + * + * Revision 1.10 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.9 1996/05/30 12:59:18 jimz + * make etimer happier, more portable + * + * Revision 1.8 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.7 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.6 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.5 1995/12/01 18:12:17 root + * added copyright info + * + */ + +#ifndef _RF__RF_ENGINE_H_ +#define _RF__RF_ENGINE_H_ + +int rf_ConfigureEngine(RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, RF_Config_t *cfgPtr); + +int rf_FinishNode(RF_DagNode_t *node, int context); /* return finished node to engine */ + +int rf_DispatchDAG(RF_DagHeader_t *dag, void (*cbFunc)(void *), void *cbArg); /* execute dag */ + +#endif /* !_RF__RF_ENGINE_H_ */ diff --git a/sys/dev/raidframe/rf_etimer.h b/sys/dev/raidframe/rf_etimer.h new file mode 100644 index 00000000000..5d78b80eac2 --- /dev/null +++ b/sys/dev/raidframe/rf_etimer.h @@ -0,0 +1,353 @@ +/* $OpenBSD: rf_etimer.h,v 1.1 1999/01/11 14:29:20 niklas Exp $ */ +/* $NetBSD: rf_etimer.h,v 1.1 1998/11/13 04:20:29 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* rf_etimer.h -- header file for code related to accurate timing + * This code currently assumes that the elapsed time between START_TIMER + * and START_TIMER is less than the period of the cycle counter. This + * means the events you want to time must be less than: + * clock speed max time + * ---------- -------- + * 175 MHz 24 sec + * 150 MHz 28 sec + * 125 MHz 34 sec + * + * + * : + * Log: rf_etimer.h,v + * Revision 1.32 1996/08/13 18:11:09 jimz + * want MACH&&!__osf__, not just MACH for mach timing (MACH defined under OSF/1) + * + * Revision 1.31 1996/08/12 20:11:38 jimz + * use read_real_time() on AIX4+ + * + * Revision 1.30 1996/08/09 18:48:12 jimz + * for now, use gettimeofday() on MACH + * (should eventually use better clock stuff) + * + * Revision 1.29 1996/08/07 21:09:08 jimz + * add IRIX as a gettimeofday system + * + * Revision 1.28 1996/08/06 22:25:23 jimz + * add LINUX_I386 + * + * Revision 1.27 1996/07/30 04:45:53 jimz + * add ultrix stuff + * + * Revision 1.26 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.25 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.24 1996/07/27 18:40:24 jimz + * cleanup sweep + * + * Revision 1.23 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.22 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.21 1996/07/17 21:00:58 jimz + * clean up timer interface, tracing + * + * Revision 1.20 1996/07/17 14:26:28 jimz + * rf_scc -> rf_rpcc + * + * Revision 1.19 1996/06/14 21:24:48 jimz + * move out ConfigureEtimer + * + * Revision 1.18 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.17 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.16 1996/05/30 12:59:18 jimz + * make etimer happier, more portable + * + * Revision 1.15 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.14 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.13 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.12 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.11 1995/12/01 18:10:40 root + * added copyright info + * + * Revision 1.10 1995/09/29 14:27:32 wvcii + * removed printfs from ConfigureEtimer() + * + * Revision 1.9 95/09/19 22:57:31 jimz + * added kernel version of ConfigureEtimer + * + * Revision 1.8 1995/09/14 13:03:04 amiri + * set default CPU speed to 125Mhz to avoid divide by zero problems. + * + * Revision 1.7 1995/09/11 19:04:36 wvcii + * timer autoconfigs using pdl routine to check cpu speed + * value may still be overridden via config debug var timerTicksPerSec + * + */ + + +#ifndef _RF__RF_TIMER_H_ +#define _RF__RF_TIMER_H_ + +#include "rf_options.h" + +#ifdef _KERNEL +#define KERNEL +#endif + +#if defined(__NetBSD__) || defined(__OpenBSD__) + +#ifdef KERNEL +extern unsigned int rpcc(void); +#define rf_read_cycle_counter rpcc +#else /* KERNEL */ +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +/* XXX does this function even exist anywhere??? GO */ +extern unsigned int rf_rpcc(); +#endif +#define rf_read_cycle_counter rf_rpcc +#endif /* KERNEL */ + +#define RF_DEF_TIMER_MAX_VAL 0xFFFFFFFF + +typedef struct RF_EtimerVal_s { + unsigned ccnt; /* cycle count */ +} RF_EtimerVal_t; + +struct RF_Etimer_s { + RF_EtimerVal_t st; + RF_EtimerVal_t et; + unsigned long ticks; /* elapsed time in ticks */ +}; + +extern long rf_timer_max_val; +extern long rf_timer_ticks_per_second; +extern unsigned long rf_timer_ticks_per_usec; + +#define RF_ETIMER_TICKS2US(_tcks_) ( (_tcks_) / rf_timer_ticks_per_usec ) +#define RF_ETIMER_START(_t_) { (_t_).st.ccnt = rf_read_cycle_counter(); } +#define RF_ETIMER_STOP(_t_) { (_t_).et.ccnt = rf_read_cycle_counter(); } +#define RF_ETIMER_EVAL(_t_) { \ + if ((_t_).st.ccnt < (_t_).et.ccnt) \ + (_t_).ticks = (_t_).et.ccnt - (_t_).st.ccnt; \ + else \ + (_t_).ticks = rf_timer_max_val - ((_t_).st.ccnt - (_t_).et.ccnt); \ +} + +#define RF_ETIMER_VAL_TICKS(_t_) ((_t_).ticks) +#define RF_ETIMER_VAL_US(_t_) (RF_ETIMER_TICKS2US((_t_).ticks)) +#define RF_ETIMER_VAL_MS(_t_) (RF_ETIMER_TICKS2US((_t_).ticks)/1000) + +#endif /* __NetBSD__ || __OpenBSD__ */ + + +#if defined(__alpha) && !defined(__NetBSD__) && !defined(__OpenBSD__) + +#ifdef KERNEL +extern unsigned int rpcc(); +#define rf_read_cycle_counter rpcc +#else /* KERNEL */ +extern unsigned int rf_rpcc(); +#define rf_read_cycle_counter rf_rpcc +#endif /* KERNEL */ + +#define RF_DEF_TIMER_MAX_VAL 0xFFFFFFFF + +typedef struct RF_EtimerVal_s { + unsigned ccnt; /* cycle count */ +} RF_EtimerVal_t; + +struct RF_Etimer_s { + RF_EtimerVal_t st; + RF_EtimerVal_t et; + unsigned long ticks; /* elapsed time in ticks */ +}; + +extern long rf_timer_max_val; +extern long rf_timer_ticks_per_second; +extern unsigned long rf_timer_ticks_per_usec; + +#define RF_ETIMER_TICKS2US(_tcks_) ( (_tcks_) / rf_timer_ticks_per_usec ) +#define RF_ETIMER_START(_t_) { (_t_).st.ccnt = rf_read_cycle_counter(); } +#define RF_ETIMER_STOP(_t_) { (_t_).et.ccnt = rf_read_cycle_counter(); } +#define RF_ETIMER_EVAL(_t_) { \ + if ((_t_).st.ccnt < (_t_).et.ccnt) \ + (_t_).ticks = (_t_).et.ccnt - (_t_).st.ccnt; \ + else \ + (_t_).ticks = rf_timer_max_val - ((_t_).st.ccnt - (_t_).et.ccnt); \ +} + +#define RF_ETIMER_VAL_TICKS(_t_) ((_t_).ticks) +#define RF_ETIMER_VAL_US(_t_) (RF_ETIMER_TICKS2US((_t_).ticks)) +#define RF_ETIMER_VAL_MS(_t_) (RF_ETIMER_TICKS2US((_t_).ticks)/1000) + +#endif /* __alpha */ + +#ifdef _IBMR2 + +extern void rf_rtclock(unsigned int *secs, unsigned int *nsecs); + +#define RF_MSEC_PER_SEC 1000 +#define RF_USEC_PER_SEC 1000000 +#define RF_NSEC_PER_SEC 1000000000 + +typedef struct RF_EtimerVal_s { + unsigned int secs; + unsigned int nsecs; +} RF_EtimerVal_t; + +struct RF_Etimer_s { + RF_EtimerVal_t start; + RF_EtimerVal_t end; + RF_EtimerVal_t elapsed; +}; + +#if RF_AIXVERS >= 4 + +#include <sys/time.h> + +#define RF_ETIMER_START(_t_) { \ + timebasestruct_t tb; \ + tb.flag = 1; \ + read_real_time(&tb, TIMEBASE_SZ); \ + (_t_).start.secs = tb.tb_high; \ + (_t_).start.nsecs = tb.tb_low; \ +} + +#define RF_ETIMER_STOP(_t_) { \ + timebasestruct_t tb; \ + tb.flag = 1; \ + read_real_time(&tb, TIMEBASE_SZ); \ + (_t_).end.secs = tb.tb_high; \ + (_t_).end.nsecs = tb.tb_low; \ +} + +#else /* RF_AIXVERS >= 4 */ + +#define RF_ETIMER_START(_t_) { \ + rf_rtclock(&((_t_).start.secs), &((_t_).start.nsecs)); \ +} + +#define RF_ETIMER_STOP(_t_) { \ + rf_rtclock(&((_t_).end.secs), &((_t_).end.nsecs)); \ +} + +#endif /* RF_AIXVERS >= 4 */ + +#define RF_ETIMER_EVAL(_t_) { \ + if ((_t_).end.nsecs >= (_t_).start.nsecs) { \ + (_t_).elapsed.nsecs = (_t_).end.nsecs - (_t_).start.nsecs; \ + (_t_).elapsed.secs = (_t_).end.secs - (_t_).start.nsecs; \ + } \ + else { \ + (_t_).elapsed.nsecs = RF_NSEC_PER_SEC + (_t_).end.nsecs; \ + (_t_).elapsed.nsecs -= (_t_).start.nsecs; \ + (_t_).elapsed.secs = (_t_).end.secs - (_t_).start.secs + 1; \ + } \ +} + +#define RF_ETIMER_VAL_US(_t_) (((_t_).elapsed.secs*RF_USEC_PER_SEC)+((_t_).elapsed.nsecs/1000)) +#define RF_ETIMER_VAL_MS(_t_) (((_t_).elapsed.secs*RF_MSEC_PER_SEC)+((_t_).elapsed.nsecs/1000000)) + +#endif /* _IBMR2 */ + +/* + * XXX investigate better timing for these + */ +#if defined(hpux) || defined(sun) || defined(NETBSD_I386) || defined(OPENBSD_I386) || defined(ultrix) || defined(LINUX_I386) || defined(IRIX) || (defined(MACH) && !defined(__osf__)) +#include <sys/time.h> + +#define RF_USEC_PER_SEC 1000000 + +struct RF_Etimer_s { + struct timeval start; + struct timeval end; + struct timeval elapsed; +}; +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#define RF_ETIMER_START(_t_) { \ + gettimeofday(&((_t_).start), NULL); \ +} + +#define RF_ETIMER_STOP(_t_) { \ + gettimeofday(&((_t_).end), NULL); \ +} + +#else +#define RF_ETIMER_START(_t_) { \ +} +/* XXX these just drop off the end of the world... */ +#define RF_ETIMER_STOP(_t_) { \ +} +#endif + +#define RF_ETIMER_EVAL(_t_) { \ + if ((_t_).end.tv_usec >= (_t_).start.tv_usec) { \ + (_t_).elapsed.tv_usec = (_t_).end.tv_usec - (_t_).start.tv_usec; \ + (_t_).elapsed.tv_sec = (_t_).end.tv_sec - (_t_).start.tv_usec; \ + } \ + else { \ + (_t_).elapsed.tv_usec = RF_USEC_PER_SEC + (_t_).end.tv_usec; \ + (_t_).elapsed.tv_usec -= (_t_).start.tv_usec; \ + (_t_).elapsed.tv_sec = (_t_).end.tv_sec - (_t_).start.tv_sec + 1; \ + } \ +} + +#define RF_ETIMER_VAL_US(_t_) (((_t_).elapsed.tv_sec*RF_USEC_PER_SEC)+(_t_).elapsed.tv_usec) +#define RF_ETIMER_VAL_MS(_t_) (((_t_).elapsed.tv_sec*RF_MSEC_PER_SEC)+((_t_).elapsed.tv_usec/1000)) + +#endif /* hpux || sun || NETBSD_I386 || OPENBSD_I386 || ultrix || LINUX_I386 || IRIX || (MACH && !__osf__) */ + +#endif /* !_RF__RF_TIMER_H_ */ diff --git a/sys/dev/raidframe/rf_evenodd.c b/sys/dev/raidframe/rf_evenodd.c new file mode 100644 index 00000000000..90d18653cda --- /dev/null +++ b/sys/dev/raidframe/rf_evenodd.c @@ -0,0 +1,556 @@ +/* $OpenBSD: rf_evenodd.c,v 1.1 1999/01/11 14:29:21 niklas Exp $ */ +/* $NetBSD: rf_evenodd.c,v 1.1 1998/11/13 04:20:29 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Chang-Ming Wu + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/***************************************************************************************** + * + * rf_evenodd.c -- implements EVENODD array architecture + * + ****************************************************************************************/ + +#include "rf_archs.h" + +#if RF_INCLUDE_EVENODD > 0 + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_dagffrd.h" +#include "rf_dagffwr.h" +#include "rf_dagdegrd.h" +#include "rf_dagdegwr.h" +#include "rf_dagutils.h" +#include "rf_dagfuncs.h" +#include "rf_threadid.h" +#include "rf_etimer.h" +#include "rf_general.h" +#include "rf_evenodd.h" +#include "rf_configure.h" +#include "rf_parityscan.h" +#include "rf_utils.h" +#include "rf_map.h" +#include "rf_pq.h" +#include "rf_mcpair.h" +#include "rf_sys.h" +#include "rf_evenodd.h" +#include "rf_evenodd_dagfuncs.h" +#include "rf_evenodd_dags.h" +#include "rf_engine.h" + +typedef struct RF_EvenOddConfigInfo_s { + RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by IdentifyStripe */ +} RF_EvenOddConfigInfo_t; + +int rf_ConfigureEvenOdd(listp, raidPtr, cfgPtr) + RF_ShutdownList_t **listp; + RF_Raid_t *raidPtr; + RF_Config_t *cfgPtr; +{ + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_EvenOddConfigInfo_t *info; + RF_RowCol_t i, j, startdisk; + + RF_MallocAndAdd(info, sizeof(RF_EvenOddConfigInfo_t), (RF_EvenOddConfigInfo_t *), raidPtr->cleanupList); + layoutPtr->layoutSpecificInfo = (void *) info; + + RF_ASSERT(raidPtr->numRow == 1); + + info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol, raidPtr->numCol, raidPtr->cleanupList); + startdisk = 0; + for (i=0; i<raidPtr->numCol; i++) { + for (j=0; j<raidPtr->numCol; j++) { + info->stripeIdentifier[i][j] = (startdisk + j) % raidPtr->numCol; + } + if ((startdisk -= 2) < 0) startdisk += raidPtr->numCol; + } + + /* fill in the remaining layout parameters */ + layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk; + layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector; + layoutPtr->numDataCol = raidPtr->numCol-2; /* ORIG: layoutPtr->numDataCol = raidPtr->numCol-1; */ +#if RF_EO_MATRIX_DIM > 17 + if (raidPtr->numCol <= 17){ + printf("Number of stripe units in a parity stripe is smaller than 17. Please\n"); + printf("define the macro RF_EO_MATRIX_DIM in file rf_evenodd_dagfuncs.h to \n"); + printf("be 17 to increase performance. \n"); + return(EINVAL); + } +#elif RF_EO_MATRIX_DIM == 17 + if (raidPtr->numCol > 17) { + printf("Number of stripe units in a parity stripe is bigger than 17. Please\n"); + printf("define the macro RF_EO_MATRIX_DIM in file rf_evenodd_dagfuncs.h to \n"); + printf("be 257 for encoding and decoding functions to work. \n"); + return(EINVAL); + } +#endif + layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; + layoutPtr->numParityCol = 2; + layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk; + raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit; + + raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; + + return(0); +} + +int rf_GetDefaultNumFloatingReconBuffersEvenOdd(RF_Raid_t *raidPtr) +{ + return(20); +} + +RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitEvenOdd(RF_Raid_t *raidPtr) +{ + return(10); +} + +void rf_IdentifyStripeEvenOdd( + RF_Raid_t *raidPtr, + RF_RaidAddr_t addr, + RF_RowCol_t **diskids, + RF_RowCol_t *outRow) +{ + RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr); + RF_EvenOddConfigInfo_t *info = (RF_EvenOddConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; + + *outRow = 0; + *diskids = info->stripeIdentifier[ stripeID % raidPtr->numCol ]; +} + +/* The layout of stripe unit on the disks are: c0 c1 c2 c3 c4 + + 0 1 2 E P + 5 E P 3 4 + P 6 7 8 E + 10 11 E P 9 + E P 12 13 14 + .... + + We use the MapSectorRAID5 to map data information because the routine can be shown to map exactly + the layout of data stripe unit as shown above although we have 2 redundant information now. + But for E and P, we use rf_MapEEvenOdd and rf_MapParityEvenOdd which are different method from raid-5. +*/ + + +void rf_MapParityEvenOdd( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; + RF_StripeNum_t endSUIDofthisStrip = (SUID/raidPtr->Layout.numDataCol + 1)*raidPtr->Layout.numDataCol - 1; + + *row = 0; + *col = ( endSUIDofthisStrip + 2)%raidPtr->numCol; + *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit + + (raidSector % raidPtr->Layout.sectorsPerStripeUnit); +} + +void rf_MapEEvenOdd( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; + RF_StripeNum_t endSUIDofthisStrip = (SUID/raidPtr->Layout.numDataCol + 1)*raidPtr->Layout.numDataCol - 1; + + *row = 0; + *col = ( endSUIDofthisStrip + 1)%raidPtr->numCol; + *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit + + (raidSector % raidPtr->Layout.sectorsPerStripeUnit); +} + +void rf_EODagSelect( + RF_Raid_t *raidPtr, + RF_IoType_t type, + RF_AccessStripeMap_t *asmap, + RF_VoidFuncPtr *createFunc) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + unsigned ndfail = asmap->numDataFailed; + unsigned npfail = asmap->numParityFailed +asmap->numQFailed; + unsigned ntfail = npfail + ndfail; + + RF_ASSERT(RF_IO_IS_R_OR_W(type)); + if (ntfail > 2) + { + RF_ERRORMSG("more than two disks failed in a single group! Aborting I/O operation.\n"); + /* *infoFunc = */ *createFunc = NULL; + return; + } + + /* ok, we can do this I/O */ + if (type == RF_IO_TYPE_READ) + { + switch (ndfail) + { + case 0: + /* fault free read */ + *createFunc = (RF_VoidFuncPtr)rf_CreateFaultFreeReadDAG; /* same as raid 5 */ + break; + case 1: + /* lost a single data unit */ + /* two cases: + (1) parity is not lost. + do a normal raid 5 reconstruct read. + (2) parity is lost. + do a reconstruct read using "e". + */ + if (ntfail == 2) /* also lost redundancy */ + { + if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) + *createFunc = (RF_VoidFuncPtr)rf_EO_110_CreateReadDAG; + else + *createFunc = (RF_VoidFuncPtr)rf_EO_101_CreateReadDAG; + } + else + { + /* P and E are ok. But is there a failure + in some unaccessed data unit? + */ + if (rf_NumFailedDataUnitsInStripe(raidPtr,asmap)==2) + *createFunc = (RF_VoidFuncPtr)rf_EO_200_CreateReadDAG; + else + *createFunc = (RF_VoidFuncPtr)rf_EO_100_CreateReadDAG; + } + break; + case 2: + /* *createFunc = rf_EO_200_CreateReadDAG; */ + *createFunc = NULL; + break; + } + return; + } + + /* a write */ + switch (ntfail) + { + case 0: /* fault free */ + if (rf_suppressLocksAndLargeWrites || + (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) || + (asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) { + + *createFunc = (RF_VoidFuncPtr)rf_EOCreateSmallWriteDAG; + } + else { + *createFunc = (RF_VoidFuncPtr)rf_EOCreateLargeWriteDAG; + } + break; + + case 1: /* single disk fault */ + if (npfail==1) + { + RF_ASSERT ((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q)); + if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) + { /* q died, treat like normal mode raid5 write.*/ + if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1)) + || (asmap->parityInfo->next!=NULL) || rf_NumFailedDataUnitsInStripe(raidPtr,asmap)) + *createFunc = (RF_VoidFuncPtr)rf_EO_001_CreateSmallWriteDAG; + else + *createFunc = (RF_VoidFuncPtr)rf_EO_001_CreateLargeWriteDAG; + } + else + { /* parity died, small write only updating Q */ + if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1)) + || (asmap->qInfo->next!=NULL) || rf_NumFailedDataUnitsInStripe(raidPtr,asmap)) + *createFunc = (RF_VoidFuncPtr)rf_EO_010_CreateSmallWriteDAG; + else + *createFunc = (RF_VoidFuncPtr)rf_EO_010_CreateLargeWriteDAG; + } + } + else + { /* data missing. + Do a P reconstruct write if only a single data unit + is lost in the stripe, otherwise a reconstruct + write which employnig both P and E units. */ + if (rf_NumFailedDataUnitsInStripe(raidPtr,asmap)==2) + { + if (asmap->numStripeUnitsAccessed == 1) + *createFunc = (RF_VoidFuncPtr)rf_EO_200_CreateWriteDAG; + else + *createFunc = NULL; /* No direct support for this case now, like that in Raid-5 */ + } + else + { + if (asmap->numStripeUnitsAccessed != 1 && asmap->failedPDAs[0]->numSector != layoutPtr->sectorsPerStripeUnit) + *createFunc = NULL; /* No direct support for this case now, like that in Raid-5 */ + else *createFunc = (RF_VoidFuncPtr)rf_EO_100_CreateWriteDAG; + } + } + break; + + case 2: /* two disk faults */ + switch (npfail) + { + case 2: /* both p and q dead */ + *createFunc = (RF_VoidFuncPtr)rf_EO_011_CreateWriteDAG; + break; + case 1: /* either p or q and dead data */ + RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA); + RF_ASSERT ((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)); + if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q) + { + if(asmap->numStripeUnitsAccessed != 1 && asmap->failedPDAs[0]->numSector != layoutPtr->sectorsPerStripeUnit) + *createFunc = NULL; /* In both PQ and EvenOdd, no direct support for this case now, like that in Raid-5 */ + else + *createFunc = (RF_VoidFuncPtr)rf_EO_101_CreateWriteDAG; + } + else + { + if (asmap->numStripeUnitsAccessed != 1 && asmap->failedPDAs[0]->numSector != layoutPtr->sectorsPerStripeUnit) + *createFunc = NULL; /* No direct support for this case, like that in Raid-5 */ + else + *createFunc = (RF_VoidFuncPtr)rf_EO_110_CreateWriteDAG; + } + break; + case 0: /* double data loss */ + /* if(asmap->failedPDAs[0]->numSector + asmap->failedPDAs[1]->numSector == 2 * layoutPtr->sectorsPerStripeUnit ) + *createFunc = rf_EOCreateLargeWriteDAG; + else */ + *createFunc = NULL; /* currently, in Evenodd, No support for simultaneous access of both failed SUs */ + break; + } + break; + + default: /* more than 2 disk faults */ + *createFunc = NULL; + RF_PANIC(); + } + return; +} + + +int rf_VerifyParityEvenOdd(raidPtr, raidAddr, parityPDA, correct_it, flags) + RF_Raid_t *raidPtr; + RF_RaidAddr_t raidAddr; + RF_PhysDiskAddr_t *parityPDA; + int correct_it; + RF_RaidAccessFlags_t flags; +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_RaidAddr_t startAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr); + RF_SectorCount_t numsector = parityPDA->numSector; + int numbytes = rf_RaidAddressToByte(raidPtr, numsector); + int bytesPerStripe = numbytes * layoutPtr->numDataCol; + RF_DagHeader_t *rd_dag_h, *wr_dag_h; /* read, write dag */ + RF_DagNode_t *blockNode, *unblockNode, *wrBlock, *wrUnblock; + RF_AccessStripeMapHeader_t *asm_h; + RF_AccessStripeMap_t *asmap; + RF_AllocListElem_t *alloclist; + RF_PhysDiskAddr_t *pda; + char *pbuf, *buf, *end_p, *p; + char *redundantbuf2; + int redundantTwoErr = 0, redundantOneErr = 0; + int parity_cant_correct = RF_FALSE, red2_cant_correct = RF_FALSE, parity_corrected = RF_FALSE, red2_corrected = RF_FALSE; + int i, retcode; + RF_ReconUnitNum_t which_ru; + RF_StripeNum_t psID = rf_RaidAddressToParityStripeID(layoutPtr, raidAddr, &which_ru); + int stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol; + RF_AccTraceEntry_t tracerec; + RF_MCPair_t *mcpair; + + retcode = RF_PARITY_OKAY; + + mcpair = rf_AllocMCPair(); + rf_MakeAllocList(alloclist); + RF_MallocAndAdd(buf, numbytes * (layoutPtr->numDataCol + layoutPtr->numParityCol), (char *), alloclist); + RF_CallocAndAdd(pbuf, 1, numbytes, (char *), alloclist); /* use calloc to make sure buffer is zeroed */ + end_p = buf + bytesPerStripe; + RF_CallocAndAdd(redundantbuf2, 1, numbytes, (char *), alloclist); /* use calloc to make sure buffer is zeroed */ + + rd_dag_h = rf_MakeSimpleDAG(raidPtr, stripeWidth, numbytes, buf, rf_DiskReadFunc, rf_DiskReadUndoFunc, + "Rod", alloclist, flags, RF_IO_NORMAL_PRIORITY); + blockNode = rd_dag_h->succedents[0]; + unblockNode = blockNode->succedents[0]->succedents[0]; + + /* map the stripe and fill in the PDAs in the dag */ + asm_h = rf_MapAccess(raidPtr, startAddr, layoutPtr->dataSectorsPerStripe, buf, RF_DONT_REMAP); + asmap = asm_h->stripeMap; + + for (pda=asmap->physInfo,i=0; i<layoutPtr->numDataCol; i++,pda=pda->next) { + RF_ASSERT(pda); + rf_RangeRestrictPDA(raidPtr, parityPDA, pda, 0, 1); + RF_ASSERT(pda->numSector != 0); + if (rf_TryToRedirectPDA(raidPtr, pda, 0)) goto out; /* no way to verify parity if disk is dead. return w/ good status */ + blockNode->succedents[i]->params[0].p = pda; + blockNode->succedents[i]->params[2].v = psID; + blockNode->succedents[i]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + } + + RF_ASSERT(!asmap->parityInfo->next); + rf_RangeRestrictPDA(raidPtr, parityPDA, asmap->parityInfo, 0, 1); + RF_ASSERT(asmap->parityInfo->numSector != 0); + if (rf_TryToRedirectPDA(raidPtr, asmap->parityInfo, 1)) + goto out; + blockNode->succedents[ layoutPtr->numDataCol ]->params[0].p = asmap->parityInfo; + + RF_ASSERT(!asmap->qInfo->next); + rf_RangeRestrictPDA(raidPtr, parityPDA, asmap->qInfo, 0, 1); + RF_ASSERT(asmap->qInfo->numSector != 0); + if (rf_TryToRedirectPDA(raidPtr, asmap->qInfo, 1)) goto out; + /* + * if disk is dead, b/c no reconstruction is implemented right now, + * the function "rf_TryToRedirectPDA" always return one, which cause + * go to out and return w/ good status + */ + blockNode->succedents[ layoutPtr->numDataCol +1 ]->params[0].p = asmap->qInfo; + + /* fire off the DAG */ + bzero((char *)&tracerec,sizeof(tracerec)); + rd_dag_h->tracerec = &tracerec; + + if (rf_verifyParityDebug) { + printf("Parity verify read dag:\n"); + rf_PrintDAGList(rd_dag_h); + } + + RF_LOCK_MUTEX(mcpair->mutex); + mcpair->flag = 0; + rf_DispatchDAG(rd_dag_h, (void (*)(void *))rf_MCPairWakeupFunc, + (void *) mcpair); + while (!mcpair->flag) RF_WAIT_COND(mcpair->cond, mcpair->mutex); + RF_UNLOCK_MUTEX(mcpair->mutex); + if (rd_dag_h->status != rf_enable) { + RF_ERRORMSG("Unable to verify parity: can't read the stripe\n"); + retcode = RF_PARITY_COULD_NOT_VERIFY; + goto out; + } + + for (p=buf, i=0; p<end_p; p+=numbytes, i++) { + rf_e_encToBuf(raidPtr, i, p, RF_EO_MATRIX_DIM - 2, redundantbuf2, numsector); + /* + * the corresponding columes in EvenOdd encoding Matrix for these p pointers which point + * to the databuffer in a full stripe are sequentially from 0 to layoutPtr->numDataCol-1 + */ + rf_bxor(p, pbuf, numbytes, NULL); + } + RF_ASSERT(i==layoutPtr->numDataCol); + + for (i=0; i<numbytes; i++) { + if (pbuf[i] != buf[bytesPerStripe+i]) { + if (!correct_it) { + RF_ERRORMSG3("Parity verify error: byte %d of parity is 0x%x should be 0x%x\n", + i,(u_char) buf[bytesPerStripe+i],(u_char) pbuf[i]); + } + } + redundantOneErr = 1; + break; + } + + for (i=0; i<numbytes; i++) { + if (redundantbuf2[i] != buf[bytesPerStripe+numbytes+i]) { + if (!correct_it) { + RF_ERRORMSG3("Parity verify error: byte %d of second redundant information is 0x%x should be 0x%x\n", + i,(u_char) buf[bytesPerStripe+numbytes+i],(u_char) redundantbuf2[i]); + } + redundantTwoErr = 1; + break; + } + } + if (redundantOneErr || redundantTwoErr ) + retcode = RF_PARITY_BAD; + + /* correct the first redundant disk, ie parity if it is error */ + if (redundantOneErr && correct_it) { + wr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, numbytes, pbuf, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, + "Wnp", alloclist, flags, RF_IO_NORMAL_PRIORITY); + wrBlock = wr_dag_h->succedents[0]; wrUnblock = wrBlock->succedents[0]->succedents[0]; + wrBlock->succedents[0]->params[0].p = asmap->parityInfo; + wrBlock->succedents[0]->params[2].v = psID; + wrBlock->succedents[0]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + bzero((char *)&tracerec,sizeof(tracerec)); + wr_dag_h->tracerec = &tracerec; + if (rf_verifyParityDebug) { + printf("Parity verify write dag:\n"); + rf_PrintDAGList(wr_dag_h); + } + RF_LOCK_MUTEX(mcpair->mutex); + mcpair->flag = 0; + rf_DispatchDAG(wr_dag_h, (void (*)(void *))rf_MCPairWakeupFunc, + (void *) mcpair); + while (!mcpair->flag) + RF_WAIT_COND(mcpair->cond, mcpair->mutex); + RF_UNLOCK_MUTEX(mcpair->mutex); + if (wr_dag_h->status != rf_enable) { + RF_ERRORMSG("Unable to correct parity in VerifyParity: can't write the stripe\n"); + parity_cant_correct = RF_TRUE; + } else { + parity_corrected = RF_TRUE; + } + rf_FreeDAG(wr_dag_h); + } + + if (redundantTwoErr && correct_it) { + wr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, numbytes, redundantbuf2, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, + "Wnred2", alloclist, flags, RF_IO_NORMAL_PRIORITY); + wrBlock = wr_dag_h->succedents[0]; wrUnblock = wrBlock->succedents[0]->succedents[0]; + wrBlock->succedents[0]->params[0].p = asmap->qInfo; + wrBlock->succedents[0]->params[2].v = psID; + wrBlock->succedents[0]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + bzero((char *)&tracerec,sizeof(tracerec)); + wr_dag_h->tracerec = &tracerec; + if (rf_verifyParityDebug) { + printf("Dag of write new second redundant information in parity verify :\n"); + rf_PrintDAGList(wr_dag_h); + } + RF_LOCK_MUTEX(mcpair->mutex); + mcpair->flag = 0; + rf_DispatchDAG(wr_dag_h, (void (*)(void *))rf_MCPairWakeupFunc, + (void *) mcpair); + while (!mcpair->flag) + RF_WAIT_COND(mcpair->cond, mcpair->mutex); + RF_UNLOCK_MUTEX(mcpair->mutex); + if (wr_dag_h->status != rf_enable) { + RF_ERRORMSG("Unable to correct second redundant information in VerifyParity: can't write the stripe\n"); + red2_cant_correct = RF_TRUE; + } else { + red2_corrected = RF_TRUE; + } + rf_FreeDAG(wr_dag_h); + } + if ( (redundantOneErr && parity_cant_correct) || + (redundantTwoErr && red2_cant_correct )) + retcode = RF_PARITY_COULD_NOT_CORRECT; + if ( (retcode = RF_PARITY_BAD) && parity_corrected && red2_corrected ) + retcode = RF_PARITY_CORRECTED; + + +out: + rf_FreeAccessStripeMap(asm_h); + rf_FreeAllocList(alloclist); + rf_FreeDAG(rd_dag_h); + rf_FreeMCPair(mcpair); + return(retcode); +} + +#endif /* RF_INCLUDE_EVENODD > 0 */ diff --git a/sys/dev/raidframe/rf_evenodd.h b/sys/dev/raidframe/rf_evenodd.h new file mode 100644 index 00000000000..24e5a811447 --- /dev/null +++ b/sys/dev/raidframe/rf_evenodd.h @@ -0,0 +1,49 @@ +/* $OpenBSD: rf_evenodd.h,v 1.1 1999/01/11 14:29:21 niklas Exp $ */ +/* $NetBSD: rf_evenodd.h,v 1.1 1998/11/13 04:20:29 oster Exp $ */ +/* + * Copyright (c) 1995, 1996 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Chang-Ming Wu + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#ifndef _RF__RF_EVENODD_H_ +#define _RF__RF_EVENODD_H_ + +/* extern declerations of the failure mode functions. */ +int rf_ConfigureEvenOdd(RF_ShutdownList_t **shutdownListp, RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr); +int rf_GetDefaultNumFloatingReconBuffersEvenOdd(RF_Raid_t *raidPtr); +RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitEvenOdd(RF_Raid_t *raidPtr); +void rf_IdentifyStripeEvenOdd(RF_Raid_t *raidPtr, RF_RaidAddr_t addr, + RF_RowCol_t **diskids, RF_RowCol_t *outrow); +void rf_MapParityEvenOdd(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_MapEEvenOdd(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_EODagSelect(RF_Raid_t *raidPtr, RF_IoType_t type, + RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc); +int rf_VerifyParityEvenOdd(RF_Raid_t *raidPtr, RF_RaidAddr_t raidAddr, + RF_PhysDiskAddr_t *parityPDA, int correct_it, RF_RaidAccessFlags_t flags); + +#endif /* !_RF__RF_EVENODD_H_ */ diff --git a/sys/dev/raidframe/rf_evenodd_dagfuncs.c b/sys/dev/raidframe/rf_evenodd_dagfuncs.c new file mode 100644 index 00000000000..2762ac725af --- /dev/null +++ b/sys/dev/raidframe/rf_evenodd_dagfuncs.c @@ -0,0 +1,887 @@ +/* $OpenBSD: rf_evenodd_dagfuncs.c,v 1.1 1999/01/11 14:29:21 niklas Exp $ */ +/* $NetBSD: rf_evenodd_dagfuncs.c,v 1.1 1998/11/13 04:20:29 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: ChangMing Wu + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Code for RAID-EVENODD architecture. + */ + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_dagffrd.h" +#include "rf_dagffwr.h" +#include "rf_dagdegrd.h" +#include "rf_dagdegwr.h" +#include "rf_dagutils.h" +#include "rf_dagfuncs.h" +#include "rf_threadid.h" +#include "rf_etimer.h" +#include "rf_general.h" +#include "rf_configure.h" +#include "rf_parityscan.h" +#include "rf_sys.h" +#include "rf_evenodd.h" +#include "rf_evenodd_dagfuncs.h" + +/* These redundant functions are for small write */ +RF_RedFuncs_t rf_EOSmallWritePFuncs = { rf_RegularXorFunc, "Regular Old-New P", rf_SimpleXorFunc, "Simple Old-New P" }; +RF_RedFuncs_t rf_EOSmallWriteEFuncs = { rf_RegularONEFunc, "Regular Old-New E", rf_SimpleONEFunc, "Regular Old-New E" }; + +/* These redundant functions are for degraded read */ +RF_RedFuncs_t rf_eoPRecoveryFuncs = { rf_RecoveryXorFunc, "Recovery Xr", rf_RecoveryXorFunc, "Recovery Xr"}; +RF_RedFuncs_t rf_eoERecoveryFuncs = { rf_RecoveryEFunc, "Recovery E Func", rf_RecoveryEFunc, "Recovery E Func" }; + +/********************************************************************************************** + * the following encoding node functions is used in EO_000_CreateLargeWriteDAG + **********************************************************************************************/ +int rf_RegularPEFunc(node) + RF_DagNode_t *node; +{ + rf_RegularESubroutine(node,node->results[1]); + rf_RegularXorFunc(node); /* does the wakeup here! */ +#if 1 + return(0); /* XXX This was missing... GO */ +#endif +} + + +/************************************************************************************************ + * For EO_001_CreateSmallWriteDAG, there are (i)RegularONEFunc() and (ii)SimpleONEFunc() to + * be used. The previous case is when write access at least sectors of full stripe unit. + * The later function is used when the write access two stripe units but with total sectors + * less than sectors per SU. In this case, the access of parity and 'E' are shown as disconnected + * areas in their stripe unit and parity write and 'E' write are both devided into two distinct + * writes( totally four). This simple old-new write and regular old-new write happen as in RAID-5 + ************************************************************************************************/ + +/* Algorithm: + 1. Store the difference of old data and new data in the Rod buffer. + 2. then encode this buffer into the buffer which already have old 'E' information inside it, + the result can be shown to be the new 'E' information. + 3. xor the Wnd buffer into the difference buffer to recover the original old data. + Here we have another alternative: to allocate a temporary buffer for storing the difference of + old data and new data, then encode temp buf into old 'E' buf to form new 'E', but this approach + take the same speed as the previous, and need more memory. +*/ +int rf_RegularONEFunc(node) + RF_DagNode_t *node; +{ + RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p; + RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout; + int EpdaIndex = (node->numParams-1)/2 - 1; /* the parameter of node where you can find e-pda */ + int i, k, retcode = 0; + int suoffset, length; + RF_RowCol_t scol; + char *srcbuf, *destbuf; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + RF_Etimer_t timer; + RF_PhysDiskAddr_t *pda, *EPDA = (RF_PhysDiskAddr_t *) node->params[EpdaIndex].p; + int ESUOffset = rf_StripeUnitOffset(layoutPtr, EPDA->startSector); /* generally zero */ + + RF_ASSERT( EPDA->type == RF_PDA_TYPE_Q ); + RF_ASSERT(ESUOffset == 0); + + RF_ETIMER_START(timer); + + /* Xor the Wnd buffer into Rod buffer, the difference of old data and new data is stored in Rod buffer */ + for( k=0; k< EpdaIndex; k += 2) { + length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *)node->params[k].p)->numSector ); + retcode = rf_bxor( node->params[k+EpdaIndex+3].p, node->params[k+1].p, length, node->dagHdr->bp); + } + /* Start to encoding the buffer storing the difference of old data and new data into 'E' buffer */ + for (i=0; i<EpdaIndex; i+=2) if (node->params[i+1].p != node->results[0]) { /* results[0] is buf ptr of E */ + pda = (RF_PhysDiskAddr_t *) node->params[i].p; + srcbuf = (char *) node->params[i+1].p; + scol = rf_EUCol(layoutPtr, pda->raidAddress ); + suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); + destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr,suoffset); + rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector); + } + /* Recover the original old data to be used by parity encoding function in XorNode */ + for( k=0; k< EpdaIndex; k += 2) { + length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *)node->params[k].p)->numSector ); + retcode = rf_bxor( node->params[k+EpdaIndex+3].p, node->params[k+1].p, length, node->dagHdr->bp); + } + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + tracerec->q_us += RF_ETIMER_VAL_US(timer); + rf_GenericWakeupFunc(node, 0); +#if 1 + return(0); /* XXX this was missing.. GO */ +#endif +} + +int rf_SimpleONEFunc(node) + RF_DagNode_t *node; +{ + RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p; + RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout; + RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p; + int retcode = 0; + char *srcbuf, *destbuf; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + int length; + RF_RowCol_t scol; + RF_Etimer_t timer; + + RF_ASSERT( ((RF_PhysDiskAddr_t *)node->params[2].p)->type == RF_PDA_TYPE_Q ); + if (node->dagHdr->status == rf_enable) { + RF_ETIMER_START(timer); + length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *)node->params[4].p)->numSector );/* this is a pda of writeDataNodes */ + /* bxor to buffer of readDataNodes */ + retcode = rf_bxor( node->params[5].p, node->params[1].p, length, node->dagHdr->bp); + /* find out the corresponding colume in encoding matrix for write colume to be encoded into redundant disk 'E' */ + scol = rf_EUCol(layoutPtr, pda->raidAddress ); + srcbuf = node->params[1].p; + destbuf = node->params[3].p; + /* Start encoding process */ + rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector); + rf_bxor( node->params[5].p, node->params[1].p, length, node->dagHdr->bp); + RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->q_us += RF_ETIMER_VAL_US(timer); + + } + return(rf_GenericWakeupFunc(node, retcode)); /* call wake func explicitly since no I/O in this node */ +} + + +/****** called by rf_RegularPEFunc(node) and rf_RegularEFunc(node) in f.f. large write ********/ +void rf_RegularESubroutine(node, ebuf) + RF_DagNode_t *node; + char *ebuf; +{ + RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p; + RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout; + RF_PhysDiskAddr_t *pda; + int i, suoffset; + RF_RowCol_t scol; + char *srcbuf, *destbuf; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + RF_Etimer_t timer; + + RF_ETIMER_START(timer); + for (i=0; i<node->numParams-2; i+=2) { + RF_ASSERT( node->params[i+1].p != ebuf ); + pda = (RF_PhysDiskAddr_t *) node->params[i].p; + suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); + scol = rf_EUCol(layoutPtr, pda->raidAddress ); + srcbuf = (char *) node->params[i+1].p; + destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset ); + rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector); + } + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + tracerec->xor_us += RF_ETIMER_VAL_US(timer); +} + + +/******************************************************************************************* + * Used in EO_001_CreateLargeWriteDAG + ******************************************************************************************/ +int rf_RegularEFunc(node) + RF_DagNode_t *node; +{ + rf_RegularESubroutine(node, node->results[0]); + rf_GenericWakeupFunc(node, 0); +#if 1 + return(0); /* XXX this was missing?.. GO */ +#endif +} + +/******************************************************************************************* + * This degraded function allow only two case: + * 1. when write access the full failed stripe unit, then the access can be more than + * one tripe units. + * 2. when write access only part of the failed SU, we assume accesses of more than + * one stripe unit is not allowed so that the write can be dealt with like a + * large write. + * The following function is based on these assumptions. So except in the second case, + * it looks the same as a large write encodeing function. But this is not exactly the + * normal way for doing a degraded write, since raidframe have to break cases of access + * other than the above two into smaller accesses. We may have to change + * DegrESubroutin in the future. + *******************************************************************************************/ +void rf_DegrESubroutine(node, ebuf) + RF_DagNode_t *node; + char *ebuf; +{ + RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p; + RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout; + RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams-2].p; + RF_PhysDiskAddr_t *pda; + int i, suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector); + RF_RowCol_t scol; + char *srcbuf, *destbuf; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + RF_Etimer_t timer; + + RF_ETIMER_START(timer); + for (i=0; i<node->numParams-2; i+=2) { + RF_ASSERT( node->params[i+1].p != ebuf ); + pda = (RF_PhysDiskAddr_t *) node->params[i].p; + suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); + scol = rf_EUCol(layoutPtr, pda->raidAddress ); + srcbuf = (char *) node->params[i+1].p; + destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset-failedSUOffset); + rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector); + } + + RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->q_us += RF_ETIMER_VAL_US(timer); +} + + +/************************************************************************************** + * This function is used in case where one data disk failed and both redundant disks + * alive. It is used in the EO_100_CreateWriteDAG. Note: if there is another disk + * failed in the stripe but not accessed at this time, then we should, instead, use + * the rf_EOWriteDoubleRecoveryFunc(). + **************************************************************************************/ +int rf_Degraded_100_EOFunc(node) + RF_DagNode_t *node; +{ + rf_DegrESubroutine(node, node->results[1]); + rf_RecoveryXorFunc(node); /* does the wakeup here! */ +#if 1 + return(0); /* XXX this was missing... SHould these be void functions??? GO */ +#endif +} + +/************************************************************************************** + * This function is to encode one sector in one of the data disks to the E disk. + * However, in evenodd this function can also be used as decoding function to recover + * data from dead disk in the case of parity failure and a single data failure. + **************************************************************************************/ +void rf_e_EncOneSect( + RF_RowCol_t srcLogicCol, + char *srcSecbuf, + RF_RowCol_t destLogicCol, + char *destSecbuf, + int bytesPerSector) +{ + int S_index; /* index of the EU in the src col which need be Xored into all EUs in a dest sector */ + int numRowInEncMatix = (RF_EO_MATRIX_DIM) -1; + RF_RowCol_t j, indexInDest, /* row index of an encoding unit in the destination colume of encoding matrix */ + indexInSrc; /* row index of an encoding unit in the source colume used for recovery */ + int bytesPerEU = bytesPerSector/numRowInEncMatix; + +#if RF_EO_MATRIX_DIM > 17 + int shortsPerEU = bytesPerEU/sizeof(short); + short *destShortBuf, *srcShortBuf1, *srcShortBuf2; + register short temp1; +#elif RF_EO_MATRIX_DIM == 17 + int longsPerEU = bytesPerEU/sizeof(long); + long *destLongBuf, *srcLongBuf1, *srcLongBuf2; + register long temp1; +#endif + +#if RF_EO_MATRIX_DIM > 17 + RF_ASSERT( sizeof(short) == 2 || sizeof(short) == 1 ); + RF_ASSERT( bytesPerEU % sizeof(short) == 0 ); +#elif RF_EO_MATRIX_DIM == 17 + RF_ASSERT( sizeof(long) == 8 || sizeof(long) == 4 ); + RF_ASSERT( bytesPerEU % sizeof(long) == 0); +#endif + + S_index = rf_EO_Mod( ( RF_EO_MATRIX_DIM -1 + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM); +#if RF_EO_MATRIX_DIM > 17 + srcShortBuf1 = (short *)(srcSecbuf + S_index * bytesPerEU); +#elif RF_EO_MATRIX_DIM == 17 + srcLongBuf1 = (long *)(srcSecbuf + S_index * bytesPerEU); +#endif + + for( indexInDest = 0; indexInDest < numRowInEncMatix ; indexInDest++){ + indexInSrc = rf_EO_Mod( (indexInDest + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM ); + +#if RF_EO_MATRIX_DIM > 17 + destShortBuf = (short *)(destSecbuf + indexInDest * bytesPerEU); + srcShortBuf2 = (short *)(srcSecbuf + indexInSrc * bytesPerEU); + for(j=0; j < shortsPerEU; j++) { + temp1 = destShortBuf[j]^srcShortBuf1[j]; + /* note: S_index won't be at the end row for any src col! */ + if(indexInSrc != RF_EO_MATRIX_DIM -1) destShortBuf[j] = (srcShortBuf2[j])^temp1; + /* if indexInSrc is at the end row, ie. RF_EO_MATRIX_DIM -1, then all elements are zero! */ + else destShortBuf[j] = temp1; + } + +#elif RF_EO_MATRIX_DIM == 17 + destLongBuf = (long *)(destSecbuf + indexInDest * bytesPerEU); + srcLongBuf2 = (long *)(srcSecbuf + indexInSrc * bytesPerEU); + for(j=0; j < longsPerEU; j++) { + temp1 = destLongBuf[j]^srcLongBuf1[j]; + if(indexInSrc != RF_EO_MATRIX_DIM -1) destLongBuf[j] = (srcLongBuf2[j])^temp1; + else destLongBuf[j] = temp1; + } +#endif + } +} + +void rf_e_encToBuf( + RF_Raid_t *raidPtr, + RF_RowCol_t srcLogicCol, + char *srcbuf, + RF_RowCol_t destLogicCol, + char *destbuf, + int numSector) +{ + int i, bytesPerSector = rf_RaidAddressToByte(raidPtr, 1); + + for (i=0; i < numSector; i++) + { + rf_e_EncOneSect( srcLogicCol, srcbuf, destLogicCol, destbuf, bytesPerSector); + srcbuf += bytesPerSector; + destbuf += bytesPerSector; + } +} + +/************************************************************************************** + * when parity die and one data die, We use second redundant information, 'E', + * to recover the data in dead disk. This function is used in the recovery node of + * for EO_110_CreateReadDAG + **************************************************************************************/ +int rf_RecoveryEFunc(node) + RF_DagNode_t *node; +{ + RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p; + RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout; + RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams-2].p; + RF_RowCol_t scol, /*source logical column*/ + fcol = rf_EUCol(layoutPtr, failedPDA->raidAddress ); /* logical column of failed SU */ + int i; + RF_PhysDiskAddr_t *pda; + int suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr,failedPDA->startSector); + char *srcbuf, *destbuf; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + RF_Etimer_t timer; + + bzero( (char *)node->results[0], rf_RaidAddressToByte(raidPtr,failedPDA->numSector)); + if (node->dagHdr->status == rf_enable) { + RF_ETIMER_START(timer); + for (i=0; i<node->numParams-2; i+=2) if (node->params[i+1].p != node->results[0]) { + pda = (RF_PhysDiskAddr_t *) node->params[i].p; + if( i == node->numParams - 4 ) scol = RF_EO_MATRIX_DIM - 2; /* the colume of redundant E */ + else scol = rf_EUCol(layoutPtr, pda->raidAddress ); + srcbuf = (char *) node->params[i+1].p; + suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); + destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr,suoffset-failedSUOffset); + rf_e_encToBuf(raidPtr, scol, srcbuf, fcol, destbuf, pda->numSector); + } + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + tracerec->xor_us += RF_ETIMER_VAL_US(timer); + } + return (rf_GenericWakeupFunc(node, 0)); /* node execute successfully */ +} + +/************************************************************************************** + * This function is used in the case where one data and the parity have filed. + * (in EO_110_CreateWriteDAG ) + **************************************************************************************/ +int rf_EO_DegradedWriteEFunc(RF_DagNode_t *node) +{ + rf_DegrESubroutine(node, node->results[0]); + rf_GenericWakeupFunc(node, 0); +#if 1 + return(0); /* XXX Yet another one!! GO */ +#endif +} + + + +/************************************************************************************** + * THE FUNCTION IS FOR DOUBLE DEGRADED READ AND WRITE CASES + **************************************************************************************/ + +void rf_doubleEOdecode( + RF_Raid_t *raidPtr, + char **rrdbuf, + char **dest, + RF_RowCol_t *fcol, + char *pbuf, + char *ebuf) +{ + RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout); + int i, j, k, f1, f2, row; + int rrdrow, erow, count = 0; + int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1 ); + int numRowInEncMatix = (RF_EO_MATRIX_DIM) -1; +#if 0 + int pcol = (RF_EO_MATRIX_DIM) - 1; +#endif + int ecol = (RF_EO_MATRIX_DIM) - 2; + int bytesPerEU = bytesPerSector/numRowInEncMatix; + int numDataCol = layoutPtr->numDataCol; +#if RF_EO_MATRIX_DIM > 17 + int shortsPerEU = bytesPerEU/sizeof(short); + short *rrdbuf_current, *pbuf_current, *ebuf_current; + short *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current; + register short *temp; + short *P; + + RF_ASSERT( bytesPerEU % sizeof(short) == 0); + RF_Malloc(P, bytesPerEU, (short *)); + RF_Malloc(temp, bytesPerEU, (short *)); +#elif RF_EO_MATRIX_DIM == 17 + int longsPerEU = bytesPerEU/sizeof(long); + long *rrdbuf_current, *pbuf_current, *ebuf_current; + long *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current; + register long *temp; + long *P; + + RF_ASSERT( bytesPerEU % sizeof(long) == 0); + RF_Malloc(P, bytesPerEU, (long *)); + RF_Malloc(temp, bytesPerEU, (long *)); +#endif + RF_ASSERT( *((long *)dest[0]) == 0); + RF_ASSERT( *((long *)dest[1]) == 0); + bzero((char *)P, bytesPerEU); + bzero((char *)temp, bytesPerEU); + RF_ASSERT( *P == 0 ); + /* calculate the 'P' parameter, which, not parity, is the Xor of all elements in + the last two column, ie. 'E' and 'parity' colume, see the Ref. paper by Blaum, et al 1993 */ + for( i=0; i< numRowInEncMatix; i++) + for( k=0; k< longsPerEU; k++) { +#if RF_EO_MATRIX_DIM > 17 + ebuf_current = ((short *)ebuf) + i*shortsPerEU + k; + pbuf_current = ((short *)pbuf) + i*shortsPerEU + k; +#elif RF_EO_MATRIX_DIM == 17 + ebuf_current = ((long *)ebuf) + i*longsPerEU + k; + pbuf_current = ((long *)pbuf) + i*longsPerEU + k; +#endif + P[k] ^= *ebuf_current; + P[k] ^= *pbuf_current; + } + RF_ASSERT( fcol[0] != fcol[1] ); + if( fcol[0] < fcol[1] ) { +#if RF_EO_MATRIX_DIM > 17 + dest_smaller = (short *)(dest[0]); + dest_larger = (short *)(dest[1]); +#elif RF_EO_MATRIX_DIM == 17 + dest_smaller = (long *)(dest[0]); + dest_larger = (long *)(dest[1]); +#endif + f1 = fcol[0]; + f2 = fcol[1]; + } + else { +#if RF_EO_MATRIX_DIM > 17 + dest_smaller = (short *)(dest[1]); + dest_larger = (short *)(dest[0]); +#elif RF_EO_MATRIX_DIM == 17 + dest_smaller = (long *)(dest[1]); + dest_larger = (long *)(dest[0]); +#endif + f1 = fcol[1]; + f2 = fcol[0]; + } + row = (RF_EO_MATRIX_DIM) -1; + while( (row = rf_EO_Mod( (row+f1-f2), RF_EO_MATRIX_DIM )) != ( (RF_EO_MATRIX_DIM) -1) ) + { +#if RF_EO_MATRIX_DIM > 17 + dest_larger_current = dest_larger + row*shortsPerEU; + dest_smaller_current = dest_smaller + row*shortsPerEU; +#elif RF_EO_MATRIX_DIM == 17 + dest_larger_current = dest_larger + row*longsPerEU; + dest_smaller_current = dest_smaller + row*longsPerEU; +#endif + /** Do the diagonal recovery. Initially, temp[k] = (failed 1), + which is the failed data in the colume which has smaller col index. **/ + /* step 1: ^(SUM of nonfailed in-diagonal A(rrdrow,0..m-3)) */ + for( j=0; j< numDataCol; j++) + { + if( j == f1 || j == f2 ) continue; + rrdrow = rf_EO_Mod( (row+f2-j), RF_EO_MATRIX_DIM ); + if ( rrdrow != (RF_EO_MATRIX_DIM) -1 ) { +#if RF_EO_MATRIX_DIM > 17 + rrdbuf_current = (short *)(rrdbuf[j]) + rrdrow * shortsPerEU; + for (k=0; k< shortsPerEU; k++) temp[k] ^= *(rrdbuf_current + k); +#elif RF_EO_MATRIX_DIM == 17 + rrdbuf_current = (long *)(rrdbuf[j]) + rrdrow * longsPerEU; + for (k=0; k< longsPerEU; k++) temp[k] ^= *(rrdbuf_current + k); +#endif + } + } + /* step 2: ^E(erow,m-2), If erow is at the buttom row, don't Xor into it + E(erow,m-2) = (principle diagonal) ^ (failed 1) ^ (failed 2) + ^ ( SUM of nonfailed in-diagonal A(rrdrow,0..m-3) ) + After this step, temp[k] = (principle diagonal) ^ (failed 2) */ + + erow = rf_EO_Mod( (row+f2-ecol), (RF_EO_MATRIX_DIM) ); + if ( erow != (RF_EO_MATRIX_DIM) -1) { +#if RF_EO_MATRIX_DIM > 17 + ebuf_current = (short *)ebuf + shortsPerEU * erow; + for (k=0; k< shortsPerEU; k++) temp[k] ^= *(ebuf_current+k); +#elif RF_EO_MATRIX_DIM == 17 + ebuf_current = (long *)ebuf + longsPerEU * erow; + for (k=0; k< longsPerEU; k++) temp[k] ^= *(ebuf_current+k); +#endif + } + /* step 3: ^P to obtain the failed data (failed 2). + P can be proved to be actually (principle diagonal) + After this step, temp[k] = (failed 2), the failed data to be recovered */ +#if RF_EO_MATRIX_DIM > 17 + for (k=0; k< shortsPerEU; k++) temp[k] ^= P[k]; + /* Put the data to the destination buffer */ + for (k=0; k< shortsPerEU; k++) dest_larger_current[k] = temp[k]; +#elif RF_EO_MATRIX_DIM == 17 + for (k=0; k< longsPerEU; k++) temp[k] ^= P[k]; + /* Put the data to the destination buffer */ + for (k=0; k< longsPerEU; k++) dest_larger_current[k] = temp[k]; +#endif + + /** THE FOLLOWING DO THE HORIZONTAL XOR **/ + /* step 1: ^(SUM of A(row,0..m-3)), ie. all nonfailed data columes */ + for (j=0; j< numDataCol; j++) + { + if( j == f1 || j == f2 ) continue; +#if RF_EO_MATRIX_DIM > 17 + rrdbuf_current = (short *)(rrdbuf[j]) + row * shortsPerEU; + for (k=0; k< shortsPerEU; k++) temp[k] ^= *(rrdbuf_current+k); +#elif RF_EO_MATRIX_DIM == 17 + rrdbuf_current = (long *)(rrdbuf[j]) + row * longsPerEU; + for (k=0; k< longsPerEU; k++) temp[k] ^= *(rrdbuf_current+k); +#endif + } + /* step 2: ^A(row,m-1) */ + /* step 3: Put the data to the destination buffer */ +#if RF_EO_MATRIX_DIM > 17 + pbuf_current = (short *)pbuf + shortsPerEU * row; + for (k=0; k< shortsPerEU; k++) temp[k] ^= *(pbuf_current+k); + for (k=0; k< shortsPerEU; k++) dest_smaller_current[k] = temp[k]; +#elif RF_EO_MATRIX_DIM == 17 + pbuf_current = (long *)pbuf + longsPerEU * row; + for (k=0; k< longsPerEU; k++) temp[k] ^= *(pbuf_current+k); + for (k=0; k< longsPerEU; k++) dest_smaller_current[k] = temp[k]; +#endif + count++; + } + /* Check if all Encoding Unit in the data buffer have been decoded, + according EvenOdd theory, if "RF_EO_MATRIX_DIM" is a prime number, + this algorithm will covered all buffer */ + RF_ASSERT( count == numRowInEncMatix ); + RF_Free((char *)P, bytesPerEU); + RF_Free((char *)temp, bytesPerEU); +} + + +/*************************************************************************************** +* This function is called by double degragded read +* EO_200_CreateReadDAG +* +***************************************************************************************/ +int rf_EvenOddDoubleRecoveryFunc(node) + RF_DagNode_t *node; +{ + int ndataParam = 0; + int np = node->numParams; + RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p; + RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p; + RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout); + int i, prm, sector, nresults = node->numResults; + RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; + unsigned sosAddr; + int two = 0, mallc_one= 0, mallc_two = 0; /* flags to indicate if memory is allocated */ + int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1 ); + RF_PhysDiskAddr_t *ppda,*ppda2,*epda,*epda2,*pda, *pda0, *pda1, npda; + RF_RowCol_t fcol[2], fsuoff[2], fsuend[2], numDataCol = layoutPtr->numDataCol; + char **buf, *ebuf, *pbuf, *dest[2]; + long *suoff=NULL, *suend=NULL, *prmToCol=NULL, psuoff, esuoff; + RF_SectorNum_t startSector, endSector; + RF_Etimer_t timer; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + + RF_ETIMER_START(timer); + + /* Find out the number of parameters which are pdas for data information */ + for (i = 0; i<= np; i++) + if( ((RF_PhysDiskAddr_t *)node->params[i].p)->type != RF_PDA_TYPE_DATA) {ndataParam = i ; break; } + + RF_Malloc(buf, numDataCol*sizeof(char *), (char **)); + if (ndataParam != 0 ){ + RF_Malloc(suoff, ndataParam*sizeof(long), (long *) ); + RF_Malloc(suend, ndataParam*sizeof(long), (long *) ); + RF_Malloc(prmToCol, ndataParam*sizeof(long), (long *) ); + } + + if (asmap->failedPDAs[1] && + (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) { + RF_ASSERT(0); /* currently, no support for this situation */ + ppda = node->params[np-6].p; + ppda2 = node->params[np-5].p; + RF_ASSERT( ppda2->type == RF_PDA_TYPE_PARITY ); + epda = node->params[np-4].p; + epda2 = node->params[np-3].p; + RF_ASSERT( epda2->type == RF_PDA_TYPE_Q ); + two = 1; + } + else { + ppda = node->params[np-4].p; + epda = node->params[np-3].p; + psuoff = rf_StripeUnitOffset(layoutPtr, ppda->startSector); + esuoff = rf_StripeUnitOffset(layoutPtr, epda->startSector); + RF_ASSERT( psuoff == esuoff ); + } + /* + the followings have three goals: + 1. determine the startSector to begin decoding and endSector to end decoding. + 2. determine the colume numbers of the two failed disks. + 3. determine the offset and end offset of the access within each failed stripe unit. + */ + if( nresults == 1 ) { + /* find the startSector to begin decoding */ + pda = node->results[0]; + bzero(pda->bufPtr, bytesPerSector*pda->numSector ); + fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda->startSector ); + fsuend[0] = fsuoff[0] + pda->numSector; + startSector = fsuoff[0]; + endSector = fsuend[0]; + + /* find out the the column of failed disk being accessed */ + fcol[0] = rf_EUCol(layoutPtr, pda->raidAddress ); + + /* find out the other failed colume not accessed */ + sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); + for (i=0; i < numDataCol; i++) { + npda.raidAddress = sosAddr + (i * secPerSU); + (raidPtr->Layout.map->MapSector)(raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0); + /* skip over dead disks */ + if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status)) + if (i != fcol[0]) break; + } + RF_ASSERT (i < numDataCol); + fcol[1] = i; + } + else { + RF_ASSERT ( nresults == 2 ); + pda0 = node->results[0]; bzero(pda0->bufPtr, bytesPerSector*pda0->numSector ); + pda1 = node->results[1]; bzero(pda1->bufPtr, bytesPerSector*pda1->numSector ); + /* determine the failed colume numbers of the two failed disks. */ + fcol[0] = rf_EUCol(layoutPtr, pda0->raidAddress ); + fcol[1] = rf_EUCol(layoutPtr, pda1->raidAddress ); + /* determine the offset and end offset of the access within each failed stripe unit. */ + fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda0->startSector ); + fsuend[0] = fsuoff[0] + pda0->numSector; + fsuoff[1] = rf_StripeUnitOffset(layoutPtr, pda1->startSector ); + fsuend[1] = fsuoff[1] + pda1->numSector; + /* determine the startSector to begin decoding */ + startSector = RF_MIN( pda0->startSector, pda1->startSector ); + /* determine the endSector to end decoding */ + endSector = RF_MAX( fsuend[0], fsuend[1] ); + } + /* + assign the beginning sector and the end sector for each parameter + find out the corresponding colume # for each parameter + */ + for( prm=0; prm < ndataParam; prm++ ) { + pda = node->params[prm].p; + suoff[prm] = rf_StripeUnitOffset(layoutPtr, pda->startSector); + suend[prm] = suoff[prm] + pda->numSector; + prmToCol[prm] = rf_EUCol(layoutPtr, pda->raidAddress ); + } + /* 'sector' is the sector for the current decoding algorithm. For each sector in the failed SU, + find out the corresponding parameters that cover the current sector and that are needed for + decoding of this sector in failed SU. 2. Find out if sector is in the shadow of any accessed + failed SU. If not, malloc a temporary space of a sector in size. + */ + for( sector = startSector; sector < endSector; sector++ ){ + if ( nresults == 2 ) + if( !(fsuoff[0]<=sector && sector<fsuend[0]) && !(fsuoff[1]<=sector && sector<fsuend[1]) )continue; + for( prm=0; prm < ndataParam; prm++ ) + if( suoff[prm] <= sector && sector < suend[prm] ) + buf[(prmToCol[prm])] = ((RF_PhysDiskAddr_t *)node->params[prm].p)->bufPtr + + rf_RaidAddressToByte(raidPtr, sector-suoff[prm]); + /* find out if sector is in the shadow of any accessed failed SU. If yes, assign dest[0], dest[1] to point + at suitable position of the buffer corresponding to failed SUs. if no, malloc a temporary space of + a sector in size for destination of decoding. + */ + RF_ASSERT( nresults == 1 || nresults == 2 ); + if ( nresults == 1) { + dest[0] = ((RF_PhysDiskAddr_t *)node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector-fsuoff[0]); + /* Always malloc temp buffer to dest[1] */ + RF_Malloc( dest[1], bytesPerSector, (char *) ); + bzero(dest[1],bytesPerSector); mallc_two = 1; } + else { + if( fsuoff[0] <= sector && sector < fsuend[0] ) + dest[0] = ((RF_PhysDiskAddr_t *)node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector-fsuoff[0]); + else { RF_Malloc( dest[0], bytesPerSector, (char *) ); + bzero(dest[0],bytesPerSector); mallc_one = 1; } + if( fsuoff[1] <= sector && sector < fsuend[1] ) + dest[1] = ((RF_PhysDiskAddr_t *)node->results[1])->bufPtr + rf_RaidAddressToByte(raidPtr, sector-fsuoff[1]); + else { RF_Malloc( dest[1], bytesPerSector, (char *) ); + bzero(dest[1],bytesPerSector); mallc_two = 1; } + RF_ASSERT( mallc_one == 0 || mallc_two == 0 ); + } + pbuf = ppda->bufPtr + rf_RaidAddressToByte(raidPtr, sector-psuoff ); + ebuf = epda->bufPtr + rf_RaidAddressToByte(raidPtr, sector-esuoff ); + /* + * After finish finding all needed sectors, call doubleEOdecode function for decoding + * one sector to destination. + */ + rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf ); + /* free all allocated memory, and mark flag to indicate no memory is being allocated */ + if( mallc_one == 1) RF_Free( dest[0], bytesPerSector ); + if( mallc_two == 1) RF_Free( dest[1], bytesPerSector ); + mallc_one = mallc_two = 0; + } + RF_Free(buf, numDataCol*sizeof(char *)); + if (ndataParam != 0){ + RF_Free(suoff, ndataParam*sizeof(long)); + RF_Free(suend, ndataParam*sizeof(long)); + RF_Free(prmToCol, ndataParam*sizeof(long)); + } + + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + if (tracerec) { + tracerec->q_us += RF_ETIMER_VAL_US(timer); + } + rf_GenericWakeupFunc(node,0); +#if 1 + return(0); /* XXX is this even close!!?!?!!? GO */ +#endif +} + + +/* currently, only access of one of the two failed SU is allowed in this function. + * also, asmap->numStripeUnitsAccessed is limited to be one, the RaidFrame will break large access into + * many accesses of single stripe unit. + */ + +int rf_EOWriteDoubleRecoveryFunc(node) + RF_DagNode_t *node; +{ + int np = node->numParams; + RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p; + RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p; + RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout); + RF_SectorNum_t sector; + RF_RowCol_t col, scol; + int prm, i, j; + RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; + unsigned sosAddr; + unsigned bytesPerSector = rf_RaidAddressToByte(raidPtr, 1 ); + RF_int64 numbytes; + RF_SectorNum_t startSector, endSector; + RF_PhysDiskAddr_t *ppda,*epda,*pda, *fpda, npda; + RF_RowCol_t fcol[2], numDataCol = layoutPtr->numDataCol; + char **buf; /* buf[0], buf[1], buf[2], ...etc. point to buffer storing data read from col0, col1, col2 */ + char *ebuf, *pbuf, *dest[2], *olddata[2]; + RF_Etimer_t timer; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + + RF_ASSERT( asmap->numDataFailed == 1 ); /* currently only support this case, the other failed SU is not being accessed */ + RF_ETIMER_START(timer); + RF_Malloc(buf, numDataCol*sizeof(char *), (char **)); + + ppda = node->results[0]; /* Instead of being buffers, node->results[0] and [1] are Ppda and Epda */ + epda = node->results[1]; + fpda = asmap->failedPDAs[0]; + + /* First, recovery the failed old SU using EvenOdd double decoding */ + /* determine the startSector and endSector for decoding */ + startSector = rf_StripeUnitOffset(layoutPtr, fpda->startSector ); + endSector = startSector + fpda->numSector; + /* Assign buf[col] pointers to point to each non-failed colume and initialize the pbuf + and ebuf to point at the beginning of each source buffers and destination buffers */ + for( prm=0; prm < numDataCol-2; prm++ ) { + pda = (RF_PhysDiskAddr_t *)node->params[prm].p; + col = rf_EUCol(layoutPtr, pda->raidAddress ); + buf[col] = pda->bufPtr; + } + /* pbuf and ebuf: they will change values as double recovery decoding goes on */ + pbuf = ppda->bufPtr; + ebuf = epda->bufPtr; + /* find out the logical colume numbers in the encoding matrix of the two failed columes */ + fcol[0] = rf_EUCol(layoutPtr, fpda->raidAddress ); + + /* find out the other failed colume not accessed this time */ + sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); + for (i=0; i < numDataCol; i++) { + npda.raidAddress = sosAddr + (i * secPerSU); + (raidPtr->Layout.map->MapSector)(raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0); + /* skip over dead disks */ + if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status)) + if (i != fcol[0]) break; + } + RF_ASSERT (i < numDataCol); + fcol[1] = i; + /* assign temporary space to put recovered failed SU */ + numbytes = fpda->numSector * bytesPerSector; + RF_Malloc(olddata[0], numbytes, (char *) ); + RF_Malloc(olddata[1], numbytes, (char *) ); + dest[0] = olddata[0]; + dest[1] = olddata[1]; + bzero(olddata[0], numbytes); + bzero(olddata[1], numbytes); + /* Begin the recovery decoding, initially buf[j], ebuf, pbuf, dest[j] have already + pointed at the beginning of each source buffers and destination buffers */ + for( sector = startSector, i=0; sector < endSector; sector++ , i++){ + rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf ); + for (j=0; j < numDataCol; j++) + if( ( j != fcol[0]) && ( j != fcol[1] ) ) buf[j] += bytesPerSector; + dest[0] += bytesPerSector; + dest[1] += bytesPerSector; + ebuf += bytesPerSector; + pbuf += bytesPerSector; + } + /* after recovery, the buffer pointed by olddata[0] is the old failed data. + With new writing data and this old data, use small write to calculate + the new redundant informations + */ + /* node->params[ 0, ... PDAPerDisk * (numDataCol - 2)-1 ] are Pdas of Rrd; + params[ PDAPerDisk*(numDataCol - 2), ... PDAPerDisk*numDataCol -1 ] are Pdas of Rp, ( Rp2 ), Re, ( Re2 ) ; + params[ PDAPerDisk*numDataCol, ... PDAPerDisk*numDataCol +asmap->numStripeUnitsAccessed -asmap->numDataFailed-1] + are Pdas of wudNodes; + For current implementation, we assume the simplest case: + asmap->numStripeUnitsAccessed == 1 and asmap->numDataFailed == 1 ie. PDAPerDisk = 1 + then node->params[numDataCol] must be the new data to be writen to the failed disk. We first bxor the new data + into the old recovered data, then do the same things as small write. + */ + + rf_bxor( ((RF_PhysDiskAddr_t *)node->params[numDataCol].p)->bufPtr, olddata[0], numbytes, node->dagHdr->bp); + /* do new 'E' calculation */ + /* find out the corresponding colume in encoding matrix for write colume to be encoded into redundant disk 'E' */ + scol = rf_EUCol(layoutPtr, fpda->raidAddress ); + /* olddata[0] now is source buffer pointer; epda->bufPtr is the dest buffer pointer */ + rf_e_encToBuf(raidPtr, scol, olddata[0], RF_EO_MATRIX_DIM - 2, epda->bufPtr, fpda->numSector); + + /* do new 'P' calculation */ + rf_bxor( olddata[0], ppda->bufPtr, numbytes, node->dagHdr->bp); + /* Free the allocated buffer */ + RF_Free( olddata[0], numbytes ); + RF_Free( olddata[1], numbytes ); + RF_Free( buf, numDataCol*sizeof(char *)); + + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + if (tracerec) { + tracerec->q_us += RF_ETIMER_VAL_US(timer); + } + + rf_GenericWakeupFunc(node,0); + return(0); +} diff --git a/sys/dev/raidframe/rf_evenodd_dagfuncs.h b/sys/dev/raidframe/rf_evenodd_dagfuncs.h new file mode 100644 index 00000000000..9773e57cedd --- /dev/null +++ b/sys/dev/raidframe/rf_evenodd_dagfuncs.h @@ -0,0 +1,77 @@ +/* $OpenBSD: rf_evenodd_dagfuncs.h,v 1.1 1999/01/11 14:29:22 niklas Exp $ */ +/* $NetBSD: rf_evenodd_dagfuncs.h,v 1.1 1998/11/13 04:20:29 oster Exp $ */ +/* + * rf_evenodd_dagfuncs.h + */ +/* + * Copyright (c) 1996 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Chang-Ming Wu + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#ifndef _RF__RF_EVENODD_DAGFUNCS_H_ +#define _RF__RF_EVENODD_DAGFUNCS_H_ + +extern RF_RedFuncs_t rf_EOSmallWriteEFuncs; +extern RF_RedFuncs_t rf_EOSmallWritePFuncs; +extern RF_RedFuncs_t rf_eoERecoveryFuncs; +extern RF_RedFuncs_t rf_eoPRecoveryFuncs; +extern RF_RedFuncs_t rf_eoERecoveryFuncs; + +int rf_RegularPEFunc(RF_DagNode_t *node); +int rf_RegularONEFunc(RF_DagNode_t *node); +int rf_SimpleONEFunc(RF_DagNode_t *node); +void rf_RegularESubroutine(RF_DagNode_t *node, char *ebuf); +int rf_RegularEFunc(RF_DagNode_t *node); +void rf_DegrESubroutine(RF_DagNode_t *node, char *ebuf); +int rf_Degraded_100_EOFunc(RF_DagNode_t *node); +void rf_e_EncOneSect(RF_RowCol_t srcLogicCol, char *srcSecbuf, + RF_RowCol_t destLogicCol, char *destSecbuf, int bytesPerSector); +void rf_e_encToBuf(RF_Raid_t *raidPtr, RF_RowCol_t srcLogicCol, + char *srcbuf, RF_RowCol_t destLogicCol, char *destbuf, int numSector); +int rf_RecoveryEFunc(RF_DagNode_t *node); +int rf_EO_DegradedWriteEFunc(RF_DagNode_t *node); +void rf_doubleEOdecode(RF_Raid_t *raidPtr, char **rrdbuf, char **dest, + RF_RowCol_t *fcol, char *pbuf, char *ebuf); +int rf_EvenOddDoubleRecoveryFunc(RF_DagNode_t *node); +int rf_EOWriteDoubleRecoveryFunc(RF_DagNode_t *node); + +#define rf_EUCol(_layoutPtr_, _addr_ ) \ +( (_addr_)%( (_layoutPtr_)->dataSectorsPerStripe ) )/((_layoutPtr_)->sectorsPerStripeUnit) + +#define rf_EO_Mod( _int1_, _int2_ ) \ +( ((_int1_) < 0)? (((_int1_)+(_int2_))%(_int2_)) : (_int1_)%(_int2_) ) + +#define rf_OffsetOfNextEUBoundary(_offset_, sec_per_eu) ((_offset_)/(sec_per_eu) + 1)*(sec_per_eu) + +#define RF_EO_MATRIX_DIM 17 + +/* + * RF_EO_MATRIX_DIM should be a prime number: and "bytesPerSector" should be + * dividable by ( RF_EO_MATRIX_DIM - 1) to fully encode and utilize the space + * in a sector, this number could also be 17. Tha later case doesn't apply + * for disk array larger than 17 columns totally. + */ + +#endif /* !_RF__RF_EVENODD_DAGFUNCS_H_ */ diff --git a/sys/dev/raidframe/rf_evenodd_dags.c b/sys/dev/raidframe/rf_evenodd_dags.c new file mode 100644 index 00000000000..775fd5008f9 --- /dev/null +++ b/sys/dev/raidframe/rf_evenodd_dags.c @@ -0,0 +1,199 @@ +/* $OpenBSD: rf_evenodd_dags.c,v 1.1 1999/01/11 14:29:22 niklas Exp $ */ +/* $NetBSD: rf_evenodd_dags.c,v 1.1 1998/11/13 04:20:29 oster Exp $ */ +/* + * rf_evenodd_dags.c + */ +/* + * Copyright (c) 1996 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Chang-Ming Wu + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#include "rf_archs.h" + +#if RF_INCLUDE_EVENODD > 0 + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_dagfuncs.h" +#include "rf_dagutils.h" +#include "rf_etimer.h" +#include "rf_acctrace.h" +#include "rf_general.h" +#include "rf_evenodd_dags.h" +#include "rf_evenodd.h" +#include "rf_evenodd_dagfuncs.h" +#include "rf_pq.h" +#include "rf_dagdegrd.h" +#include "rf_dagdegwr.h" +#include "rf_dagffwr.h" + + +/* + * Lost one data. + * Use P to reconstruct missing data. + */ +RF_CREATE_DAG_FUNC_DECL(rf_EO_100_CreateReadDAG) +{ + rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_eoPRecoveryFuncs); +} + +/* + * Lost data + E. + * Use P to reconstruct missing data. + */ +RF_CREATE_DAG_FUNC_DECL(rf_EO_101_CreateReadDAG) +{ + rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_eoPRecoveryFuncs); +} + +/* + * Lost data + P. + * Make E look like P, and use Eor for Xor, and we can + * use degraded read DAG. + */ +RF_CREATE_DAG_FUNC_DECL(rf_EO_110_CreateReadDAG) +{ + RF_PhysDiskAddr_t *temp; + /* swap P and E pointers to fake out the DegradedReadDAG code */ + temp = asmap->parityInfo; asmap->parityInfo = asmap->qInfo; asmap->qInfo = temp; + rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_eoERecoveryFuncs); +} + +/* + * Lost two data. + */ +RF_CREATE_DAG_FUNC_DECL(rf_EOCreateDoubleDegradedReadDAG) +{ + rf_EO_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList); +} + +/* + * Lost two data. + */ +RF_CREATE_DAG_FUNC_DECL(rf_EO_200_CreateReadDAG) +{ + rf_EOCreateDoubleDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList); +} + +RF_CREATE_DAG_FUNC_DECL(rf_EO_100_CreateWriteDAG) +{ + if (asmap->numStripeUnitsAccessed != 1 && + asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit) + RF_PANIC(); + rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2, (int (*)(RF_DagNode_t *))rf_Degraded_100_EOFunc, RF_TRUE); +} + +/* + * E is dead. Small write. + */ +RF_CREATE_DAG_FUNC_DECL(rf_EO_001_CreateSmallWriteDAG) +{ + rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_EOSmallWritePFuncs, NULL); +} + +/* + * E is dead. Large write. + */ +RF_CREATE_DAG_FUNC_DECL(rf_EO_001_CreateLargeWriteDAG) +{ + rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularPFunc, RF_TRUE); +} + +/* + * P is dead. Small write. + * Swap E + P, use single-degraded stuff. + */ +RF_CREATE_DAG_FUNC_DECL(rf_EO_010_CreateSmallWriteDAG) +{ + RF_PhysDiskAddr_t *temp; + /* swap P and E pointers to fake out the DegradedReadDAG code */ + temp = asmap->parityInfo; asmap->parityInfo = asmap->qInfo; asmap->qInfo = temp; + rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_EOSmallWriteEFuncs, NULL); +} + +/* + * P is dead. Large write. + * Swap E + P, use single-degraded stuff. + */ +RF_CREATE_DAG_FUNC_DECL(rf_EO_010_CreateLargeWriteDAG) +{ + RF_PhysDiskAddr_t *temp; + /* swap P and E pointers to fake out the code */ + temp = asmap->parityInfo; asmap->parityInfo = asmap->qInfo; asmap->qInfo = temp; + rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularEFunc, RF_FALSE); +} + +RF_CREATE_DAG_FUNC_DECL(rf_EO_011_CreateWriteDAG) +{ + rf_CreateNonRedundantWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, + RF_IO_TYPE_WRITE); +} + +RF_CREATE_DAG_FUNC_DECL(rf_EO_110_CreateWriteDAG) +{ + RF_PhysDiskAddr_t *temp; + + if (asmap->numStripeUnitsAccessed != 1 && + asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit) + { + RF_PANIC(); + } + /* swap P and E to fake out parity code */ + temp = asmap->parityInfo; asmap->parityInfo = asmap->qInfo; asmap->qInfo = temp; + rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList,1, (int (*)(RF_DagNode_t *))rf_EO_DegradedWriteEFunc, RF_FALSE); + /* is the regular E func the right one to call? */ +} + +RF_CREATE_DAG_FUNC_DECL(rf_EO_101_CreateWriteDAG) +{ + if (asmap->numStripeUnitsAccessed != 1 && + asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit) + RF_PANIC(); + rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList,1, rf_RecoveryXorFunc, RF_TRUE); +} + +RF_CREATE_DAG_FUNC_DECL(rf_EO_DoubleDegRead) +{ + rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList, + "Re", "EvenOddRecovery", rf_EvenOddDoubleRecoveryFunc); +} + +RF_CREATE_DAG_FUNC_DECL(rf_EOCreateSmallWriteDAG) +{ + rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_EOSmallWriteEFuncs); +} + +RF_CREATE_DAG_FUNC_DECL(rf_EOCreateLargeWriteDAG) +{ + rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2, rf_RegularPEFunc, RF_FALSE); +} + +RF_CREATE_DAG_FUNC_DECL(rf_EO_200_CreateWriteDAG) +{ + rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Re", "We", "EOWrDDRecovery", rf_EOWriteDoubleRecoveryFunc); +} + +#endif /* RF_INCLUDE_EVENODD > 0 */ diff --git a/sys/dev/raidframe/rf_evenodd_dags.h b/sys/dev/raidframe/rf_evenodd_dags.h new file mode 100644 index 00000000000..3d125e8aa25 --- /dev/null +++ b/sys/dev/raidframe/rf_evenodd_dags.h @@ -0,0 +1,64 @@ +/* $OpenBSD: rf_evenodd_dags.h,v 1.1 1999/01/11 14:29:22 niklas Exp $ */ +/* $NetBSD: rf_evenodd_dags.h,v 1.1 1998/11/13 04:20:29 oster Exp $ */ +/* + * rf_evenodd_dags.h + */ +/* + * Copyright (c) 1996 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Chang-Ming Wu + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#ifndef _RF__RF_EVENODD_DAGS_H_ +#define _RF__RF_EVENODD_DAGS_H_ + +#include "rf_types.h" + +#if RF_UTILITY == 0 +#include "rf_dag.h" + +/* extern decl's of the failure mode EO functions. + * swiped from rf_pqdeg.h + */ + +RF_CREATE_DAG_FUNC_DECL(rf_EO_100_CreateReadDAG); +RF_CREATE_DAG_FUNC_DECL(rf_EO_101_CreateReadDAG); +RF_CREATE_DAG_FUNC_DECL(rf_EO_110_CreateReadDAG); +RF_CREATE_DAG_FUNC_DECL(rf_EO_200_CreateReadDAG); +RF_CREATE_DAG_FUNC_DECL(rf_EOCreateDoubleDegradedReadDAG); +RF_CREATE_DAG_FUNC_DECL(rf_EO_100_CreateWriteDAG); +RF_CREATE_DAG_FUNC_DECL(rf_EO_010_CreateSmallWriteDAG); +RF_CREATE_DAG_FUNC_DECL(rf_EO_001_CreateSmallWriteDAG); +RF_CREATE_DAG_FUNC_DECL(rf_EO_010_CreateLargeWriteDAG); +RF_CREATE_DAG_FUNC_DECL(rf_EO_001_CreateLargeWriteDAG); +RF_CREATE_DAG_FUNC_DECL(rf_EO_011_CreateWriteDAG); +RF_CREATE_DAG_FUNC_DECL(rf_EO_110_CreateWriteDAG); +RF_CREATE_DAG_FUNC_DECL(rf_EO_101_CreateWriteDAG); +RF_CREATE_DAG_FUNC_DECL(rf_EO_DoubleDegRead); +RF_CREATE_DAG_FUNC_DECL(rf_EOCreateSmallWriteDAG); +RF_CREATE_DAG_FUNC_DECL(rf_EOCreateLargeWriteDAG); +RF_CREATE_DAG_FUNC_DECL(rf_EO_200_CreateWriteDAG); +#endif /* RF_UTILITY == 0 */ + +#endif /* !_RF__RF_EVENODD_DAGS_H_ */ diff --git a/sys/dev/raidframe/rf_fifo.c b/sys/dev/raidframe/rf_fifo.c new file mode 100644 index 00000000000..63367aeb4ab --- /dev/null +++ b/sys/dev/raidframe/rf_fifo.c @@ -0,0 +1,371 @@ +/* $OpenBSD: rf_fifo.c,v 1.1 1999/01/11 14:29:22 niklas Exp $ */ +/* $NetBSD: rf_fifo.c,v 1.1 1998/11/13 04:20:29 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/*************************************************** + * + * rf_fifo.c -- prioritized fifo queue code. + * There are only two priority levels: hi and lo. + * + * Aug 4, 1994, adapted from raidSim version (MCH) + * + ***************************************************/ + +/* + * : + * Log: rf_fifo.c,v + * Revision 1.20 1996/06/18 20:53:11 jimz + * fix up disk queueing (remove configure routine, + * add shutdown list arg to create routines) + * + * Revision 1.19 1996/06/14 00:08:21 jimz + * make happier in all environments + * + * Revision 1.18 1996/06/13 20:41:24 jimz + * add random queueing + * + * Revision 1.17 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.16 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.15 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.14 1996/06/06 01:15:02 jimz + * added debugging + * + * Revision 1.13 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.12 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.11 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.10 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.9 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.8 1995/12/01 18:22:15 root + * added copyright info + * + * Revision 1.7 1995/11/07 15:32:16 wvcii + * added function FifoPeek() + * + */ + +#include "rf_types.h" +#include "rf_alloclist.h" +#include "rf_stripelocks.h" +#include "rf_layout.h" +#include "rf_diskqueue.h" +#include "rf_fifo.h" +#include "rf_debugMem.h" +#include "rf_general.h" +#include "rf_threadid.h" +#include "rf_options.h" + +#if !defined(KERNEL) && RF_INCLUDE_QUEUE_RANDOM > 0 +#include "rf_randmacros.h" +RF_DECLARE_STATIC_RANDOM +#endif /* !KERNEL && RF_INCLUDE_QUEUE_RANDOM > 0 */ + +/* just malloc a header, zero it (via calloc), and return it */ +/*ARGSUSED*/ +void *rf_FifoCreate(sectPerDisk, clList, listp) + RF_SectorCount_t sectPerDisk; + RF_AllocListElem_t *clList; + RF_ShutdownList_t **listp; +{ + RF_FifoHeader_t *q; + +#if !defined(KERNEL) && RF_INCLUDE_QUEUE_RANDOM > 0 + RF_INIT_STATIC_RANDOM(1); +#endif /* !KERNEL && RF_INCLUDE_QUEUE_RANDOM > 0 */ + RF_CallocAndAdd(q, 1, sizeof(RF_FifoHeader_t), (RF_FifoHeader_t *), clList); + q->hq_count = q->lq_count = 0; +#if !defined(KERNEL) && RF_INCLUDE_QUEUE_RANDOM > 0 + q->rval = (long)RF_STATIC_RANDOM(); +#endif /* !KERNEL && RF_INCLUDE_QUEUE_RANDOM > 0 */ + return((void *)q); +} + +void rf_FifoEnqueue(q_in, elem, priority) + void *q_in; + RF_DiskQueueData_t *elem; + int priority; +{ + RF_FifoHeader_t *q = (RF_FifoHeader_t *)q_in; + + RF_ASSERT(priority == RF_IO_NORMAL_PRIORITY || priority == RF_IO_LOW_PRIORITY); + + elem->next = NULL; + if (priority == RF_IO_NORMAL_PRIORITY) { + if (!q->hq_tail) { + RF_ASSERT(q->hq_count == 0 && q->hq_head == NULL); + q->hq_head = q->hq_tail = elem; + } else { + RF_ASSERT(q->hq_count != 0 && q->hq_head != NULL); + q->hq_tail->next = elem; + q->hq_tail = elem; + } + q->hq_count++; + } + else { + RF_ASSERT(elem->next == NULL); + if (rf_fifoDebug) { + int tid; + rf_get_threadid(tid); + printf("[%d] fifo: ENQ lopri\n", tid); + } + if (!q->lq_tail) { + RF_ASSERT(q->lq_count == 0 && q->lq_head == NULL); + q->lq_head = q->lq_tail = elem; + } else { + RF_ASSERT(q->lq_count != 0 && q->lq_head != NULL); + q->lq_tail->next = elem; + q->lq_tail = elem; + } + q->lq_count++; + } + if ((q->hq_count + q->lq_count)!= elem->queue->queueLength) { + printf("Queue lengths differ!: %d %d %d\n", + q->hq_count, q->lq_count, (int)elem->queue->queueLength); + printf("%d %d %d %d\n", + (int)elem->queue->numOutstanding, + (int)elem->queue->maxOutstanding, + (int)elem->queue->row, + (int)elem->queue->col); + } + RF_ASSERT((q->hq_count + q->lq_count) == elem->queue->queueLength); +} + +RF_DiskQueueData_t *rf_FifoDequeue(q_in) + void *q_in; +{ + RF_FifoHeader_t *q = (RF_FifoHeader_t *) q_in; + RF_DiskQueueData_t *nd; + + RF_ASSERT(q); + if (q->hq_head) { + RF_ASSERT(q->hq_count != 0 && q->hq_tail != NULL); + nd = q->hq_head; q->hq_head = q->hq_head->next; + if (!q->hq_head) q->hq_tail = NULL; + nd->next = NULL; + q->hq_count--; + } else if (q->lq_head) { + RF_ASSERT(q->lq_count != 0 && q->lq_tail != NULL); + nd = q->lq_head; q->lq_head = q->lq_head->next; + if (!q->lq_head) q->lq_tail = NULL; + nd->next = NULL; + q->lq_count--; + if (rf_fifoDebug) { + int tid; + rf_get_threadid(tid); + printf("[%d] fifo: DEQ lopri %lx\n", tid, (long)nd); + } + } else { + RF_ASSERT(q->hq_count == 0 && q->lq_count == 0 && q->hq_tail == NULL && q->lq_tail == NULL); + nd = NULL; + } + return(nd); +} + +/* This never gets used!! No loss (I hope) if we don't include it... GO */ +#if !defined(__NetBSD__) && !defined(__OpenBSD__) && !defined(_KERNEL) + +static RF_DiskQueueData_t *n_in_q(headp, tailp, countp, n, deq) + RF_DiskQueueData_t **headp; + RF_DiskQueueData_t **tailp; + int *countp; + int n; + int deq; +{ + RF_DiskQueueData_t *r, *s; + int i; + + for(s=NULL,i=n,r=*headp;r;s=r,r=r->next) { + if (i == 0) + break; + i--; + } + RF_ASSERT(r != NULL); + if (deq == 0) + return(r); + if (s) { + s->next = r->next; + } + else { + *headp = r->next; + } + if (*tailp == r) + *tailp = s; + (*countp)--; + return(r); +} +#endif + +#if !defined(KERNEL) && RF_INCLUDE_QUEUE_RANDOM > 0 +RF_DiskQueueData_t *rf_RandomPeek(q_in) + void *q_in; +{ + RF_FifoHeader_t *q = (RF_FifoHeader_t *) q_in; + RF_DiskQueueData_t *req; + int n; + + if (q->hq_head) { + n = q->rval % q->hq_count; + req = n_in_q(&q->hq_head, &q->hq_tail, &q->hq_count, n, 0); + } + else { + RF_ASSERT(q->hq_count == 0); + if (q->lq_head == NULL) { + RF_ASSERT(q->lq_count == 0); + return(NULL); + } + n = q->rval % q->lq_count; + req = n_in_q(&q->lq_head, &q->lq_tail, &q->lq_count, n, 0); + } + RF_ASSERT((q->hq_count + q->lq_count) == req->queue->queueLength); + RF_ASSERT(req != NULL); + return(req); +} + +RF_DiskQueueData_t *rf_RandomDequeue(q_in) + void *q_in; +{ + RF_FifoHeader_t *q = (RF_FifoHeader_t *) q_in; + RF_DiskQueueData_t *req; + int n; + + if (q->hq_head) { + n = q->rval % q->hq_count; + q->rval = (long)RF_STATIC_RANDOM(); + req = n_in_q(&q->hq_head, &q->hq_tail, &q->hq_count, n, 1); + } + else { + RF_ASSERT(q->hq_count == 0); + if (q->lq_head == NULL) { + RF_ASSERT(q->lq_count == 0); + return(NULL); + } + n = q->rval % q->lq_count; + q->rval = (long)RF_STATIC_RANDOM(); + req = n_in_q(&q->lq_head, &q->lq_tail, &q->lq_count, n, 1); + } + RF_ASSERT((q->hq_count + q->lq_count) == (req->queue->queueLength-1)); + return(req); +} +#endif /* !KERNEL && RF_INCLUDE_QUEUE_RANDOM > 0 */ + +/* Return ptr to item at head of queue. Used to examine request + * info without actually dequeueing the request. + */ +RF_DiskQueueData_t *rf_FifoPeek(void *q_in) +{ + RF_DiskQueueData_t *headElement = NULL; + RF_FifoHeader_t *q = (RF_FifoHeader_t *) q_in; + + RF_ASSERT(q); + if (q->hq_head) + headElement = q->hq_head; + else if (q->lq_head) + headElement = q->lq_head; + return(headElement); +} + +/* We sometimes need to promote a low priority access to a regular priority access. + * Currently, this is only used when the user wants to write a stripe which is currently + * under reconstruction. + * This routine will promote all accesses tagged with the indicated parityStripeID from + * the low priority queue to the end of the normal priority queue. + * We assume the queue is locked upon entry. + */ +int rf_FifoPromote(q_in, parityStripeID, which_ru) + void *q_in; + RF_StripeNum_t parityStripeID; + RF_ReconUnitNum_t which_ru; +{ + RF_FifoHeader_t *q = (RF_FifoHeader_t *) q_in; + RF_DiskQueueData_t *lp = q->lq_head, *pt = NULL; /* lp = lo-pri queue pointer, pt = trailer */ + int retval = 0; + + while (lp) { + + /* search for the indicated parity stripe in the low-pri queue */ + if (lp->parityStripeID == parityStripeID && lp->which_ru == which_ru) { + /*printf("FifoPromote: promoting access for psid %ld\n",parityStripeID);*/ + if (pt) pt->next = lp->next; /* delete an entry other than the first */ + else q->lq_head = lp->next; /* delete the head entry */ + + if (!q->lq_head) q->lq_tail = NULL; /* we deleted the only entry */ + else if (lp == q->lq_tail) q->lq_tail = pt; /* we deleted the tail entry */ + + lp->next = NULL; + q->lq_count--; + + if (q->hq_tail) {q->hq_tail->next = lp; q->hq_tail = lp;} /* append to hi-priority queue */ + else {q->hq_head = q->hq_tail = lp;} + q->hq_count++; + + /*UpdateShortestSeekFinishTimeForced(lp->requestPtr, lp->diskState);*/ /* deal with this later, if ever */ + + lp = (pt) ? pt->next : q->lq_head; /* reset low-pri pointer and continue */ + retval++; + + } else {pt = lp; lp = lp->next;} + } + + /* sanity check. delete this if you ever put more than one entry in the low-pri queue */ + RF_ASSERT(retval == 0 || retval == 1); + if (rf_fifoDebug) { + int tid; + rf_get_threadid(tid); + printf("[%d] fifo: promote %d\n", tid, retval); + } + return(retval); +} diff --git a/sys/dev/raidframe/rf_fifo.h b/sys/dev/raidframe/rf_fifo.h new file mode 100644 index 00000000000..44d2cc577f4 --- /dev/null +++ b/sys/dev/raidframe/rf_fifo.h @@ -0,0 +1,115 @@ +/* $OpenBSD: rf_fifo.h,v 1.1 1999/01/11 14:29:23 niklas Exp $ */ +/* $NetBSD: rf_fifo.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * rf_fifo.h -- prioritized FIFO queue code. + * + * 4-9-93 Created (MCH) + */ + +/* + * : + * Log: rf_fifo.h,v + * Revision 1.12 1996/06/18 20:53:11 jimz + * fix up disk queueing (remove configure routine, + * add shutdown list arg to create routines) + * + * Revision 1.11 1996/06/13 20:41:28 jimz + * add random queueing + * + * Revision 1.10 1996/06/13 20:38:28 jimz + * add random dequeue, peek + * + * Revision 1.9 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.8 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.7 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.6 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.5 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.4 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.3 1995/12/01 18:22:26 root + * added copyright info + * + * Revision 1.2 1995/11/07 15:31:57 wvcii + * added Peek() function + * + */ + +#ifndef _RF__RF_FIFO_H_ +#define _RF__RF_FIFO_H_ + +#include "rf_archs.h" +#include "rf_types.h" +#include "rf_diskqueue.h" + +typedef struct RF_FifoHeader_s { + RF_DiskQueueData_t *hq_head, *hq_tail; /* high priority requests */ + RF_DiskQueueData_t *lq_head, *lq_tail; /* low priority requests */ + int hq_count, lq_count; /* debug only */ +#if !defined(KERNEL) && RF_INCLUDE_QUEUE_RANDOM > 0 + long rval; /* next random number (random qpolicy) */ +#endif /* !KERNEL && RF_INCLUDE_QUEUE_RANDOM > 0 */ +} RF_FifoHeader_t; + +extern void *rf_FifoCreate(RF_SectorCount_t sectPerDisk, + RF_AllocListElem_t *clList, RF_ShutdownList_t **listp); +extern void rf_FifoEnqueue(void *q_in, RF_DiskQueueData_t *elem, + int priority); +extern RF_DiskQueueData_t *rf_FifoDequeue(void *q_in); +extern RF_DiskQueueData_t *rf_FifoPeek(void *q_in); +extern int rf_FifoPromote(void *q_in, RF_StripeNum_t parityStripeID, + RF_ReconUnitNum_t which_ru); +#if !defined(KERNEL) && RF_INCLUDE_QUEUE_RANDOM > 0 +extern RF_DiskQueueData_t *rf_RandomDequeue(void *q_in); +extern RF_DiskQueueData_t *rf_RandomPeek(void *q_in); +#endif /* !KERNEL && RF_INCLUDE_QUEUE_RANDOM > 0 */ + +#endif /* !_RF__RF_FIFO_H_ */ diff --git a/sys/dev/raidframe/rf_freelist.h b/sys/dev/raidframe/rf_freelist.h new file mode 100644 index 00000000000..8f8e4f5120d --- /dev/null +++ b/sys/dev/raidframe/rf_freelist.h @@ -0,0 +1,734 @@ +/* $OpenBSD: rf_freelist.h,v 1.1 1999/01/11 14:29:23 niklas Exp $ */ +/* $NetBSD: rf_freelist.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */ +/* + * rf_freelist.h + */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ +/* + * : + * Log: rf_freelist.h,v + * Revision 1.13 1996/06/10 12:50:57 jimz + * Add counters to freelists to track number of allocations, frees, + * grows, max size, etc. Adjust a couple sets of PRIME params based + * on the results. + * + * Revision 1.12 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.11 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.10 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.9 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.8 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.7 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.6 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.5 1996/05/20 16:16:12 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.4 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.3 1996/05/16 16:04:52 jimz + * allow init func to fail for FREELIST ops + * + * Revision 1.2 1996/05/16 14:54:08 jimz + * added _INIT and _CLEAN versions of ops for objects with + * internal allocations + * + * Revision 1.1 1996/05/15 23:37:53 jimz + * Initial revision + * + */ +/* + * rf_freelist.h -- code to manage counted freelists + * + * Keep an arena of fixed-size objects. When a new object is needed, + * allocate it as necessary. When an object is freed, either put it + * in the arena, or really free it, depending on the maximum arena + * size. + */ + +#ifndef _RF__RF_FREELIST_H_ +#define _RF__RF_FREELIST_H_ + +#include "rf_types.h" +#include "rf_debugMem.h" +#include "rf_general.h" +#include "rf_threadstuff.h" + +#define RF_FREELIST_STATS 0 + +#if RF_FREELIST_STATS > 0 +typedef struct RF_FreeListStats_s { + char *file; + int line; + int allocations; + int frees; + int max_free; + int grows; + int outstanding; + int max_outstanding; +} RF_FreeListStats_t; + +#define RF_FREELIST_STAT_INIT(_fl_) { \ + bzero((char *)&((_fl_)->stats), sizeof(RF_FreeListStats_t)); \ + (_fl_)->stats.file = __FILE__; \ + (_fl_)->stats.line = __LINE__; \ +} + +#define RF_FREELIST_STAT_ALLOC(_fl_) { \ + (_fl_)->stats.allocations++; \ + (_fl_)->stats.outstanding++; \ + if ((_fl_)->stats.outstanding > (_fl_)->stats.max_outstanding) \ + (_fl_)->stats.max_outstanding = (_fl_)->stats.outstanding; \ +} + +#define RF_FREELIST_STAT_FREE_UPDATE(_fl_) { \ + if ((_fl_)->free_cnt > (_fl_)->stats.max_free) \ + (_fl_)->stats.max_free = (_fl_)->free_cnt; \ +} + +#define RF_FREELIST_STAT_FREE(_fl_) { \ + (_fl_)->stats.frees++; \ + (_fl_)->stats.outstanding--; \ + RF_FREELIST_STAT_FREE_UPDATE(_fl_); \ +} + +#define RF_FREELIST_STAT_GROW(_fl_) { \ + (_fl_)->stats.grows++; \ + RF_FREELIST_STAT_FREE_UPDATE(_fl_); \ +} + +#define RF_FREELIST_STAT_REPORT(_fl_) { \ + printf("Freelist at %s %d (%s)\n", (_fl_)->stats.file, (_fl_)->stats.line, RF_STRING(_fl_)); \ + printf(" %d allocations, %d frees\n", (_fl_)->stats.allocations, (_fl_)->stats.frees); \ + printf(" %d grows\n", (_fl_)->stats.grows); \ + printf(" %d outstanding\n", (_fl_)->stats.outstanding); \ + printf(" %d free (max)\n", (_fl_)->stats.max_free); \ + printf(" %d outstanding (max)\n", (_fl_)->stats.max_outstanding); \ +} + +#else /* RF_FREELIST_STATS > 0 */ + +#define RF_FREELIST_STAT_INIT(_fl_) +#define RF_FREELIST_STAT_ALLOC(_fl_) +#define RF_FREELIST_STAT_FREE_UPDATE(_fl_) +#define RF_FREELIST_STAT_FREE(_fl_) +#define RF_FREELIST_STAT_GROW(_fl_) +#define RF_FREELIST_STAT_REPORT(_fl_) + +#endif /* RF_FREELIST_STATS > 0 */ + +struct RF_FreeList_s { + void *objlist; /* list of free obj */ + int free_cnt; /* how many free obj */ + int max_free_cnt; /* max free arena size */ + int obj_inc; /* how many to allocate at a time */ + int obj_size; /* size of objects */ + RF_DECLARE_MUTEX(lock) +#if RF_FREELIST_STATS > 0 + RF_FreeListStats_t stats; /* statistics */ +#endif /* RF_FREELIST_STATS > 0 */ +}; + +/* + * fl = freelist + * maxcnt = max number of items in arena + * inc = how many to allocate at a time + * size = size of object + */ +#define RF_FREELIST_CREATE(_fl_,_maxcnt_,_inc_,_size_) { \ + int rc; \ + RF_ASSERT((_inc_) > 0); \ + RF_Malloc(_fl_, sizeof(RF_FreeList_t), (RF_FreeList_t *)); \ + (_fl_)->objlist = NULL; \ + (_fl_)->free_cnt = 0; \ + (_fl_)->max_free_cnt = _maxcnt_; \ + (_fl_)->obj_inc = _inc_; \ + (_fl_)->obj_size = _size_; \ + rc = rf_mutex_init(&(_fl_)->lock); \ + if (rc) { \ + RF_Free(_fl_, sizeof(RF_FreeList_t)); \ + _fl_ = NULL; \ + } \ + RF_FREELIST_STAT_INIT(_fl_); \ +} + +/* + * fl = freelist + * cnt = number to prime with + * nextp = name of "next" pointer in obj + * cast = object cast + */ +#define RF_FREELIST_PRIME(_fl_,_cnt_,_nextp_,_cast_) { \ + void *_p; \ + int _i; \ + RF_LOCK_MUTEX((_fl_)->lock); \ + for(_i=0;_i<(_cnt_);_i++) { \ + RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \ + if (_p) { \ + (_cast_(_p))->_nextp_ = (_fl_)->objlist; \ + (_fl_)->objlist = _p; \ + (_fl_)->free_cnt++; \ + } \ + else { \ + break; \ + } \ + } \ + RF_FREELIST_STAT_FREE_UPDATE(_fl_); \ + RF_UNLOCK_MUTEX((_fl_)->lock); \ +} + +#define RF_FREELIST_MUTEX_OF(_fl_) ((_fl_)->lock) + +#define RF_FREELIST_DO_UNLOCK(_fl_) { \ + RF_UNLOCK_MUTEX((_fl_)->lock); \ +} + +#define RF_FREELIST_DO_LOCK(_fl_) { \ + RF_LOCK_MUTEX((_fl_)->lock); \ +} + +/* + * fl = freelist + * cnt = number to prime with + * nextp = name of "next" pointer in obj + * cast = object cast + * init = func to call to init obj + */ +#define RF_FREELIST_PRIME_INIT(_fl_,_cnt_,_nextp_,_cast_,_init_) { \ + void *_p; \ + int _i; \ + RF_LOCK_MUTEX((_fl_)->lock); \ + for(_i=0;_i<(_cnt_);_i++) { \ + RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \ + if (_init_ (_cast_ _p)) { \ + RF_Free(_p,(_fl_)->obj_size); \ + _p = NULL; \ + } \ + if (_p) { \ + (_cast_(_p))->_nextp_ = (_fl_)->objlist; \ + (_fl_)->objlist = _p; \ + (_fl_)->free_cnt++; \ + } \ + else { \ + break; \ + } \ + } \ + RF_FREELIST_STAT_FREE_UPDATE(_fl_); \ + RF_UNLOCK_MUTEX((_fl_)->lock); \ +} + +/* + * fl = freelist + * cnt = number to prime with + * nextp = name of "next" pointer in obj + * cast = object cast + * init = func to call to init obj + * arg = arg to init obj func + */ +#define RF_FREELIST_PRIME_INIT_ARG(_fl_,_cnt_,_nextp_,_cast_,_init_,_arg_) { \ + void *_p; \ + int _i; \ + RF_LOCK_MUTEX((_fl_)->lock); \ + for(_i=0;_i<(_cnt_);_i++) { \ + RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \ + if (_init_ (_cast_ _p,_arg_)) { \ + RF_Free(_p,(_fl_)->obj_size); \ + _p = NULL; \ + } \ + if (_p) { \ + (_cast_(_p))->_nextp_ = (_fl_)->objlist; \ + (_fl_)->objlist = _p; \ + (_fl_)->free_cnt++; \ + } \ + else { \ + break; \ + } \ + } \ + RF_FREELIST_STAT_FREE_UPDATE(_fl_); \ + RF_UNLOCK_MUTEX((_fl_)->lock); \ +} + +/* + * fl = freelist + * obj = object to allocate + * nextp = name of "next" pointer in obj + * cast = cast of obj assignment + * init = init obj func + */ +#define RF_FREELIST_GET_INIT(_fl_,_obj_,_nextp_,_cast_,_init_) { \ + void *_p; \ + int _i; \ + RF_LOCK_MUTEX((_fl_)->lock); \ + RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \ + if (_fl_->objlist) { \ + _obj_ = _cast_((_fl_)->objlist); \ + (_fl_)->objlist = (void *)((_obj_)->_nextp_); \ + (_fl_)->free_cnt--; \ + } \ + else { \ + /* \ + * Allocate one at a time so we can free \ + * one at a time without cleverness when arena \ + * is full. \ + */ \ + RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \ + if (_obj_) { \ + if (_init_ (_obj_)) { \ + RF_Free(_obj_,(_fl_)->obj_size); \ + _obj_ = NULL; \ + } \ + else { \ + for(_i=1;_i<(_fl_)->obj_inc;_i++) { \ + RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \ + if (_p) { \ + if (_init_ (_p)) { \ + RF_Free(_p,(_fl_)->obj_size); \ + _p = NULL; \ + break; \ + } \ + (_cast_(_p))->_nextp_ = (_fl_)->objlist; \ + (_fl_)->objlist = _p; \ + } \ + else { \ + break; \ + } \ + } \ + } \ + } \ + RF_FREELIST_STAT_GROW(_fl_); \ + } \ + RF_FREELIST_STAT_ALLOC(_fl_); \ + RF_UNLOCK_MUTEX((_fl_)->lock); \ +} + +/* + * fl = freelist + * obj = object to allocate + * nextp = name of "next" pointer in obj + * cast = cast of obj assignment + * init = init obj func + * arg = arg to init obj func + */ +#define RF_FREELIST_GET_INIT_ARG(_fl_,_obj_,_nextp_,_cast_,_init_,_arg_) { \ + void *_p; \ + int _i; \ + RF_LOCK_MUTEX((_fl_)->lock); \ + RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \ + if (_fl_->objlist) { \ + _obj_ = _cast_((_fl_)->objlist); \ + (_fl_)->objlist = (void *)((_obj_)->_nextp_); \ + (_fl_)->free_cnt--; \ + } \ + else { \ + /* \ + * Allocate one at a time so we can free \ + * one at a time without cleverness when arena \ + * is full. \ + */ \ + RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \ + if (_obj_) { \ + if (_init_ (_obj_,_arg_)) { \ + RF_Free(_obj_,(_fl_)->obj_size); \ + _obj_ = NULL; \ + } \ + else { \ + for(_i=1;_i<(_fl_)->obj_inc;_i++) { \ + RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \ + if (_p) { \ + if (_init_ (_p,_arg_)) { \ + RF_Free(_p,(_fl_)->obj_size); \ + _p = NULL; \ + break; \ + } \ + (_cast_(_p))->_nextp_ = (_fl_)->objlist; \ + (_fl_)->objlist = _p; \ + } \ + else { \ + break; \ + } \ + } \ + } \ + } \ + RF_FREELIST_STAT_GROW(_fl_); \ + } \ + RF_FREELIST_STAT_ALLOC(_fl_); \ + RF_UNLOCK_MUTEX((_fl_)->lock); \ +} + +/* + * fl = freelist + * obj = object to allocate + * nextp = name of "next" pointer in obj + * cast = cast of obj assignment + * init = init obj func + */ +#define RF_FREELIST_GET_INIT_NOUNLOCK(_fl_,_obj_,_nextp_,_cast_,_init_) { \ + void *_p; \ + int _i; \ + RF_LOCK_MUTEX((_fl_)->lock); \ + RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \ + if (_fl_->objlist) { \ + _obj_ = _cast_((_fl_)->objlist); \ + (_fl_)->objlist = (void *)((_obj_)->_nextp_); \ + (_fl_)->free_cnt--; \ + } \ + else { \ + /* \ + * Allocate one at a time so we can free \ + * one at a time without cleverness when arena \ + * is full. \ + */ \ + RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \ + if (_obj_) { \ + if (_init_ (_obj_)) { \ + RF_Free(_obj_,(_fl_)->obj_size); \ + _obj_ = NULL; \ + } \ + else { \ + for(_i=1;_i<(_fl_)->obj_inc;_i++) { \ + RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \ + if (_p) { \ + if (_init_ (_p)) { \ + RF_Free(_p,(_fl_)->obj_size); \ + _p = NULL; \ + break; \ + } \ + (_cast_(_p))->_nextp_ = (_fl_)->objlist; \ + (_fl_)->objlist = _p; \ + } \ + else { \ + break; \ + } \ + } \ + } \ + } \ + RF_FREELIST_STAT_GROW(_fl_); \ + } \ + RF_FREELIST_STAT_ALLOC(_fl_); \ +} + +/* + * fl = freelist + * obj = object to allocate + * nextp = name of "next" pointer in obj + * cast = cast of obj assignment + */ +#define RF_FREELIST_GET(_fl_,_obj_,_nextp_,_cast_) { \ + void *_p; \ + int _i; \ + RF_LOCK_MUTEX((_fl_)->lock); \ + RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \ + if (_fl_->objlist) { \ + _obj_ = _cast_((_fl_)->objlist); \ + (_fl_)->objlist = (void *)((_obj_)->_nextp_); \ + (_fl_)->free_cnt--; \ + } \ + else { \ + /* \ + * Allocate one at a time so we can free \ + * one at a time without cleverness when arena \ + * is full. \ + */ \ + RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \ + if (_obj_) { \ + for(_i=1;_i<(_fl_)->obj_inc;_i++) { \ + RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \ + if (_p) { \ + (_cast_(_p))->_nextp_ = (_fl_)->objlist; \ + (_fl_)->objlist = _p; \ + } \ + else { \ + break; \ + } \ + } \ + } \ + RF_FREELIST_STAT_GROW(_fl_); \ + } \ + RF_FREELIST_STAT_ALLOC(_fl_); \ + RF_UNLOCK_MUTEX((_fl_)->lock); \ +} + +/* + * fl = freelist + * obj = object to allocate + * nextp = name of "next" pointer in obj + * cast = cast of obj assignment + * num = num objs to return + */ +#define RF_FREELIST_GET_N(_fl_,_obj_,_nextp_,_cast_,_num_) { \ + void *_p, *_l, *_f; \ + int _i, _n; \ + _l = _f = NULL; \ + _n = 0; \ + RF_LOCK_MUTEX((_fl_)->lock); \ + RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \ + for(_n=0;_n<_num_;_n++) { \ + if (_fl_->objlist) { \ + _obj_ = _cast_((_fl_)->objlist); \ + (_fl_)->objlist = (void *)((_obj_)->_nextp_); \ + (_fl_)->free_cnt--; \ + } \ + else { \ + /* \ + * Allocate one at a time so we can free \ + * one at a time without cleverness when arena \ + * is full. \ + */ \ + RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \ + if (_obj_) { \ + for(_i=1;_i<(_fl_)->obj_inc;_i++) { \ + RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \ + if (_p) { \ + (_cast_(_p))->_nextp_ = (_fl_)->objlist; \ + (_fl_)->objlist = _p; \ + } \ + else { \ + break; \ + } \ + } \ + } \ + RF_FREELIST_STAT_GROW(_fl_); \ + } \ + if (_f == NULL) \ + _f = _obj_; \ + if (_obj_) { \ + (_cast_(_obj_))->_nextp_ = _l; \ + _l = _obj_; \ + RF_FREELIST_STAT_ALLOC(_fl_); \ + } \ + else { \ + (_cast_(_f))->_nextp_ = (_fl_)->objlist; \ + (_fl_)->objlist = _l; \ + _n = _num_; \ + } \ + } \ + RF_UNLOCK_MUTEX((_fl_)->lock); \ +} + +/* + * fl = freelist + * obj = object to free + * nextp = name of "next" pointer in obj + */ +#define RF_FREELIST_FREE(_fl_,_obj_,_nextp_) { \ + RF_LOCK_MUTEX((_fl_)->lock); \ + if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \ + RF_Free(_obj_,(_fl_)->obj_size); \ + } \ + else { \ + RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \ + (_obj_)->_nextp_ = (_fl_)->objlist; \ + (_fl_)->objlist = (void *)(_obj_); \ + (_fl_)->free_cnt++; \ + } \ + RF_FREELIST_STAT_FREE(_fl_); \ + RF_UNLOCK_MUTEX((_fl_)->lock); \ +} + +/* + * fl = freelist + * obj = object to free + * nextp = name of "next" pointer in obj + * num = num to free (debugging) + */ +#define RF_FREELIST_FREE_N(_fl_,_obj_,_nextp_,_cast_,_num_) { \ + void *_no; \ + int _n; \ + _n = 0; \ + RF_LOCK_MUTEX((_fl_)->lock); \ + while(_obj_) { \ + _no = (_cast_(_obj_))->_nextp_; \ + if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \ + RF_Free(_obj_,(_fl_)->obj_size); \ + } \ + else { \ + RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \ + (_obj_)->_nextp_ = (_fl_)->objlist; \ + (_fl_)->objlist = (void *)(_obj_); \ + (_fl_)->free_cnt++; \ + } \ + _n++; \ + _obj_ = _no; \ + RF_FREELIST_STAT_FREE(_fl_); \ + } \ + RF_ASSERT(_n==(_num_)); \ + RF_UNLOCK_MUTEX((_fl_)->lock); \ +} + +/* + * fl = freelist + * obj = object to free + * nextp = name of "next" pointer in obj + * clean = undo for init + */ +#define RF_FREELIST_FREE_CLEAN(_fl_,_obj_,_nextp_,_clean_) { \ + RF_LOCK_MUTEX((_fl_)->lock); \ + if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \ + _clean_ (_obj_); \ + RF_Free(_obj_,(_fl_)->obj_size); \ + } \ + else { \ + RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \ + (_obj_)->_nextp_ = (_fl_)->objlist; \ + (_fl_)->objlist = (void *)(_obj_); \ + (_fl_)->free_cnt++; \ + } \ + RF_FREELIST_STAT_FREE(_fl_); \ + RF_UNLOCK_MUTEX((_fl_)->lock); \ +} + +/* + * fl = freelist + * obj = object to free + * nextp = name of "next" pointer in obj + * clean = undo for init + * arg = arg for undo func + */ +#define RF_FREELIST_FREE_CLEAN_ARG(_fl_,_obj_,_nextp_,_clean_,_arg_) { \ + RF_LOCK_MUTEX((_fl_)->lock); \ + if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \ + _clean_ (_obj_,_arg_); \ + RF_Free(_obj_,(_fl_)->obj_size); \ + } \ + else { \ + RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \ + (_obj_)->_nextp_ = (_fl_)->objlist; \ + (_fl_)->objlist = (void *)(_obj_); \ + (_fl_)->free_cnt++; \ + } \ + RF_FREELIST_STAT_FREE(_fl_); \ + RF_UNLOCK_MUTEX((_fl_)->lock); \ +} + +/* + * fl = freelist + * obj = object to free + * nextp = name of "next" pointer in obj + * clean = undo for init + */ +#define RF_FREELIST_FREE_CLEAN_NOUNLOCK(_fl_,_obj_,_nextp_,_clean_) { \ + RF_LOCK_MUTEX((_fl_)->lock); \ + if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \ + _clean_ (_obj_); \ + RF_Free(_obj_,(_fl_)->obj_size); \ + } \ + else { \ + RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \ + (_obj_)->_nextp_ = (_fl_)->objlist; \ + (_fl_)->objlist = (void *)(_obj_); \ + (_fl_)->free_cnt++; \ + } \ + RF_FREELIST_STAT_FREE(_fl_); \ +} + +/* + * fl = freelist + * nextp = name of "next" pointer in obj + * cast = cast to object type + */ +#define RF_FREELIST_DESTROY(_fl_,_nextp_,_cast_) { \ + void *_cur, *_next; \ + RF_FREELIST_STAT_REPORT(_fl_); \ + rf_mutex_destroy(&((_fl_)->lock)); \ + for(_cur=(_fl_)->objlist;_cur;_cur=_next) { \ + _next = (_cast_ _cur)->_nextp_; \ + RF_Free(_cur,(_fl_)->obj_size); \ + } \ + RF_Free(_fl_,sizeof(RF_FreeList_t)); \ +} + +/* + * fl = freelist + * nextp = name of "next" pointer in obj + * cast = cast to object type + * clean = func to undo obj init + */ +#define RF_FREELIST_DESTROY_CLEAN(_fl_,_nextp_,_cast_,_clean_) { \ + void *_cur, *_next; \ + RF_FREELIST_STAT_REPORT(_fl_); \ + rf_mutex_destroy(&((_fl_)->lock)); \ + for(_cur=(_fl_)->objlist;_cur;_cur=_next) { \ + _next = (_cast_ _cur)->_nextp_; \ + _clean_ (_cur); \ + RF_Free(_cur,(_fl_)->obj_size); \ + } \ + RF_Free(_fl_,sizeof(RF_FreeList_t)); \ +} + +/* + * fl = freelist + * nextp = name of "next" pointer in obj + * cast = cast to object type + * clean = func to undo obj init + * arg = arg for undo func + */ +#define RF_FREELIST_DESTROY_CLEAN_ARG(_fl_,_nextp_,_cast_,_clean_,_arg_) { \ + void *_cur, *_next; \ + RF_FREELIST_STAT_REPORT(_fl_); \ + rf_mutex_destroy(&((_fl_)->lock)); \ + for(_cur=(_fl_)->objlist;_cur;_cur=_next) { \ + _next = (_cast_ _cur)->_nextp_; \ + _clean_ (_cur,_arg_); \ + RF_Free(_cur,(_fl_)->obj_size); \ + } \ + RF_Free(_fl_,sizeof(RF_FreeList_t)); \ +} + +#endif /* !_RF__RF_FREELIST_H_ */ diff --git a/sys/dev/raidframe/rf_general.h b/sys/dev/raidframe/rf_general.h new file mode 100644 index 00000000000..3879520133f --- /dev/null +++ b/sys/dev/raidframe/rf_general.h @@ -0,0 +1,269 @@ +/* $OpenBSD: rf_general.h,v 1.1 1999/01/11 14:29:23 niklas Exp $ */ +/* $NetBSD: rf_general.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * rf_general.h -- some general-use definitions + */ + +/* + * : + * Log: rf_general.h,v + * Revision 1.26 1996/08/09 16:44:57 jimz + * sunos port + * + * Revision 1.25 1996/08/07 21:08:57 jimz + * get NBPG defined for IRIX + * + * Revision 1.24 1996/08/06 22:02:06 jimz + * include linux/user.h for linux to get NBPG + * + * Revision 1.23 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.22 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.21 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.20 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.19 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.18 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.17 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.16 1996/05/21 18:53:13 jimz + * be sure that noop macros don't confuse conditionals and loops + * + * Revision 1.15 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.14 1996/05/08 21:01:24 jimz + * fixed up enum type names that were conflicting with other + * enums and function names (ie, "panic") + * future naming trends will be towards RF_ and rf_ for + * everything raidframe-related + * + * Revision 1.13 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.12 1995/12/01 18:29:08 root + * added copyright info + * + * Revision 1.11 1995/09/19 22:59:52 jimz + * Add kernel macro RF_DKU_END_IO(). When DKUSAGE is not defined, + * this is a no-op. When it is defined, it calls dku_end_io() + * correctly given a raidframe unit number and a buf pointer. + * + * Revision 1.10 1995/07/03 18:13:56 holland + * changed kernel defn of GETTIME + * + * Revision 1.9 1995/07/02 15:07:42 holland + * bug fixes related to getting distributed sparing numbers + * + * Revision 1.8 1995/06/12 15:54:40 rachad + * Added garbege collection for log structured storage + * + * Revision 1.7 1995/06/03 19:18:16 holland + * changes related to kernelization: access traces + * changes related to distributed sparing: some bug fixes + * + * Revision 1.6 1995/05/01 13:28:00 holland + * parity range locks, locking disk requests, recon+parityscan in kernel, etc. + * + * Revision 1.5 1995/04/06 14:47:56 rachad + * merge completed + * + * Revision 1.4 1995/03/15 20:45:23 holland + * distr sparing changes. + * + * Revision 1.3 1995/02/03 22:31:36 holland + * many changes related to kernelization + * + * Revision 1.2 1994/11/29 21:37:10 danner + * Added divide by zero check. + * + */ + +/*#define NOASSERT*/ + +#ifndef _RF__RF_GENERAL_H_ +#define _RF__RF_GENERAL_H_ + +#ifdef _KERNEL +#define KERNEL +#endif + +#if !defined(KERNEL) && !defined(NOASSERT) +#include <assert.h> +#endif /* !KERNEL && !NOASSERT */ + +/* error reporting and handling */ + +#ifndef KERNEL + +#define RF_ERRORMSG(s) fprintf(stderr,(s)) +#define RF_ERRORMSG1(s,a) fprintf(stderr,(s),(a)) +#define RF_ERRORMSG2(s,a,b) fprintf(stderr,(s),(a),(b)) +#define RF_ERRORMSG3(s,a,b,c) fprintf(stderr,(s),(a),(b),(c)) +#define RF_ERRORMSG4(s,a,b,c,d) fprintf(stderr,(s),(a),(b),(c),(d)) +#define RF_ERRORMSG5(s,a,b,c,d,e) fprintf(stderr,(s),(a),(b),(c),(d),(e)) +#ifndef NOASSERT +#define RF_ASSERT(x) {assert(x);} +#else /* !NOASSERT */ +#define RF_ASSERT(x) {/*noop*/} +#endif /* !NOASSERT */ +#define RF_PANIC() {printf("YIKES! Something terrible happened at line %d of file %s. Use a debugger.\n",__LINE__,__FILE__); abort();} + +#else /* !KERNEL */ +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) +#include<sys/systm.h> /* printf, sprintf, and friends */ +#endif +#define RF_ERRORMSG(s) printf((s)) +#define RF_ERRORMSG1(s,a) printf((s),(a)) +#define RF_ERRORMSG2(s,a,b) printf((s),(a),(b)) +#define RF_ERRORMSG3(s,a,b,c) printf((s),(a),(b),(c)) +#define RF_ERRORMSG4(s,a,b,c,d) printf((s),(a),(b),(c),(d)) +#define RF_ERRORMSG5(s,a,b,c,d,e) printf((s),(a),(b),(c),(d),(e)) +#define perror(x) +extern char rf_panicbuf[]; +#define RF_PANIC() {sprintf(rf_panicbuf,"raidframe error at line %d file %s",__LINE__,__FILE__); panic(rf_panicbuf);} + +#ifdef RF_ASSERT +#undef RF_ASSERT +#endif /* RF_ASSERT */ +#ifndef NOASSERT +#define RF_ASSERT(_x_) { \ + if (!(_x_)) { \ + sprintf(rf_panicbuf, \ + "raidframe error at line %d file %s (failed asserting %s)\n", \ + __LINE__, __FILE__, #_x_); \ + panic(rf_panicbuf); \ + } \ +} +#else /* !NOASSERT */ +#define RF_ASSERT(x) {/*noop*/} +#endif /* !NOASSERT */ + +#endif /* !KERNEL */ + +/* random stuff */ +#define RF_MAX(a,b) (((a) > (b)) ? (a) : (b)) +#define RF_MIN(a,b) (((a) < (b)) ? (a) : (b)) + +/* divide-by-zero check */ +#define RF_DB0_CHECK(a,b) ( ((b)==0) ? 0 : (a)/(b) ) + +/* get time of day */ +#ifdef KERNEL +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +extern struct timeval time; +#endif /* !__NetBSD__ && !__OpenBSD__ */ +#define RF_GETTIME(_t) microtime(&(_t)) +#else /* KERNEL */ +#define RF_GETTIME(_t) gettimeofday(&(_t), NULL); +#endif /* KERNEL */ + +/* + * zero memory- not all bzero calls go through here, only + * those which in the kernel may have a user address + */ +#ifdef KERNEL +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#define RF_BZERO(_bp,_b,_l) if (IS_SYS_VA(_b)) bzero(_b,_l); else rf_BzeroWithRemap(_bp,_b,_l) +#else + +#define RF_BZERO(_bp,_b,_l) bzero(_b,_l) /* XXX This is likely incorrect. GO*/ +#endif /* __NetBSD__ || __OpenBSD__ */ +#else /* KERNEL */ +#define RF_BZERO(_bp,_b,_l) bzero(_b,_l) +#endif /* KERNEL */ + +#ifdef sun +#include <sys/param.h> +#ifndef NBPG +#define NBPG PAGESIZE +#endif /* !NBPG */ +#endif /* sun */ + +#ifdef IRIX +#include <sys/tfp.h> +#define NBPG _PAGESZ +#endif /* IRIX */ + +#ifdef LINUX +#include <linux/user.h> +#endif /* LINUX */ + +#define RF_UL(x) ((unsigned long) (x)) +#define RF_PGMASK RF_UL(NBPG-1) +#define RF_BLIP(x) (NBPG - (RF_UL(x) & RF_PGMASK)) /* bytes left in page */ +#define RF_PAGE_ALIGNED(x) ((RF_UL(x) & RF_PGMASK) == 0) + +#ifdef KERNEL +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#include <dkusage.h> +#endif +#if DKUSAGE > 0 +#define RF_DKU_END_IO(_unit_,_bp_) { \ + int s = splbio(); \ + dku_end_io(DKU_RAIDFRAME_BUS, _unit_, 0, \ + (((_bp_)->b_flags&(B_READ|B_WRITE) == B_READ) ? \ + CAM_DIR_IN : CAM_DIR_OUT), \ + (_bp_)->b_bcount); \ + splx(s); \ +} +#else /* DKUSAGE > 0 */ +#define RF_DKU_END_IO(unit) { /* noop */ } +#endif /* DKUSAGE > 0 */ +#endif /* KERNEL */ + +#ifdef __STDC__ +#define RF_STRING(_str_) #_str_ +#else /* __STDC__ */ +#define RF_STRING(_str_) "_str_" +#endif /* __STDC__ */ + +#endif /* !_RF__RF_GENERAL_H_ */ diff --git a/sys/dev/raidframe/rf_geniq.c b/sys/dev/raidframe/rf_geniq.c new file mode 100644 index 00000000000..bfe55cb87d2 --- /dev/null +++ b/sys/dev/raidframe/rf_geniq.c @@ -0,0 +1,199 @@ +/* $NetBSD: rf_geniq.c,v 1.1 1998/11/13 04:20:30 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Daniel Stodolsky + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* rf_geniq.c + * code which implements Reed-Solomon encoding for RAID level 6 + */ + +/* : + * Log: rf_geniq.c,v + * Revision 1.12 1996/07/29 16:37:00 jimz + * remove archs.h include to avoid VPATH problems in kernel + * rf_invertq.c now must include archs.h before invertq.h + * + * Revision 1.11 1996/07/29 15:04:16 jimz + * correct rf_archs.h path for kernel + * + * Revision 1.10 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.9 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.8 1996/07/15 17:22:18 jimz + * nit-pick code cleanup + * resolve stdlib problems on DEC OSF + * + * Revision 1.7 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.6 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.5 1995/12/01 18:29:18 root + * added copyright info + * + */ + +#define RF_UTILITY 1 +#include "rf_pqdeg.h" + +/* + five bit lfsr + poly - feedback connections + + val = value; +*/ +int lsfr_shift(val,poly) +unsigned val, poly; +{ + unsigned new; + unsigned int i; + unsigned high = (val >> 4) & 1; + unsigned bit; + + new = (poly & 1) ? high : 0; + + for (i=1; i <=4; i++) + { + bit = (val >> (i-1)) & 1; + if (poly & (1<<i)) /* there is a feedback connection */ + new = new | ((bit ^ high)<<i); + else + new = new | (bit << i); + } + return new; +} + +/* generate Q matricies for the data */ + +RF_ua32_t rf_qfor[32]; + +void main() +{ + unsigned int i,j,l,a,b; + unsigned int val; + unsigned int r; + unsigned int m,p,q; + + RF_ua32_t k; + + printf("/*\n"); + printf(" * rf_invertq.h\n"); + printf(" */\n"); + printf("/*\n"); + printf(" * GENERATED FILE -- DO NOT EDIT\n"); + printf(" */\n"); + printf("\n"); + printf("#ifndef _RF__RF_INVERTQ_H_\n"); + printf("#define _RF__RF_INVERTQ_H_\n"); + printf("\n"); + printf("/*\n"); + printf(" * rf_geniq.c must include rf_archs.h before including\n"); + printf(" * this file (to get VPATH magic right with the way we\n"); + printf(" * generate this file in kernel trees)\n"); + printf(" */\n"); + printf("/* #include \"rf_archs.h\" */\n"); + printf("\n"); + printf("#if (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_RAID6 > 0)\n"); + printf("\n"); + printf("#define RF_Q_COLS 32\n"); + printf("RF_ua32_t rf_rn = {\n"); + k[0] = 1; + for (j=0 ; j < 31; j++) + k[j+1] = lsfr_shift(k[j],5); + for (j=0; j < 32; j++) + printf("%d, ",k[j]); + printf("};\n"); + + printf("RF_ua32_t rf_qfor[32] = {\n"); + for (i=0; i < 32; i++) + { + printf("/* i = %d */ { 0, ",i); + rf_qfor[i][0] = 0; + for (j=1; j < 32; j++) + { + val = j; + for (l=0; l < i; l++) + val = lsfr_shift(val,5); + rf_qfor[i][j] = val; + printf("%d, ",val); + } + printf("},\n"); + } + printf("};\n"); + printf("#define RF_Q_DATA_COL(col_num) rf_rn[col_num],rf_qfor[28-(col_num)]\n"); + + /* generate the inverse tables. (i,j,p,q) */ + /* The table just stores a. Get b back from + the parity */ + printf("#ifdef KERNEL\n"); + printf("RF_ua1024_t rf_qinv[1]; /* don't compile monster table into kernel */\n"); + printf("#elif defined(NO_PQ)\n"); + printf("RF_ua1024_t rf_qinv[29*29];\n"); + printf("#else /* !KERNEL && NO_PQ */\n"); + printf("RF_ua1024_t rf_qinv[29*29] = {\n"); + for (i=0; i < 29; i++) + { + for (j =0; j < 29; j++) + { + printf("/* i %d, j %d */{ ",i,j); + if (i==j) + for (l=0; l < 1023; l++) printf("0, "); + else + { + for (p=0; p < 32; p++) + for (q=0; q < 32; q++) + { + /* What are a, b such that + a ^ b = p; and + qfor[(28-i)][a ^ rf_rn[i+1]] ^ qfor[(28-j)][b ^ rf_rn[j+1]] = q. + Solve by guessing a. Then testing. + */ + for ( a =0 ; a < 32; a++ ) + { + b = a ^ p; + if ( (rf_qfor[28-i][a^ k[i+1]] ^ rf_qfor[28-j][b ^ k[j+1]]) == q ) + break; + } + if (a == 32) printf("unable to solve %d %d %d %d\n",i,j,p,q); + printf("%d,",a); + } + } + printf("},\n"); + } + } + printf("};\n"); + printf("\n#endif /* (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */\n\n"); + printf("#endif /* !KERNEL && NO_PQ */\n"); + printf("#endif /* !_RF__RF_INVERTQ_H_ */\n"); + exit(0); +} diff --git a/sys/dev/raidframe/rf_geometry.c b/sys/dev/raidframe/rf_geometry.c new file mode 100644 index 00000000000..15da7cdda8e --- /dev/null +++ b/sys/dev/raidframe/rf_geometry.c @@ -0,0 +1,891 @@ +/* $OpenBSD: rf_geometry.c,v 1.1 1999/01/11 14:29:24 niklas Exp $ */ +/* $NetBSD: rf_geometry.c,v 1.1 1998/11/13 04:20:30 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Changes: + * 10/24/91 Changes to support disk bus contention model + * (MCH) 1. Added media_done_time param to Access_time() + * + * 08/18/92 Geometry routines have been modified to support zone-bit + * recording. + * (AS) 1. Each routine which originally referenced the variable + * 'disk->geom->sectors_per_track' has been modified, + * since the number of sectors per track varies on disks + * with zone-bit recording. + */ + +/* : + * Log: rf_geometry.c,v + * Revision 1.18 1996/08/11 00:40:57 jimz + * fix up broken comment + * + * Revision 1.17 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.16 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.15 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.14 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.13 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.12 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.11 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.10 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.9 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.8 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.7 1995/12/01 18:29:34 root + * added copyright info + * + */ + +#include "rf_types.h" +#include "rf_geometry.h" +#include "rf_raid.h" +#include "rf_general.h" +#include "rf_debugMem.h" + +#define DISK_DB "disk_db" +#define DISK_NAME "HP2247" + +#define ABS_DIFF(a,b) ( ((a)>(b)) ? ((a)-(b)) : ((b)-(a)) ) + +static RF_GeometryList_t *geom_list = (RF_GeometryList_t *) NULL; + +RF_TICS_t rf_globalSpinup = 1.5; + +#define NM_LGTH 80 +#define NM_PATN " %80s" + +static RF_GeometryList_t *Fetch_geometry_db(FILE *fd); +static void Format_disk(RF_DiskState_t *disk, long sectors_per_block); +static long Find_cyl(RF_SectorNum_t block, RF_DiskState_t *disk); +static long Find_track(RF_SectorNum_t block, RF_DiskState_t *disk); +static long Find_phys_sector(RF_SectorNum_t block, RF_DiskState_t *disk); +static RF_TICS_t Delay_to(RF_TICS_t cur_time, RF_SectorNum_t block, + RF_DiskState_t *disk); +static RF_TICS_t Seek_time(long to_cyl, long to_track, long from_cyl, + long from_track, RF_DiskState_t *disk); +static RF_TICS_t Seek(RF_TICS_t cur_time, RF_SectorNum_t block, + RF_DiskState_t *disk, long update); +static RF_TICS_t Rotate(RF_TICS_t cur_time, RF_SectorNum_t block, + RF_DiskState_t *disk, long update); +static RF_TICS_t Seek_Rotate(RF_TICS_t cur_time, RF_SectorNum_t block, + RF_DiskState_t *disk, long update); +static RF_TICS_t GAP(long sec_per_track, RF_DiskState_t *disk); +static RF_TICS_t Block_access_time(RF_TICS_t cur_time, RF_SectorNum_t block, + RF_SectorCount_t numblocks, RF_DiskState_t *disk, long update); +static void Zero_stats(RF_DiskState_t *disk); +static RF_TICS_t Update_stats(RF_TICS_t cur_time, RF_TICS_t seek, RF_TICS_t rotate, + RF_TICS_t transfer, RF_DiskState_t *disk); +static void rf_DiskParam(long numCyls, RF_TICS_t minSeek, RF_TICS_t avgSeek, RF_TICS_t maxSeek, + RF_TICS_t *a, RF_TICS_t *b, RF_TICS_t *c); + +static RF_GeometryList_t *Fetch_geometry_db(fd) + FILE *fd; +{ + long ret, lineno; + char name[NM_LGTH], title[20]; + RF_GeometryList_t * list = (RF_GeometryList_t *) NULL, + ** next_ptr = & list; + + if( RF_MAX_DISKNAME_LEN<NM_LGTH ) RF_PANIC(); + lineno = 0; + while( (ret = fscanf( fd, " %20s", title )) != EOF ) { + float tmp_f1, tmp_f2, tmp_f3, tmp_f4; + float tmp_f5=0.0; + float tmp_f6=0.0; + RF_Geometry_t *g; + long i, x, y, z, num_cylinders; + RF_ZoneList_t ** znext_ptr; + + if( ret == 1 && strncmp( "enddisk", title, 8 ) == 0 ) break; + + RF_Calloc(*next_ptr, 1, sizeof(RF_GeometryList_t), (RF_GeometryList_t *)); + (*next_ptr)->next = (RF_GeometryList_t *) NULL; + RF_Calloc(g, 1, sizeof(RF_Geometry_t), (RF_Geometry_t *)); + (*next_ptr)->disk = g; + next_ptr = &( (*next_ptr)->next ); /*prep for next iteration */ + lineno++; + if (fscanf( fd, NM_PATN, name ) != 1) { + fprintf(stderr,"Disk DB Error: Can't get disk name from disk db\n"); + fprintf(stderr,"lineno=%d\n", lineno); + fprintf(stderr,"name=\"%s\"\n", name); + exit(1); + } + lineno++; + if ( (fscanf(fd, " tracks per cylinder %ld", &(g->tracks_per_cyl)) != 1) || g->tracks_per_cyl <= 0) { + fprintf(stderr,"Disk DB Error: Missing or invalid tracks/cyl for disk %s\n", name); exit(1); + } + lineno++; + if ( (fscanf(fd, " number of disk zones %ld", &(g->num_zones)) != 1) || g->num_zones <= 0) { + fprintf(stderr,"Disk DB Error: Missing or invalid number of zones for disk %s\n", name); exit(1); + } + + + + /* This section of code creates the linked list which + contains the disk's zone information. */ + g->zbr_data = (RF_ZoneList_t *) NULL; + znext_ptr = &(g->zbr_data); + num_cylinders = 0; + + /* This for-loop reads in the cylinder count, the sectors + per track, and track skew for each zone on the disk. */ + for (i=1; i <= g->num_zones; i++) { + lineno++; + if ( (fscanf(fd, " number of cylinders in zone %ld", &x) != 1) || x < 1) { + fprintf(stderr,"Disk DB Error: Zone %ld: Missing or invalid cyls/zone for disk %s\n", i, name); exit(1); + } + lineno++; + if ( (fscanf(fd, " sectors per track in zone %ld", &y) != 1) || y < 1 ) { + fprintf(stderr,"Disk DB Error: Zone %ld: Missing or invalid sectors/track for disk %s\n", i, name); exit(1); + } + lineno++; + if ( (fscanf(fd, " track skew in zone %ld", &z) != 1) || z < 0 ) { + fprintf(stderr,"Disk DB Error: Zone %ld: Missing or invalid track skew for disk %s\n",i, name); exit(1); + } + + RF_Calloc(*znext_ptr, 1, sizeof(RF_ZoneList_t), (RF_ZoneList_t *)); + (*znext_ptr)->next = (RF_ZoneList_t *) NULL; + (*znext_ptr)->zone.num_cylinders = x; + (*znext_ptr)->zone.sec_per_track = y; + (*znext_ptr)->zone.track_skew = z; + (*znext_ptr)->zone.num_sectors = + (*znext_ptr)->zone.num_cylinders * + g->tracks_per_cyl * + (*znext_ptr)->zone.sec_per_track; + znext_ptr = &((*znext_ptr)->next); + num_cylinders = num_cylinders + x; + } /* End of for-loop */ + + lineno++; + if ( (fscanf(fd, " revolution time %f", &tmp_f1) != 1) || tmp_f1 <= 0) { + fprintf(stderr,"Disk DB Error: Missing or invalid revolution time for disk %s\n",name); exit(1); + } + lineno++; + if ( (fscanf(fd, " 1 cylinder seek time %f", &tmp_f2 ) != 1) || tmp_f2 <= 0) { + fprintf(stderr,"Disk DB Error: Missing or invalid 1-cyl seek time for disk %s\n",name); exit(1); + } + lineno++; + if ( (fscanf(fd, " max stroke seek time %f", &tmp_f3) != 1) || tmp_f3 <= 0) { + fprintf(stderr,"Disk DB Error: Missing or invalid max seek time for disk %s\n",name); exit(1); + } + lineno++; + if ( (fscanf(fd, " average seek time %f", &tmp_f4) != 1) || tmp_f4 <= 0) { + fprintf(stderr,"Disk DB Error: Missing or invalid avg seek time for disk %s\n",name); exit(1); + } + lineno++; + if ( (fscanf(fd, " time to sleep %f", &tmp_f5) != 1) || tmp_f4 <= 0) { + fprintf(stderr,"Disk DB Error: Missing or invalid time to sleep for disk %s\n",name); exit(1); + } + lineno++; + if ( (fscanf(fd, " time to spinup %f", &tmp_f6) != 1) || tmp_f4 <= 0) { + fprintf(stderr,"Disk DB Error: Missing or invalid time to sleep for disk %s\n",name); exit(1); + } + strcpy( g->disk_name, name ); + g->revolution_time = tmp_f1; + g->seek_one_cyl = tmp_f2; + g->seek_max_stroke = tmp_f3; + g->seek_avg = tmp_f4; + g->time_to_sleep = tmp_f5; + g->time_to_spinup = tmp_f6; + /* convert disk specs to seek equation coeff */ + rf_DiskParam( num_cylinders, g->seek_one_cyl, + g->seek_avg, g->seek_max_stroke, + &g->seek_sqrt_coeff, &g->seek_linear_coeff, + &g->seek_constant_coeff ); + } + return( list ); +} + +static void Format_disk(disk, sectors_per_block) + RF_DiskState_t *disk; + long sectors_per_block; +{ + long sector_count = 0; + RF_ZoneList_t *z; + + if( disk == (RF_DiskState_t *) NULL ) RF_PANIC(); + if( disk->geom == (RF_Geometry_t *) NULL ) RF_PANIC(); + if( sectors_per_block <=0 ) RF_PANIC(); + + disk->sectors_per_block = sectors_per_block; + z = disk->geom->zbr_data; + /* This while-loop visits each disk zone and computes the total + number of sectors on the disk. */ + while (z != (RF_ZoneList_t *) NULL) { + sector_count = sector_count + (z->zone.num_cylinders * + disk->geom->tracks_per_cyl * + z->zone.sec_per_track); + z = z->next; + } + + disk->last_block_index = (sector_count / sectors_per_block) - 1; +} + +void rf_InitDisk( disk, disk_db, disk_name, init_cyl, init_track, init_offset, row, col) + RF_DiskState_t *disk; + char *disk_db; + char *disk_name; + long init_cyl; + long init_track; + RF_TICS_t init_offset; + int row; + int col; +{ + RF_GeometryList_t *gp; + FILE *f; + + RF_ASSERT( disk != (RF_DiskState_t *) NULL ); + + disk->cur_cyl = init_cyl; + disk->cur_track = init_track; + disk->index_offset = init_offset; + disk->geom = (RF_Geometry_t *) NULL; + disk->queueFinishTime = 0.0; + disk->lastBlock = 0; + disk->row=row; + disk->col=col; + Zero_stats(disk); + + if (strncmp(disk_name,"/dev",4 )==0) strcpy(disk_name,"HP2247"); + + if( geom_list == (RF_GeometryList_t *) NULL ) { + f = fopen(disk_db,"r"); + if (f == NULL) { + fprintf(stderr, "ERROR: RAIDframe could not open disk db %s\n", disk_db); + exit(1); + } + geom_list = Fetch_geometry_db( f ); + fclose( f ); + } + for( gp = geom_list; gp != (RF_GeometryList_t *) NULL; gp = gp->next ) { + RF_ASSERT( gp->disk != (RF_Geometry_t *) NULL + && gp->disk->disk_name != (char *) NULL ); + if( strncmp( disk_name, gp->disk->disk_name, RF_MAX_DISKNAME_LEN ) + == 0 ) { + disk->geom = gp->disk; + break; + } + } + if( disk->geom == (RF_Geometry_t *) NULL ) { + fprintf( stderr, "Disk %s not found in database %s\n", + disk_name, disk_db ); + exit(1); + } + + Format_disk( disk, 1 ); +} + +static long Find_cyl( block, disk ) + RF_SectorNum_t block; + RF_DiskState_t *disk; +{ + RF_ZoneList_t * z; + long tmp; + + long log_sector = block * disk->sectors_per_block; + long cylinder = 0; + z = disk->geom->zbr_data; + /* This while-loop finds the zone to which log_sector belongs, + computes the starting cylinder number of this zone, and + computes the sector offset into this zone. */ + while (log_sector >= z->zone.num_sectors) { + log_sector = log_sector - z->zone.num_sectors; + cylinder = cylinder + z->zone.num_cylinders; + z = z->next; + } + + /* The cylinder to which log_sector belongs equals the starting + cylinder number of its zone plus the cylinder offset into + the zone. */ + tmp = cylinder + (log_sector / (z->zone.sec_per_track * + disk->geom->tracks_per_cyl)); + + return( tmp ); +} + +static long Find_track( block, disk ) + RF_SectorNum_t block; + RF_DiskState_t *disk; +{ + RF_ZoneList_t * z; + long tmp; + + long log_sector = block * disk->sectors_per_block; + long track = 0; + z = disk->geom->zbr_data; + /* This while-loop finds the zone to which log_sector belongs, + computes the starting track number of this zone, and computes + the sector offset into this zone. */ + while (log_sector >= z->zone.num_sectors) { + log_sector = log_sector - z->zone.num_sectors; + track = track + (z->zone.num_cylinders * + disk->geom->tracks_per_cyl); + z = z->next; + } + + /* The track to which log_sector belongs equals the starting + track number of its zone plus the track offset into the zone, + modulo the number of tracks per cylinder on the disk. */ + tmp = (track + (log_sector / z->zone.sec_per_track)) % + disk->geom->tracks_per_cyl; + + return( tmp ); +} + +/* + ** The position of a logical sector relative to the index mark on any track + ** is not simple. A simple organization would be: +** +** track 0 : 0, 1, 2, 3, ... N-1 +** track 1 : N,N+1,N+2,N+3, ... 2N-1 +** ^ +** Index mark just before this point +** +** This is not good because sequential access of sectors N-1 then N +** will require a full revolution in between (because track switch requires +** a couple of sectors to recalibrate from embedded servo). So frequently +** sequentially numbered sectors are physically skewed so that the next +** accessible sector after N-1 will be N (with a skew of 2) +** +** track 0 : 0, 1, 2, 3, ... N-1 +** track 1 : 2N-2,2N-1, N, N+1, ... 2N-3 +** ^ +** Index mark just before this point +** +** Layout gets even more complex with cylinder boundaries. Seek time +** is A + B*M where M is the number of cylinders to seek over. On a sequential +** access that crosses a cylinder boundary, the disk will rotate for +** A+B seconds, then "track skew" sectors (inter-sector gaps actually) +** before it can access another sector, so the cylinder to cylinder skew +** is "track skew" + CEIL( sectors_per_track*(A+B)/revolution_time ). +** +** So if sector 0 is 0 sectors from the index mark on the first track, +** where is sector X relative to the index mark on its track? +** +** ( ( X % sectors_per_track ) basic relative position ** +** + track_skew * ( X / sectors_per_track ) skewed for each track ** +** + CEIL( sectors_per_track*(A+B)/revolution_time ) +** * ( X / sectors_per_cylinder ) skewed more for each cyl ** +** ) % sectors_per_track wrapped around in the track ** +** +** +*/ + +static long Find_phys_sector(block, disk) + RF_SectorNum_t block; + RF_DiskState_t *disk; +{ + long phys = 0; + RF_ZoneList_t * z; + long previous_spt = 1; + long sector = block * disk->sectors_per_block; + + z = disk->geom->zbr_data; + /* This while-loop finds the zone to which sector belongs, + and computes the physical sector up to that zone. */ + while (sector >= z->zone.num_sectors) { + sector = sector - z->zone.num_sectors; + /* By first multiplying 'phys' by the sectors per track in + the current zone divided by the sectors per track in the + previous zone, we convert a given physical sector in one + zone to an equivalent physical sector in another zone. */ + phys = ((phys * z->zone.sec_per_track / previous_spt) + + (((z->zone.num_sectors - 1) % z->zone.sec_per_track) + + (z->zone.track_skew * z->zone.num_cylinders * + disk->geom->tracks_per_cyl) + + (long) ceil( (double) z->zone.sec_per_track * + (disk->geom->seek_constant_coeff) / + disk->geom->revolution_time) * + z->zone.num_cylinders)) % + z->zone.sec_per_track; + previous_spt = z->zone.sec_per_track; + z = z->next; + } + + /* The final physical sector equals the physical sector up to + the particular zone, plus the physical sector caused by the + sector offset into this zone. */ + phys = ((phys * z->zone.sec_per_track / previous_spt) + + ((sector % z->zone.sec_per_track) + + (z->zone.track_skew * (sector / z->zone.sec_per_track)) + + (long) ceil( (RF_TICS_t) z->zone.sec_per_track * + (disk->geom->seek_constant_coeff) / + disk->geom->revolution_time) * + (sector / (z->zone.sec_per_track * + disk->geom->tracks_per_cyl)))) % + z->zone.sec_per_track; + + + return( phys ); +} + +/* + ** When each disk starts up, its index mark is a fraction (f) of a rotation + ** ahead from its heads (in the direction of rotation). The sector + ** under its heads is at a fraction f of a rotation from the index + ** mark. After T time has past, T/rotation_time revolutions have occured, so + ** the sector under the heads is at a fraction FRAC(f+T/rotation_time) of a + ** rotation from the index mark. If the target block is at physical sector + ** X relative to its index mark, then it is at fraction (X/sectors_per_track), + ** so the rotational delay is + ** ((X/sectors_per_track)-FRAC(f+T/rotation_time)) * revolution_time + ** if this is positive, otherwise it is + ** (1+(X/sectors_per_track)-FRAC(f+T/rotation_time)) * revolution_time + */ + +#define FRAC(a) ( (a) - (long) floor(a) ) + +static RF_TICS_t Delay_to(cur_time, block, disk) + RF_TICS_t cur_time; + RF_SectorNum_t block; + RF_DiskState_t *disk; +{ + RF_TICS_t tmp; + RF_ZoneList_t *z; + + long sector = block * disk->sectors_per_block; + z = disk->geom->zbr_data; + /* This while-loop finds the zone to which sector belongs. */ + while (sector >= z->zone.num_sectors) { + sector = sector - z->zone.num_sectors; + z = z->next; + } + + tmp = ( + (RF_TICS_t) Find_phys_sector(block,disk)/z->zone.sec_per_track + - FRAC(disk->index_offset+cur_time/disk->geom->revolution_time) + ) * disk->geom->revolution_time; + if( tmp < 0 ) tmp += disk->geom->revolution_time; + if( tmp < 0 ) RF_PANIC(); + return( tmp ); +} + +/* Hmmm...they seem to be computing the head switch time as + * equal to the track skew penalty. Is this an approximation? + * (MCH) + */ +static RF_TICS_t Seek_time( to_cyl, to_track, from_cyl, from_track, disk ) + long to_cyl; + long to_track; + long from_cyl; + long from_track; + RF_DiskState_t *disk; +{ + long cyls = ABS_DIFF( from_cyl, to_cyl ) - 1; + RF_TICS_t seek = 0.0; + RF_ZoneList_t * z; + + /* printf("Seek_time: from_cyl %ld, to_cyl %ld, from_trk %ld, to_trk %ld\n",from_cyl, to_cyl, from_track, to_track); */ + if( from_cyl != to_cyl ) { + z = disk->geom->zbr_data; + /* This while-loop finds the zone to which to_cyl belongs. */ + while (to_cyl >= z->zone.num_cylinders) { + to_cyl = to_cyl - z->zone.num_cylinders; + z = z->next; + } + + seek = disk->geom->seek_constant_coeff + + disk->geom->seek_linear_coeff * cyls + + disk->geom->seek_sqrt_coeff * sqrt( (double) cyls ) + + z->zone.track_skew * disk->geom->revolution_time / + z->zone.sec_per_track; + + } else if( from_track != to_track ) { + /* from_track and to_track must lie in the same zone. */ + z = disk->geom->zbr_data; + /* This while-loop finds the zone to which from_cyl belongs. */ + while (from_cyl >= z->zone.num_cylinders) { + from_cyl = from_cyl - z->zone.num_cylinders; + z = z->next; + } + + seek = z->zone.track_skew + * disk->geom->revolution_time + / z->zone.sec_per_track; + } + return( seek ); +} + +static RF_TICS_t Seek(cur_time, block, disk, update) + RF_TICS_t cur_time; + RF_SectorNum_t block; + RF_DiskState_t *disk; + long update; +{ + long cur_cyl, cur_track; + /* + ** current location is derived from the time, + ** current track and current cylinder + ** + ** update current location as you go + */ + + RF_ASSERT( block <= disk->last_block_index ); + cur_cyl = disk->cur_cyl; + cur_track = disk->cur_track; + if (update) { + disk->cur_cyl = Find_cyl( block, disk ); + disk->cur_track = Find_track( block, disk ); + } + return( Seek_time( disk->cur_cyl, disk->cur_track, + cur_cyl, cur_track, disk ) ); +} + +static RF_TICS_t Rotate(cur_time, block, disk, update) + RF_TICS_t cur_time; + RF_SectorNum_t block; + RF_DiskState_t *disk; + long update; +{ + /* + ** current location is derived from the time, + ** current track and current cylinder + ** + ** block the process until at the appropriate block + ** updating current location as you go + */ + + RF_ASSERT( block <= disk->last_block_index ); + return( Delay_to( cur_time, block, disk ) ); +} + +static RF_TICS_t Seek_Rotate(cur_time, block, disk, update) + RF_TICS_t cur_time; + RF_SectorNum_t block; + RF_DiskState_t *disk; + long update; +{ + RF_TICS_t seek, delay; + + RF_ASSERT( block <= disk->last_block_index ); + seek = Seek( cur_time, block, disk, update ); + delay = seek + Rotate( cur_time+seek, block, disk, update ); + return( delay ); +} + +static RF_TICS_t GAP(sec_per_track, disk) + long sec_per_track; + RF_DiskState_t *disk; +{ + RF_TICS_t tmp = (disk->geom->revolution_time/(100*sec_per_track)); + return (tmp); +} + +RF_TICS_t Block_access_time(cur_time, block, numblocks, disk, update) + RF_TICS_t cur_time; + RF_SectorNum_t block; + RF_SectorCount_t numblocks; + RF_DiskState_t *disk; + long update; +{ + RF_TICS_t delay = 0; + long cur = block, end = block + numblocks; + long sector, tmp; + RF_ZoneList_t * z; + /* + ** this is the same as Seek_Rotate by merit of the mapping + ** except that the access ends before the gap to the next block + */ + RF_ASSERT( numblocks > 0 && end-1 <= disk->last_block_index ); + + while( cur < end ) { + sector = cur * disk->sectors_per_block; + z = disk->geom->zbr_data; + /* This while-loop finds the zone to which sector belongs. */ + while (sector >= z->zone.num_sectors) { + sector = sector - z->zone.num_sectors; + z = z->next; + } + + tmp = RF_MIN( end - cur, z->zone.sec_per_track + - cur % z->zone.sec_per_track ); + delay += tmp * disk->geom->revolution_time / + z->zone.sec_per_track - + GAP(z->zone.sec_per_track, disk); + cur += tmp; + if( cur != end ) + delay += Seek_Rotate( cur_time+delay, cur, disk, update ); + } + return( delay ); +} + +static void Zero_stats(disk) + RF_DiskState_t *disk; +{ + char traceFileName[64]; + disk->stats.num_events = 0; + disk->stats.seek_sum = 0; + disk->stats.seekSq_sum = 0; + disk->stats.rotate_sum = 0; + disk->stats.rotateSq_sum = 0; + disk->stats.transfer_sum = 0; + disk->stats.transferSq_sum = 0; + disk->stats.access_sum = 0; + disk->stats.accessSq_sum = 0; + disk->stats.sleep_sum=0; + disk->stats.idle_sum=0; + disk->stats.rw_sum=0; + disk->stats.spinup_sum=0; + disk->stats.last_acc=0; + if (rf_diskTrace){ + sprintf (traceFileName,"rf_diskTracer%dc%d\0",disk->row,disk->col); + if ( (disk->traceFile= fopen(traceFileName, "w")) == NULL) { + perror(traceFileName); RF_PANIC();} + } +} + +static RF_TICS_t Update_stats(cur_time, seek, rotate, transfer, disk) + RF_TICS_t cur_time; + RF_TICS_t seek; + RF_TICS_t rotate; + RF_TICS_t transfer; + RF_DiskState_t *disk; +{ + RF_TICS_t spinup=0; + RF_TICS_t sleep=0; + RF_TICS_t idle=0; + + disk->stats.num_events++; + disk->stats.seek_sum += seek; + disk->stats.seekSq_sum += seek*seek; + disk->stats.rotate_sum += rotate; + disk->stats.rotateSq_sum += rotate*rotate; + disk->stats.transfer_sum += transfer; + disk->stats.transferSq_sum += transfer*transfer; + disk->stats.access_sum += seek+rotate+transfer; + disk->stats.accessSq_sum += + (seek+rotate+transfer)*(seek+rotate+transfer); + +/* ASSERT (cur_time - disk->stats.last_acc >= 0); */ + + if (cur_time-disk->stats.last_acc>disk->geom->time_to_sleep){ + idle=disk->geom->time_to_sleep; + + sleep = cur_time - disk->stats.last_acc - idle; + spinup=disk->geom->time_to_spinup; + rf_globalSpinup = spinup; + } + + else{ + idle=cur_time - disk->stats.last_acc; + } + + + disk->stats.sleep_sum+=sleep; + disk->stats.idle_sum+=idle; + disk->stats.rw_sum+=seek+rotate+transfer; + disk->stats.spinup_sum+=spinup; + + if (rf_diskTrace){ + fprintf(disk->traceFile,"%g %g\n",disk->stats.last_acc,2.0); + fprintf(disk->traceFile,"%g %g\n",(disk->stats.last_acc+idle),2.0); + if (sleep){ + fprintf(disk->traceFile,"%g %g\n",(disk->stats.last_acc+idle),1.0); + fprintf(disk->traceFile,"%g %g\n",(disk->stats.last_acc+idle+sleep),1.0); + } + + if (spinup){ + fprintf(disk->traceFile,"%g %g\n",(cur_time),4.0); + fprintf(disk->traceFile,"%g %g\n",(cur_time+spinup),4.0); + } + + fprintf(disk->traceFile,"%g %g\n",(cur_time+spinup),3.0); + fprintf(disk->traceFile,"%g %g\n",(cur_time+spinup+seek+rotate+transfer),3.0); + + + } + + disk->stats.last_acc=cur_time+spinup+seek+rotate+transfer; + + return(spinup); +} + + +void rf_StopStats(disk, cur_time) + RF_DiskState_t *disk; + RF_TICS_t cur_time; +{ + + RF_TICS_t sleep=0; + RF_TICS_t idle=0; + + if (cur_time - disk->stats.last_acc > disk->geom->time_to_sleep){ + + sleep = cur_time - disk->stats.last_acc-disk->geom->time_to_sleep; + idle = disk->geom->time_to_sleep; + + } + + + + else{ + idle=cur_time - disk->stats.last_acc; + } + + disk->stats.sleep_sum+=sleep; + disk->stats.idle_sum+=idle; + + if (rf_diskTrace){ + fprintf(disk->traceFile,"%g %g\n",disk->stats.last_acc,2.0); + fprintf(disk->traceFile,"%g %g\n",(disk->stats.last_acc+idle),2.0); + if (sleep){ + fprintf(disk->traceFile,"%g %g\n",(disk->stats.last_acc+idle),1.0); + fprintf(disk->traceFile,"%g %g\n",(disk->stats.last_acc+idle+sleep),1.0); + } + fclose(disk->traceFile); + } +} + +/* Sometimes num_events is zero because the disk was failed at the start + * of the simulation and never replaced. This causes a crash on some + * architectures, which is why we have the conditional. + */ +void rf_Report_stats( + RF_DiskState_t *disk, + long *numEventsPtr, + RF_TICS_t *avgSeekPtr, + RF_TICS_t *avgRotatePtr, + RF_TICS_t *avgTransferPtr, + RF_TICS_t *avgAccessPtr, + RF_TICS_t *SleepPtr, + RF_TICS_t *IdlePtr, + RF_TICS_t *RwPtr, + RF_TICS_t *SpinupPtr) +{ + *numEventsPtr = disk->stats.num_events; + if (disk->stats.num_events) { + *avgSeekPtr = disk->stats.seek_sum / disk->stats.num_events; + *avgRotatePtr = disk->stats.rotate_sum / disk->stats.num_events; + *avgTransferPtr = disk->stats.transfer_sum / disk->stats.num_events; + *avgAccessPtr = disk->stats.access_sum / disk->stats.num_events; + } else { + *avgSeekPtr = 0; + *avgRotatePtr = 0; + *avgTransferPtr = 0; + *avgAccessPtr = 0; + } + *SleepPtr = disk->stats.sleep_sum; + *IdlePtr = disk->stats.idle_sum; + *RwPtr = disk->stats.rw_sum ; + *SpinupPtr = disk->stats.spinup_sum ; +} + +int rf_Access_time( access_time, cur_time, block, numblocks, disk, media_done_time, update ) + RF_TICS_t *access_time; + RF_TICS_t cur_time; + RF_SectorNum_t block; + RF_SectorCount_t numblocks; + RF_DiskState_t *disk; + RF_TICS_t *media_done_time; + long update; /* 1 => update disk state, 0 => don't */ +{ + /* + * first move to the start of the data, then sweep to the end + */ + RF_TICS_t spinup=0; + RF_TICS_t seek = Seek( cur_time, block, disk, update ); + RF_TICS_t rotate = Rotate( cur_time+seek, block, disk, update ); + RF_TICS_t transfer = Block_access_time( cur_time+seek+rotate, block, + numblocks, disk, update ); + + if (update) spinup=Update_stats(cur_time, seek, rotate, transfer, disk ); + *media_done_time = seek+rotate+transfer; + *access_time =( seek+rotate+transfer+spinup); + return(0); +} + +/* added to take into account the fact that maping code acounts for the disk label */ + +void rf_GeometryDoReadCapacity(disk, numBlocks, blockSize) + RF_DiskState_t *disk; + RF_SectorCount_t *numBlocks; + int *blockSize; +{ + *numBlocks= (disk->last_block_index + 1 )-rf_protectedSectors; + + *blockSize= (disk->sectors_per_block*512 ); + + /* in bytes */ +} + + +/* END GEOMETRY ROUTINES **********************************************/ + + +static void rf_DiskParam(numCyls, minSeek, avgSeek, maxSeek, a, b, c) + long numCyls; + RF_TICS_t minSeek; + RF_TICS_t avgSeek; + RF_TICS_t maxSeek; + RF_TICS_t *a; + RF_TICS_t *b; + RF_TICS_t *c; +{ + if (minSeek == avgSeek && minSeek == maxSeek) { + *a = 0.0; *b = 0.0; *c = minSeek; + } else { + *a = ( 15 * avgSeek - 10 * minSeek - 5 * maxSeek ) / ( 3 * sqrt( (double) numCyls )); + *b = ( 7 * minSeek + 8 * maxSeek - 15 * avgSeek ) / ( 3 * numCyls ); + *c = minSeek; + } +} diff --git a/sys/dev/raidframe/rf_geometry.h b/sys/dev/raidframe/rf_geometry.h new file mode 100644 index 00000000000..3d77b1ea402 --- /dev/null +++ b/sys/dev/raidframe/rf_geometry.h @@ -0,0 +1,155 @@ +/* $OpenBSD: rf_geometry.h,v 1.1 1999/01/11 14:29:24 niklas Exp $ */ +/* $NetBSD: rf_geometry.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */ +/* geometry.h + * code from raidSim to model disk behavior + */ +/* + * Changes: + * 8/18/92 Additional structures have been declared and existing + * structures have been modified in order to support zone- + * bit recording. + * (AS) 1. The types 'Zone_data' and 'Zone_list' have been defined. + * (AS) 2. The type 'Geometry' has been modified. + */ + +/* : + * Log: rf_geometry.h,v + * Revision 1.10 1996/08/06 22:25:08 jimz + * include raidframe stuff before system stuff + * + * Revision 1.9 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.8 1996/05/31 10:16:14 jimz + * add raidsim note + * + * Revision 1.7 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.6 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.5 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.4 1995/12/01 18:29:45 root + * added copyright info + * + */ + +#ifndef _RF__RF_GEOMETRY_H_ +#define _RF__RF_GEOMETRY_H_ + +#include "rf_types.h" +#include "rf_sys.h" +#ifndef _KERNEL +#include <string.h> +#include <math.h> +#if defined(__NetBSD__) || defined(__OpenBSD__) +#include <stdio.h> +#endif /* __NetBSD__ || __OpenBSD__ */ +#endif + +#define RF_MAX_DISKNAME_LEN 80 + +typedef struct RF_ZoneData_s { + long num_cylinders; /* Number of cylinders in zone */ + long sec_per_track; /* Sectors per track in zone */ + long track_skew; /* Skew of each track in zone */ + long num_sectors; /* Number of sectors in zone */ +} RF_ZoneData_t; + +/* + * Linked list containing zone data + */ +typedef struct RF_ZoneList_s RF_ZoneList_t; +struct RF_ZoneList_s { + RF_ZoneData_t zone; /* for each disk */ + RF_ZoneList_t *next; +}; + +typedef struct RF_Geometry_s { + char disk_name[RF_MAX_DISKNAME_LEN]; /* name for a type of disk */ + long tracks_per_cyl; /* tracks in a cylinder */ + /* assume 1 head per track, 1 set of read/write electronics */ + long num_zones; /* number of ZBR zones on disk */ + RF_TICS_t revolution_time; /* milliseconds per revolution */ + RF_TICS_t seek_one_cyl; /* adjacent cylinder seek time */ + RF_TICS_t seek_max_stroke; /* end to end seek time */ + RF_TICS_t seek_avg; /* random from/to average time */ + /* + * seek time = a * (x-1)^0.5 + b * (x-1) + c + * x >= 1 is the seek distance in cylinders + */ + RF_TICS_t seek_sqrt_coeff; /* a */ + RF_TICS_t seek_linear_coeff; /* b */ + RF_TICS_t seek_constant_coeff; /* c */ + RF_ZoneList_t *zbr_data; /* linked list with ZBR data */ + RF_TICS_t time_to_sleep; /* seconds of idle time before disks goes to sleep */ + RF_TICS_t time_to_spinup; /* seconds spin up takes */ +} RF_Geometry_t; + +typedef struct RF_GeometryList_s RF_GeometryList_t; +struct RF_GeometryList_s { + RF_Geometry_t *disk; + RF_GeometryList_t *next; +}; + +typedef struct RF_DiskStats_s { + long num_events; + RF_TICS_t seek_sum; + RF_TICS_t seekSq_sum; + RF_TICS_t rotate_sum; + RF_TICS_t rotateSq_sum; + RF_TICS_t transfer_sum; + RF_TICS_t transferSq_sum; + RF_TICS_t access_sum; + RF_TICS_t accessSq_sum; + RF_TICS_t sleep_sum; + RF_TICS_t idle_sum; + RF_TICS_t rw_sum; + RF_TICS_t spinup_sum; + RF_TICS_t last_acc; /* time the last acces was finished */ +} RF_DiskStats_t; + +struct RF_DiskState_s { + int row; + int col; + RF_Geometry_t *geom; + long sectors_per_block; /* formatted per disk */ + long last_block_index; /* format result for convenience */ + RF_TICS_t index_offset; /* powerup head offset to index mark */ + long cur_track; /* current track */ + long cur_cyl; /* current cylinder */ + RF_DiskStats_t stats; /* disk statistics */ + + RF_TICS_t queueFinishTime; /* used by shortest-seek code */ + long lastBlock; + FILE *traceFile; +}; +typedef struct RF_DiskState_s RF_DiskState_t; + +extern RF_TICS_t rf_globalSpinup; + +void rf_InitDisk(RF_DiskState_t *disk, char *disk_name, char *disk_db, long init_cyl, + long init_track, RF_TICS_t init_offset, int row, int col); +void rf_StopStats(RF_DiskState_t *disk, RF_TICS_t cur_time); +void rf_Report_stats(RF_DiskState_t *disk, long *numEventsPtr, RF_TICS_t *avgSeekPtr, + RF_TICS_t *avgRotatePtr, RF_TICS_t *avgTransferPtr, RF_TICS_t *avgAccessPtr, + RF_TICS_t *SleepPtr, RF_TICS_t *IdlePtr, RF_TICS_t *RwPtr, RF_TICS_t *SpinupPtr); +int rf_Access_time(RF_TICS_t *access_time, RF_TICS_t cur_time, + RF_SectorNum_t block, RF_SectorCount_t numblocks, RF_DiskState_t *disk, + RF_TICS_t *media_done_time, long update); +void rf_GeometryDoReadCapacity(RF_DiskState_t *disk, RF_SectorCount_t *numBlocks, + int *blockSize); + +#endif /* !_RF__RF_GEOMETRY_H_ */ diff --git a/sys/dev/raidframe/rf_heap.c b/sys/dev/raidframe/rf_heap.c new file mode 100644 index 00000000000..ecb7a14518d --- /dev/null +++ b/sys/dev/raidframe/rf_heap.c @@ -0,0 +1,274 @@ +/* $OpenBSD: rf_heap.c,v 1.1 1999/01/11 14:29:25 niklas Exp $ */ +/* $NetBSD: rf_heap.c,v 1.1 1998/11/13 04:20:30 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* We manage a heap of data,key pairs, where the key a simple data type + * and the data is any singular data type. We allow the caller to add + * pairs, remote pairs, peek at the top pair, and do delete/add combinations. + * The latter are efficient because we only reheap once. + * + * David Kotz 1990? and 1993 + * + * Modify the heap to work with events, with the smallest time on the top. + * Song Bac Toh, 1994 + */ + +/* : + * Log: rf_heap.c,v + * Revision 1.8 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.7 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.6 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.5 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.4 1995/12/01 19:03:58 root + * added copyright info + * + */ + +#include "rf_types.h" +#include "rf_heap.h" +#include "rf_general.h" + +/* return RF_TRUE if the two requests in the heap match */ +#define Matching_REQUESTS(HeapData1, HeapData2) \ +((HeapData1->disk == HeapData2->disk) && \ + (HeapData1->req_code == HeapData2->req_code)) + +/* getting around in the heap */ +/* we don't use the 0th element of the array */ +#define ROOT 1 +#define LCHILD(p) (2 * (p)) +#define RCHILD(p) (2 * (p) + 1) +#define PARENT(c) ((c) / 2) + +/* @SUBTITLE "Debugging macros" */ +/* The following are used for debugging our callers + * as well as internal stuff + */ + +#define CHECK_INVARIANTS 1 + +#ifdef CHECK_INVARIANTS +#define INVARIANT2(x, y) \ +{ \ + if (!(x)) { \ + fprintf(stderr, "INVARIANT false: in \"%s\", line %d\n", \ + __FILE__, __LINE__); \ + fprintf(stderr, (y)); \ + exit(1); \ + } \ +} + +/* +#define INVARIANT3(x, y, z) \ + { \ + if (!(x)) { \ + fprintf(stderr, "INVARIANT false: in \"%s\", line %d\n", \ + __FILE__, __LINE__); \ + fprintf(stderr, (y), (z)); \ + exit(1); \ + } \ + } + */ +#else /* CHECK_INVARIANTS */ +/* #define INVARIANT2(x, y) */ +/* #define INVARIANT3(x, y, z) already defined in modularize.h */ +#endif /* CHECK_INVARIANTS */ + +/**** Rachad, must add to general debug structure */ + + +/* @SUBTITLE "InitHeap: Allocate a new heap" */ +/* might return NULL if no free memory */ +RF_Heap_t rf_InitHeap(int maxsize) +{ + RF_Heap_t hp; + + RF_ASSERT(maxsize > 0); + RF_Malloc(hp, sizeof(struct RF_Heap_s),(RF_Heap_t)); + if (hp == NULL) { + fprintf(stderr, "InitHeap: No memory for heap\n"); + return(NULL); + } + + RF_Malloc(hp->heap,sizeof(RF_HeapEntry_t)*(maxsize+1),(RF_HeapEntry_t *)); + if (hp->heap == NULL) { + fprintf(stderr, "InitHeap: No memory for heap of %d elements\n", + maxsize); + RF_Free(hp,-1); /* -1 means don't cause an error if the size does not match */ + return(NULL); + } + + hp->numheap = 0; + hp->maxsize = maxsize; + + return(hp); +} + +/* @SUBTITLE "FreeHeap: delete a heap" */ +void rf_FreeHeap(RF_Heap_t hp) +{ + if (hp != NULL) { + RF_Free(hp->heap,sizeof(RF_HeapEntry_t)*(hp->maxsize+1)); + RF_Free(hp,sizeof(struct RF_Heap_s)); + } +} + +/* @SUBTITLE "AddHeap: Add an element to the heap" */ +void rf_AddHeap(RF_Heap_t hp, RF_HeapData_t *data, RF_HeapKey_t key) +{ + int node; + + INVARIANT2(hp != NULL, "AddHeap: NULL heap\n"); + INVARIANT2((hp->numheap < RF_HEAP_MAX), "AddHeap: Heap overflowed\n"); + + /* use new space end of heap */ + node = ++(hp->numheap); + + /* and reheap */ + while (node != ROOT && hp->heap[PARENT(node)].key > key) { + hp->heap[node] = hp->heap[PARENT(node)]; + node = PARENT(node); + } + + hp->heap[node].data = data; + hp->heap[node].key = key; +} + +/* @SUBTITLE "TopHeap: Return top element of heap" */ +int rf_TopHeap(RF_Heap_t hp, RF_HeapData_t **data, RF_HeapKey_t *key) +{ + INVARIANT2(hp != NULL, "TopHeap: NULL heap\n"); + + if (hp->numheap > 0) { + if (data) + *data = hp->heap[ROOT].data; + if (key) + *key = hp->heap[ROOT].key; + return(RF_HEAP_FOUND); + } + else { + return(RF_HEAP_NONE); + } +} + +/* @SUBTITLE "RepHeap: Replace top of heap with given element and reheap" */ +/* note that hp->numheap does not change, and should already be > 0 */ +void rf_RepHeap(RF_Heap_t hp, RF_HeapData_t *data, RF_HeapKey_t key) +{ + int node; /* node in heap */ + int lchild, rchild; /* left and right children of node */ + int left, right; /* left and right children exist? */ + int swapped; /* swap was made? */ + RF_HeapEntry_t *heap; /* pointer to the base of this heap array */ + + INVARIANT2(hp != NULL, "RepHeap: NULL heap\n"); + + /* If heap is empty just add this element */ + /* if used properly this case should never come up */ + if (hp->numheap == 0) { + rf_AddHeap(hp, data, key); + + return; + } + + heap = hp->heap; /* cache the heap base pointer */ + + node = ROOT; + + do { + lchild = LCHILD(node); + rchild = RCHILD(node); + left = (lchild <= hp->numheap); + right = (rchild <= hp->numheap); + + /* Both children exist: which is smaller? */ + if (left && right) + if (heap[lchild].key < heap[rchild].key) + right = RF_HEAP_NONE; + else + left = RF_HEAP_NONE; + + /* Now only one of left and right is true. compare it with us */ + if (left && heap[lchild].key < key) { + /* swap with left child */ + heap[node] = heap[lchild]; + node = lchild; + swapped = RF_HEAP_FOUND; + } else if (right && heap[rchild].key < key) { + /* swap with right child */ + heap[node] = heap[rchild]; + node = rchild; + swapped = RF_HEAP_FOUND; + } else + swapped = RF_HEAP_NONE; + } while (swapped); + + /* final resting place for new element */ + heap[node].key = key; + heap[node].data = data; +} + +/* @SUBTITLE "RemHeap: Remove top element and reheap" */ +int rf_RemHeap(RF_Heap_t hp, RF_HeapData_t **data, RF_HeapKey_t *key) +{ + int node; + + /* we don't check hp's validity because TopHeap will do it for us */ + + /* get the top element into data and key, if any */ + if (rf_TopHeap(hp, data, key)) { + /* there was something there, so replace top with last element */ + node = hp->numheap--; + if (hp->numheap > 0) + rf_RepHeap(hp, hp->heap[node].data, hp->heap[node].key); + + return(RF_HEAP_FOUND); + } else{ + return(RF_HEAP_NONE); + } +} + diff --git a/sys/dev/raidframe/rf_heap.h b/sys/dev/raidframe/rf_heap.h new file mode 100644 index 00000000000..bf8f8cfdaf9 --- /dev/null +++ b/sys/dev/raidframe/rf_heap.h @@ -0,0 +1,128 @@ +/* $OpenBSD: rf_heap.h,v 1.1 1999/01/11 14:29:25 niklas Exp $ */ +/* $NetBSD: rf_heap.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* @TITLE "heap.h - interface to heap management implementation */ +/* We manage a heap of data,key pairs, where the key could be any + * simple data type + * and the data is any pointer data type. We allow the caller to add + * pairs, remote pairs, peek at the top pair, and do delete/add combinations. + * The latter are efficient because we only reheap once. + * + * David Kotz 1990? and 1993 + */ + +/* : + * Log: rf_heap.h,v + * Revision 1.8 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.7 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.6 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.5 1995/12/01 19:04:07 root + * added copyright info + * + */ + +#ifndef _RF__RF_HEAP_H_ +#define _RF__RF_HEAP_H_ + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_desc.h" + +#define RF_HEAP_MAX 10240 + +#define RF_HEAP_FOUND 1 +#define RF_HEAP_NONE 0 + +typedef RF_TICS_t RF_HeapKey_t; + +typedef struct RF_HeapData_s RF_HeapData_t; +typedef struct RF_Heap_s *RF_Heap_t; +typedef struct RF_HeapEntry_s RF_HeapEntry_t; + +/* heap data */ +struct RF_HeapData_s { + RF_TICS_t eventTime; + int disk; + int (*CompleteFunc)(); /* function to be called upon completion */ + void *argument; /* argument to be passed to CompleteFunc */ + int owner; /* which task is resposable for this request */ + int row; + int col; /* coordinates of disk */ + RF_Raid_t *raidPtr; + void *diskid; + /* Dag event */ + RF_RaidAccessDesc_t *desc; +}; + +struct RF_HeapEntry_s { + RF_HeapData_t *data; /* the arbitrary data */ + RF_HeapKey_t key; /* key for comparison */ +}; + +struct RF_Heap_s { + RF_HeapEntry_t *heap; /* the heap in use (an array) */ + int numheap; /* number of elements in heap */ + int maxsize; +}; + +/* set up heap to hold maxsize nodes */ +RF_Heap_t rf_InitHeap(int maxsize); + +/* delete a heap data structure */ +void rf_FreeHeap(RF_Heap_t hp); + +/* add the element to the heap */ +void rf_AddHeap(RF_Heap_t hp, RF_HeapData_t *data, RF_HeapKey_t key); + +/* return top of the heap, without removing it from heap (FALSE if empty) */ +int rf_TopHeap(RF_Heap_t hp, RF_HeapData_t **data, RF_HeapKey_t *key); + +/* replace the heap's top item with a new item, and reheap */ +void rf_RepHeap(RF_Heap_t hp, RF_HeapData_t *data, RF_HeapKey_t key); + +/* remove the heap's top item, if any (FALSE if empty heap) */ +int rf_RemHeap(RF_Heap_t hp, RF_HeapData_t **data, RF_HeapKey_t *key); + +#endif /* !_RF__RF_HEAP_H_ */ diff --git a/sys/dev/raidframe/rf_hist.h b/sys/dev/raidframe/rf_hist.h new file mode 100644 index 00000000000..371c544d316 --- /dev/null +++ b/sys/dev/raidframe/rf_hist.h @@ -0,0 +1,73 @@ +/* $OpenBSD: rf_hist.h,v 1.1 1999/01/11 14:29:25 niklas Exp $ */ +/* $NetBSD: rf_hist.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */ +/* + * rf_hist.h + * + * Histgram operations for RAIDframe stats + */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ +/* : + * Log: rf_hist.h,v + * Revision 1.3 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.2 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.1 1996/05/31 10:33:05 jimz + * Initial revision + * + */ + +#ifndef _RF__RF_HIST_H_ +#define _RF__RF_HIST_H_ + +#include "rf_types.h" + +#define RF_HIST_RESOLUTION 5 +#define RF_HIST_MIN_VAL 0 +#define RF_HIST_MAX_VAL 1000 +#define RF_HIST_RANGE (RF_HIST_MAX_VAL - RF_HIST_MIN_VAL) +#define RF_HIST_NUM_BUCKETS (RF_HIST_RANGE / RF_HIST_RESOLUTION + 1) + +typedef RF_uint32 RF_Hist_t; + +#define RF_HIST_ADD(_hist_,_val_) { \ + RF_Hist_t val; \ + val = ((RF_Hist_t)(_val_)) / 1000; \ + if (val >= RF_HIST_MAX_VAL) \ + _hist_[RF_HIST_NUM_BUCKETS-1]++; \ + else \ + _hist_[(val - RF_HIST_MIN_VAL) / RF_HIST_RESOLUTION]++; \ +} + +#endif /* !_RF__RF_HIST_H_ */ diff --git a/sys/dev/raidframe/rf_interdecluster.c b/sys/dev/raidframe/rf_interdecluster.c new file mode 100644 index 00000000000..3ce97d075ee --- /dev/null +++ b/sys/dev/raidframe/rf_interdecluster.c @@ -0,0 +1,361 @@ +/* $OpenBSD: rf_interdecluster.c,v 1.1 1999/01/11 14:29:26 niklas Exp $ */ +/* $NetBSD: rf_interdecluster.c,v 1.1 1998/11/13 04:20:30 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Khalil Amiri + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/************************************************************ + * + * rf_interdecluster.c -- implements interleaved declustering + * + ************************************************************/ + +/* : + * Log: rf_interdecluster.c,v + * Revision 1.24 1996/08/02 13:20:38 jimz + * get rid of bogus (long) casts + * + * Revision 1.23 1996/07/31 16:56:18 jimz + * dataBytesPerStripe, sectorsPerDisk init arch-indep. + * + * Revision 1.22 1996/07/29 14:05:12 jimz + * fix numPUs/numRUs confusion (everything is now numRUs) + * clean up some commenting, return values + * + * Revision 1.21 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.20 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.19 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.18 1996/06/19 17:53:48 jimz + * move GetNumSparePUs, InstallSpareTable ops into layout switch + * + * Revision 1.17 1996/06/11 15:17:55 wvcii + * added include of rf_interdecluster.h + * fixed parameter list of rf_ConfigureInterDecluster + * fixed return type of rf_GetNumSparePUsInterDecluster + * removed include of rf_raid1.h + * + * Revision 1.16 1996/06/11 08:55:15 jimz + * improved error-checking at configuration time + * + * Revision 1.15 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.14 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.13 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.12 1996/06/06 18:41:48 jimz + * add interleaved declustering dag selection + * + * Revision 1.11 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.10 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.9 1996/05/31 05:03:01 amiri + * fixed a bug related to sparing layout. + * + * Revision 1.8 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.7 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.6 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.5 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.4 1996/05/03 19:50:38 wvcii + * removed include of rf_redstripe.h + * fixed change log parameters in header + * + */ + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_interdecluster.h" +#include "rf_dag.h" +#include "rf_dagutils.h" +#include "rf_dagfuncs.h" +#include "rf_threadid.h" +#include "rf_general.h" +#include "rf_utils.h" +#include "rf_dagffrd.h" +#include "rf_dagdegrd.h" +#include "rf_dagffwr.h" +#include "rf_dagdegwr.h" + +typedef struct RF_InterdeclusterConfigInfo_s { + RF_RowCol_t **stripeIdentifier; /* filled in at config time + * and used by IdentifyStripe */ + RF_StripeCount_t numSparingRegions; + RF_StripeCount_t stripeUnitsPerSparingRegion; + RF_SectorNum_t mirrorStripeOffset; +} RF_InterdeclusterConfigInfo_t; + +int rf_ConfigureInterDecluster( + RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr) +{ + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_StripeCount_t num_used_stripeUnitsPerDisk; + RF_InterdeclusterConfigInfo_t *info; + RF_RowCol_t i, tmp, SUs_per_region; + + /* create an Interleaved Declustering configuration structure */ + RF_MallocAndAdd(info, sizeof(RF_InterdeclusterConfigInfo_t), (RF_InterdeclusterConfigInfo_t *), + raidPtr->cleanupList); + if (info == NULL) + return(ENOMEM); + layoutPtr->layoutSpecificInfo = (void *) info; + + /* fill in the config structure. */ + SUs_per_region = raidPtr->numCol * (raidPtr->numCol - 1); + info->stripeIdentifier = rf_make_2d_array(SUs_per_region, 2 , raidPtr->cleanupList); + if (info->stripeIdentifier == NULL) + return(ENOMEM); + for (i=0; i< SUs_per_region; i++) { + info->stripeIdentifier[i][0] = i / (raidPtr->numCol-1); + tmp = i / raidPtr->numCol; + info->stripeIdentifier[i][1] = (i+1+tmp) % raidPtr->numCol; + } + + /* no spare tables */ + RF_ASSERT(raidPtr->numRow == 1); + + /* fill in the remaining layout parameters */ + + /* total number of stripes should a multiple of 2*numCol: Each sparing region consists of + 2*numCol stripes: n-1 primary copy, n-1 secondary copy and 2 for spare .. */ + num_used_stripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk - (layoutPtr->stripeUnitsPerDisk % + (2*raidPtr->numCol) ); + info->numSparingRegions = num_used_stripeUnitsPerDisk / (2*raidPtr->numCol); + /* this is in fact the number of stripe units (that are primary data copies) in the sparing region */ + info->stripeUnitsPerSparingRegion = raidPtr->numCol * (raidPtr->numCol - 1); + info->mirrorStripeOffset = info->numSparingRegions * (raidPtr->numCol+1); + layoutPtr->numStripe = info->numSparingRegions * info->stripeUnitsPerSparingRegion; + layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector; + layoutPtr->numDataCol = 1; + layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; + layoutPtr->numParityCol = 1; + + layoutPtr->dataStripeUnitsPerDisk = num_used_stripeUnitsPerDisk; + + raidPtr->sectorsPerDisk = + num_used_stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit; + + raidPtr->totalSectors = + (layoutPtr->numStripe) * layoutPtr->sectorsPerStripeUnit; + + layoutPtr->stripeUnitsPerDisk = raidPtr->sectorsPerDisk / layoutPtr->sectorsPerStripeUnit; + + return(0); +} + +int rf_GetDefaultNumFloatingReconBuffersInterDecluster(RF_Raid_t *raidPtr) +{ + return(30); +} + +RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitInterDecluster(RF_Raid_t *raidPtr) +{ + return(raidPtr->sectorsPerDisk); +} + +RF_ReconUnitCount_t rf_GetNumSpareRUsInterDecluster( + RF_Raid_t *raidPtr) +{ + RF_InterdeclusterConfigInfo_t *info = (RF_InterdeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; + + return ( 2 * ((RF_ReconUnitCount_t) info->numSparingRegions) ); + /* the layout uses two stripe units per disk as spare within each sparing region */ +} + +/* Maps to the primary copy of the data, i.e. the first mirror pair */ +void rf_MapSectorInterDecluster( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_InterdeclusterConfigInfo_t *info = (RF_InterdeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; + RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; + RF_StripeNum_t su_offset_into_disk, mirror_su_offset_into_disk; + RF_StripeNum_t sparing_region_id, index_within_region; + int col_before_remap; + + *row = 0; + sparing_region_id = SUID / info->stripeUnitsPerSparingRegion; + index_within_region = SUID % info->stripeUnitsPerSparingRegion; + su_offset_into_disk = index_within_region % (raidPtr->numCol-1); + mirror_su_offset_into_disk = index_within_region / raidPtr->numCol; + col_before_remap = index_within_region / (raidPtr->numCol-1); + + if (!remap) { + *col = col_before_remap;; + *diskSector = ( su_offset_into_disk + ( (raidPtr->numCol-1) * sparing_region_id) ) * + raidPtr->Layout.sectorsPerStripeUnit; + *diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit); + } + else { + /* remap sector to spare space...*/ + *diskSector = sparing_region_id * (raidPtr->numCol+1) * raidPtr->Layout.sectorsPerStripeUnit; + *diskSector += (raidPtr->numCol-1) * raidPtr->Layout.sectorsPerStripeUnit; + *diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit); + *col = (index_within_region + 1 + mirror_su_offset_into_disk) % raidPtr->numCol; + *col = (*col + 1) % raidPtr->numCol; + if (*col == col_before_remap) *col = (*col + 1) % raidPtr->numCol; + } +} + +/* Maps to the second copy of the mirror pair. */ +void rf_MapParityInterDecluster( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_InterdeclusterConfigInfo_t *info = (RF_InterdeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; + RF_StripeNum_t sparing_region_id, index_within_region, mirror_su_offset_into_disk; + RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; + int col_before_remap; + + sparing_region_id = SUID / info->stripeUnitsPerSparingRegion; + index_within_region = SUID % info->stripeUnitsPerSparingRegion; + mirror_su_offset_into_disk = index_within_region / raidPtr->numCol; + col_before_remap = (index_within_region + 1 + mirror_su_offset_into_disk) % raidPtr->numCol; + + *row = 0; + if (!remap) { + *col = col_before_remap; + *diskSector = info->mirrorStripeOffset * raidPtr->Layout.sectorsPerStripeUnit; + *diskSector += sparing_region_id * (raidPtr->numCol-1) * raidPtr->Layout.sectorsPerStripeUnit; + *diskSector += mirror_su_offset_into_disk * raidPtr->Layout.sectorsPerStripeUnit; + *diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit); + } + else { + /* remap parity to spare space ... */ + *diskSector = sparing_region_id * (raidPtr->numCol+1) * raidPtr->Layout.sectorsPerStripeUnit; + *diskSector += (raidPtr->numCol) * raidPtr->Layout.sectorsPerStripeUnit; + *diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit); + *col = index_within_region / (raidPtr->numCol-1); + *col = (*col + 1) % raidPtr->numCol; + if (*col == col_before_remap) *col = (*col + 1) % raidPtr->numCol; + } +} + +void rf_IdentifyStripeInterDecluster( + RF_Raid_t *raidPtr, + RF_RaidAddr_t addr, + RF_RowCol_t **diskids, + RF_RowCol_t *outRow) +{ + RF_InterdeclusterConfigInfo_t *info = (RF_InterdeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; + RF_StripeNum_t SUID; + + SUID = addr / raidPtr->Layout.sectorsPerStripeUnit; + SUID = SUID % info->stripeUnitsPerSparingRegion; + + *outRow = 0; + *diskids = info->stripeIdentifier[ SUID ]; +} + +void rf_MapSIDToPSIDInterDecluster( + RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t stripeID, + RF_StripeNum_t *psID, + RF_ReconUnitNum_t *which_ru) +{ + *which_ru = 0; + *psID = stripeID; +} + +/****************************************************************************** + * select a graph to perform a single-stripe access + * + * Parameters: raidPtr - description of the physical array + * type - type of operation (read or write) requested + * asmap - logical & physical addresses for this access + * createFunc - name of function to use to create the graph + *****************************************************************************/ + +void rf_RAIDIDagSelect( + RF_Raid_t *raidPtr, + RF_IoType_t type, + RF_AccessStripeMap_t *asmap, + RF_VoidFuncPtr *createFunc) +{ + RF_ASSERT(RF_IO_IS_R_OR_W(type)); + + if (asmap->numDataFailed + asmap->numParityFailed > 1) { + RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n"); + *createFunc = NULL; + return; + } + + *createFunc = (type == RF_IO_TYPE_READ) ? (RF_VoidFuncPtr)rf_CreateFaultFreeReadDAG : (RF_VoidFuncPtr)rf_CreateRaidOneWriteDAG; + if (type == RF_IO_TYPE_READ) { + if (asmap->numDataFailed == 0) + *createFunc = (RF_VoidFuncPtr)rf_CreateMirrorPartitionReadDAG; + else + *createFunc = (RF_VoidFuncPtr)rf_CreateRaidOneDegradedReadDAG; + } + else + *createFunc = (RF_VoidFuncPtr)rf_CreateRaidOneWriteDAG; +} diff --git a/sys/dev/raidframe/rf_interdecluster.h b/sys/dev/raidframe/rf_interdecluster.h new file mode 100644 index 00000000000..a76ea9dcb46 --- /dev/null +++ b/sys/dev/raidframe/rf_interdecluster.h @@ -0,0 +1,112 @@ +/* $OpenBSD: rf_interdecluster.h,v 1.1 1999/01/11 14:29:26 niklas Exp $ */ +/* $NetBSD: rf_interdecluster.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Khalil Amiri + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* rf_interdecluster.h + * header file for Interleaved Declustering + */ + +/* + * : + * Log: rf_interdecluster.h,v + * Revision 1.13 1996/07/29 14:05:12 jimz + * fix numPUs/numRUs confusion (everything is now numRUs) + * clean up some commenting, return values + * + * Revision 1.12 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.11 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.10 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.9 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.8 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.7 1996/06/06 18:41:58 jimz + * add RAIDIDagSelect + * + * Revision 1.6 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.5 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.4 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.3 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.2 1995/12/01 19:07:25 root + * added copyright info + * + * Revision 1.1 1995/11/28 21:38:27 amiri + * Initial revision + */ + +#ifndef _RF__RF_INTERDECLUSTER_H_ +#define _RF__RF_INTERDECLUSTER_H_ + +int rf_ConfigureInterDecluster(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr); +int rf_GetDefaultNumFloatingReconBuffersInterDecluster(RF_Raid_t *raidPtr); +RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitInterDecluster(RF_Raid_t *raidPtr); +RF_ReconUnitCount_t rf_GetNumSpareRUsInterDecluster(RF_Raid_t *raidPtr); +void rf_MapSectorInterDecluster(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_MapParityInterDecluster(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_IdentifyStripeInterDecluster(RF_Raid_t *raidPtr, RF_RaidAddr_t addr, + RF_RowCol_t **diskids, RF_RowCol_t *outRow); +void rf_MapSIDToPSIDInterDecluster(RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t stripeID, RF_StripeNum_t *psID, + RF_ReconUnitNum_t *which_ru); +void rf_RAIDIDagSelect(RF_Raid_t *raidPtr, RF_IoType_t type, + RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc); + +#endif /* !_RF__RF_INTERDECLUSTER_H_ */ diff --git a/sys/dev/raidframe/rf_invertq.c b/sys/dev/raidframe/rf_invertq.c new file mode 100644 index 00000000000..c1e07aa257f --- /dev/null +++ b/sys/dev/raidframe/rf_invertq.c @@ -0,0 +1,55 @@ +/* $OpenBSD: rf_invertq.c,v 1.1 1999/01/11 14:29:26 niklas Exp $ */ +/* $NetBSD: rf_invertq.c,v 1.1 1998/11/13 04:20:30 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Daniel Stodolsky + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* : + * Log: rf_invertq.c,v + * Revision 1.5 1996/07/29 16:36:36 jimz + * include rf_archs.h here, not rf_invertq.h, to avoid VPATH + * problems in OSF/1 kernel + * + * Revision 1.4 1995/11/30 15:57:27 wvcii + * added copyright info + * + */ + +#ifdef _KERNEL +#define KERNEL +#endif + +#include "rf_archs.h" +#include "rf_pqdeg.h" +#ifdef KERNEL +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#include <raidframe/du_data/rf_invertq.h> +#else +#include "rf_invertq.h" /* XXX this is a hack. */ +#endif /* !__NetBSD__ && !__OpenBSD__ */ +#else /* KERNEL */ +#include "rf_invertq.h" +#endif /* KERNEL */ diff --git a/sys/dev/raidframe/rf_invertq.h b/sys/dev/raidframe/rf_invertq.h new file mode 100644 index 00000000000..e9c1e69d768 --- /dev/null +++ b/sys/dev/raidframe/rf_invertq.h @@ -0,0 +1,73 @@ +/* $OpenBSD: rf_invertq.h,v 1.1 1999/01/11 14:29:27 niklas Exp $ */ +/* $NetBSD: rf_invertq.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */ +/* + * rf_invertq.h + */ +/* + * This is normally a generated file. Not so for Net- and OpenBSD. + */ + +#ifndef _RF__RF_INVERTQ_H_ +#define _RF__RF_INVERTQ_H_ + +#ifdef _KERNEL +#define KERNEL +#endif + +/* + * rf_geniq.c must include rf_archs.h before including + * this file (to get VPATH magic right with the way we + * generate this file in kernel trees) + */ +/* #include "rf_archs.h" */ + +#if (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_RAID6 > 0) + +#define RF_Q_COLS 32 +RF_ua32_t rf_rn = { +1, 2, 4, 8, 16, 5, 10, 20, 13, 26, 17, 7, 14, 28, 29, 31, 27, 19, 3, 6, 12, 24, 21, 15, 30, 25, 23, 11, 22, 9, 18, 1, }; +RF_ua32_t rf_qfor[32] = { +/* i = 0 */ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, }, +/* i = 1 */ { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 5, 7, 1, 3, 13, 15, 9, 11, 21, 23, 17, 19, 29, 31, 25, 27, }, +/* i = 2 */ { 0, 4, 8, 12, 16, 20, 24, 28, 5, 1, 13, 9, 21, 17, 29, 25, 10, 14, 2, 6, 26, 30, 18, 22, 15, 11, 7, 3, 31, 27, 23, 19, }, +/* i = 3 */ { 0, 8, 16, 24, 5, 13, 21, 29, 10, 2, 26, 18, 15, 7, 31, 23, 20, 28, 4, 12, 17, 25, 1, 9, 30, 22, 14, 6, 27, 19, 11, 3, }, +/* i = 4 */ { 0, 16, 5, 21, 10, 26, 15, 31, 20, 4, 17, 1, 30, 14, 27, 11, 13, 29, 8, 24, 7, 23, 2, 18, 25, 9, 28, 12, 19, 3, 22, 6, }, +/* i = 5 */ { 0, 5, 10, 15, 20, 17, 30, 27, 13, 8, 7, 2, 25, 28, 19, 22, 26, 31, 16, 21, 14, 11, 4, 1, 23, 18, 29, 24, 3, 6, 9, 12, }, +/* i = 6 */ { 0, 10, 20, 30, 13, 7, 25, 19, 26, 16, 14, 4, 23, 29, 3, 9, 17, 27, 5, 15, 28, 22, 8, 2, 11, 1, 31, 21, 6, 12, 18, 24, }, +/* i = 7 */ { 0, 20, 13, 25, 26, 14, 23, 3, 17, 5, 28, 8, 11, 31, 6, 18, 7, 19, 10, 30, 29, 9, 16, 4, 22, 2, 27, 15, 12, 24, 1, 21, }, +/* i = 8 */ { 0, 13, 26, 23, 17, 28, 11, 6, 7, 10, 29, 16, 22, 27, 12, 1, 14, 3, 20, 25, 31, 18, 5, 8, 9, 4, 19, 30, 24, 21, 2, 15, }, +/* i = 9 */ { 0, 26, 17, 11, 7, 29, 22, 12, 14, 20, 31, 5, 9, 19, 24, 2, 28, 6, 13, 23, 27, 1, 10, 16, 18, 8, 3, 25, 21, 15, 4, 30, }, +/* i = 10 */ { 0, 17, 7, 22, 14, 31, 9, 24, 28, 13, 27, 10, 18, 3, 21, 4, 29, 12, 26, 11, 19, 2, 20, 5, 1, 16, 6, 23, 15, 30, 8, 25, }, +/* i = 11 */ { 0, 7, 14, 9, 28, 27, 18, 21, 29, 26, 19, 20, 1, 6, 15, 8, 31, 24, 17, 22, 3, 4, 13, 10, 2, 5, 12, 11, 30, 25, 16, 23, }, +/* i = 12 */ { 0, 14, 28, 18, 29, 19, 1, 15, 31, 17, 3, 13, 2, 12, 30, 16, 27, 21, 7, 9, 6, 8, 26, 20, 4, 10, 24, 22, 25, 23, 5, 11, }, +/* i = 13 */ { 0, 28, 29, 1, 31, 3, 2, 30, 27, 7, 6, 26, 4, 24, 25, 5, 19, 15, 14, 18, 12, 16, 17, 13, 8, 20, 21, 9, 23, 11, 10, 22, }, +/* i = 14 */ { 0, 29, 31, 2, 27, 6, 4, 25, 19, 14, 12, 17, 8, 21, 23, 10, 3, 30, 28, 1, 24, 5, 7, 26, 16, 13, 15, 18, 11, 22, 20, 9, }, +/* i = 15 */ { 0, 31, 27, 4, 19, 12, 8, 23, 3, 28, 24, 7, 16, 15, 11, 20, 6, 25, 29, 2, 21, 10, 14, 17, 5, 26, 30, 1, 22, 9, 13, 18, }, +/* i = 16 */ { 0, 27, 19, 8, 3, 24, 16, 11, 6, 29, 21, 14, 5, 30, 22, 13, 12, 23, 31, 4, 15, 20, 28, 7, 10, 17, 25, 2, 9, 18, 26, 1, }, +/* i = 17 */ { 0, 19, 3, 16, 6, 21, 5, 22, 12, 31, 15, 28, 10, 25, 9, 26, 24, 11, 27, 8, 30, 13, 29, 14, 20, 7, 23, 4, 18, 1, 17, 2, }, +/* i = 18 */ { 0, 3, 6, 5, 12, 15, 10, 9, 24, 27, 30, 29, 20, 23, 18, 17, 21, 22, 19, 16, 25, 26, 31, 28, 13, 14, 11, 8, 1, 2, 7, 4, }, +/* i = 19 */ { 0, 6, 12, 10, 24, 30, 20, 18, 21, 19, 25, 31, 13, 11, 1, 7, 15, 9, 3, 5, 23, 17, 27, 29, 26, 28, 22, 16, 2, 4, 14, 8, }, +/* i = 20 */ { 0, 12, 24, 20, 21, 25, 13, 1, 15, 3, 23, 27, 26, 22, 2, 14, 30, 18, 6, 10, 11, 7, 19, 31, 17, 29, 9, 5, 4, 8, 28, 16, }, +/* i = 21 */ { 0, 24, 21, 13, 15, 23, 26, 2, 30, 6, 11, 19, 17, 9, 4, 28, 25, 1, 12, 20, 22, 14, 3, 27, 7, 31, 18, 10, 8, 16, 29, 5, }, +/* i = 22 */ { 0, 21, 15, 26, 30, 11, 17, 4, 25, 12, 22, 3, 7, 18, 8, 29, 23, 2, 24, 13, 9, 28, 6, 19, 14, 27, 1, 20, 16, 5, 31, 10, }, +/* i = 23 */ { 0, 15, 30, 17, 25, 22, 7, 8, 23, 24, 9, 6, 14, 1, 16, 31, 11, 4, 21, 26, 18, 29, 12, 3, 28, 19, 2, 13, 5, 10, 27, 20, }, +/* i = 24 */ { 0, 30, 25, 7, 23, 9, 14, 16, 11, 21, 18, 12, 28, 2, 5, 27, 22, 8, 15, 17, 1, 31, 24, 6, 29, 3, 4, 26, 10, 20, 19, 13, }, +/* i = 25 */ { 0, 25, 23, 14, 11, 18, 28, 5, 22, 15, 1, 24, 29, 4, 10, 19, 9, 16, 30, 7, 2, 27, 21, 12, 31, 6, 8, 17, 20, 13, 3, 26, }, +/* i = 26 */ { 0, 23, 11, 28, 22, 1, 29, 10, 9, 30, 2, 21, 31, 8, 20, 3, 18, 5, 25, 14, 4, 19, 15, 24, 27, 12, 16, 7, 13, 26, 6, 17, }, +/* i = 27 */ { 0, 11, 22, 29, 9, 2, 31, 20, 18, 25, 4, 15, 27, 16, 13, 6, 1, 10, 23, 28, 8, 3, 30, 21, 19, 24, 5, 14, 26, 17, 12, 7, }, +/* i = 28 */ { 0, 22, 9, 31, 18, 4, 27, 13, 1, 23, 8, 30, 19, 5, 26, 12, 2, 20, 11, 29, 16, 6, 25, 15, 3, 21, 10, 28, 17, 7, 24, 14, }, +/* i = 29 */ { 0, 9, 18, 27, 1, 8, 19, 26, 2, 11, 16, 25, 3, 10, 17, 24, 4, 13, 22, 31, 5, 12, 23, 30, 6, 15, 20, 29, 7, 14, 21, 28, }, +/* i = 30 */ { 0, 18, 1, 19, 2, 16, 3, 17, 4, 22, 5, 23, 6, 20, 7, 21, 8, 26, 9, 27, 10, 24, 11, 25, 12, 30, 13, 31, 14, 28, 15, 29, }, +/* i = 31 */ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, }, +}; +#define RF_Q_DATA_COL(col_num) rf_rn[col_num],rf_qfor[28-(col_num)] +#ifdef KERNEL +RF_ua1024_t rf_qinv[1]; /* don't compile monster table into kernel */ +#elif defined(NO_PQ) +RF_ua1024_t rf_qinv[29*29]; +#else /* !KERNEL && NO_PQ */ + +#endif /* !KERNEL && NO_PQ */ + +#endif /* (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */ +#endif /* !_RF__RF_INVERTQ_H_ */ diff --git a/sys/dev/raidframe/rf_kintf.h b/sys/dev/raidframe/rf_kintf.h new file mode 100644 index 00000000000..e270aa0b933 --- /dev/null +++ b/sys/dev/raidframe/rf_kintf.h @@ -0,0 +1,71 @@ +/* $OpenBSD: rf_kintf.h,v 1.1 1999/01/11 14:29:27 niklas Exp $ */ +/* $NetBSD: rf_kintf.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */ +/* + * rf_kintf.h + * + * RAIDframe exported kernel interface + */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ +/* + * : + * Log: rf_kintf.h,v + * Revision 1.2 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.1 1996/05/31 18:59:14 jimz + * Initial revision + * + */ + +#ifndef _RF__RF_KINTF_H_ +#define _RF__RF_KINTF_H_ + +#include "rf_types.h" + +int rf_boot(void); +int rf_open(dev_t dev, int flag, int fmt); +int rf_close(dev_t dev, int flag, int fmt); +void rf_strategy(struct buf *bp); +void rf_minphys(struct buf *bp); +int rf_read(dev_t dev, struct uio *uio); +int rf_write(dev_t dev, struct uio *uio); +int rf_size(dev_t dev); +int rf_ioctl(dev_t dev, int cmd, caddr_t data, int flag); +void rf_ReconKernelThread(void); +int rf_GetSpareTableFromDaemon(RF_SparetWait_t *req); +caddr_t rf_MapToKernelSpace(struct buf *bp, caddr_t addr); +int rf_BzeroWithRemap(struct buf *bp, char *databuf, int len); +int rf_DoAccessKernel(RF_Raid_t *raidPtr, struct buf *bp, + RF_RaidAccessFlags_t flags, void (*cbFunc)(struct buf *), void *cbArg); +int rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req); + +#endif /* _RF__RF_KINTF_H_ */ diff --git a/sys/dev/raidframe/rf_layout.c b/sys/dev/raidframe/rf_layout.c new file mode 100644 index 00000000000..a8a06e044ff --- /dev/null +++ b/sys/dev/raidframe/rf_layout.c @@ -0,0 +1,720 @@ +/* $OpenBSD: rf_layout.c,v 1.1 1999/01/11 14:29:27 niklas Exp $ */ +/* $NetBSD: rf_layout.c,v 1.1 1998/11/13 04:20:30 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* rf_layout.c -- driver code dealing with layout and mapping issues + */ + +/* + * : + * Log: rf_layout.c,v + * Revision 1.71 1996/08/20 22:41:30 jimz + * add declustered evenodd + * + * Revision 1.70 1996/07/31 16:56:18 jimz + * dataBytesPerStripe, sectorsPerDisk init arch-indep. + * + * Revision 1.69 1996/07/31 15:34:46 jimz + * add EvenOdd + * + * Revision 1.68 1996/07/29 14:05:12 jimz + * fix numPUs/numRUs confusion (everything is now numRUs) + * clean up some commenting, return values + * + * Revision 1.67 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.66 1996/07/27 18:40:24 jimz + * cleanup sweep + * + * Revision 1.65 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.64 1996/07/15 17:22:18 jimz + * nit-pick code cleanup + * resolve stdlib problems on DEC OSF + * + * Revision 1.63 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.62 1996/07/11 19:08:00 jimz + * generalize reconstruction mechanism + * allow raid1 reconstructs via copyback (done with array + * quiesced, not online, therefore not disk-directed) + * + * Revision 1.61 1996/06/19 22:23:01 jimz + * parity verification is now a layout-configurable thing + * not all layouts currently support it (correctly, anyway) + * + * Revision 1.60 1996/06/19 17:53:48 jimz + * move GetNumSparePUs, InstallSpareTable ops into layout switch + * + * Revision 1.59 1996/06/19 14:57:58 jimz + * move layout-specific config parsing hooks into RF_LayoutSW_t + * table in rf_layout.c + * + * Revision 1.58 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.57 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.56 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.55 1996/06/06 18:41:35 jimz + * change interleaved declustering dag selection to an + * interleaved-declustering-specific routine (so we can + * use the partitioned mirror node) + * + * Revision 1.54 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.53 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.52 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.51 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.50 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.49 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.48 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.47 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.46 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.45 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.44 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.43 1996/02/22 16:46:35 amiri + * modified chained declustering to use a seperate DAG selection routine + * + * Revision 1.42 1995/12/01 19:16:11 root + * added copyright info + * + * Revision 1.41 1995/11/28 21:31:02 amiri + * added Interleaved Declustering to switch table + * + * Revision 1.40 1995/11/20 14:35:17 arw + * moved rf_StartThroughputStats in DefaultWrite and DefaultRead + * + * Revision 1.39 1995/11/19 16:28:46 wvcii + * replaced LaunchDAGState with CreateDAGState, ExecuteDAGState + * + * Revision 1.38 1995/11/17 19:00:41 wvcii + * added MapQ entries to switch table + * + * Revision 1.37 1995/11/17 16:58:13 amiri + * Added the Chained Declustering architecture ('C'), + * essentially a variant of mirroring. + * + * Revision 1.36 1995/11/16 16:16:10 amiri + * Added RAID5 with rotated sparing ('R' configuration) + * + * Revision 1.35 1995/11/07 15:41:17 wvcii + * modified state lists: DefaultStates, VSReadStates + * necessary to support new states (LaunchDAGState, ProcessDAGState) + * + * Revision 1.34 1995/10/18 01:23:20 amiri + * added ifndef SIMULATE wrapper around rf_StartThroughputStats() + * + * Revision 1.33 1995/10/13 15:05:46 arw + * added rf_StartThroughputStats to DefaultRead and DefaultWrite + * + * Revision 1.32 1995/10/12 16:04:23 jimz + * added config names to mapsw entires + * + * Revision 1.31 1995/10/04 03:57:48 wvcii + * added raid level 1 to mapsw + * + * Revision 1.30 1995/09/07 01:26:55 jimz + * Achive basic compilation in kernel. Kernel functionality + * is not guaranteed at all, but it'll compile. Mostly. I hope. + * + * Revision 1.29 1995/07/28 21:43:42 robby + * checkin after leaving for Rice. Bye + * + * Revision 1.28 1995/07/26 03:26:14 robby + * *** empty log message *** + * + * Revision 1.27 1995/07/21 19:47:52 rachad + * Added raid 0 /5 with caching architectures + * + * Revision 1.26 1995/07/21 19:29:27 robby + * added virtual striping states + * + * Revision 1.25 1995/07/10 21:41:47 robby + * switched to have my own virtual stripng write function from the cache + * + * Revision 1.24 1995/07/10 20:51:59 robby + * added virtual striping states + * + * Revision 1.23 1995/07/10 16:57:42 robby + * updated alloclistelem struct to the correct struct name + * + * Revision 1.22 1995/07/08 20:06:11 rachad + * *** empty log message *** + * + * Revision 1.21 1995/07/08 19:43:16 cfb + * *** empty log message *** + * + * Revision 1.20 1995/07/08 18:05:39 rachad + * Linked up Claudsons code with the real cache + * + * Revision 1.19 1995/07/06 14:29:36 robby + * added defaults states list to the layout switch + * + * Revision 1.18 1995/06/23 13:40:34 robby + * updeated to prototypes in rf_layout.h + * + */ + +#include "rf_types.h" +#include "rf_archs.h" +#include "rf_raid.h" +#include "rf_configure.h" +#include "rf_dag.h" +#include "rf_desc.h" +#include "rf_decluster.h" +#include "rf_pq.h" +#include "rf_declusterPQ.h" +#include "rf_raid0.h" +#include "rf_raid1.h" +#include "rf_raid4.h" +#include "rf_raid5.h" +#include "rf_states.h" +#if RF_INCLUDE_RAID5_RS > 0 +#include "rf_raid5_rotatedspare.h" +#endif /* RF_INCLUDE_RAID5_RS > 0 */ +#if RF_INCLUDE_CHAINDECLUSTER > 0 +#include "rf_chaindecluster.h" +#endif /* RF_INCLUDE_CHAINDECLUSTER > 0 */ +#if RF_INCLUDE_INTERDECLUSTER > 0 +#include "rf_interdecluster.h" +#endif /* RF_INCLUDE_INTERDECLUSTER > 0 */ +#if RF_INCLUDE_PARITYLOGGING > 0 +#include "rf_paritylogging.h" +#endif /* RF_INCLUDE_PARITYLOGGING > 0 */ +#if RF_INCLUDE_EVENODD > 0 +#include "rf_evenodd.h" +#endif /* RF_INCLUDE_EVENODD > 0 */ +#include "rf_general.h" +#include "rf_driver.h" +#include "rf_parityscan.h" +#include "rf_reconbuffer.h" +#include "rf_reconutil.h" + +/*********************************************************************** + * + * the layout switch defines all the layouts that are supported. + * fields are: layout ID, init routine, shutdown routine, map + * sector, map parity, identify stripe, dag selection, map stripeid + * to parity stripe id (optional), num faults tolerated, special + * flags. + * + ***********************************************************************/ + +static RF_AccessState_t DefaultStates[] = {rf_QuiesceState, + rf_IncrAccessesCountState, rf_MapState, rf_LockState, rf_CreateDAGState, + rf_ExecuteDAGState, rf_ProcessDAGState, rf_DecrAccessesCountState, + rf_CleanupState, rf_LastState}; + +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && !defined(_KERNEL) +/* XXX Gross hack to shutup gcc -- it complains that DefaultStates is not +used when compiling this in userland.. I hate to burst it's bubble, but +DefaultStates is used all over the place here in the initialization of +lots of data structures. GO */ +RF_AccessState_t *NothingAtAll = DefaultStates; +#endif + +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) +/* XXX Remove static so GCC doesn't complain about these being unused! */ +int distSpareYes = 1; +int distSpareNo = 0; +#else +static int distSpareYes = 1; +static int distSpareNo = 0; +#endif +#ifdef KERNEL +#define RF_NK2(a,b) +#else /* KERNEL */ +#define RF_NK2(a,b) a,b, +#endif /* KERNEL */ + +#if RF_UTILITY > 0 +#define RF_NU(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) +#else /* RF_UTILITY > 0 */ +#define RF_NU(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p +#endif /* RF_UTILITY > 0 */ + +static RF_LayoutSW_t mapsw[] = { + /* parity declustering */ + {'T', "Parity declustering", + RF_NK2(rf_MakeLayoutSpecificDeclustered, &distSpareNo) + RF_NU( + rf_ConfigureDeclustered, + rf_MapSectorDeclustered, rf_MapParityDeclustered, NULL, + rf_IdentifyStripeDeclustered, + rf_RaidFiveDagSelect, + rf_MapSIDToPSIDDeclustered, + rf_GetDefaultHeadSepLimitDeclustered, + rf_GetDefaultNumFloatingReconBuffersDeclustered, + NULL, NULL, + rf_SubmitReconBufferBasic, + rf_VerifyParityBasic, + 1, + DefaultStates, + 0) + }, + + /* parity declustering with distributed sparing */ + {'D', "Distributed sparing parity declustering", + RF_NK2(rf_MakeLayoutSpecificDeclustered, &distSpareYes) + RF_NU( + rf_ConfigureDeclusteredDS, + rf_MapSectorDeclustered, rf_MapParityDeclustered, NULL, + rf_IdentifyStripeDeclustered, + rf_RaidFiveDagSelect, + rf_MapSIDToPSIDDeclustered, + rf_GetDefaultHeadSepLimitDeclustered, + rf_GetDefaultNumFloatingReconBuffersDeclustered, + rf_GetNumSpareRUsDeclustered, rf_InstallSpareTable, + rf_SubmitReconBufferBasic, + rf_VerifyParityBasic, + 1, + DefaultStates, + RF_DISTRIBUTE_SPARE|RF_BD_DECLUSTERED) + }, + +#if RF_INCLUDE_DECL_PQ > 0 + /* declustered P+Q */ + {'Q', "Declustered P+Q", + RF_NK2(rf_MakeLayoutSpecificDeclustered, &distSpareNo) + RF_NU( + rf_ConfigureDeclusteredPQ, + rf_MapSectorDeclusteredPQ, rf_MapParityDeclusteredPQ, rf_MapQDeclusteredPQ, + rf_IdentifyStripeDeclusteredPQ, + rf_PQDagSelect, + rf_MapSIDToPSIDDeclustered, + rf_GetDefaultHeadSepLimitDeclustered, + rf_GetDefaultNumFloatingReconBuffersPQ, + NULL, NULL, + NULL, + rf_VerifyParityBasic, + 2, + DefaultStates, + 0) + }, +#endif /* RF_INCLUDE_DECL_PQ > 0 */ + +#if RF_INCLUDE_RAID5_RS > 0 + /* RAID 5 with rotated sparing */ + {'R', "RAID Level 5 rotated sparing", + RF_NK2(rf_MakeLayoutSpecificNULL, NULL) + RF_NU( + rf_ConfigureRAID5_RS, + rf_MapSectorRAID5_RS, rf_MapParityRAID5_RS, NULL, + rf_IdentifyStripeRAID5_RS, + rf_RaidFiveDagSelect, + rf_MapSIDToPSIDRAID5_RS, + rf_GetDefaultHeadSepLimitRAID5, + rf_GetDefaultNumFloatingReconBuffersRAID5, + rf_GetNumSpareRUsRAID5_RS, NULL, + rf_SubmitReconBufferBasic, + rf_VerifyParityBasic, + 1, + DefaultStates, + RF_DISTRIBUTE_SPARE) + }, +#endif /* RF_INCLUDE_RAID5_RS > 0 */ + +#if RF_INCLUDE_CHAINDECLUSTER > 0 + /* Chained Declustering */ + {'C', "Chained Declustering", + RF_NK2(rf_MakeLayoutSpecificNULL, NULL) + RF_NU( + rf_ConfigureChainDecluster, + rf_MapSectorChainDecluster, rf_MapParityChainDecluster, NULL, + rf_IdentifyStripeChainDecluster, + rf_RAIDCDagSelect, + rf_MapSIDToPSIDChainDecluster, + NULL, + NULL, + rf_GetNumSpareRUsChainDecluster, NULL, + rf_SubmitReconBufferBasic, + rf_VerifyParityBasic, + 1, + DefaultStates, + 0) + }, +#endif /* RF_INCLUDE_CHAINDECLUSTER > 0 */ + +#if RF_INCLUDE_INTERDECLUSTER > 0 + /* Interleaved Declustering */ + {'I', "Interleaved Declustering", + RF_NK2(rf_MakeLayoutSpecificNULL, NULL) + RF_NU( + rf_ConfigureInterDecluster, + rf_MapSectorInterDecluster, rf_MapParityInterDecluster, NULL, + rf_IdentifyStripeInterDecluster, + rf_RAIDIDagSelect, + rf_MapSIDToPSIDInterDecluster, + rf_GetDefaultHeadSepLimitInterDecluster, + rf_GetDefaultNumFloatingReconBuffersInterDecluster, + rf_GetNumSpareRUsInterDecluster, NULL, + rf_SubmitReconBufferBasic, + rf_VerifyParityBasic, + 1, + DefaultStates, + RF_DISTRIBUTE_SPARE) + }, +#endif /* RF_INCLUDE_INTERDECLUSTER > 0 */ + +#if RF_INCLUDE_RAID0 > 0 + /* RAID level 0 */ + {'0', "RAID Level 0", + RF_NK2(rf_MakeLayoutSpecificNULL, NULL) + RF_NU( + rf_ConfigureRAID0, + rf_MapSectorRAID0, rf_MapParityRAID0, NULL, + rf_IdentifyStripeRAID0, + rf_RAID0DagSelect, + rf_MapSIDToPSIDRAID0, + NULL, + NULL, + NULL, NULL, + NULL, + rf_VerifyParityRAID0, + 0, + DefaultStates, + 0) + }, +#endif /* RF_INCLUDE_RAID0 > 0 */ + +#if RF_INCLUDE_RAID1 > 0 + /* RAID level 1 */ + {'1', "RAID Level 1", + RF_NK2(rf_MakeLayoutSpecificNULL, NULL) + RF_NU( + rf_ConfigureRAID1, + rf_MapSectorRAID1, rf_MapParityRAID1, NULL, + rf_IdentifyStripeRAID1, + rf_RAID1DagSelect, + rf_MapSIDToPSIDRAID1, + NULL, + NULL, + NULL, NULL, + rf_SubmitReconBufferRAID1, + rf_VerifyParityRAID1, + 1, + DefaultStates, + 0) + }, +#endif /* RF_INCLUDE_RAID1 > 0 */ + +#if RF_INCLUDE_RAID4 > 0 + /* RAID level 4 */ + {'4', "RAID Level 4", + RF_NK2(rf_MakeLayoutSpecificNULL, NULL) + RF_NU( + rf_ConfigureRAID4, + rf_MapSectorRAID4, rf_MapParityRAID4, NULL, + rf_IdentifyStripeRAID4, + rf_RaidFiveDagSelect, + rf_MapSIDToPSIDRAID4, + rf_GetDefaultHeadSepLimitRAID4, + rf_GetDefaultNumFloatingReconBuffersRAID4, + NULL, NULL, + rf_SubmitReconBufferBasic, + rf_VerifyParityBasic, + 1, + DefaultStates, + 0) + }, +#endif /* RF_INCLUDE_RAID4 > 0 */ + +#if RF_INCLUDE_RAID5 > 0 + /* RAID level 5 */ + {'5', "RAID Level 5", + RF_NK2(rf_MakeLayoutSpecificNULL, NULL) + RF_NU( + rf_ConfigureRAID5, + rf_MapSectorRAID5, rf_MapParityRAID5, NULL, + rf_IdentifyStripeRAID5, + rf_RaidFiveDagSelect, + rf_MapSIDToPSIDRAID5, + rf_GetDefaultHeadSepLimitRAID5, + rf_GetDefaultNumFloatingReconBuffersRAID5, + NULL, NULL, + rf_SubmitReconBufferBasic, + rf_VerifyParityBasic, + 1, + DefaultStates, + 0) + }, +#endif /* RF_INCLUDE_RAID5 > 0 */ + +#if RF_INCLUDE_EVENODD > 0 + /* Evenodd */ + {'E', "EvenOdd", + RF_NK2(rf_MakeLayoutSpecificNULL, NULL) + RF_NU( + rf_ConfigureEvenOdd, + rf_MapSectorRAID5, rf_MapParityEvenOdd, rf_MapEEvenOdd, + rf_IdentifyStripeEvenOdd, + rf_EODagSelect, + rf_MapSIDToPSIDRAID5, + NULL, + NULL, + NULL, NULL, + NULL, /* no reconstruction, yet */ + rf_VerifyParityEvenOdd, + 2, + DefaultStates, + 0) + }, +#endif /* RF_INCLUDE_EVENODD > 0 */ + +#if RF_INCLUDE_EVENODD > 0 + /* Declustered Evenodd */ + {'e', "Declustered EvenOdd", + RF_NK2(rf_MakeLayoutSpecificDeclustered, &distSpareNo) + RF_NU( + rf_ConfigureDeclusteredPQ, + rf_MapSectorDeclusteredPQ, rf_MapParityDeclusteredPQ, rf_MapQDeclusteredPQ, + rf_IdentifyStripeDeclusteredPQ, + rf_EODagSelect, + rf_MapSIDToPSIDRAID5, + rf_GetDefaultHeadSepLimitDeclustered, + rf_GetDefaultNumFloatingReconBuffersPQ, + NULL, NULL, + NULL, /* no reconstruction, yet */ + rf_VerifyParityEvenOdd, + 2, + DefaultStates, + 0) + }, +#endif /* RF_INCLUDE_EVENODD > 0 */ + +#if RF_INCLUDE_PARITYLOGGING > 0 + /* parity logging */ + {'L', "Parity logging", + RF_NK2(rf_MakeLayoutSpecificNULL, NULL) + RF_NU( + rf_ConfigureParityLogging, + rf_MapSectorParityLogging, rf_MapParityParityLogging, NULL, + rf_IdentifyStripeParityLogging, + rf_ParityLoggingDagSelect, + rf_MapSIDToPSIDParityLogging, + rf_GetDefaultHeadSepLimitParityLogging, + rf_GetDefaultNumFloatingReconBuffersParityLogging, + NULL, NULL, + rf_SubmitReconBufferBasic, + NULL, + 1, + DefaultStates, + 0) + }, +#endif /* RF_INCLUDE_PARITYLOGGING > 0 */ + + /* end-of-list marker */ + { '\0', NULL, + RF_NK2(NULL, NULL) + RF_NU( + NULL, + NULL, NULL, NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, NULL, + NULL, + NULL, + 0, + NULL, + 0) + } +}; + +RF_LayoutSW_t *rf_GetLayout(RF_ParityConfig_t parityConfig) +{ + RF_LayoutSW_t *p; + + /* look up the specific layout */ + for (p=&mapsw[0]; p->parityConfig; p++) + if (p->parityConfig == parityConfig) + break; + if (!p->parityConfig) + return(NULL); + RF_ASSERT(p->parityConfig == parityConfig); + return(p); +} + +#if RF_UTILITY == 0 +/***************************************************************************************** + * + * ConfigureLayout -- + * + * read the configuration file and set up the RAID layout parameters. After reading + * common params, invokes the layout-specific configuration routine to finish + * the configuration. + * + ****************************************************************************************/ +int rf_ConfigureLayout( + RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_ParityConfig_t parityConfig; + RF_LayoutSW_t *p; + int retval; + + layoutPtr->sectorsPerStripeUnit = cfgPtr->sectPerSU; + layoutPtr->SUsPerPU = cfgPtr->SUsPerPU; + layoutPtr->SUsPerRU = cfgPtr->SUsPerRU; + parityConfig = cfgPtr->parityConfig; + + layoutPtr->stripeUnitsPerDisk = raidPtr->sectorsPerDisk / layoutPtr->sectorsPerStripeUnit; + + p = rf_GetLayout(parityConfig); + if (p == NULL) { + RF_ERRORMSG1("Unknown parity configuration '%c'", parityConfig); + return(EINVAL); + } + RF_ASSERT(p->parityConfig == parityConfig); + layoutPtr->map = p; + + /* initialize the specific layout */ + + retval = (p->Configure)(listp, raidPtr, cfgPtr); + + if (retval) + return(retval); + + layoutPtr->dataBytesPerStripe = layoutPtr->dataSectorsPerStripe << raidPtr->logBytesPerSector; + raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit; + + if (rf_forceNumFloatingReconBufs >= 0) { + raidPtr->numFloatingReconBufs = rf_forceNumFloatingReconBufs; + } + else { + raidPtr->numFloatingReconBufs = rf_GetDefaultNumFloatingReconBuffers(raidPtr); + } + + if (rf_forceHeadSepLimit >= 0) { + raidPtr->headSepLimit = rf_forceHeadSepLimit; + } + else { + raidPtr->headSepLimit = rf_GetDefaultHeadSepLimit(raidPtr); + } + + printf("RAIDFRAME: Configure (%s): total number of sectors is %lu (%lu MB)\n", + layoutPtr->map->configName, + (unsigned long)raidPtr->totalSectors, + (unsigned long)(raidPtr->totalSectors / 1024 * (1<<raidPtr->logBytesPerSector) / 1024)); + if (raidPtr->headSepLimit >= 0) { + printf("RAIDFRAME(%s): Using %ld floating recon bufs with head sep limit %ld\n", + layoutPtr->map->configName, (long)raidPtr->numFloatingReconBufs, (long)raidPtr->headSepLimit); + } + else { + printf("RAIDFRAME(%s): Using %ld floating recon bufs with no head sep limit\n", + layoutPtr->map->configName, (long)raidPtr->numFloatingReconBufs); + } + + return(0); +} + +/* typically there is a 1-1 mapping between stripes and parity stripes. + * however, the declustering code supports packing multiple stripes into + * a single parity stripe, so as to increase the size of the reconstruction + * unit without affecting the size of the stripe unit. This routine finds + * the parity stripe identifier associated with a stripe ID. There is also + * a RaidAddressToParityStripeID macro in layout.h + */ +RF_StripeNum_t rf_MapStripeIDToParityStripeID(layoutPtr, stripeID, which_ru) + RF_RaidLayout_t *layoutPtr; + RF_StripeNum_t stripeID; + RF_ReconUnitNum_t *which_ru; +{ + RF_StripeNum_t parityStripeID; + + /* quick exit in the common case of SUsPerPU==1 */ + if ((layoutPtr->SUsPerPU == 1) || !layoutPtr->map->MapSIDToPSID) { + *which_ru = 0; + return(stripeID); + } + else { + (layoutPtr->map->MapSIDToPSID)(layoutPtr, stripeID, &parityStripeID, which_ru); + } + return(parityStripeID); +} +#endif /* RF_UTILITY == 0 */ diff --git a/sys/dev/raidframe/rf_layout.h b/sys/dev/raidframe/rf_layout.h new file mode 100644 index 00000000000..4259947f67f --- /dev/null +++ b/sys/dev/raidframe/rf_layout.h @@ -0,0 +1,493 @@ +/* $OpenBSD: rf_layout.h,v 1.1 1999/01/11 14:29:28 niklas Exp $ */ +/* $NetBSD: rf_layout.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* rf_layout.h -- header file defining layout data structures + */ + +/* + * : + * Log: rf_layout.h,v + * Revision 1.50 1996/11/05 21:10:40 jimz + * failed pda generalization + * + * Revision 1.49 1996/07/29 14:05:12 jimz + * fix numPUs/numRUs confusion (everything is now numRUs) + * clean up some commenting, return values + * + * Revision 1.48 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.47 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.46 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.45 1996/07/11 19:08:00 jimz + * generalize reconstruction mechanism + * allow raid1 reconstructs via copyback (done with array + * quiesced, not online, therefore not disk-directed) + * + * Revision 1.44 1996/06/19 22:23:01 jimz + * parity verification is now a layout-configurable thing + * not all layouts currently support it (correctly, anyway) + * + * Revision 1.43 1996/06/19 17:53:48 jimz + * move GetNumSparePUs, InstallSpareTable ops into layout switch + * + * Revision 1.42 1996/06/19 14:56:48 jimz + * move layout-specific config parsing hooks into RF_LayoutSW_t + * table in rf_layout.c + * + * Revision 1.41 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.40 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.39 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.38 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.37 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.36 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.35 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.34 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.33 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.32 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.31 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.30 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.29 1995/12/01 19:16:19 root + * added copyright info + * + * Revision 1.28 1995/11/28 21:26:49 amiri + * defined a declustering flag RF_BD_DECLUSTERED + * + * Revision 1.27 1995/11/17 19:00:59 wvcii + * created MapQ entry in switch table + * added prototyping to MapParity + * + * Revision 1.26 1995/11/07 15:40:27 wvcii + * changed prototype of SeclectionFunc in mapsw + * function no longer returns numHdrSucc, numTermAnt + * + * Revision 1.25 1995/10/12 20:57:08 arw + * added lots of comments + * + * Revision 1.24 1995/10/12 16:04:08 jimz + * added config name to mapsw + * + * Revision 1.23 1995/07/26 03:28:31 robby + * intermediary checkin + * + * Revision 1.22 1995/07/10 20:51:08 robby + * added to the asm info for the virtual striping locks + * + * Revision 1.21 1995/07/10 16:57:47 robby + * updated alloclistelem struct to the correct struct name + * + * Revision 1.20 1995/07/08 20:06:11 rachad + * *** empty log message *** + * + * Revision 1.19 1995/07/08 18:05:39 rachad + * Linked up Claudsons code with the real cache + * + * Revision 1.18 1995/07/06 14:29:36 robby + * added defaults states list to the layout switch + * + * Revision 1.17 1995/06/23 13:40:14 robby + * updeated to prototypes in rf_layout.h + * + * Revision 1.16 1995/06/08 22:11:03 holland + * bug fixes related to mutiple-row arrays + * + * Revision 1.15 1995/05/24 21:43:23 wvcii + * added field numParityLogCol to RaidLayout + * + * Revision 1.14 95/05/02 22:46:53 holland + * minor code cleanups. + * + * Revision 1.13 1995/05/02 12:48:01 holland + * eliminated some unused code. + * + * Revision 1.12 1995/05/01 13:28:00 holland + * parity range locks, locking disk requests, recon+parityscan in kernel, etc. + * + * Revision 1.11 1995/03/15 20:01:17 holland + * added REMAP and DONT_REMAP + * + * Revision 1.10 1995/03/09 19:54:11 rachad + * Added suport for threadless simulator + * + * Revision 1.9 1995/03/03 21:48:58 holland + * minor changes. + * + * Revision 1.8 1995/03/01 20:25:48 holland + * kernelization changes + * + * Revision 1.7 1995/02/03 22:31:36 holland + * many changes related to kernelization + * + * Revision 1.6 1995/01/30 14:53:46 holland + * extensive changes related to making DoIO non-blocking + * + * Revision 1.5 1995/01/24 23:58:46 holland + * multi-way recon XOR, plus various small changes + * + * Revision 1.4 1995/01/04 19:28:35 holland + * corrected comments around mapsw + * + * Revision 1.3 1994/11/28 22:15:45 danner + * Added type field to the physdiskaddr struct. + * + */ + +#ifndef _RF__RF_LAYOUT_H_ +#define _RF__RF_LAYOUT_H_ + +#include "rf_types.h" +#include "rf_archs.h" +#include "rf_alloclist.h" + +/***************************************************************************************** + * + * This structure identifies all layout-specific operations and parameters. + * + ****************************************************************************************/ + +typedef struct RF_LayoutSW_s { + RF_ParityConfig_t parityConfig; + char *configName; + +#ifndef KERNEL + /* layout-specific parsing */ + int (*MakeLayoutSpecific)(FILE *fp, RF_Config_t *cfgPtr, void *arg); + void *makeLayoutSpecificArg; +#endif /* !KERNEL */ + +#if RF_UTILITY == 0 + /* initialization routine */ + int (*Configure)(RF_ShutdownList_t **shutdownListp, RF_Raid_t *raidPtr, RF_Config_t *cfgPtr); + + /* routine to map RAID sector address -> physical (row, col, offset) */ + void (*MapSector)(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); + + /* routine to map RAID sector address -> physical (r,c,o) of parity unit */ + void (*MapParity)(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); + + /* routine to map RAID sector address -> physical (r,c,o) of Q unit */ + void (*MapQ)(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, RF_RowCol_t *row, + RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); + + /* routine to identify the disks comprising a stripe */ + void (*IdentifyStripe)(RF_Raid_t *raidPtr, RF_RaidAddr_t addr, + RF_RowCol_t **diskids, RF_RowCol_t *outRow); + + /* routine to select a dag */ + void (*SelectionFunc)(RF_Raid_t *raidPtr, RF_IoType_t type, + RF_AccessStripeMap_t *asmap, + RF_VoidFuncPtr *); +#if 0 + void (**createFunc)(RF_Raid_t *, + RF_AccessStripeMap_t *, + RF_DagHeader_t *, void *, + RF_RaidAccessFlags_t, + RF_AllocListElem_t *)); + +#endif + + /* map a stripe ID to a parity stripe ID. This is typically the identity mapping */ + void (*MapSIDToPSID)(RF_RaidLayout_t *layoutPtr, RF_StripeNum_t stripeID, + RF_StripeNum_t *psID, RF_ReconUnitNum_t *which_ru); + + /* get default head separation limit (may be NULL) */ + RF_HeadSepLimit_t (*GetDefaultHeadSepLimit)(RF_Raid_t *raidPtr); + + /* get default num recon buffers (may be NULL) */ + int (*GetDefaultNumFloatingReconBuffers)(RF_Raid_t *raidPtr); + + /* get number of spare recon units (may be NULL) */ + RF_ReconUnitCount_t (*GetNumSpareRUs)(RF_Raid_t *raidPtr); + + /* spare table installation (may be NULL) */ + int (*InstallSpareTable)(RF_Raid_t *raidPtr, RF_RowCol_t frow, RF_RowCol_t fcol); + + /* recon buffer submission function */ + int (*SubmitReconBuffer)(RF_ReconBuffer_t *rbuf, int keep_it, + int use_committed); + + /* + * verify that parity information for a stripe is correct + * see rf_parityscan.h for return vals + */ + int (*VerifyParity)(RF_Raid_t *raidPtr, RF_RaidAddr_t raidAddr, + RF_PhysDiskAddr_t *parityPDA, int correct_it, RF_RaidAccessFlags_t flags); + + /* number of faults tolerated by this mapping */ + int faultsTolerated; + + /* states to step through in an access. Must end with "LastState". + * The default is DefaultStates in rf_layout.c */ + RF_AccessState_t *states; + + RF_AccessStripeMapFlags_t flags; +#endif /* RF_UTILITY == 0 */ +} RF_LayoutSW_t; + +/* enables remapping to spare location under dist sparing */ +#define RF_REMAP 1 +#define RF_DONT_REMAP 0 + +/* + * Flags values for RF_AccessStripeMapFlags_t + */ +#define RF_NO_STRIPE_LOCKS 0x0001 /* suppress stripe locks */ +#define RF_DISTRIBUTE_SPARE 0x0002 /* distribute spare space in archs that support it */ +#define RF_BD_DECLUSTERED 0x0004 /* declustering uses block designs */ + +/************************************************************************* + * + * this structure forms the layout component of the main Raid + * structure. It describes everything needed to define and perform + * the mapping of logical RAID addresses <-> physical disk addresses. + * + *************************************************************************/ +struct RF_RaidLayout_s { + /* configuration parameters */ + RF_SectorCount_t sectorsPerStripeUnit; /* number of sectors in one stripe unit */ + RF_StripeCount_t SUsPerPU; /* stripe units per parity unit */ + RF_StripeCount_t SUsPerRU; /* stripe units per reconstruction unit */ + + /* redundant-but-useful info computed from the above, used in all layouts */ + RF_StripeCount_t numStripe; /* total number of stripes in the array */ + RF_SectorCount_t dataSectorsPerStripe; + RF_StripeCount_t dataStripeUnitsPerDisk; + u_int bytesPerStripeUnit; + u_int dataBytesPerStripe; + RF_StripeCount_t numDataCol; /* number of SUs of data per stripe (name here is a la RAID4) */ + RF_StripeCount_t numParityCol; /* number of SUs of parity per stripe. Always 1 for now */ + RF_StripeCount_t numParityLogCol; /* number of SUs of parity log per stripe. Always 1 for now */ + RF_StripeCount_t stripeUnitsPerDisk; + + RF_LayoutSW_t *map; /* ptr to struct holding mapping fns and information */ + void *layoutSpecificInfo; /* ptr to a structure holding layout-specific params */ +}; + +/***************************************************************************************** + * + * The mapping code returns a pointer to a list of AccessStripeMap structures, which + * describes all the mapping information about an access. The list contains one + * AccessStripeMap structure per stripe touched by the access. Each element in the list + * contains a stripe identifier and a pointer to a list of PhysDiskAddr structuress. Each + * element in this latter list describes the physical location of a stripe unit accessed + * within the corresponding stripe. + * + ****************************************************************************************/ + +#define RF_PDA_TYPE_DATA 0 +#define RF_PDA_TYPE_PARITY 1 +#define RF_PDA_TYPE_Q 2 + +struct RF_PhysDiskAddr_s { + RF_RowCol_t row,col; /* disk identifier */ + RF_SectorNum_t startSector; /* sector offset into the disk */ + RF_SectorCount_t numSector; /* number of sectors accessed */ + int type; /* used by higher levels: currently, data, parity, or q */ + caddr_t bufPtr; /* pointer to buffer supplying/receiving data */ + RF_RaidAddr_t raidAddress; /* raid address corresponding to this physical disk address */ + RF_PhysDiskAddr_t *next; +}; + +#define RF_MAX_FAILED_PDA RF_MAXCOL + +struct RF_AccessStripeMap_s { + RF_StripeNum_t stripeID; /* the stripe index */ + RF_RaidAddr_t raidAddress; /* the starting raid address within this stripe */ + RF_RaidAddr_t endRaidAddress; /* raid address one sector past the end of the access */ + RF_SectorCount_t totalSectorsAccessed; /* total num sectors identified in physInfo list */ + RF_StripeCount_t numStripeUnitsAccessed; /* total num elements in physInfo list */ + int numDataFailed; /* number of failed data disks accessed */ + int numParityFailed; /* number of failed parity disks accessed (0 or 1) */ + int numQFailed; /* number of failed Q units accessed (0 or 1) */ + RF_AccessStripeMapFlags_t flags; /* various flags */ +#if 0 + RF_PhysDiskAddr_t *failedPDA; /* points to the PDA that has failed */ + RF_PhysDiskAddr_t *failedPDAtwo; /* points to the second PDA that has failed, if any */ +#else + int numFailedPDAs; /* number of failed phys addrs */ + RF_PhysDiskAddr_t *failedPDAs[RF_MAX_FAILED_PDA]; /* array of failed phys addrs */ +#endif + RF_PhysDiskAddr_t *physInfo; /* a list of PhysDiskAddr structs */ + RF_PhysDiskAddr_t *parityInfo; /* list of physical addrs for the parity (P of P + Q ) */ + RF_PhysDiskAddr_t *qInfo; /* list of physical addrs for the Q of P + Q */ + RF_LockReqDesc_t lockReqDesc; /* used for stripe locking */ + RF_RowCol_t origRow; /* the original row: we may redirect the acc to a different row */ + RF_AccessStripeMap_t *next; +}; + +/* flag values */ +#define RF_ASM_REDIR_LARGE_WRITE 0x00000001 /* allows large-write creation code to redirect failed accs */ +#define RF_ASM_BAILOUT_DAG_USED 0x00000002 /* allows us to detect recursive calls to the bailout write dag */ +#define RF_ASM_FLAGS_LOCK_TRIED 0x00000004 /* we've acquired the lock on the first parity range in this parity stripe */ +#define RF_ASM_FLAGS_LOCK_TRIED2 0x00000008 /* we've acquired the lock on the 2nd parity range in this parity stripe */ +#define RF_ASM_FLAGS_FORCE_TRIED 0x00000010 /* we've done the force-recon call on this parity stripe */ +#define RF_ASM_FLAGS_RECON_BLOCKED 0x00000020 /* we blocked recon => we must unblock it later */ + +struct RF_AccessStripeMapHeader_s { + RF_StripeCount_t numStripes; /* total number of stripes touched by this acc */ + RF_AccessStripeMap_t *stripeMap; /* pointer to the actual map. Also used for making lists */ + RF_AccessStripeMapHeader_t *next; +}; + +/***************************************************************************************** + * + * various routines mapping addresses in the RAID address space. These work across + * all layouts. DON'T PUT ANY LAYOUT-SPECIFIC CODE HERE. + * + ****************************************************************************************/ + +/* return the identifier of the stripe containing the given address */ +#define rf_RaidAddressToStripeID(_layoutPtr_, _addr_) \ + ( ((_addr_) / (_layoutPtr_)->sectorsPerStripeUnit) / (_layoutPtr_)->numDataCol ) + +/* return the raid address of the start of the indicates stripe ID */ +#define rf_StripeIDToRaidAddress(_layoutPtr_, _sid_) \ + ( ((_sid_) * (_layoutPtr_)->sectorsPerStripeUnit) * (_layoutPtr_)->numDataCol ) + +/* return the identifier of the stripe containing the given stripe unit id */ +#define rf_StripeUnitIDToStripeID(_layoutPtr_, _addr_) \ + ( (_addr_) / (_layoutPtr_)->numDataCol ) + +/* return the identifier of the stripe unit containing the given address */ +#define rf_RaidAddressToStripeUnitID(_layoutPtr_, _addr_) \ + ( ((_addr_) / (_layoutPtr_)->sectorsPerStripeUnit) ) + +/* return the RAID address of next stripe boundary beyond the given address */ +#define rf_RaidAddressOfNextStripeBoundary(_layoutPtr_, _addr_) \ + ( (((_addr_)/(_layoutPtr_)->dataSectorsPerStripe)+1) * (_layoutPtr_)->dataSectorsPerStripe ) + +/* return the RAID address of the start of the stripe containing the given address */ +#define rf_RaidAddressOfPrevStripeBoundary(_layoutPtr_, _addr_) \ + ( (((_addr_)/(_layoutPtr_)->dataSectorsPerStripe)+0) * (_layoutPtr_)->dataSectorsPerStripe ) + +/* return the RAID address of next stripe unit boundary beyond the given address */ +#define rf_RaidAddressOfNextStripeUnitBoundary(_layoutPtr_, _addr_) \ + ( (((_addr_)/(_layoutPtr_)->sectorsPerStripeUnit)+1L)*(_layoutPtr_)->sectorsPerStripeUnit ) + +/* return the RAID address of the start of the stripe unit containing RAID address _addr_ */ +#define rf_RaidAddressOfPrevStripeUnitBoundary(_layoutPtr_, _addr_) \ + ( (((_addr_)/(_layoutPtr_)->sectorsPerStripeUnit)+0)*(_layoutPtr_)->sectorsPerStripeUnit ) + +/* returns the offset into the stripe. used by RaidAddressStripeAligned */ +#define rf_RaidAddressStripeOffset(_layoutPtr_, _addr_) \ + ( (_addr_) % ((_layoutPtr_)->dataSectorsPerStripe) ) + +/* returns the offset into the stripe unit. */ +#define rf_StripeUnitOffset(_layoutPtr_, _addr_) \ + ( (_addr_) % ((_layoutPtr_)->sectorsPerStripeUnit) ) + +/* returns nonzero if the given RAID address is stripe-aligned */ +#define rf_RaidAddressStripeAligned( __layoutPtr__, __addr__ ) \ + ( rf_RaidAddressStripeOffset(__layoutPtr__, __addr__) == 0 ) + +/* returns nonzero if the given address is stripe-unit aligned */ +#define rf_StripeUnitAligned( __layoutPtr__, __addr__ ) \ + ( rf_StripeUnitOffset(__layoutPtr__, __addr__) == 0 ) + +/* convert an address expressed in RAID blocks to/from an addr expressed in bytes */ +#define rf_RaidAddressToByte(_raidPtr_, _addr_) \ + ( (_addr_) << ( (_raidPtr_)->logBytesPerSector ) ) + +#define rf_ByteToRaidAddress(_raidPtr_, _addr_) \ + ( (_addr_) >> ( (_raidPtr_)->logBytesPerSector ) ) + +/* convert a raid address to/from a parity stripe ID. Conversion to raid address is easy, + * since we're asking for the address of the first sector in the parity stripe. Conversion to a + * parity stripe ID is more complex, since stripes are not contiguously allocated in + * parity stripes. + */ +#define rf_RaidAddressToParityStripeID(_layoutPtr_, _addr_, _ru_num_) \ + rf_MapStripeIDToParityStripeID( (_layoutPtr_), rf_RaidAddressToStripeID( (_layoutPtr_), (_addr_) ), (_ru_num_) ) + +#define rf_ParityStripeIDToRaidAddress(_layoutPtr_, _psid_) \ + ( (_psid_) * (_layoutPtr_)->SUsPerPU * (_layoutPtr_)->numDataCol * (_layoutPtr_)->sectorsPerStripeUnit ) + +RF_LayoutSW_t *rf_GetLayout(RF_ParityConfig_t parityConfig); +int rf_ConfigureLayout(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr); +RF_StripeNum_t rf_MapStripeIDToParityStripeID(RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t stripeID, RF_ReconUnitNum_t *which_ru); + +#endif /* !_RF__RF_LAYOUT_H_ */ diff --git a/sys/dev/raidframe/rf_map.c b/sys/dev/raidframe/rf_map.c new file mode 100644 index 00000000000..11a3262a3a8 --- /dev/null +++ b/sys/dev/raidframe/rf_map.c @@ -0,0 +1,976 @@ +/* $OpenBSD: rf_map.c,v 1.1 1999/01/11 14:29:28 niklas Exp $ */ +/* $NetBSD: rf_map.c,v 1.1 1998/11/13 04:20:31 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/************************************************************************** + * + * map.c -- main code for mapping RAID addresses to physical disk addresses + * + **************************************************************************/ + +/* + * : + * Log: rf_map.c,v + * Revision 1.53 1996/11/05 21:10:40 jimz + * failed pda generalization + * + * Revision 1.52 1996/08/20 19:58:39 jimz + * initialize numParityFailed and numQFailed to 0 in MarkFailuresInASMList + * + * Revision 1.51 1996/08/19 22:26:31 jimz + * add Chang's bugfixes for double-disk failures in MarkFailuresInASMList + * + * Revision 1.50 1996/08/19 21:38:06 jimz + * stripeOffset was uninitialized in CheckStripeForFailures + * + * Revision 1.49 1996/07/31 15:34:56 jimz + * evenodd changes; bugfixes for double-degraded archs, generalize + * some formerly PQ-only functions + * + * Revision 1.48 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.47 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.46 1996/06/10 12:50:57 jimz + * Add counters to freelists to track number of allocations, frees, + * grows, max size, etc. Adjust a couple sets of PRIME params based + * on the results. + * + * Revision 1.45 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.44 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.43 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.42 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.41 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.40 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.39 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.38 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.37 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.36 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.35 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.34 1996/05/20 16:14:45 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.33 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.32 1996/05/17 00:51:47 jimz + * reformat for readability + * + * Revision 1.31 1996/05/16 23:06:26 jimz + * convert asmhdr to use RF_FREELIST stuff + * + * Revision 1.30 1996/05/16 19:09:42 jimz + * grow init asm freelist to 32 + * + * Revision 1.29 1996/05/16 15:27:55 jimz + * prime freelist pumps for asm and pda lists + * + * Revision 1.28 1996/05/02 14:58:35 jimz + * legibility cleanup + * + * Revision 1.27 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.26 1995/12/01 19:25:06 root + * added copyright info + * + * Revision 1.25 1995/11/17 19:01:57 wvcii + * added call to MapQ in two fault tolerant case + * + * Revision 1.24 1995/11/17 15:10:53 wvcii + * fixed bug in ASMCheckStatus - ASSERT was using disk sector addresses + * rather than raidAddress + * + * Revision 1.23 1995/07/26 03:26:51 robby + * map the allocation and freeing routines for some stuff non-static + * + * Revision 1.22 1995/06/28 09:33:45 holland + * bug fixes related to dist sparing and multiple-row arrays + * + * Revision 1.21 1995/06/28 04:51:08 holland + * added some asserts against zero-length accesses + * + * Revision 1.20 1995/06/23 13:40:06 robby + * updeated to prototypes in rf_layout.h + * + */ + +#include "rf_types.h" +#include "rf_threadstuff.h" +#include "rf_raid.h" +#include "rf_general.h" +#include "rf_map.h" +#include "rf_freelist.h" +#include "rf_shutdown.h" +#include "rf_sys.h" + +static void rf_FreePDAList(RF_PhysDiskAddr_t *start, RF_PhysDiskAddr_t *end, int count); +static void rf_FreeASMList(RF_AccessStripeMap_t *start, RF_AccessStripeMap_t *end, + int count); + +/***************************************************************************************** + * + * MapAccess -- main 1st order mapping routine. + * + * Maps an access in the RAID address space to the corresponding set of physical disk + * addresses. The result is returned as a list of AccessStripeMap structures, one per + * stripe accessed. Each ASM structure contains a pointer to a list of PhysDiskAddr + * structures, which describe the physical locations touched by the user access. Note + * that this routine returns only static mapping information, i.e. the list of physical + * addresses returned does not necessarily identify the set of physical locations that + * will actually be read or written. + * + * The routine also maps the parity. The physical disk location returned always + * indicates the entire parity unit, even when only a subset of it is being accessed. + * This is because an access that is not stripe unit aligned but that spans a stripe + * unit boundary may require access two distinct portions of the parity unit, and we + * can't yet tell which portion(s) we'll actually need. We leave it up to the algorithm + * selection code to decide what subset of the parity unit to access. + * + * Note that addresses in the RAID address space must always be maintained as + * longs, instead of ints. + * + * This routine returns NULL if numBlocks is 0 + * + ****************************************************************************************/ + +RF_AccessStripeMapHeader_t *rf_MapAccess(raidPtr, raidAddress, numBlocks, buffer, remap) + RF_Raid_t *raidPtr; + RF_RaidAddr_t raidAddress; /* starting address in RAID address space */ + RF_SectorCount_t numBlocks; /* number of blocks in RAID address space to access */ + caddr_t buffer; /* buffer to supply/receive data */ + int remap; /* 1 => remap addresses to spare space */ +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_AccessStripeMapHeader_t *asm_hdr = NULL; + RF_AccessStripeMap_t *asm_list = NULL, *asm_p = NULL; + int faultsTolerated = layoutPtr->map->faultsTolerated; + RF_RaidAddr_t startAddress = raidAddress; /* we'll change raidAddress along the way */ + RF_RaidAddr_t endAddress = raidAddress + numBlocks; + RF_RaidDisk_t **disks = raidPtr->Disks; + + RF_PhysDiskAddr_t *pda_p, *pda_q; + RF_StripeCount_t numStripes = 0; + RF_RaidAddr_t stripeRealEndAddress, stripeEndAddress, nextStripeUnitAddress; + RF_RaidAddr_t startAddrWithinStripe, lastRaidAddr; + RF_StripeCount_t totStripes; + RF_StripeNum_t stripeID, lastSID, SUID, lastSUID; + RF_AccessStripeMap_t *asmList, *t_asm; + RF_PhysDiskAddr_t *pdaList, *t_pda; + + /* allocate all the ASMs and PDAs up front */ + lastRaidAddr = raidAddress + numBlocks - 1 ; + stripeID = rf_RaidAddressToStripeID(layoutPtr, raidAddress); + lastSID = rf_RaidAddressToStripeID(layoutPtr, lastRaidAddr); + totStripes = lastSID - stripeID + 1; + SUID = rf_RaidAddressToStripeUnitID(layoutPtr, raidAddress); + lastSUID = rf_RaidAddressToStripeUnitID(layoutPtr, lastRaidAddr); + + asmList = rf_AllocASMList(totStripes); + pdaList = rf_AllocPDAList(lastSUID - SUID + 1 + faultsTolerated * totStripes); /* may also need pda(s) per stripe for parity */ + + if (raidAddress+numBlocks > raidPtr->totalSectors) { + RF_ERRORMSG1("Unable to map access because offset (%d) was invalid\n", + (int)raidAddress); + return(NULL); + } + + if (rf_mapDebug) + rf_PrintRaidAddressInfo(raidPtr, raidAddress, numBlocks); + for (; raidAddress < endAddress; ) { + /* make the next stripe structure */ + RF_ASSERT(asmList); + t_asm = asmList; + asmList = asmList->next; + bzero((char *)t_asm, sizeof(RF_AccessStripeMap_t)); + if (!asm_p) + asm_list = asm_p = t_asm; + else { + asm_p->next = t_asm; + asm_p = asm_p->next; + } + numStripes++; + + /* map SUs from current location to the end of the stripe */ + asm_p->stripeID = /*rf_RaidAddressToStripeID(layoutPtr, raidAddress)*/ stripeID++; + stripeRealEndAddress = rf_RaidAddressOfNextStripeBoundary(layoutPtr, raidAddress); + stripeEndAddress = RF_MIN(endAddress,stripeRealEndAddress ); + asm_p->raidAddress = raidAddress; + asm_p->endRaidAddress = stripeEndAddress; + + /* map each stripe unit in the stripe */ + pda_p = NULL; + startAddrWithinStripe = raidAddress; /* Raid addr of start of portion of access that is within this stripe */ + for (; raidAddress < stripeEndAddress; ) { + RF_ASSERT(pdaList); + t_pda = pdaList; + pdaList = pdaList->next; + bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t)); + if (!pda_p) + asm_p->physInfo = pda_p = t_pda; + else { + pda_p->next = t_pda; + pda_p = pda_p->next; + } + + pda_p->type = RF_PDA_TYPE_DATA; + (layoutPtr->map->MapSector)(raidPtr, raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap); + + /* mark any failures we find. failedPDA is don't-care if there is more than one failure */ + pda_p->raidAddress = raidAddress; /* the RAID address corresponding to this physical disk address */ + nextStripeUnitAddress = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, raidAddress); + pda_p->numSector = RF_MIN(endAddress, nextStripeUnitAddress) - raidAddress; + RF_ASSERT(pda_p->numSector != 0); + rf_ASMCheckStatus(raidPtr,pda_p,asm_p,disks,0); + pda_p->bufPtr = buffer + rf_RaidAddressToByte(raidPtr, (raidAddress - startAddress)); + asm_p->totalSectorsAccessed += pda_p->numSector; + asm_p->numStripeUnitsAccessed++; + asm_p->origRow = pda_p->row; /* redundant but harmless to do this in every loop iteration */ + + raidAddress = RF_MIN(endAddress, nextStripeUnitAddress); + } + + /* Map the parity. At this stage, the startSector and numSector fields + * for the parity unit are always set to indicate the entire parity unit. + * We may modify this after mapping the data portion. + */ + switch (faultsTolerated) + { + case 0: + break; + case 1: /* single fault tolerant */ + RF_ASSERT(pdaList); + t_pda = pdaList; + pdaList = pdaList->next; + bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t)); + pda_p = asm_p->parityInfo = t_pda; + pda_p->type = RF_PDA_TYPE_PARITY; + (layoutPtr->map->MapParity)(raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe), + &(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap); + pda_p->numSector = layoutPtr->sectorsPerStripeUnit; + /* raidAddr may be needed to find unit to redirect to */ + pda_p->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe); + rf_ASMCheckStatus(raidPtr,pda_p,asm_p,disks,1); + rf_ASMParityAdjust(asm_p->parityInfo,startAddrWithinStripe,endAddress,layoutPtr,asm_p); + + break; + case 2: /* two fault tolerant */ + RF_ASSERT(pdaList && pdaList->next); + t_pda = pdaList; + pdaList = pdaList->next; + bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t)); + pda_p = asm_p->parityInfo = t_pda; + pda_p->type = RF_PDA_TYPE_PARITY; + t_pda = pdaList; + pdaList = pdaList->next; + bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t)); + pda_q = asm_p->qInfo = t_pda; + pda_q->type = RF_PDA_TYPE_Q; + (layoutPtr->map->MapParity)(raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe), + &(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap); + (layoutPtr->map->MapQ)(raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe), + &(pda_q->row), &(pda_q->col), &(pda_q->startSector), remap); + pda_q->numSector = pda_p->numSector = layoutPtr->sectorsPerStripeUnit; + /* raidAddr may be needed to find unit to redirect to */ + pda_p->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe); + pda_q->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe); + /* failure mode stuff */ + rf_ASMCheckStatus(raidPtr,pda_p,asm_p,disks,1); + rf_ASMCheckStatus(raidPtr,pda_q,asm_p,disks,1); + rf_ASMParityAdjust(asm_p->parityInfo,startAddrWithinStripe,endAddress,layoutPtr,asm_p); + rf_ASMParityAdjust(asm_p->qInfo,startAddrWithinStripe,endAddress,layoutPtr,asm_p); + break; + } + } + RF_ASSERT(asmList == NULL && pdaList == NULL); + /* make the header structure */ + asm_hdr = rf_AllocAccessStripeMapHeader(); + RF_ASSERT(numStripes == totStripes); + asm_hdr->numStripes = numStripes; + asm_hdr->stripeMap = asm_list; + + if (rf_mapDebug) + rf_PrintAccessStripeMap(asm_hdr); + return(asm_hdr); +} + +/***************************************************************************************** + * This routine walks through an ASM list and marks the PDAs that have failed. + * It's called only when a disk failure causes an in-flight DAG to fail. + * The parity may consist of two components, but we want to use only one failedPDA + * pointer. Thus we set failedPDA to point to the first parity component, and rely + * on the rest of the code to do the right thing with this. + ****************************************************************************************/ + +void rf_MarkFailuresInASMList(raidPtr, asm_h) + RF_Raid_t *raidPtr; + RF_AccessStripeMapHeader_t *asm_h; +{ + RF_RaidDisk_t **disks = raidPtr->Disks; + RF_AccessStripeMap_t *asmap; + RF_PhysDiskAddr_t *pda; + + for (asmap = asm_h->stripeMap; asmap; asmap = asmap->next) { + asmap->numDataFailed = asmap->numParityFailed = asmap->numQFailed = 0; + asmap->numFailedPDAs = 0; + bzero((char *)asmap->failedPDAs, + RF_MAX_FAILED_PDA*sizeof(RF_PhysDiskAddr_t *)); + for (pda = asmap->physInfo; pda; pda=pda->next) { + if (RF_DEAD_DISK(disks[pda->row][pda->col].status)) { + printf("DEAD DISK BOGUSLY DETECTED!!\n"); + asmap->numDataFailed++; + asmap->failedPDAs[asmap->numFailedPDAs] = pda; + asmap->numFailedPDAs++; + } + } + pda = asmap->parityInfo; + if (pda && RF_DEAD_DISK(disks[pda->row][pda->col].status)) { + asmap->numParityFailed++; + asmap->failedPDAs[asmap->numFailedPDAs] = pda; + asmap->numFailedPDAs++; + } + pda = asmap->qInfo; + if (pda && RF_DEAD_DISK(disks[pda->row][pda->col].status)) { + asmap->numQFailed++; + asmap->failedPDAs[asmap->numFailedPDAs] = pda; + asmap->numFailedPDAs++; + } + } +} + +/***************************************************************************************** + * + * DuplicateASM -- duplicates an ASM and returns the new one + * + ****************************************************************************************/ +RF_AccessStripeMap_t *rf_DuplicateASM(asmap) + RF_AccessStripeMap_t *asmap; +{ + RF_AccessStripeMap_t *new_asm; + RF_PhysDiskAddr_t *pda, *new_pda, *t_pda; + + new_pda = NULL; + new_asm = rf_AllocAccessStripeMapComponent(); + bcopy((char *)asmap, (char *)new_asm, sizeof(RF_AccessStripeMap_t)); + new_asm->numFailedPDAs = 0; /* ??? */ + new_asm->failedPDAs[0] = NULL; + new_asm->physInfo = NULL; + new_asm->parityInfo = NULL; + new_asm->next = NULL; + + for (pda = asmap->physInfo; pda; pda=pda->next) { /* copy the physInfo list */ + t_pda = rf_AllocPhysDiskAddr(); + bcopy((char *)pda, (char *)t_pda, sizeof(RF_PhysDiskAddr_t)); + t_pda->next = NULL; + if (!new_asm->physInfo) {new_asm->physInfo = t_pda; new_pda = t_pda;} + else {new_pda->next = t_pda; new_pda = new_pda->next;} + if (pda == asmap->failedPDAs[0]) + new_asm->failedPDAs[0] = t_pda; + } + for (pda = asmap->parityInfo; pda; pda=pda->next) { /* copy the parityInfo list */ + t_pda = rf_AllocPhysDiskAddr(); + bcopy((char *)pda, (char *)t_pda, sizeof(RF_PhysDiskAddr_t)); + t_pda->next = NULL; + if (!new_asm->parityInfo) {new_asm->parityInfo = t_pda; new_pda = t_pda;} + else {new_pda->next = t_pda; new_pda = new_pda->next;} + if (pda == asmap->failedPDAs[0]) + new_asm->failedPDAs[0] = t_pda; + } + return(new_asm); +} + +/***************************************************************************************** + * + * DuplicatePDA -- duplicates a PDA and returns the new one + * + ****************************************************************************************/ +RF_PhysDiskAddr_t *rf_DuplicatePDA(pda) + RF_PhysDiskAddr_t *pda; +{ + RF_PhysDiskAddr_t *new; + + new = rf_AllocPhysDiskAddr(); + bcopy((char *)pda, (char *)new, sizeof(RF_PhysDiskAddr_t)); + return(new); +} + +/***************************************************************************************** + * + * routines to allocate and free list elements. All allocation routines zero the + * structure before returning it. + * + * FreePhysDiskAddr is static. It should never be called directly, because + * FreeAccessStripeMap takes care of freeing the PhysDiskAddr list. + * + ****************************************************************************************/ + +static RF_FreeList_t *rf_asmhdr_freelist; +#define RF_MAX_FREE_ASMHDR 128 +#define RF_ASMHDR_INC 16 +#define RF_ASMHDR_INITIAL 32 + +static RF_FreeList_t *rf_asm_freelist; +#define RF_MAX_FREE_ASM 192 +#define RF_ASM_INC 24 +#define RF_ASM_INITIAL 64 + +static RF_FreeList_t *rf_pda_freelist; +#define RF_MAX_FREE_PDA 192 +#define RF_PDA_INC 24 +#define RF_PDA_INITIAL 64 + +/* called at shutdown time. So far, all that is necessary is to release all the free lists */ +static void rf_ShutdownMapModule(void *); +static void rf_ShutdownMapModule(ignored) + void *ignored; +{ + RF_FREELIST_DESTROY(rf_asmhdr_freelist,next,(RF_AccessStripeMapHeader_t *)); + RF_FREELIST_DESTROY(rf_pda_freelist,next,(RF_PhysDiskAddr_t *)); + RF_FREELIST_DESTROY(rf_asm_freelist,next,(RF_AccessStripeMap_t *)); +} + +int rf_ConfigureMapModule(listp) + RF_ShutdownList_t **listp; +{ + int rc; + + RF_FREELIST_CREATE(rf_asmhdr_freelist, RF_MAX_FREE_ASMHDR, + RF_ASMHDR_INC, sizeof(RF_AccessStripeMapHeader_t)); + if (rf_asmhdr_freelist == NULL) { + return(ENOMEM); + } + RF_FREELIST_CREATE(rf_asm_freelist, RF_MAX_FREE_ASM, + RF_ASM_INC, sizeof(RF_AccessStripeMap_t)); + if (rf_asm_freelist == NULL) { + RF_FREELIST_DESTROY(rf_asmhdr_freelist,next,(RF_AccessStripeMapHeader_t *)); + return(ENOMEM); + } + RF_FREELIST_CREATE(rf_pda_freelist, RF_MAX_FREE_PDA, + RF_PDA_INC, sizeof(RF_PhysDiskAddr_t)); + if (rf_pda_freelist == NULL) { + RF_FREELIST_DESTROY(rf_asmhdr_freelist,next,(RF_AccessStripeMapHeader_t *)); + RF_FREELIST_DESTROY(rf_pda_freelist,next,(RF_PhysDiskAddr_t *)); + return(ENOMEM); + } + + rc = rf_ShutdownCreate(listp, rf_ShutdownMapModule, NULL); + if (rc) { + RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + rf_ShutdownMapModule(NULL); + return(rc); + } + + RF_FREELIST_PRIME(rf_asmhdr_freelist, RF_ASMHDR_INITIAL,next, + (RF_AccessStripeMapHeader_t *)); + RF_FREELIST_PRIME(rf_asm_freelist, RF_ASM_INITIAL,next, + (RF_AccessStripeMap_t *)); + RF_FREELIST_PRIME(rf_pda_freelist, RF_PDA_INITIAL,next, + (RF_PhysDiskAddr_t *)); + + return(0); +} + +RF_AccessStripeMapHeader_t *rf_AllocAccessStripeMapHeader() +{ + RF_AccessStripeMapHeader_t *p; + + RF_FREELIST_GET(rf_asmhdr_freelist,p,next,(RF_AccessStripeMapHeader_t *)); + bzero((char *)p, sizeof(RF_AccessStripeMapHeader_t)); + + return(p); +} + + +void rf_FreeAccessStripeMapHeader(p) + RF_AccessStripeMapHeader_t *p; +{ + RF_FREELIST_FREE(rf_asmhdr_freelist,p,next); +} + +RF_PhysDiskAddr_t *rf_AllocPhysDiskAddr() +{ + RF_PhysDiskAddr_t *p; + + RF_FREELIST_GET(rf_pda_freelist,p,next,(RF_PhysDiskAddr_t *)); + bzero((char *)p, sizeof(RF_PhysDiskAddr_t)); + + return(p); +} + +/* allocates a list of PDAs, locking the free list only once + * when we have to call calloc, we do it one component at a time to simplify + * the process of freeing the list at program shutdown. This should not be + * much of a performance hit, because it should be very infrequently executed. + */ +RF_PhysDiskAddr_t *rf_AllocPDAList(count) + int count; +{ + RF_PhysDiskAddr_t *p = NULL; + + RF_FREELIST_GET_N(rf_pda_freelist,p,next,(RF_PhysDiskAddr_t *),count); + return(p); +} + +void rf_FreePhysDiskAddr(p) + RF_PhysDiskAddr_t *p; +{ + RF_FREELIST_FREE(rf_pda_freelist,p,next); +} + +static void rf_FreePDAList(l_start, l_end, count) + RF_PhysDiskAddr_t *l_start, *l_end; /* pointers to start and end of list */ + int count; /* number of elements in list */ +{ + RF_FREELIST_FREE_N(rf_pda_freelist,l_start,next,(RF_PhysDiskAddr_t *),count); +} + +RF_AccessStripeMap_t *rf_AllocAccessStripeMapComponent() +{ + RF_AccessStripeMap_t *p; + + RF_FREELIST_GET(rf_asm_freelist,p,next,(RF_AccessStripeMap_t *)); + bzero((char *)p, sizeof(RF_AccessStripeMap_t)); + + return(p); +} + +/* this is essentially identical to AllocPDAList. I should combine the two. + * when we have to call calloc, we do it one component at a time to simplify + * the process of freeing the list at program shutdown. This should not be + * much of a performance hit, because it should be very infrequently executed. + */ +RF_AccessStripeMap_t *rf_AllocASMList(count) + int count; +{ + RF_AccessStripeMap_t *p = NULL; + + RF_FREELIST_GET_N(rf_asm_freelist,p,next,(RF_AccessStripeMap_t *),count); + return(p); +} + +void rf_FreeAccessStripeMapComponent(p) + RF_AccessStripeMap_t *p; +{ + RF_FREELIST_FREE(rf_asm_freelist,p,next); +} + +static void rf_FreeASMList(l_start, l_end, count) + RF_AccessStripeMap_t *l_start, *l_end; + int count; +{ + RF_FREELIST_FREE_N(rf_asm_freelist,l_start,next,(RF_AccessStripeMap_t *),count); +} + +void rf_FreeAccessStripeMap(hdr) + RF_AccessStripeMapHeader_t *hdr; +{ + RF_AccessStripeMap_t *p, *pt = NULL; + RF_PhysDiskAddr_t *pdp, *trailer, *pdaList = NULL, *pdaEnd = NULL; + int count = 0, t, asm_count = 0; + + for (p = hdr->stripeMap; p; p=p->next) { + + /* link the 3 pda lists into the accumulating pda list */ + + if (!pdaList) pdaList = p->qInfo; else pdaEnd->next = p->qInfo; + for (trailer=NULL,pdp=p->qInfo; pdp; ) {trailer = pdp; pdp=pdp->next; count++;} + if (trailer) pdaEnd = trailer; + + if (!pdaList) pdaList = p->parityInfo; else pdaEnd->next = p->parityInfo; + for (trailer=NULL,pdp=p->parityInfo; pdp; ) {trailer = pdp; pdp=pdp->next; count++;} + if (trailer) pdaEnd = trailer; + + if (!pdaList) pdaList = p->physInfo; else pdaEnd->next = p->physInfo; + for (trailer=NULL,pdp=p->physInfo; pdp; ) {trailer = pdp; pdp=pdp->next; count++;} + if (trailer) pdaEnd = trailer; + + pt = p; + asm_count++; + } + + /* debug only */ + for (t=0,pdp=pdaList; pdp; pdp=pdp->next) + t++; + RF_ASSERT(t == count); + + if (pdaList) + rf_FreePDAList(pdaList, pdaEnd, count); + rf_FreeASMList(hdr->stripeMap, pt, asm_count); + rf_FreeAccessStripeMapHeader(hdr); +} + +/* We can't use the large write optimization if there are any failures in the stripe. + * In the declustered layout, there is no way to immediately determine what disks + * constitute a stripe, so we actually have to hunt through the stripe looking for failures. + * The reason we map the parity instead of just using asm->parityInfo->col is because + * the latter may have been already redirected to a spare drive, which would + * mess up the computation of the stripe offset. + * + * ASSUMES AT MOST ONE FAILURE IN THE STRIPE. + */ +int rf_CheckStripeForFailures(raidPtr, asmap) + RF_Raid_t *raidPtr; + RF_AccessStripeMap_t *asmap; +{ + RF_RowCol_t trow, tcol, prow, pcol, *diskids, row, i; + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_StripeCount_t stripeOffset; + int numFailures; + RF_RaidAddr_t sosAddr; + RF_SectorNum_t diskOffset, poffset; + RF_RowCol_t testrow; + + /* quick out in the fault-free case. */ + RF_LOCK_MUTEX(raidPtr->mutex); + numFailures = raidPtr->numFailures; + RF_UNLOCK_MUTEX(raidPtr->mutex); + if (numFailures == 0) return(0); + + sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); + row = asmap->physInfo->row; + (layoutPtr->map->IdentifyStripe)(raidPtr, asmap->raidAddress, &diskids, &testrow); + (layoutPtr->map->MapParity)(raidPtr, asmap->raidAddress, &prow, &pcol, &poffset, 0); /* get pcol */ + + /* this need not be true if we've redirected the access to a spare in another row + RF_ASSERT(row == testrow); + */ + stripeOffset = 0; + for (i=0; i<layoutPtr->numDataCol+layoutPtr->numParityCol; i++) { + if (diskids[i] != pcol) { + if (RF_DEAD_DISK(raidPtr->Disks[testrow][diskids[i]].status)) { + if (raidPtr->status[testrow] != rf_rs_reconstructing) + return(1); + RF_ASSERT(raidPtr->reconControl[testrow]->fcol == diskids[i]); + layoutPtr->map->MapSector(raidPtr, + sosAddr + stripeOffset * layoutPtr->sectorsPerStripeUnit, + &trow, &tcol, &diskOffset, 0); + RF_ASSERT( (trow == testrow) && (tcol == diskids[i]) ); + if (!rf_CheckRUReconstructed(raidPtr->reconControl[testrow]->reconMap, diskOffset)) + return(1); + asmap->flags |= RF_ASM_REDIR_LARGE_WRITE; + return(0); + } + stripeOffset++; + } + } + return(0); +} + +/* + return the number of failed data units in the stripe. +*/ + +int rf_NumFailedDataUnitsInStripe(raidPtr, asmap) + RF_Raid_t *raidPtr; + RF_AccessStripeMap_t *asmap; +{ + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_RowCol_t trow, tcol, row, i; + RF_SectorNum_t diskOffset; + RF_RaidAddr_t sosAddr; + int numFailures; + + /* quick out in the fault-free case. */ + RF_LOCK_MUTEX(raidPtr->mutex); + numFailures = raidPtr->numFailures; + RF_UNLOCK_MUTEX(raidPtr->mutex); + if (numFailures == 0) return(0); + numFailures = 0; + + sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); + row = asmap->physInfo->row; + for (i=0; i<layoutPtr->numDataCol; i++) + { + (layoutPtr->map->MapSector)(raidPtr, sosAddr + i * layoutPtr->sectorsPerStripeUnit, + &trow, &tcol, &diskOffset, 0); + if (RF_DEAD_DISK(raidPtr->Disks[trow][tcol].status)) + numFailures++; + } + + return numFailures; +} + + +/***************************************************************************************** + * + * debug routines + * + ****************************************************************************************/ + +void rf_PrintAccessStripeMap(asm_h) + RF_AccessStripeMapHeader_t *asm_h; +{ + rf_PrintFullAccessStripeMap(asm_h, 0); +} + +void rf_PrintFullAccessStripeMap(asm_h, prbuf) + RF_AccessStripeMapHeader_t *asm_h; + int prbuf; /* flag to print buffer pointers */ +{ + int i; + RF_AccessStripeMap_t *asmap = asm_h->stripeMap; + RF_PhysDiskAddr_t *p; + printf("%d stripes total\n", (int)asm_h->numStripes); + for (; asmap; asmap = asmap->next) { + /* printf("Num failures: %d\n",asmap->numDataFailed); */ + /* printf("Num sectors: %d\n",(int)asmap->totalSectorsAccessed); */ + printf("Stripe %d (%d sectors), failures: %d data, %d parity: ", + (int) asmap->stripeID, + (int) asmap->totalSectorsAccessed, + (int) asmap->numDataFailed, + (int) asmap->numParityFailed); + if (asmap->parityInfo) { + printf("Parity [r%d c%d s%d-%d", asmap->parityInfo->row, asmap->parityInfo->col, + (int)asmap->parityInfo->startSector, + (int)(asmap->parityInfo->startSector + + asmap->parityInfo->numSector - 1)); + if (prbuf) printf(" b0x%lx",(unsigned long) asmap->parityInfo->bufPtr); + if (asmap->parityInfo->next) { + printf(", r%d c%d s%d-%d", asmap->parityInfo->next->row, + asmap->parityInfo->next->col, + (int) asmap->parityInfo->next->startSector, + (int)(asmap->parityInfo->next->startSector + + asmap->parityInfo->next->numSector - 1)); + if (prbuf) printf(" b0x%lx",(unsigned long) asmap->parityInfo->next->bufPtr); + RF_ASSERT(asmap->parityInfo->next->next == NULL); + } + printf("]\n\t"); + } + for (i=0,p=asmap->physInfo; p; p=p->next,i++) { + printf("SU r%d c%d s%d-%d ", p->row, p->col, (int)p->startSector, + (int)(p->startSector + p->numSector - 1)); + if (prbuf) printf("b0x%lx ", (unsigned long) p->bufPtr); + if (i && !(i&1)) printf("\n\t"); + } + printf("\n"); + p = asm_h->stripeMap->failedPDAs[0]; + if (asm_h->stripeMap->numDataFailed + asm_h->stripeMap->numParityFailed > 1) printf("[multiple failures]\n"); + else if (asm_h->stripeMap->numDataFailed + asm_h->stripeMap->numParityFailed > 0) + printf("\t[Failed PDA: r%d c%d s%d-%d]\n",p->row, p->col, + (int)p->startSector, (int)(p->startSector + p->numSector-1)); + } +} + +void rf_PrintRaidAddressInfo(raidPtr, raidAddr, numBlocks) + RF_Raid_t *raidPtr; + RF_RaidAddr_t raidAddr; + RF_SectorCount_t numBlocks; +{ + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_RaidAddr_t ra, sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr); + + printf("Raid addrs of SU boundaries from start of stripe to end of access:\n\t"); + for (ra = sosAddr; ra <= raidAddr + numBlocks; ra += layoutPtr->sectorsPerStripeUnit) { + printf("%d (0x%x), ",(int)ra, (int)ra); + } + printf("\n"); + printf("Offset into stripe unit: %d (0x%x)\n", + (int)(raidAddr % layoutPtr->sectorsPerStripeUnit), + (int)(raidAddr % layoutPtr->sectorsPerStripeUnit)); +} + +/* + given a parity descriptor and the starting address within a stripe, + range restrict the parity descriptor to touch only the correct stuff. +*/ +void rf_ASMParityAdjust( + RF_PhysDiskAddr_t *toAdjust, + RF_StripeNum_t startAddrWithinStripe, + RF_SectorNum_t endAddress, + RF_RaidLayout_t *layoutPtr, + RF_AccessStripeMap_t *asm_p) +{ + RF_PhysDiskAddr_t *new_pda; + + /* when we're accessing only a portion of one stripe unit, we want the parity descriptor + * to identify only the chunk of parity associated with the data. When the access spans + * exactly one stripe unit boundary and is less than a stripe unit in size, it uses two disjoint + * regions of the parity unit. When an access spans more than one stripe unit boundary, it + * uses all of the parity unit. + * + * To better handle the case where stripe units are small, we may eventually want to change + * the 2nd case so that if the SU size is below some threshold, we just read/write the whole + * thing instead of breaking it up into two accesses. + */ + if (asm_p->numStripeUnitsAccessed == 1) + { + int x = (startAddrWithinStripe % layoutPtr->sectorsPerStripeUnit); + toAdjust->startSector += x; + toAdjust->raidAddress += x; + toAdjust->numSector = asm_p->physInfo->numSector; + RF_ASSERT(toAdjust->numSector != 0); + } + else + if (asm_p->numStripeUnitsAccessed == 2 && asm_p->totalSectorsAccessed < layoutPtr->sectorsPerStripeUnit) + { + int x = (startAddrWithinStripe % layoutPtr->sectorsPerStripeUnit); + + /* create a second pda and copy the parity map info into it */ + RF_ASSERT(toAdjust->next == NULL); + new_pda = toAdjust->next = rf_AllocPhysDiskAddr(); + *new_pda = *toAdjust; /* structure assignment */ + new_pda->next = NULL; + + /* adjust the start sector & number of blocks for the first parity pda */ + toAdjust->startSector += x; + toAdjust->raidAddress += x; + toAdjust->numSector = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, startAddrWithinStripe) - startAddrWithinStripe; + RF_ASSERT(toAdjust->numSector != 0); + + /* adjust the second pda */ + new_pda->numSector = endAddress - rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, endAddress); + /*new_pda->raidAddress = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, toAdjust->raidAddress);*/ + RF_ASSERT(new_pda->numSector != 0); + } +} + +/* + Check if a disk has been spared or failed. If spared, + redirect the I/O. + If it has been failed, record it in the asm pointer. + Fourth arg is whether data or parity. +*/ +void rf_ASMCheckStatus( + RF_Raid_t *raidPtr, + RF_PhysDiskAddr_t *pda_p, + RF_AccessStripeMap_t *asm_p, + RF_RaidDisk_t **disks, + int parity) +{ + RF_DiskStatus_t dstatus; + RF_RowCol_t frow, fcol; + + dstatus = disks[pda_p->row][pda_p->col].status; + + if (dstatus == rf_ds_spared) { + /* if the disk has been spared, redirect access to the spare */ + frow = pda_p->row; fcol = pda_p->col; + pda_p->row = disks[frow][fcol].spareRow; + pda_p->col = disks[frow][fcol].spareCol; + } + else if (dstatus == rf_ds_dist_spared) { + /* ditto if disk has been spared to dist spare space */ + RF_RowCol_t or = pda_p->row, oc=pda_p->col; + RF_SectorNum_t oo = pda_p->startSector; + + if (pda_p -> type == RF_PDA_TYPE_DATA) + raidPtr->Layout.map->MapSector(raidPtr, pda_p->raidAddress, &pda_p->row, &pda_p->col, &pda_p->startSector, RF_REMAP); + else + raidPtr->Layout.map->MapParity(raidPtr, pda_p->raidAddress, &pda_p->row, &pda_p->col, &pda_p->startSector, RF_REMAP); + + if (rf_mapDebug) { + printf("Redirected r %d c %d o %d -> r%d c %d o %d\n",or,oc,(int)oo, + pda_p->row,pda_p->col,(int)pda_p->startSector); + } + } else if (RF_DEAD_DISK(dstatus)) { + /* if the disk is inaccessible, mark the failure */ + if (parity) + asm_p->numParityFailed++; + else { + asm_p->numDataFailed++; +#if 0 + /* XXX Do we really want this spewing out on the console? GO */ + printf("DATA_FAILED!\n"); +#endif + } + asm_p->failedPDAs[asm_p->numFailedPDAs] = pda_p; + asm_p->numFailedPDAs++; +#if 0 + switch (asm_p->numParityFailed + asm_p->numDataFailed) + { + case 1: + asm_p->failedPDAs[0] = pda_p; + break; + case 2: + asm_p->failedPDAs[1] = pda_p; + default: + break; + } +#endif + } + /* the redirected access should never span a stripe unit boundary */ + RF_ASSERT(rf_RaidAddressToStripeUnitID(&raidPtr->Layout,pda_p->raidAddress) == + rf_RaidAddressToStripeUnitID(&raidPtr->Layout,pda_p->raidAddress + pda_p->numSector -1)); + RF_ASSERT(pda_p->col != -1); +} diff --git a/sys/dev/raidframe/rf_map.h b/sys/dev/raidframe/rf_map.h new file mode 100644 index 00000000000..827de180b51 --- /dev/null +++ b/sys/dev/raidframe/rf_map.h @@ -0,0 +1,134 @@ +/* $OpenBSD: rf_map.h,v 1.1 1999/01/11 14:29:29 niklas Exp $ */ +/* $NetBSD: rf_map.h,v 1.1 1998/11/13 04:20:31 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* rf_map.h */ + +/* : + * Log: rf_map.h,v + * Revision 1.9 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.8 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.7 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.6 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.5 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.4 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.3 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.2 1995/12/01 19:25:14 root + * added copyright info + * + */ + +#ifndef _RF__RF_MAP_H_ +#define _RF__RF_MAP_H_ + +#include "rf_types.h" +#include "rf_alloclist.h" +#include "rf_raid.h" + +/* mapping structure allocation and free routines */ +RF_AccessStripeMapHeader_t *rf_MapAccess(RF_Raid_t *raidPtr, + RF_RaidAddr_t raidAddress, RF_SectorCount_t numBlocks, + caddr_t buffer, int remap); + +void rf_MarkFailuresInASMList(RF_Raid_t *raidPtr, + RF_AccessStripeMapHeader_t *asm_h); + +RF_AccessStripeMap_t *rf_DuplicateASM(RF_AccessStripeMap_t *asmap); + +RF_PhysDiskAddr_t *rf_DuplicatePDA(RF_PhysDiskAddr_t *pda); + +int rf_ConfigureMapModule(RF_ShutdownList_t **listp); + +RF_AccessStripeMapHeader_t *rf_AllocAccessStripeMapHeader(void); + +void rf_FreeAccessStripeMapHeader(RF_AccessStripeMapHeader_t *p); + +RF_PhysDiskAddr_t *rf_AllocPhysDiskAddr(void); + +RF_PhysDiskAddr_t *rf_AllocPDAList(int count); + +void rf_FreePhysDiskAddr(RF_PhysDiskAddr_t *p); + +RF_AccessStripeMap_t *rf_AllocAccessStripeMapComponent(void); + +RF_AccessStripeMap_t *rf_AllocASMList(int count); + +void rf_FreeAccessStripeMapComponent(RF_AccessStripeMap_t *p); + +void rf_FreeAccessStripeMap(RF_AccessStripeMapHeader_t *hdr); + +int rf_CheckStripeForFailures(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap); + +int rf_NumFailedDataUnitsInStripe(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap); + +void rf_PrintAccessStripeMap(RF_AccessStripeMapHeader_t *asm_h); + +void rf_PrintFullAccessStripeMap(RF_AccessStripeMapHeader_t *asm_h, int prbuf); + +void rf_PrintRaidAddressInfo(RF_Raid_t *raidPtr, RF_RaidAddr_t raidAddr, + RF_SectorCount_t numBlocks); + +void rf_ASMParityAdjust(RF_PhysDiskAddr_t *toAdjust, + RF_StripeNum_t startAddrWithinStripe, RF_SectorNum_t endAddress, + RF_RaidLayout_t *layoutPtr, RF_AccessStripeMap_t *asm_p); + +void rf_ASMCheckStatus(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda_p, + RF_AccessStripeMap_t *asm_p, RF_RaidDisk_t **disks, int parity); + +#endif /* !_RF__RF_MAP_H_ */ diff --git a/sys/dev/raidframe/rf_mcpair.c b/sys/dev/raidframe/rf_mcpair.c new file mode 100644 index 00000000000..4ed3a187b1c --- /dev/null +++ b/sys/dev/raidframe/rf_mcpair.c @@ -0,0 +1,200 @@ +/* $OpenBSD: rf_mcpair.c,v 1.1 1999/01/11 14:29:29 niklas Exp $ */ +/* $NetBSD: rf_mcpair.c,v 1.1 1998/11/13 04:20:31 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* rf_mcpair.c + * an mcpair is a structure containing a mutex and a condition variable. + * it's used to block the current thread until some event occurs. + */ + +/* : + * Log: rf_mcpair.c,v + * Revision 1.16 1996/06/19 22:23:01 jimz + * parity verification is now a layout-configurable thing + * not all layouts currently support it (correctly, anyway) + * + * Revision 1.15 1996/06/17 03:18:04 jimz + * include shutdown.h for macroized ShutdownCreate + * + * Revision 1.14 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.13 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.12 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.11 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.10 1996/05/20 16:15:22 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.9 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.8 1996/05/16 16:04:42 jimz + * convert to return-val on FREELIST init + * + * Revision 1.7 1996/05/16 14:47:21 jimz + * rewrote to use RF_FREELIST + * + * Revision 1.6 1995/12/01 19:25:43 root + * added copyright info + * + */ + +#include "rf_types.h" +#include "rf_threadstuff.h" +#include "rf_mcpair.h" +#include "rf_debugMem.h" +#include "rf_freelist.h" +#include "rf_shutdown.h" + +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) +#include <sys/proc.h> +#endif + +static RF_FreeList_t *rf_mcpair_freelist; + +#define RF_MAX_FREE_MCPAIR 128 +#define RF_MCPAIR_INC 16 +#define RF_MCPAIR_INITIAL 24 + +static int init_mcpair(RF_MCPair_t *); +static void clean_mcpair(RF_MCPair_t *); +static void rf_ShutdownMCPair(void *); + + + +static int init_mcpair(t) + RF_MCPair_t *t; +{ + int rc; + + rc = rf_mutex_init(&t->mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + return(rc); + } + rc = rf_cond_init(&t->cond); + if (rc) { + RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + rf_mutex_destroy(&t->mutex); + return(rc); + } + return(0); +} + +static void clean_mcpair(t) + RF_MCPair_t *t; +{ + rf_mutex_destroy(&t->mutex); + rf_cond_destroy(&t->cond); +} + +static void rf_ShutdownMCPair(ignored) + void *ignored; +{ + RF_FREELIST_DESTROY_CLEAN(rf_mcpair_freelist,next,(RF_MCPair_t *),clean_mcpair); +} + +int rf_ConfigureMCPair(listp) + RF_ShutdownList_t **listp; +{ + int rc; + + RF_FREELIST_CREATE(rf_mcpair_freelist, RF_MAX_FREE_MCPAIR, + RF_MCPAIR_INC, sizeof(RF_MCPair_t)); + rc = rf_ShutdownCreate(listp, rf_ShutdownMCPair, NULL); + if (rc) { + RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", + __FILE__, __LINE__, rc); + rf_ShutdownMCPair(NULL); + return(rc); + } + RF_FREELIST_PRIME_INIT(rf_mcpair_freelist, RF_MCPAIR_INITIAL,next, + (RF_MCPair_t *),init_mcpair); + return(0); +} + +RF_MCPair_t *rf_AllocMCPair() +{ + RF_MCPair_t *t; + + RF_FREELIST_GET_INIT(rf_mcpair_freelist,t,next,(RF_MCPair_t *),init_mcpair); + if (t) { + t->flag = 0; + t->next = NULL; + } + return(t); +} + +void rf_FreeMCPair(t) + RF_MCPair_t *t; +{ + RF_FREELIST_FREE_CLEAN(rf_mcpair_freelist,t,next,clean_mcpair); +} + +/* the callback function used to wake you up when you use an mcpair to wait for something */ +void rf_MCPairWakeupFunc(mcpair) + RF_MCPair_t *mcpair; +{ + RF_LOCK_MUTEX(mcpair->mutex); + mcpair->flag = 1; +#if 0 +printf("MCPairWakeupFunc called!\n"); +#endif +#ifdef KERNEL + wakeup(&(mcpair->flag)); /* XXX Does this do anything useful!! GO */ + /* + * XXX Looks like the following is needed to truly get the + * functionality they were looking for here... This could be a + * side-effect of my using a tsleep in the Net- and OpenBSD port + * though... XXX + */ +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + wakeup(&(mcpair->cond)); /* XXX XXX XXX GO */ +#endif +#else /* KERNEL */ + RF_SIGNAL_COND(mcpair->cond); +#endif /* KERNEL */ + RF_UNLOCK_MUTEX(mcpair->mutex); +} diff --git a/sys/dev/raidframe/rf_mcpair.h b/sys/dev/raidframe/rf_mcpair.h new file mode 100644 index 00000000000..852b85ad041 --- /dev/null +++ b/sys/dev/raidframe/rf_mcpair.h @@ -0,0 +1,62 @@ +/* $OpenBSD: rf_mcpair.h,v 1.1 1999/01/11 14:29:29 niklas Exp $ */ +/* $NetBSD: rf_mcpair.h,v 1.1 1998/11/13 04:20:31 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* rf_mcpair.h + * see comments in rf_mcpair.c + */ + +#ifndef _RF__RF_MCPAIR_H_ +#define _RF__RF_MCPAIR_H_ + +#include "rf_types.h" +#include "rf_threadstuff.h" + +struct RF_MCPair_s { + RF_DECLARE_MUTEX(mutex) + RF_DECLARE_COND(cond) + int flag; + RF_MCPair_t *next; +}; + +#ifdef KERNEL +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#define RF_WAIT_MCPAIR(_mcp) mpsleep(&((_mcp)->flag), PZERO, "mcpair", 0, (void *) simple_lock_addr((_mcp)->mutex), MS_LOCK_SIMPLE) +#else +#define RF_WAIT_MCPAIR(_mcp) tsleep(&((_mcp)->flag), PRIBIO | PCATCH, "mcpair", 0) +#endif +#else /* KERNEL */ +#define RF_WAIT_MCPAIR(_mcp) RF_WAIT_COND((_mcp)->cond, (_mcp)->mutex) +#endif /* KERNEL */ + +int rf_ConfigureMCPair(RF_ShutdownList_t **listp); +RF_MCPair_t *rf_AllocMCPair(void); +void rf_FreeMCPair(RF_MCPair_t *t); +void rf_MCPairWakeupFunc(RF_MCPair_t *t); + +#endif /* !_RF__RF_MCPAIR_H_ */ diff --git a/sys/dev/raidframe/rf_memchunk.c b/sys/dev/raidframe/rf_memchunk.c new file mode 100644 index 00000000000..568eb90e12d --- /dev/null +++ b/sys/dev/raidframe/rf_memchunk.c @@ -0,0 +1,256 @@ +/* $OpenBSD: rf_memchunk.c,v 1.1 1999/01/11 14:29:30 niklas Exp $ */ +/* $NetBSD: rf_memchunk.c,v 1.1 1998/11/13 04:20:31 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/********************************************************************************* + * rf_memchunk.c + * + * experimental code. I've found that the malloc and free calls in the DAG + * creation code are very expensive. Since for any given workload the DAGs + * created for different accesses are likely to be similar to each other, the + * amount of memory used for any given DAG data structure is likely to be one + * of a small number of values. For example, in UNIX, all reads and writes will + * be less than 8k and will not span stripe unit boundaries. Thus in the absence + * of failure, the only DAGs that will ever get created are single-node reads + * and single-stripe-unit atomic read-modify-writes. So, I'm very likely to + * be continually asking for chunks of memory equal to the sizes of these two + * DAGs. + * + * This leads to the idea of holding on to these chunks of memory when the DAG is + * freed and then, when a new DAG is created, trying to find such a chunk before + * calling malloc. + * + * the "chunk list" is a list of lists. Each header node contains a size value + * and a pointer to a list of chunk descriptors, each of which holds a pointer + * to a chunk of memory of the indicated size. + * + * There is currently no way to purge memory out of the chunk list. My + * initial thought on this is to have a low-priority thread that wakes up every + * 1 or 2 seconds, purges all the chunks with low reuse counts, and sets all + * the reuse counts to zero. + * + * This whole idea may be bad, since malloc may be able to do this more efficiently. + * It's worth a try, though, and it can be turned off by setting useMemChunks to 0. + * + ********************************************************************************/ + +/* : + * Log: rf_memchunk.c,v + * Revision 1.17 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.16 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.15 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.14 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.13 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.12 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.11 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.10 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.9 1996/05/20 16:15:45 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.8 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.7 1995/12/01 19:26:07 root + * added copyright info + * + */ + +#include "rf_types.h" +#include "rf_threadstuff.h" +#include "rf_debugMem.h" +#include "rf_memchunk.h" +#include "rf_general.h" +#include "rf_options.h" +#include "rf_shutdown.h" +#include "rf_sys.h" + +typedef struct RF_ChunkHdr_s RF_ChunkHdr_t; +struct RF_ChunkHdr_s { + int size; + RF_ChunkDesc_t *list; + RF_ChunkHdr_t *next; +}; + +static RF_ChunkHdr_t *chunklist, *chunk_hdr_free_list; +static RF_ChunkDesc_t *chunk_desc_free_list; +RF_DECLARE_STATIC_MUTEX(chunkmutex) + +static void rf_ShutdownMemChunk(void *); +static RF_ChunkDesc_t *NewMemChunk(int, char *); + + +static void rf_ShutdownMemChunk(ignored) + void *ignored; +{ + RF_ChunkDesc_t *pt, *p; + RF_ChunkHdr_t *hdr, *ht; + + if (rf_memChunkDebug) + printf("Chunklist:\n"); + for (hdr = chunklist; hdr;) { + for (p = hdr->list; p; ) { + if (rf_memChunkDebug) + printf("Size %d reuse count %d\n",p->size, p->reuse_count); + pt = p; p=p->next; + RF_Free(pt->buf, pt->size); + RF_Free(pt, sizeof(*pt)); + } + ht = hdr; hdr=hdr->next; + RF_Free(ht, sizeof(*ht)); + } + + rf_mutex_destroy(&chunkmutex); +} + +int rf_ConfigureMemChunk(listp) + RF_ShutdownList_t **listp; +{ + int rc; + + chunklist = NULL; + chunk_hdr_free_list = NULL; + chunk_desc_free_list = NULL; + rc = rf_mutex_init(&chunkmutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + } + rc = rf_ShutdownCreate(listp, rf_ShutdownMemChunk, NULL); + if (rc) { + RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + rf_mutex_destroy(&chunkmutex); + } + return(rc); +} + +/* called to get a chunk descriptor for a newly-allocated chunk of memory + * MUTEX MUST BE LOCKED + * + * free list is not currently used + */ +static RF_ChunkDesc_t *NewMemChunk(size, buf) + int size; + char *buf; +{ + RF_ChunkDesc_t *p; + + if (chunk_desc_free_list) {p = chunk_desc_free_list; chunk_desc_free_list = p->next;} + else RF_Malloc(p, sizeof(RF_ChunkDesc_t), (RF_ChunkDesc_t *)); + p->size = size; + p->buf = buf; + p->next = NULL; + p->reuse_count = 0; + return(p); +} + +/* looks for a chunk of memory of acceptable size. If none, allocates one and returns + * a chunk descriptor for it, but does not install anything in the list. This is done + * when the chunk is released. + */ +RF_ChunkDesc_t *rf_GetMemChunk(size) + int size; +{ + RF_ChunkHdr_t *hdr = chunklist; + RF_ChunkDesc_t *p = NULL; + char *buf; + + RF_LOCK_MUTEX(chunkmutex); + for (hdr = chunklist; hdr; hdr = hdr->next) if (hdr->size >= size) { + p = hdr->list; + if (p) { + hdr->list = p->next; + p->next = NULL; + p->reuse_count++; + } + break; + } + if (!p) { + RF_Malloc(buf, size, (char *)); + p = NewMemChunk(size, buf); + } + RF_UNLOCK_MUTEX(chunkmutex); + (void) bzero(p->buf, size); + return(p); +} + +void rf_ReleaseMemChunk(chunk) + RF_ChunkDesc_t *chunk; +{ + RF_ChunkHdr_t *hdr, *ht = NULL, *new; + + RF_LOCK_MUTEX(chunkmutex); + for (hdr = chunklist; hdr && hdr->size < chunk->size; ht=hdr,hdr=hdr->next); + if (hdr && hdr->size == chunk->size) { + chunk->next = hdr->list; + hdr->list = chunk; + } + else { + RF_Malloc(new, sizeof(RF_ChunkHdr_t), (RF_ChunkHdr_t *)); + new->size = chunk->size; new->list = chunk; chunk->next = NULL; + if (ht) { + new->next = ht->next; + ht->next = new; + } + else { + new->next = hdr; + chunklist = new; + } + } + RF_UNLOCK_MUTEX(chunkmutex); +} diff --git a/sys/dev/raidframe/rf_memchunk.h b/sys/dev/raidframe/rf_memchunk.h new file mode 100644 index 00000000000..7d41f57eae5 --- /dev/null +++ b/sys/dev/raidframe/rf_memchunk.h @@ -0,0 +1,80 @@ +/* $OpenBSD: rf_memchunk.h,v 1.1 1999/01/11 14:29:30 niklas Exp $ */ +/* $NetBSD: rf_memchunk.h,v 1.1 1998/11/13 04:20:31 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* header file for rf_memchunk.c. See comments there */ + +/* : + * Log: rf_memchunk.h,v + * Revision 1.8 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.7 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.6 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.5 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.4 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.3 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.2 1995/12/01 19:25:56 root + * added copyright info + * + */ + +#ifndef _RF__RF_MEMCHUNK_H_ +#define _RF__RF_MEMCHUNK_H_ + +#include "rf_types.h" + +struct RF_ChunkDesc_s { + int size; + int reuse_count; + char *buf; + RF_ChunkDesc_t *next; +}; + +int rf_ConfigureMemChunk(RF_ShutdownList_t **listp); +RF_ChunkDesc_t *rf_GetMemChunk(int size); +void rf_ReleaseMemChunk(RF_ChunkDesc_t *chunk); + +#endif /* !_RF__RF_MEMCHUNK_H_ */ diff --git a/sys/dev/raidframe/rf_netbsd.h b/sys/dev/raidframe/rf_netbsd.h new file mode 100644 index 00000000000..6d66769112b --- /dev/null +++ b/sys/dev/raidframe/rf_netbsd.h @@ -0,0 +1,98 @@ +/* $OpenBSD: rf_netbsd.h,v 1.1 1999/01/11 14:29:30 niklas Exp $ */ +/* $NetBSD: rf_netbsd.h,v 1.1 1998/11/13 04:20:31 oster Exp $ */ +/*- + * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Greg Oster + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Jason R. Thorpe. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _RF__RF_NETBSDSTUFF_H_ +#define _RF__RF_NETBSDSTUFF_H_ + +#include <sys/fcntl.h> +#include <sys/systm.h> +#include <sys/namei.h> +#include <sys/vnode.h> + + + +#if defined(__NetBSD__) && defined(_KERNEL) +struct raidcinfo { + struct vnode *ci_vp; /* device's vnode */ + dev_t ci_dev; /* XXX: device's dev_t */ +#if 0 + size_t ci_size; /* size */ + char *ci_path; /* path to component */ + size_t ci_pathlen; /* length of component path */ +#endif +}; +#endif + +#endif /* _RF__RF_NETBSDSTUFF_H_ */ diff --git a/sys/dev/raidframe/rf_netbsdkintf.c b/sys/dev/raidframe/rf_netbsdkintf.c new file mode 100644 index 00000000000..ad6673541cc --- /dev/null +++ b/sys/dev/raidframe/rf_netbsdkintf.c @@ -0,0 +1,2048 @@ +/* $OpenBSD: rf_netbsdkintf.c,v 1.1 1999/01/11 14:29:30 niklas Exp $ */ +/* $NetBSD: rf_netbsdkintf.c,v 1.5 1998/12/22 20:03:14 oster Exp $ */ +/*- + * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Greg Oster; Jason R. Thorpe. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1988 University of Utah. + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Utah $Hdr: cd.c 1.6 90/11/28$ + * + * @(#)cd.c 8.2 (Berkeley) 11/16/93 + */ + + + + +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Mark Holland, Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/*********************************************************** + * + * rf_kintf.c -- the kernel interface routines for RAIDframe + * + ***********************************************************/ +/* + * : + * Log: rf_kintf.c,v + * Revision 1.57 1996/07/19 16:12:20 jimz + * remove addition of protectedSectors in InitBP- it's already + * done in the diskqueue code + * + * Revision 1.56 1996/07/17 21:00:58 jimz + * clean up timer interface, tracing + * + * Revision 1.55 1996/06/17 03:00:54 jimz + * Change RAIDFRAME_GET_INFO interface to do its own copyout() + * (because size of device config structure now exceeds 8k) + * + * Revision 1.54 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.53 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.52 1996/06/06 17:28:08 jimz + * track sector number of last I/O dequeued + * + * Revision 1.51 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.50 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.49 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.48 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.47 1996/05/30 12:59:18 jimz + * make etimer happier, more portable + * + * Revision 1.46 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.45 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.44 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.43 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.42 1996/05/23 22:17:54 jimz + * fix sector size hardcoding problems + * + * Revision 1.41 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.40 1996/05/23 13:18:07 jimz + * tracing_mutex -> rf_tracing_mutex + * + * Revision 1.39 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.38 1996/05/20 16:15:32 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.37 1996/05/10 16:23:47 jimz + * RF_offset -> RF_Offset + * + * Revision 1.36 1996/05/08 21:01:24 jimz + * fixed up enum type names that were conflicting with other + * enums and function names (ie, "panic") + * future naming trends will be towards RF_ and rf_ for + * everything raidframe-related + * + * Revision 1.35 1996/05/03 19:10:48 jimz + * change sanity checking for bogus I/Os to return more appropriate + * values (to make some user-level utilities happer with RAIDframe) + * + * Revision 1.34 1996/05/02 22:17:00 jimz + * When using DKUSAGE, send a bogus IO after configuring to let DKUSAGE know + * that we exist. This will let user-level programs doing group stats on the + * RF device function without error before RF gets its first IO + * + * Changed rf_device_config devs and spares fields to RF_RaidDisk_t + * + * Inc numOutstanding for the disk queue in rf_DispatchKernelIO if + * type is IO_TYPE_NOP. I'm not sure this is right, but it seems to be, + * because the disk IO completion routine wants to dec it, and doesn't + * care if there was no such IO. + * + * Revision 1.33 1996/05/02 15:05:44 jimz + * for now, rf_DoAccessKernel will reject non-sector-sized I/Os + * eventually, it should do something more clever... + * (and do it in DoAccess(), not just DoAccessKernel()) + * + * Revision 1.32 1996/05/01 16:28:39 jimz + * get rid of uses of ccmn_ functions + * + * Revision 1.31 1996/05/01 15:42:17 jimz + * ccmn_* memory management is on the way out. This is an archival checkpoint- + * both the old and new code are in place (all the ccmn_ calls are #if 0). After + * this, the ccmn_ code will no longer appear. + * + * Revision 1.30 1996/04/22 15:53:13 jimz + * MAX_RAIDS -> NRAIDFRAME + * + * Revision 1.29 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.28 1995/12/01 19:11:01 root + * added copyright info + * + * Revision 1.27 1995/11/28 18:56:40 wvcii + * disabled buffer copy in rf_write + * + * Revision 1.26 1995/10/06 16:37:08 jimz + * get struct bufs from ubc, not cam + * copy all write data, and operate on copy + * (temporary hack to get around dags in PQ that want + * to Xor into user write buffers) + * + * Revision 1.25 1995/09/30 22:23:08 jimz + * do not require raid to be active to perform ACCTOTAL ioctl + * + * Revision 1.24 1995/09/30 20:39:08 jimz + * added new ioctls: + * RAIDFRAME_RESET_ACCTOTALS + * RAIDFRAME_GET_ACCTOTALS + * RAIDFRAME_KEEP_ACCTOTALS + * + * Revision 1.23 1995/09/20 21:11:59 jimz + * include dfstrace.h in KERNEL block + * (even though it's a kernel-only file, this makes the depend process + * at user-level happy. Why the user-level Makefile wants to depend + * kintf.c is less clear, but this is a workaround). + * + * Revision 1.22 1995/09/19 23:19:03 jimz + * added DKUSAGE support + * + */ + + + + +#ifdef _KERNEL +#define KERNEL +#endif + + + +#ifdef KERNEL + +#include <sys/errno.h> + +#ifdef __NetBSD__ +#include "raid.h" +#include <sys/param.h> +#include <sys/pool.h> +#include <sys/queue.h> +#include <sys/disk.h> +#include <sys/device.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <sys/fcntl.h> +#include <sys/systm.h> +#include <sys/namei.h> +#include <sys/vnode.h> +#endif + +#include <sys/param.h> +#include <sys/types.h> + +#include <machine/types.h> + +#include <sys/disklabel.h> + +#include <sys/conf.h> + + +#ifdef __NetBSD__ +#include <sys/lock.h> +#endif /* __NetBSD__ */ + + +#include <sys/buf.h> +#include <sys/user.h> +#include "rf_raid.h" +#include "rf_raidframe.h" +#include "rf_dag.h" +#include "rf_dagflags.h" +#include "rf_diskqueue.h" +#include "rf_acctrace.h" +#include "rf_etimer.h" +#include "rf_general.h" +#include "rf_debugMem.h" +#include "rf_kintf.h" +#include "rf_options.h" +#include "rf_driver.h" +#include "rf_parityscan.h" +#include "rf_debugprint.h" +#include "rf_threadstuff.h" + +int rf_kdebug_level = 0; + +#define RFK_BOOT_NONE 0 +#define RFK_BOOT_GOOD 1 +#define RFK_BOOT_BAD 2 +static int rf_kbooted = RFK_BOOT_NONE; + +#ifdef DEBUG +#define db0_printf(a) printf a +#define db_printf(a) if (rf_kdebug_level > 0) printf a +#define db1_printf(a) if (rf_kdebug_level > 0) printf a +#define db2_printf(a) if (rf_kdebug_level > 1) printf a +#define db3_printf(a) if (rf_kdebug_level > 2) printf a +#define db4_printf(a) if (rf_kdebug_level > 3) printf a +#define db5_printf(a) if (rf_kdebug_level > 4) printf a +#else /* DEBUG */ +#define db0_printf(a) printf a +#define db1_printf(a) { } +#define db2_printf(a) { } +#define db3_printf(a) { } +#define db4_printf(a) { } +#define db5_printf(a) { } +#endif /* DEBUG */ + +static RF_Raid_t **raidPtrs; /* global raid device descriptors */ + +static int rf_pending_testaccs; + +RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex) +RF_DECLARE_STATIC_MUTEX(rf_async_done_q_mutex) +static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a spare table */ +static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from installation process */ +static struct rf_test_acc *rf_async_done_qh, *rf_async_done_qt; + +static struct rf_recon_req *recon_queue = NULL; /* used to communicate reconstruction requests */ + + +decl_simple_lock_data(,recon_queue_mutex) + + +#define LOCK_RECON_Q_MUTEX() simple_lock(&recon_queue_mutex) +#define UNLOCK_RECON_Q_MUTEX() simple_unlock(&recon_queue_mutex) + +/* prototypes */ +static void KernelWakeupFunc(struct buf *bp); +static void InitBP(struct buf *bp, struct vnode *, unsigned rw_flag, dev_t dev, + RF_SectorNum_t startSect, RF_SectorCount_t numSect, caddr_t buf, + void (*cbFunc)(struct buf *), void *cbArg, int logBytesPerSector, + struct proc *b_proc); + +#define Dprintf0(s) if (rf_queueDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf1(s,a) if (rf_queueDebug) rf_debug_printf(s,a,NULL,NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf2(s,a,b) if (rf_queueDebug) rf_debug_printf(s,a,b,NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf3(s,a,b,c) if (rf_queueDebug) rf_debug_printf(s,a,b,c,NULL,NULL,NULL,NULL,NULL) + + +/* this is so that we can compile under 2.0 as well as 3.2 */ +#ifndef proc_to_task +#define proc_to_task(x) ((x)->task) +#endif /* !proc_to_task */ + +void raidattach __P((int)); +int raidsize __P((dev_t)); + +void rf_DiskIOComplete(RF_DiskQueue_t *, RF_DiskQueueData_t *, int); +void rf_CopybackReconstructedData(RF_Raid_t *raidPtr); +static int raidinit __P((dev_t,RF_Raid_t *,int)); + +int raidopen __P((dev_t, int, int, struct proc *)); +int raidclose __P((dev_t, int, int, struct proc *)); +int raidioctl __P((dev_t, u_long, caddr_t, int, struct proc *)); +int raidwrite __P((dev_t, struct uio *, int)); +int raidread __P((dev_t, struct uio *, int)); +void raidstrategy __P((struct buf *)); +int raiddump __P((dev_t, daddr_t, caddr_t, size_t)); + +/* + * Pilfered from ccd.c + */ + +struct raidbuf { + struct buf rf_buf; /* new I/O buf. MUST BE FIRST!!! */ + struct buf *rf_obp; /* ptr. to original I/O buf */ + int rf_flags; /* misc. flags */ + RF_DiskQueueData_t *req; /* the request that this was part of.. */ +}; + + +#define RAIDGETBUF(rs) pool_get(&(rs)->sc_cbufpool, PR_NOWAIT) +#define RAIDPUTBUF(rs, cbp) pool_put(&(rs)->sc_cbufpool, cbp) + +/* XXX Not sure if the following should be replacing the raidPtrs above, +or if it should be used in conjunction with that... */ + +struct raid_softc { + int sc_unit; /* logical unit number */ + int sc_flags; /* flags */ + int sc_cflags; /* configuration flags */ + size_t sc_size; /* size of the raid device */ + dev_t sc_dev; /* our device..*/ + char sc_xname[20]; /* XXX external name */ + struct disk sc_dkdev; /* generic disk device info */ + struct pool sc_cbufpool; /* component buffer pool */ +}; + +/* sc_flags */ +#define RAIDF_INITED 0x01 /* unit has been initialized */ +#define RAIDF_WLABEL 0x02 /* label area is writable */ +#define RAIDF_LABELLING 0x04 /* unit is currently being labelled */ +#define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */ +#define RAIDF_LOCKED 0x80 /* unit is locked */ + +#define raidunit(x) DISKUNIT(x) +static int numraid=0; + +#define RAIDLABELDEV(dev) \ + (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART)) + +/* declared here, and made public, for the benefit of KVM stuff.. */ +struct raid_softc *raid_softc; + +static void raidgetdefaultlabel __P((RF_Raid_t *, struct raid_softc *, struct disklabel *)); +static void raidgetdisklabel __P((dev_t)); +static void raidmakedisklabel __P((struct raid_softc *)); + +static int raidlock __P((struct raid_softc *)); +static void raidunlock __P((struct raid_softc *)); +int raidlookup __P((char *, struct proc *p, struct vnode **)); + + +void +raidattach(num) + int num; +{ + int raidID; + +#ifdef DEBUG + printf("raidattach: Asked for %d units\n",num); +#endif + + if (num <= 0) { +#ifdef DIAGNOSTIC + panic("raidattach: count <= 0"); +#endif + return; + } + /* + This is where all the initialization stuff gets done. + */ + + /* Make some space for requested number of units... */ + + RF_Calloc(raidPtrs, num, sizeof(RF_Raid_t *), (RF_Raid_t **)); + if (raidPtrs == NULL) { + panic("raidPtrs is NULL!!\n"); + } + + + + rf_kbooted = rf_boot(); + if (rf_kbooted) { + panic("Serious error booting RAID!!\n"); + } + + rf_kbooted = RFK_BOOT_GOOD; + + /* + put together some datastructures like the CCD device does.. + This lets us lock the device and what-not when it gets opened. + */ + + raid_softc = (struct raid_softc *) + malloc(num * sizeof(struct raid_softc), + M_DEVBUF, M_NOWAIT); + if (raid_softc == NULL) { + printf("WARNING: no memory for RAIDframe driver\n"); + return; + } + numraid = num; + bzero(raid_softc, num * sizeof(struct raid_softc)); + + for(raidID=0;raidID < num;raidID++) { + RF_Calloc(raidPtrs[raidID], 1, sizeof(RF_Raid_t), + (RF_Raid_t *)); + if (raidPtrs[raidID]==NULL) { + printf("raidPtrs[%d] is NULL\n",raidID); + } + } +} + + +int +raidsize(dev) + dev_t dev; +{ + struct raid_softc *rs; + struct disklabel *lp; + int part, unit, omask, size; + + unit = raidunit(dev); + if (unit >= numraid) + return (-1); + rs = &raid_softc[unit]; + + if ((rs->sc_flags & RAIDF_INITED) == 0) + return (-1); + + part = DISKPART(dev); + omask = rs->sc_dkdev.dk_openmask & (1 << part); + lp = rs->sc_dkdev.dk_label; + + if (omask == 0 && raidopen(dev, 0, S_IFBLK, curproc)) + return (-1); + + if (lp->d_partitions[part].p_fstype != FS_SWAP) + size = -1; + else + size = lp->d_partitions[part].p_size * + (lp->d_secsize / DEV_BSIZE); + + if (omask == 0 && raidclose(dev, 0, S_IFBLK, curproc)) + return (-1); + + return (size); + +} + +int +raiddump(dev, blkno, va, size) + dev_t dev; + daddr_t blkno; + caddr_t va; + size_t size; +{ + /* Not implemented. */ + return ENXIO; +} + +/* ARGSUSED */ +int +raidopen(dev, flags, fmt, p) + dev_t dev; + int flags, fmt; + struct proc *p; +{ + int unit = raidunit(dev); + struct raid_softc *rs; + struct disklabel *lp; + int part,pmask; + unsigned int raidID; + int rc; + int error = 0; + + /* This whole next chunk of code is somewhat suspect... Not sure + it's needed here at all... XXX */ + + if (rf_kbooted == RFK_BOOT_NONE) { + printf("Doing restart on raidopen.\n"); + rf_kbooted = RFK_BOOT_GOOD; + rc = rf_boot(); + if (rc) { + rf_kbooted = RFK_BOOT_BAD; + printf("Someone is unhappy...\n"); + return(rc); + } + } + + if (unit >= numraid) + return (ENXIO); + rs = &raid_softc[unit]; + + if ((error = raidlock(rs)) != 0) + return(error); + lp = rs->sc_dkdev.dk_label; + + raidID = raidunit(dev); + + part = DISKPART(dev); + pmask = (1 << part); + + db1_printf(("Opening raid device number: %d partition: %d\n", + raidID,part)); + + + if ((rs->sc_flags & RAIDF_INITED) && + (rs->sc_dkdev.dk_openmask == 0)) + raidgetdisklabel(dev); + + /* make sure that this partition exists */ + + if (part != RAW_PART) { + db1_printf(("Not a raw partition..\n")); + if (((rs->sc_flags & RAIDF_INITED) == 0) || + ((part >= lp->d_npartitions) || + (lp->d_partitions[part].p_fstype == FS_UNUSED))) { + error = ENXIO; + raidunlock(rs); + db1_printf(("Bailing out...\n")); + return(error); + } + } + + /* Prevent this unit from being unconfigured while open. */ + switch (fmt) { + case S_IFCHR: + rs->sc_dkdev.dk_copenmask |= pmask; + break; + + case S_IFBLK: + rs->sc_dkdev.dk_bopenmask |= pmask; + break; + } + rs->sc_dkdev.dk_openmask = + rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask; + + raidunlock(rs); + + return(error); + + +} + +/* ARGSUSED */ +int +raidclose(dev, flags, fmt, p) + dev_t dev; + int flags, fmt; + struct proc *p; +{ + int unit = raidunit(dev); + struct raid_softc *rs; + int error = 0; + int part; + + if (unit >= numraid) + return (ENXIO); + rs = &raid_softc[unit]; + + if ((error = raidlock(rs)) != 0) + return (error); + + part = DISKPART(dev); + + /* ...that much closer to allowing unconfiguration... */ + switch (fmt) { + case S_IFCHR: + rs->sc_dkdev.dk_copenmask &= ~(1 << part); + break; + + case S_IFBLK: + rs->sc_dkdev.dk_bopenmask &= ~(1 << part); + break; + } + rs->sc_dkdev.dk_openmask = + rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask; + + raidunlock(rs); + return (0); + +} + +void +raidstrategy(bp) + register struct buf *bp; +{ + register int s; + + unsigned int raidID = raidunit(bp->b_dev); + RF_Raid_t *raidPtr; + struct raid_softc *rs = &raid_softc[raidID]; + struct disklabel *lp; + int wlabel; + +#if 0 + db1_printf(("Strategy: 0x%x 0x%x\n",bp,bp->b_data)); + db1_printf(("Strategy(2): bp->b_bufsize%d\n", (int)bp->b_bufsize)); + db1_printf(("bp->b_count=%d\n",(int)bp->b_bcount)); + db1_printf(("bp->b_resid=%d\n",(int)bp->b_resid)); + db1_printf(("bp->b_blkno=%d\n",(int)bp->b_blkno)); + + if (bp->b_flags&B_READ) + db1_printf(("READ\n")); + else + db1_printf(("WRITE\n")); +#endif + if (rf_kbooted != RFK_BOOT_GOOD) + return; + if (raidID >= numraid || !raidPtrs[raidID]) { + bp->b_error = ENODEV; + bp->b_flags |= B_ERROR; + bp->b_resid = bp->b_bcount; + biodone(bp); + return; + } + raidPtr = raidPtrs[raidID]; + if (!raidPtr->valid) { + bp->b_error = ENODEV; + bp->b_flags |= B_ERROR; + bp->b_resid = bp->b_bcount; + biodone(bp); + return; + } + if (bp->b_bcount == 0) { + db1_printf(("b_bcount is zero..\n")); + biodone(bp); + return; + } + lp = rs->sc_dkdev.dk_label; + + /* + * Do bounds checking and adjust transfer. If there's an + * error, the bounds check will flag that for us. + */ + + wlabel = rs->sc_flags & (RAIDF_WLABEL|RAIDF_LABELLING); + if (DISKPART(bp->b_dev) != RAW_PART) + if (bounds_check_with_label(bp, lp, wlabel) <= 0) { + db1_printf(("Bounds check failed!!:%d %d\n", + (int)bp->b_blkno,(int)wlabel)); + biodone(bp); + return; + } + + s = splbio(); /* XXX Needed? */ + db1_printf(("Beginning strategy...\n")); + + bp->b_resid = 0; + bp->b_error = rf_DoAccessKernel(raidPtrs[raidID], bp, + NULL, NULL, NULL); + if (bp->b_error) { + bp->b_flags |= B_ERROR; + db1_printf(("bp->b_flags HAS B_ERROR SET!!!: %d\n", + bp->b_error)); + } + splx(s); +#if 0 + db1_printf(("Strategy exiting: 0x%x 0x%x %d %d\n", + bp,bp->b_data, + (int)bp->b_bcount,(int)bp->b_resid)); +#endif +} + +/* ARGSUSED */ +int +raidread(dev, uio, flags) + dev_t dev; + struct uio *uio; + int flags; +{ + int unit = raidunit(dev); + struct raid_softc *rs; + int result; + int part; + + if (unit >= numraid) + return (ENXIO); + rs = &raid_softc[unit]; + + if ((rs->sc_flags & RAIDF_INITED) == 0) + return (ENXIO); + part = DISKPART(dev); + + db1_printf(("raidread: unit: %d partition: %d\n",unit,part)); + +#if 0 + return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio)); +#endif + result=physio(raidstrategy, NULL, dev, B_READ, minphys, uio); + db1_printf(("raidread done. Result is %d %d\n", + result,uio->uio_resid)); + return(result); + +} + +/* ARGSUSED */ +int +raidwrite(dev, uio, flags) + dev_t dev; + struct uio *uio; + int flags; +{ + int unit = raidunit(dev); + struct raid_softc *rs; + + if (unit >= numraid) + return (ENXIO); + rs = &raid_softc[unit]; + + if ((rs->sc_flags & RAIDF_INITED) == 0) + return (ENXIO); + db1_printf(("raidwrite\n")); + return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio)); + + +} + +int +raidioctl(dev, cmd, data, flag, p) + dev_t dev; + u_long cmd; + caddr_t data; + int flag; + struct proc *p; +{ + int unit = raidunit(dev); + int error = 0; + int part, pmask; + struct raid_softc *rs; +#if 0 + int r,c; +#endif + /* struct raid_ioctl *ccio = (struct ccd_ioctl *)data; */ + + /* struct ccdbuf *cbp; */ + /* struct raidbuf *raidbp; */ + RF_Config_t *k_cfg, *u_cfg; + u_char *specific_buf; + int retcode = 0; + + int row; + struct rf_recon_req *rrcopy, *rr; +#if 0 + int nbytes, spl, rw, row; + struct rf_test_acc *ta; + struct buf *bp; + RF_SparetWait_t *waitreq; + struct rf_test_acc *ta_p, *ta_copy; +#endif + + if (unit >= numraid) + return (ENXIO); + rs = &raid_softc[unit]; + + db1_printf(("raidioctl: %d %d %d %d\n",(int)dev, + (int)DISKPART(dev),(int)unit,(int)cmd)); + + /* Must be open for writes for these commands... */ + switch (cmd) { + case DIOCSDINFO: + case DIOCWDINFO: + case DIOCWLABEL: + if ((flag & FWRITE) == 0) + return (EBADF); + } + + /* Must be initialized for these... */ + switch (cmd) { + case DIOCGDINFO: + case DIOCSDINFO: + case DIOCWDINFO: + case DIOCGPART: + case DIOCWLABEL: + case DIOCGDEFLABEL: + case RAIDFRAME_SHUTDOWN: + case RAIDFRAME_REWRITEPARITY: + case RAIDFRAME_GET_INFO: + case RAIDFRAME_RESET_ACCTOTALS: + case RAIDFRAME_GET_ACCTOTALS: + case RAIDFRAME_KEEP_ACCTOTALS: + case RAIDFRAME_GET_SIZE: + case RAIDFRAME_FAIL_DISK: + case RAIDFRAME_COPYBACK: + case RAIDFRAME_CHECKRECON: + if ((rs->sc_flags & RAIDF_INITED) == 0) + return (ENXIO); + } + + switch (cmd) { + + + /* configure the system */ + case RAIDFRAME_CONFIGURE: + + db3_printf(("rf_ioctl: RAIDFRAME_CONFIGURE\n")); + /* copy-in the configuration information */ + /* data points to a pointer to the configuration structure */ + u_cfg = *((RF_Config_t **) data); + RF_Malloc(k_cfg,sizeof(RF_Config_t),(RF_Config_t *)); + if (k_cfg == NULL) { + db3_printf(("rf_ioctl: ENOMEM for config. Code is %d\n", retcode)); + return(ENOMEM); + } + retcode = copyin((caddr_t) u_cfg, (caddr_t) k_cfg, + sizeof(RF_Config_t)); + if (retcode) { + db3_printf(("rf_ioctl: retcode=%d copyin.1\n", + retcode)); + return(retcode); + } + + /* allocate a buffer for the layout-specific data, + and copy it in */ + if (k_cfg->layoutSpecificSize) { + if (k_cfg->layoutSpecificSize > 10000) { + /* sanity check */ + db3_printf(("rf_ioctl: EINVAL %d\n", retcode)); + return(EINVAL); + } + RF_Malloc(specific_buf,k_cfg->layoutSpecificSize, + (u_char *)); + if (specific_buf == NULL) { + RF_Free(k_cfg,sizeof(RF_Config_t)); + db3_printf(("rf_ioctl: ENOMEM %d\n", retcode)); + return(ENOMEM); + } + retcode = copyin(k_cfg->layoutSpecific, + (caddr_t) specific_buf, + k_cfg->layoutSpecificSize); + if (retcode) { + db3_printf(("rf_ioctl: retcode=%d copyin.2\n", + retcode)); + return(retcode); + } + } else specific_buf = NULL; + k_cfg->layoutSpecific = specific_buf; + + /* should do some kind of sanity check on the configuration. + Store the sum of all the bytes in the last byte? + */ + +#if 0 + db1_printf(("Considering configuring the system.:%d 0x%x\n", + unit,p)); +#endif + + /* We need the pointer to this a little deeper, so + stash it here... */ + + raidPtrs[unit]->proc = p; + + /* configure the system */ + rf_pending_testaccs = 0; + + + raidPtrs[unit]->raidid = unit; + retcode = rf_Configure(raidPtrs[unit], k_cfg); + + + if (retcode == 0) { + retcode = raidinit(dev, raidPtrs[unit],unit); + } + + /* free the buffers. No return code here. */ + if (k_cfg->layoutSpecificSize) { + RF_Free(specific_buf,k_cfg->layoutSpecificSize); + } + RF_Free(k_cfg,sizeof(RF_Config_t)); + + db3_printf(("rf_ioctl: retcode=%d RAIDFRAME_CONFIGURE\n", + retcode)); + return(retcode); + + /* shutdown the system */ + case RAIDFRAME_SHUTDOWN: + + if ((error = raidlock(rs)) != 0) + return(error); + + /* + * If somebody has a partition mounted, we shouldn't + * shutdown. + */ + + part = DISKPART(dev); + pmask = (1 << part); + if ((rs->sc_dkdev.dk_openmask & ~pmask) || + ((rs->sc_dkdev.dk_bopenmask & pmask) && + (rs->sc_dkdev.dk_copenmask & pmask))) { + raidunlock(rs); + return (EBUSY); + } + + /* the intention here was to disallow shutdowns while + raidframe is mounted, but it doesn't work because the + shutdown ioctl calls rf_open + */ + if (rf_pending_testaccs > 0) { + printf("RAIDFRAME: Can't shutdown because there are %d pending test accs\n", + rf_pending_testaccs); + return(EINVAL); + } + if (rf_debugKernelAccess) { + printf("call shutdown\n"); + } + raidPtrs[unit]->proc = p; /* XXX necessary evil */ + retcode = rf_Shutdown(raidPtrs[unit]); + + db1_printf(("Done main shutdown\n")); + + pool_destroy(&rs->sc_cbufpool); + db1_printf(("Done freeing component buffer freelist\n")); + + /* It's no longer initialized... */ + rs->sc_flags &= ~RAIDF_INITED; + + /* Detach the disk. */ + disk_detach(&rs->sc_dkdev); + + raidunlock(rs); + + return(retcode); + + /* initialize all parity */ + case RAIDFRAME_REWRITEPARITY: + + if (raidPtrs[unit]->Layout.map->faultsTolerated == 0) + return(EINVAL); + /* borrow the thread of the requesting process */ + raidPtrs[unit]->proc = p; /* Blah... :-p GO */ + retcode = rf_RewriteParity(raidPtrs[unit]); + /* return I/O Error if the parity rewrite fails */ + + if (retcode) + retcode = EIO; + return(retcode); + + /* issue a test-unit-ready through raidframe to the + indicated device */ +#if 0 /* XXX not supported yet (ever?) */ + case RAIDFRAME_TUR: + /* debug only */ + retcode = rf_SCSI_DoTUR(0, 0, 0, 0, *(dev_t *) data); + return(retcode); +#endif + case RAIDFRAME_GET_INFO: + { + RF_Raid_t *raid = raidPtrs[unit]; + RF_DeviceConfig_t *cfg, **ucfgp; + int i, j, d; + + if (!raid->valid) + return(ENODEV); + ucfgp = (RF_DeviceConfig_t **)data; + RF_Malloc(cfg,sizeof(RF_DeviceConfig_t), + (RF_DeviceConfig_t *)); + if (cfg == NULL) + return(ENOMEM); + bzero((char *)cfg, sizeof(RF_DeviceConfig_t)); + cfg->rows = raid->numRow; + cfg->cols = raid->numCol; + cfg->ndevs = raid->numRow * raid->numCol; + if (cfg->ndevs >= RF_MAX_DISKS) { + cfg->ndevs = 0; + return(ENOMEM); + } + cfg->nspares = raid->numSpare; + if (cfg->nspares >= RF_MAX_DISKS) { + cfg->nspares = 0; + return(ENOMEM); + } + cfg->maxqdepth = raid->maxQueueDepth; + d = 0; + for(i=0;i<cfg->rows;i++) { + for(j=0;j<cfg->cols;j++) { + cfg->devs[d] = raid->Disks[i][j]; + d++; + } + } + for(j=cfg->cols,i=0;i<cfg->nspares;i++,j++) { + cfg->spares[i] = raid->Disks[0][j]; + } + retcode = copyout((caddr_t)cfg, (caddr_t)*ucfgp, + sizeof(RF_DeviceConfig_t)); + RF_Free(cfg,sizeof(RF_DeviceConfig_t)); + + return(retcode); + } + break; + + case RAIDFRAME_RESET_ACCTOTALS: + { + RF_Raid_t *raid = raidPtrs[unit]; + + bzero(&raid->acc_totals, sizeof(raid->acc_totals)); + return(0); + } + break; + + case RAIDFRAME_GET_ACCTOTALS: + { + RF_AccTotals_t *totals = (RF_AccTotals_t *)data; + RF_Raid_t *raid = raidPtrs[unit]; + + *totals = raid->acc_totals; + return(0); + } + break; + + case RAIDFRAME_KEEP_ACCTOTALS: + { + RF_Raid_t *raid = raidPtrs[unit]; + int *keep = (int *)data; + + raid->keep_acc_totals = *keep; + return(0); + } + break; + + case RAIDFRAME_GET_SIZE: + *(int *) data = raidPtrs[unit]->totalSectors; + return(0); + +#define RAIDFRAME_RECON 1 + /* XXX The above should probably be set somewhere else!! GO */ +#if RAIDFRAME_RECON > 0 + + /* fail a disk & optionally start reconstruction */ + case RAIDFRAME_FAIL_DISK: + rr = (struct rf_recon_req *) data; + + if (rr->row < 0 || rr->row >= raidPtrs[unit]->numRow + || rr->col < 0 || rr->col >= raidPtrs[unit]->numCol) + return(EINVAL); + + printf("Failing the disk: row: %d col: %d\n",rr->row,rr->col); + + /* make a copy of the recon request so that we don't + rely on the user's buffer */ + RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *)); + bcopy(rr, rrcopy, sizeof(*rr)); + rrcopy->raidPtr = (void *) raidPtrs[unit]; + + LOCK_RECON_Q_MUTEX(); + rrcopy->next = recon_queue; + recon_queue = rrcopy; + wakeup(&recon_queue); + UNLOCK_RECON_Q_MUTEX(); + + return(0); + + /* invoke a copyback operation after recon on whatever + disk needs it, if any */ + case RAIDFRAME_COPYBACK: + /* borrow the current thread to get this done */ + raidPtrs[unit]->proc = p; /* ICK.. but needed :-p GO */ + rf_CopybackReconstructedData(raidPtrs[unit]); + return(0); + + /* return the percentage completion of reconstruction */ + case RAIDFRAME_CHECKRECON: + row = *(int *) data; + if (row < 0 || row >= raidPtrs[unit]->numRow) + return(EINVAL); + if (raidPtrs[unit]->status[row] != rf_rs_reconstructing) + *(int *) data = 100; + else + *(int *) data = raidPtrs[unit]->reconControl[row]->percentComplete; + return(0); + + /* the sparetable daemon calls this to wait for the + kernel to need a spare table. + * this ioctl does not return until a spare table is needed. + * XXX -- calling mpsleep here in the ioctl code is almost + certainly wrong and evil. -- XXX + * XXX -- I should either compute the spare table in the + kernel, or have a different -- XXX + * XXX -- interface (a different character device) for + delivering the table -- XXX + */ +#if 0 + case RAIDFRAME_SPARET_WAIT: + RF_LOCK_MUTEX(rf_sparet_wait_mutex); + while (!rf_sparet_wait_queue) mpsleep(&rf_sparet_wait_queue, (PZERO+1)|PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE); + waitreq = rf_sparet_wait_queue; + rf_sparet_wait_queue = rf_sparet_wait_queue->next; + RF_UNLOCK_MUTEX(rf_sparet_wait_mutex); + + *((RF_SparetWait_t *) data) = *waitreq; /* structure assignment */ + + RF_Free(waitreq, sizeof(*waitreq)); + return(0); + + + /* wakes up a process waiting on SPARET_WAIT and puts an + error code in it that will cause the dameon to exit */ + case RAIDFRAME_ABORT_SPARET_WAIT: + RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *)); + waitreq->fcol = -1; + RF_LOCK_MUTEX(rf_sparet_wait_mutex); + waitreq->next = rf_sparet_wait_queue; + rf_sparet_wait_queue = waitreq; + RF_UNLOCK_MUTEX(rf_sparet_wait_mutex); + wakeup(&rf_sparet_wait_queue); + return(0); + + /* used by the spare table daemon to deliver a spare table + into the kernel */ + case RAIDFRAME_SEND_SPARET: + + /* install the spare table */ + retcode = rf_SetSpareTable(raidPtrs[unit],*(void **) data); + + /* respond to the requestor. the return status of the + spare table installation is passed in the "fcol" field */ + RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *)); + waitreq->fcol = retcode; + RF_LOCK_MUTEX(rf_sparet_wait_mutex); + waitreq->next = rf_sparet_resp_queue; + rf_sparet_resp_queue = waitreq; + wakeup(&rf_sparet_resp_queue); + RF_UNLOCK_MUTEX(rf_sparet_wait_mutex); + + return(retcode); +#endif + + +#endif /* RAIDFRAME_RECON > 0 */ + + default: break; /* fall through to the os-specific code below */ + + } + + if (!raidPtrs[unit]->valid) + return(EINVAL); + + /* + * Add support for "regular" device ioctls here. + */ + + switch (cmd) { + case DIOCGDINFO: + db1_printf(("DIOCGDINFO %d %d\n",(int)dev,(int)DISKPART(dev))); + *(struct disklabel *)data = *(rs->sc_dkdev.dk_label); + break; + + case DIOCGPART: + db1_printf(("DIOCGPART: %d %d\n",(int)dev,(int)DISKPART(dev))); + ((struct partinfo *)data)->disklab = rs->sc_dkdev.dk_label; + ((struct partinfo *)data)->part = + &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)]; + break; + + case DIOCWDINFO: + db1_printf(("DIOCWDINFO\n")); + case DIOCSDINFO: + db1_printf(("DIOCSDINFO\n")); + if ((error = raidlock(rs)) != 0) + return (error); + + rs->sc_flags |= RAIDF_LABELLING; + + error = setdisklabel(rs->sc_dkdev.dk_label, + (struct disklabel *)data, 0, rs->sc_dkdev.dk_cpulabel); + if (error == 0) { + if (cmd == DIOCWDINFO) + error = writedisklabel(RAIDLABELDEV(dev), + raidstrategy, rs->sc_dkdev.dk_label, + rs->sc_dkdev.dk_cpulabel); + } + + rs->sc_flags &= ~RAIDF_LABELLING; + + raidunlock(rs); + + if (error) + return (error); + break; + + case DIOCWLABEL: + db1_printf(("DIOCWLABEL\n")); + if (*(int *)data != 0) + rs->sc_flags |= RAIDF_WLABEL; + else + rs->sc_flags &= ~RAIDF_WLABEL; + break; + + case DIOCGDEFLABEL: + db1_printf(("DIOCGDEFLABEL\n")); + raidgetdefaultlabel(raidPtrs[unit], rs, + (struct disklabel *)data); + break; + + default: + retcode = ENOTTY; /* XXXX ?? OR EINVAL ? */ + } + return(retcode); + +} + + +/* raidinit -- complete the rest of the initialization for the + RAIDframe device. */ + + +static int +raidinit(dev, raidPtr,unit) + dev_t dev; + RF_Raid_t *raidPtr; + int unit; +{ + int retcode; + /* int ix; */ + /* struct raidbuf *raidbp; */ + struct raid_softc *rs; + + retcode = 0; + + rs = &raid_softc[unit]; + pool_init(&rs->sc_cbufpool, sizeof(struct raidbuf), 0, + 0, 0, "raidpl", 0, NULL, NULL, M_DEVBUF); + + + /* XXX should check return code first... */ + rs->sc_flags |= RAIDF_INITED; + + sprintf(rs->sc_xname, "raid%d", unit); /* XXX doesn't check bounds.*/ + + rs->sc_dkdev.dk_name = rs->sc_xname; + /* disk_attach actually creates space for the CPU disklabel, among + other things, so it's critical to call this *BEFORE* we + try putzing with disklabels. */ + disk_attach(&rs->sc_dkdev); + + /* XXX There may be a weird interaction here between this, and + protectedSectors, as used in RAIDframe. */ + rs->sc_size = raidPtr->totalSectors; + rs->sc_dev = dev; + return(retcode); +} + + +/********************************************************* + * + * initialization code called at boot time (startup.c) + * + ********************************************************/ +int rf_boot() +{ + int i, rc; + + rc = rf_mutex_init(&rf_sparet_wait_mutex); + if (rc) { + RF_PANIC(); + } + rc = rf_mutex_init(&rf_async_done_q_mutex); + if (rc) { + RF_PANIC(); + } + rf_sparet_wait_queue = rf_sparet_resp_queue = NULL; + recon_queue = NULL; + rf_async_done_qh = rf_async_done_qt = NULL; + for (i=0; i<numraid; i++) + raidPtrs[i] = NULL; + rc = rf_BootRaidframe(); + if (rc == 0) + printf("Kernelized RAIDframe activated\n"); + else + rf_kbooted = RFK_BOOT_BAD; + return(rc); +} + +/* + * This kernel thread never exits. It is created once, and persists + * until the system reboots. + */ +void rf_ReconKernelThread() +{ + struct rf_recon_req *req; + int s; + + /* XXX not sure what spl() level we should be at here... probably splbio() */ + s=splbio(); + + while (1) { + /* grab the next reconstruction request from the queue */ + LOCK_RECON_Q_MUTEX(); + while (!recon_queue) { + UNLOCK_RECON_Q_MUTEX(); + tsleep(&recon_queue, PRIBIO | PCATCH, "raidframe recon", 0); + LOCK_RECON_Q_MUTEX(); + } + req = recon_queue; + recon_queue = recon_queue->next; + UNLOCK_RECON_Q_MUTEX(); + + /* + * If flags specifies that we should start recon, this call + * will not return until reconstruction completes, fails, or is aborted. + */ + rf_FailDisk((RF_Raid_t *) req->raidPtr, req->row, req->col, + ((req->flags&RF_FDFLAGS_RECON) ? 1 : 0)); + + RF_Free(req, sizeof(*req)); + } +} +/* wake up the daemon & tell it to get us a spare table + * XXX + * the entries in the queues should be tagged with the raidPtr + * so that in the extremely rare case that two recons happen at once, we know for + * which device were requesting a spare table + * XXX + */ +int rf_GetSpareTableFromDaemon(req) + RF_SparetWait_t *req; +{ + int retcode; + + RF_LOCK_MUTEX(rf_sparet_wait_mutex); + req->next = rf_sparet_wait_queue; + rf_sparet_wait_queue = req; + wakeup(&rf_sparet_wait_queue); + + /* mpsleep unlocks the mutex */ + while (!rf_sparet_resp_queue) { + tsleep(&rf_sparet_resp_queue, PRIBIO | PCATCH, + "raidframe getsparetable", 0); +#if 0 + mpsleep(&rf_sparet_resp_queue, PZERO, "sparet resp", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE); +#endif + } + req = rf_sparet_resp_queue; + rf_sparet_resp_queue = req->next; + RF_UNLOCK_MUTEX(rf_sparet_wait_mutex); + + retcode = req->fcol; + RF_Free(req, sizeof(*req)); /* this is not the same req as we alloc'd */ + return(retcode); +} + +/* a wrapper around rf_DoAccess that extracts appropriate info from the bp & passes it down. + * any calls originating in the kernel must use non-blocking I/O + * do some extra sanity checking to return "appropriate" error values for + * certain conditions (to make some standard utilities work) + */ +int rf_DoAccessKernel(raidPtr, bp, flags, cbFunc, cbArg) + RF_Raid_t *raidPtr; + struct buf *bp; + RF_RaidAccessFlags_t flags; + void (*cbFunc)(struct buf *); + void *cbArg; +{ + RF_SectorCount_t num_blocks, pb, sum; + RF_RaidAddr_t raid_addr; + int retcode; + struct partition *pp; + daddr_t blocknum; + int unit; + struct raid_softc *rs; + + /* XXX The dev_t used here should be for /dev/[r]raid* !!! */ + + unit = raidPtr->raidid; + rs = &raid_softc[unit]; + + /* Ok, for the bp we have here, bp->b_blkno is relative to the + partition.. Need to make it absolute to the underlying + device.. */ + + blocknum = bp->b_blkno; + if (DISKPART(bp->b_dev) != RAW_PART) { + pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)]; + blocknum += pp->p_offset; + db1_printf(("updated: %d %d\n",DISKPART(bp->b_dev), + pp->p_offset)); + } else { + db1_printf(("Is raw..\n")); + } + db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno, (int) blocknum)); + + db1_printf(("bp->b_bcount = %d\n",(int)bp->b_bcount)); + db1_printf(("bp->b_resid = %d\n",(int)bp->b_resid)); + + /* *THIS* is where we adjust what block we're going to... but + DO NOT TOUCH bp->b_blkno!!! */ + raid_addr = blocknum; + + num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector; + pb = (bp->b_bcount&raidPtr->sectorMask) ? 1 : 0; + sum = raid_addr + num_blocks + pb; + if (1 || rf_debugKernelAccess) { + db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n", + (int)raid_addr, (int)sum,(int)num_blocks, + (int)pb,(int)bp->b_resid)); + } + + + if ((sum > raidPtr->totalSectors) || (sum < raid_addr) + || (sum < num_blocks) || (sum < pb)) + { + bp->b_error = ENOSPC; + bp->b_flags |= B_ERROR; + bp->b_resid = bp->b_bcount; + biodone(bp); + return(bp->b_error); + } + + /* + * XXX rf_DoAccess() should do this, not just DoAccessKernel() + */ + + if (bp->b_bcount & raidPtr->sectorMask) { + bp->b_error = EINVAL; + bp->b_flags |= B_ERROR; + bp->b_resid = bp->b_bcount; + biodone(bp); + return(bp->b_error); + } + db1_printf(("Calling DoAccess..\n")); + + /* don't ever condition on bp->b_flags & B_WRITE. + always condition on B_READ instead */ + retcode = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ? + RF_IO_TYPE_READ : RF_IO_TYPE_WRITE, + 0, raid_addr, num_blocks, bp->b_un.b_addr, + bp, NULL, NULL, RF_DAG_NONBLOCKING_IO|flags, + NULL, cbFunc, cbArg); +#if 0 + db1_printf(("After call to DoAccess: 0x%x 0x%x %d\n",bp, + bp->b_data,(int)bp->b_resid)); +#endif + return(retcode); +} + +/* invoke an I/O from kernel mode. Disk queue should be locked upon entry */ + +int rf_DispatchKernelIO(queue, req) + RF_DiskQueue_t *queue; + RF_DiskQueueData_t *req; +{ + int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE; + struct buf *bp; + struct raidbuf *raidbp=NULL; + struct raid_softc *rs; + int unit; + + /* XXX along with the vnode, we also need the softc associated with + this device.. */ + + req->queue = queue; + + unit = queue->raidPtr->raidid; + + db1_printf(("DispatchKernelIO unit: %d\n",unit)); + + if (unit >= numraid) { + printf("Invalid unit number: %d %d\n",unit,numraid); + panic("Invalid Unit number in rf_DispatchKernelIO\n"); + } + + rs = &raid_softc[unit]; + + /* XXX is this the right place? */ + disk_busy(&rs->sc_dkdev); + + bp = req->bp; + + /* + XXX when there is a physical disk failure, someone is passing + us a buffer that contains old stuff!! Attempt to deal with + this problem without taking a performance hit... + (not sure where the real bug is. It's buried in RAIDframe + somewhere) :-( GO ) + */ + + if (bp->b_flags & B_ERROR) { + bp->b_flags &= ~B_ERROR; + } + if (bp->b_error!=0) { + bp->b_error = 0; + } + + raidbp = RAIDGETBUF(rs); + + raidbp->rf_flags = 0; /* XXX not really used anywhere... */ + + /* + * context for raidiodone + */ + raidbp->rf_obp = bp; + raidbp->req = req; + + switch (req->type) { + case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */ + /* + Dprintf2("rf_DispatchKernelIO: NOP to r %d c %d\n", + queue->row, queue->col); + */ + /* XXX need to do something extra here.. */ + /* I'm leaving this in, as I've never actually seen it + used, and I'd like folks to report it... GO */ + printf(("WAKEUP CALLED\n")); + queue->numOutstanding++; + + /* XXX need to glue the original buffer into this?? */ + + KernelWakeupFunc(&raidbp->rf_buf); + break; + + case RF_IO_TYPE_READ: + case RF_IO_TYPE_WRITE: + + if (req->tracerec) { + RF_ETIMER_START(req->tracerec->timer); + } + + + InitBP(&raidbp->rf_buf, queue->rf_cinfo->ci_vp, + op | bp->b_flags, queue->rf_cinfo->ci_dev, + req->sectorOffset, req->numSector, + req->buf, KernelWakeupFunc, (void *) req, + queue->raidPtr->logBytesPerSector, req->b_proc); + + if (rf_debugKernelAccess) { + db1_printf(("dispatch: bp->b_blkno = %ld\n", + (long) bp->b_blkno)); + } + queue->numOutstanding++; + queue->last_deq_sector = req->sectorOffset; + /* acc wouldn't have been let in if there were any + pending reqs at any other priority */ + queue->curPriority = req->priority; + /* + Dprintf3("rf_DispatchKernelIO: %c to row %d col %d\n", + req->type, queue->row, queue->col); + */ + + db1_printf(("Going for %c to unit %d row %d col %d\n", + req->type, unit, queue->row, queue->col)); + db1_printf(("sector %d count %d (%d bytes) %d\n", + (int) req->sectorOffset, (int) req->numSector, + (int) (req->numSector << + queue->raidPtr->logBytesPerSector), + (int) queue->raidPtr->logBytesPerSector)); + if ((raidbp->rf_buf.b_flags & B_READ) == 0) { + raidbp->rf_buf.b_vp->v_numoutput++; + } + + VOP_STRATEGY(&raidbp->rf_buf); + + break; + + default: + panic("bad req->type in rf_DispatchKernelIO"); + } + db1_printf(("Exiting from DispatchKernelIO\n")); + return(0); +} + +/* this is the callback function associated with a I/O invoked from + kernel code. + */ +static void KernelWakeupFunc(vbp) + struct buf *vbp; +{ + RF_DiskQueueData_t *req = NULL; + RF_DiskQueue_t *queue; + struct raidbuf *raidbp = (struct raidbuf *)vbp; + struct buf *bp; + struct raid_softc *rs; + int unit; + register int s; + + s=splbio(); /* XXX */ + db1_printf(("recovering the request queue:\n")); + req = raidbp->req; + + bp = raidbp->rf_obp; +#if 0 + db1_printf(("bp=0x%x\n",bp)); +#endif + + queue = (RF_DiskQueue_t *) req->queue; + + if (raidbp->rf_buf.b_flags & B_ERROR) { +#if 0 + printf("Setting bp->b_flags!!! %d\n",raidbp->rf_buf.b_error); +#endif + bp->b_flags |= B_ERROR; + bp->b_error = raidbp->rf_buf.b_error ? + raidbp->rf_buf.b_error : EIO; + } + +#if 0 + db1_printf(("raidbp->rf_buf.b_bcount=%d\n",(int)raidbp->rf_buf.b_bcount)); + db1_printf(("raidbp->rf_buf.b_bufsize=%d\n",(int)raidbp->rf_buf.b_bufsize)); + db1_printf(("raidbp->rf_buf.b_resid=%d\n",(int)raidbp->rf_buf.b_resid)); + db1_printf(("raidbp->rf_buf.b_data=0x%x\n",raidbp->rf_buf.b_data)); +#endif + + /* XXX methinks this could be wrong... */ +#if 1 + bp->b_resid = raidbp->rf_buf.b_resid; +#endif + + if (req->tracerec) { + RF_ETIMER_STOP(req->tracerec->timer); + RF_ETIMER_EVAL(req->tracerec->timer); + RF_LOCK_MUTEX(rf_tracing_mutex); + req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer); + req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer); + req->tracerec->num_phys_ios++; + RF_UNLOCK_MUTEX(rf_tracing_mutex); + } + + bp->b_bcount = raidbp->rf_buf.b_bcount;/* XXXX ?? */ + + unit = queue->raidPtr->raidid; /* *Much* simpler :-> */ + + + /* XXX Ok, let's get aggressive... If B_ERROR is set, let's go ballistic, + and mark the component as hosed... */ +#if 1 + if (bp->b_flags&B_ERROR) { + /* Mark the disk as dead */ + /* but only mark it once... */ + if (queue->raidPtr->Disks[queue->row][queue->col].status == + rf_ds_optimal) { + printf("raid%d: IO Error. Marking %s as failed.\n", + unit, queue->raidPtr->Disks[queue->row][queue->col].devname ); + queue->raidPtr->Disks[queue->row][queue->col].status = + rf_ds_failed; + queue->raidPtr->status[queue->row] = rf_rs_degraded; + queue->raidPtr->numFailures++; + } else { /* Disk is already dead... */ + /* printf("Disk already marked as dead!\n"); */ + } + + } +#endif + + rs = &raid_softc[unit]; + RAIDPUTBUF(rs,raidbp); + + + if (bp->b_resid==0) { + db1_printf(("Disk is no longer busy for this buffer... %d %ld %ld\n", + unit, bp->b_resid, bp->b_bcount)); + /* XXX is this the right place for a disk_unbusy()??!??!?!? */ + disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid)); + } else { + db1_printf(("b_resid is still %ld\n",bp->b_resid)); + } + + rf_DiskIOComplete(queue, req, (bp->b_flags & B_ERROR) ? 1 : 0); + (req->CompleteFunc)(req->argument, (bp->b_flags & B_ERROR) ? 1 : 0); + /* printf("Exiting KernelWakeupFunc\n"); */ + + splx(s); /* XXX */ +} + + + +/* + * initialize a buf structure for doing an I/O in the kernel. + */ +static void InitBP( + struct buf *bp, + struct vnode *b_vp, + unsigned rw_flag, + dev_t dev, + RF_SectorNum_t startSect, + RF_SectorCount_t numSect, + caddr_t buf, + void (*cbFunc)(struct buf *), + void *cbArg, + int logBytesPerSector, + struct proc *b_proc) +{ + /* bp->b_flags = B_PHYS | rw_flag; */ + bp->b_flags = B_CALL | rw_flag; /* XXX need B_PHYS here too??? */ + bp->b_bcount = numSect << logBytesPerSector; + bp->b_bufsize = bp->b_bcount; + bp->b_error = 0; + bp->b_dev = dev; + db1_printf(("bp->b_dev is %d\n", dev)); + bp->b_un.b_addr = buf; +#if 0 + db1_printf(("bp->b_data=0x%x\n",bp->b_data)); +#endif + + bp->b_blkno = startSect; + bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */ + db1_printf(("b_bcount is: %d\n",(int)bp->b_bcount)); + if (bp->b_bcount == 0) { + panic("bp->b_bcount is zero in InitBP!!\n"); + } + bp->b_proc = b_proc; + bp->b_iodone = cbFunc; + bp->b_vp = b_vp; + +} +#endif /* KERNEL */ + +/* Extras... */ + +unsigned int rpcc() +{ + /* XXX no clue what this is supposed to do.. my guess is + that it's supposed to read the CPU cycle counter... */ + /* db1_printf("this is supposed to do something useful too!??\n"); */ + return(0); +} + +#if 0 +int rf_GetSpareTableFromDaemon(req) + RF_SparetWait_t *req; +{ + int retcode=1; + printf("This is supposed to do something useful!!\n"); /* XXX */ + + return(retcode); + +} +#endif + +static void +raidgetdefaultlabel(raidPtr, rs, lp) + RF_Raid_t *raidPtr; + struct raid_softc *rs; + struct disklabel *lp; +{ + db1_printf(("Building a default label...\n")); + bzero(lp, sizeof(*lp)); + + /* fabricate a label... */ + lp->d_secperunit = raidPtr->totalSectors; + lp->d_secsize = raidPtr->bytesPerSector; + lp->d_nsectors = 1024 * (1024 / raidPtr->bytesPerSector); + lp->d_ntracks = 1; + lp->d_ncylinders = raidPtr->totalSectors / lp->d_nsectors; + lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors; + + strncpy(lp->d_typename, "raid", sizeof(lp->d_typename)); + lp->d_type = DTYPE_RAID; + strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname)); + lp->d_rpm = 3600; + lp->d_interleave = 1; + lp->d_flags = 0; + + lp->d_partitions[RAW_PART].p_offset = 0; + lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors; + lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED; + lp->d_npartitions = RAW_PART + 1; + + lp->d_magic = DISKMAGIC; + lp->d_magic2 = DISKMAGIC; + lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label); + +} + +/* + * Read the disklabel from the raid device. If one is not present, fake one + * up. + */ +static void +raidgetdisklabel(dev) + dev_t dev; +{ + int unit = raidunit(dev); + struct raid_softc *rs = &raid_softc[unit]; + char *errstring; + struct disklabel *lp = rs->sc_dkdev.dk_label; + struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel; + RF_Raid_t *raidPtr; + + db1_printf(("Getting the disklabel...\n")); + + bzero(clp, sizeof(*clp)); + + raidPtr = raidPtrs[unit]; + + raidgetdefaultlabel(raidPtr, rs, lp); + + /* + * Call the generic disklabel extraction routine. + */ + errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy, + rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel); + if (errstring) + raidmakedisklabel(rs); + else { + int i; + struct partition *pp; + + /* + * Sanity check whether the found disklabel is valid. + * + * This is necessary since total size of the raid device + * may vary when an interleave is changed even though exactly + * same componets are used, and old disklabel may used + * if that is found. + */ + if (lp->d_secperunit != rs->sc_size) + printf("WARNING: %s: " + "total sector size in disklabel (%d) != " + "the size of raid (%d)\n", rs->sc_xname, + lp->d_secperunit, rs->sc_size); + for (i = 0; i < lp->d_npartitions; i++) { + pp = &lp->d_partitions[i]; + if (pp->p_offset + pp->p_size > rs->sc_size) + printf("WARNING: %s: end of partition `%c' " + "exceeds the size of raid (%d)\n", + rs->sc_xname, 'a' + i, rs->sc_size); + } + } + +} + +/* + * Take care of things one might want to take care of in the event + * that a disklabel isn't present. + */ +static void +raidmakedisklabel(rs) + struct raid_softc *rs; +{ + struct disklabel *lp = rs->sc_dkdev.dk_label; + db1_printf(("Making a label..\n")); + + /* + * For historical reasons, if there's no disklabel present + * the raw partition must be marked FS_BSDFFS. + */ + + lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS; + + strncpy(lp->d_packname, "default label", sizeof(lp->d_packname)); + + lp->d_checksum = dkcksum(lp); +} + +/* + * Lookup the provided name in the filesystem. If the file exists, + * is a valid block device, and isn't being used by anyone else, + * set *vpp to the file's vnode. + * You'll find the original of this in ccd.c + */ +int +raidlookup(path, p, vpp) + char *path; + struct proc *p; + struct vnode **vpp; /* result */ +{ + struct nameidata nd; + struct vnode *vp; + struct vattr va; + int error; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path, p); + if ((error = vn_open(&nd, FREAD|FWRITE, 0)) != 0) { +#ifdef DEBUG + printf("RAIDframe: vn_open returned %d\n",error); +#endif + return (error); + } + vp = nd.ni_vp; + if (vp->v_usecount > 1) { + VOP_UNLOCK(vp, 0); + (void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p); + return (EBUSY); + } + if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)) != 0) { + VOP_UNLOCK(vp, 0); + (void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p); + return (error); + } + /* XXX: eventually we should handle VREG, too. */ + if (va.va_type != VBLK) { + VOP_UNLOCK(vp, 0); + (void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p); + return (ENOTBLK); + } + VOP_UNLOCK(vp, 0); + *vpp = vp; + return (0); +} + +/* + * Wait interruptibly for an exclusive lock. + * + * XXX + * Several drivers do this; it should be abstracted and made MP-safe. + * (Hmm... where have we seen this warning before :-> GO ) + */ +static int +raidlock(rs) + struct raid_softc *rs; +{ + int error; + + while ((rs->sc_flags & RAIDF_LOCKED) != 0) { + rs->sc_flags |= RAIDF_WANTED; + if ((error = + tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0) + return (error); + } + rs->sc_flags |= RAIDF_LOCKED; + return (0); +} + +/* + * Unlock and wake up any waiters. + */ +static void +raidunlock(rs) + struct raid_softc *rs; +{ + + rs->sc_flags &= ~RAIDF_LOCKED; + if ((rs->sc_flags & RAIDF_WANTED) != 0) { + rs->sc_flags &= ~RAIDF_WANTED; + wakeup(rs); + } +} diff --git a/sys/dev/raidframe/rf_nwayxor.c b/sys/dev/raidframe/rf_nwayxor.c new file mode 100644 index 00000000000..c319aa04c52 --- /dev/null +++ b/sys/dev/raidframe/rf_nwayxor.c @@ -0,0 +1,454 @@ +/* $OpenBSD: rf_nwayxor.c,v 1.1 1999/01/11 14:29:31 niklas Exp $ */ +/* $NetBSD: rf_nwayxor.c,v 1.1 1998/11/13 04:20:31 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, Daniel Stodolsky + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/************************************************************ + * + * nwayxor.c -- code to do N-way xors for reconstruction + * + * nWayXorN xors N input buffers into the destination buffer. + * adapted from danner's longword_bxor code. + * + ************************************************************/ + +/* : + * Log: rf_nwayxor.c,v + * Revision 1.6 1996/06/12 03:31:18 jimz + * only print call counts if rf_showXorCallCounts != 0 + * + * Revision 1.5 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.4 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.3 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.2 1995/12/01 19:29:14 root + * added copyright info + * + */ + +#include "rf_nwayxor.h" +#include "rf_shutdown.h" + +static int callcount[10]; +static void rf_ShutdownNWayXor(void *); + +static void rf_ShutdownNWayXor(ignored) + void *ignored; +{ + int i; + + if (rf_showXorCallCounts == 0) + return; + printf("Call counts for n-way xor routines: "); + for (i=0; i<10; i++) + printf("%d ",callcount[i]); + printf("\n"); +} + +int rf_ConfigureNWayXor(listp) + RF_ShutdownList_t **listp; +{ + int i, rc; + + for (i=0; i<10; i++) + callcount[i] = 0; + rc = rf_ShutdownCreate(listp, rf_ShutdownNWayXor, NULL); + return(rc); +} + +void rf_nWayXor1(src_rbs, dest_rb, len) + RF_ReconBuffer_t **src_rbs; + RF_ReconBuffer_t *dest_rb; + int len; +{ + register unsigned long *src = (unsigned long *) src_rbs[0]->buffer; + register unsigned long *dest= (unsigned long *) dest_rb->buffer; + register unsigned long *end = src+len; + register unsigned long d0, d1, d2, d3, s0, s1, s2, s3; + + callcount[1]++; + while (len >= 4 ) + { + d0 = dest[0]; + d1 = dest[1]; + d2 = dest[2]; + d3 = dest[3]; + s0 = src[0]; + s1 = src[1]; + s2 = src[2]; + s3 = src[3]; + dest[0] = d0 ^ s0; + dest[1] = d1 ^ s1; + dest[2] = d2 ^ s2; + dest[3] = d3 ^ s3; + src += 4; + dest += 4; + len -= 4; + } + while (src < end) {*dest++ ^= *src++;} +} + +void rf_nWayXor2(src_rbs, dest_rb, len) + RF_ReconBuffer_t **src_rbs; + RF_ReconBuffer_t *dest_rb; + int len; +{ + register unsigned long *dst = (unsigned long *) dest_rb->buffer; + register unsigned long *a = dst; + register unsigned long *b = (unsigned long *) src_rbs[0]->buffer; + register unsigned long *c = (unsigned long *) src_rbs[1]->buffer; + unsigned long a0,a1,a2,a3, b0,b1,b2,b3; + + callcount[2]++; + /* align dest to cache line */ + while ((((unsigned long) dst) & 0x1f)) + { + *dst++ = *a++ ^ *b++ ^ *c++; + len--; + } + while (len > 4 ) + { + a0 = a[0]; len -= 4; + + a1 = a[1]; + a2 = a[2]; + + a3 = a[3]; a += 4; + + b0 = b[0]; + b1 = b[1]; + + b2 = b[2]; + b3 = b[3]; + /* start dual issue */ + a0 ^= b0; b0 = c[0]; + + b += 4; a1 ^= b1; + + a2 ^= b2; a3 ^= b3; + + b1 = c[1]; a0 ^= b0; + + b2 = c[2]; a1 ^= b1; + + b3 = c[3]; a2 ^= b2; + + dst[0] = a0; a3 ^= b3; + dst[1] = a1; c += 4; + dst[2] = a2; + dst[3] = a3; dst += 4; + } + while (len) + { + *dst++ = *a++ ^ *b++ ^ *c++; + len--; + } +} + +/* note that first arg is not incremented but 2nd arg is */ +#define LOAD_FIRST(_dst,_b) \ + a0 = _dst[0]; len -= 4; \ + a1 = _dst[1]; \ + a2 = _dst[2]; \ + a3 = _dst[3]; \ + b0 = _b[0]; \ + b1 = _b[1]; \ + b2 = _b[2]; \ + b3 = _b[3]; _b += 4; + +/* note: arg is incremented */ +#define XOR_AND_LOAD_NEXT(_n) \ + a0 ^= b0; b0 = _n[0]; \ + a1 ^= b1; b1 = _n[1]; \ + a2 ^= b2; b2 = _n[2]; \ + a3 ^= b3; b3 = _n[3]; \ + _n += 4; + +/* arg is incremented */ +#define XOR_AND_STORE(_dst) \ + a0 ^= b0; _dst[0] = a0; \ + a1 ^= b1; _dst[1] = a1; \ + a2 ^= b2; _dst[2] = a2; \ + a3 ^= b3; _dst[3] = a3; \ + _dst += 4; + + +void rf_nWayXor3(src_rbs, dest_rb, len) + RF_ReconBuffer_t **src_rbs; + RF_ReconBuffer_t *dest_rb; + int len; +{ + register unsigned long *dst = (unsigned long *) dest_rb->buffer; + register unsigned long *b = (unsigned long *) src_rbs[0]->buffer; + register unsigned long *c = (unsigned long *) src_rbs[1]->buffer; + register unsigned long *d = (unsigned long *) src_rbs[2]->buffer; + unsigned long a0,a1,a2,a3, b0,b1,b2,b3; + + callcount[3]++; + /* align dest to cache line */ + while ((((unsigned long) dst) & 0x1f)) { + *dst++ ^= *b++ ^ *c++ ^ *d++; + len--; + } + while (len > 4 ) { + LOAD_FIRST(dst,b); + XOR_AND_LOAD_NEXT(c); + XOR_AND_LOAD_NEXT(d); + XOR_AND_STORE(dst); + } + while (len) { + *dst++ ^= *b++ ^ *c++ ^ *d++; + len--; + } +} + +void rf_nWayXor4(src_rbs, dest_rb, len) + RF_ReconBuffer_t **src_rbs; + RF_ReconBuffer_t *dest_rb; + int len; +{ + register unsigned long *dst = (unsigned long *) dest_rb->buffer; + register unsigned long *b = (unsigned long *) src_rbs[0]->buffer; + register unsigned long *c = (unsigned long *) src_rbs[1]->buffer; + register unsigned long *d = (unsigned long *) src_rbs[2]->buffer; + register unsigned long *e = (unsigned long *) src_rbs[3]->buffer; + unsigned long a0,a1,a2,a3, b0,b1,b2,b3; + + callcount[4]++; + /* align dest to cache line */ + while ((((unsigned long) dst) & 0x1f)) { + *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++; + len--; + } + while (len > 4 ) { + LOAD_FIRST(dst,b); + XOR_AND_LOAD_NEXT(c); + XOR_AND_LOAD_NEXT(d); + XOR_AND_LOAD_NEXT(e); + XOR_AND_STORE(dst); + } + while (len) { + *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++; + len--; + } +} + +void rf_nWayXor5(src_rbs, dest_rb, len) + RF_ReconBuffer_t **src_rbs; + RF_ReconBuffer_t *dest_rb; + int len; +{ + register unsigned long *dst = (unsigned long *) dest_rb->buffer; + register unsigned long *b = (unsigned long *) src_rbs[0]->buffer; + register unsigned long *c = (unsigned long *) src_rbs[1]->buffer; + register unsigned long *d = (unsigned long *) src_rbs[2]->buffer; + register unsigned long *e = (unsigned long *) src_rbs[3]->buffer; + register unsigned long *f = (unsigned long *) src_rbs[4]->buffer; + unsigned long a0,a1,a2,a3, b0,b1,b2,b3; + + callcount[5]++; + /* align dest to cache line */ + while ((((unsigned long) dst) & 0x1f)) { + *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++; + len--; + } + while (len > 4 ) { + LOAD_FIRST(dst,b); + XOR_AND_LOAD_NEXT(c); + XOR_AND_LOAD_NEXT(d); + XOR_AND_LOAD_NEXT(e); + XOR_AND_LOAD_NEXT(f); + XOR_AND_STORE(dst); + } + while (len) { + *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++; + len--; + } +} + +void rf_nWayXor6(src_rbs, dest_rb, len) + RF_ReconBuffer_t **src_rbs; + RF_ReconBuffer_t *dest_rb; + int len; +{ + register unsigned long *dst = (unsigned long *) dest_rb->buffer; + register unsigned long *b = (unsigned long *) src_rbs[0]->buffer; + register unsigned long *c = (unsigned long *) src_rbs[1]->buffer; + register unsigned long *d = (unsigned long *) src_rbs[2]->buffer; + register unsigned long *e = (unsigned long *) src_rbs[3]->buffer; + register unsigned long *f = (unsigned long *) src_rbs[4]->buffer; + register unsigned long *g = (unsigned long *) src_rbs[5]->buffer; + unsigned long a0,a1,a2,a3, b0,b1,b2,b3; + + callcount[6]++; + /* align dest to cache line */ + while ((((unsigned long) dst) & 0x1f)) { + *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++; + len--; + } + while (len > 4 ) { + LOAD_FIRST(dst,b); + XOR_AND_LOAD_NEXT(c); + XOR_AND_LOAD_NEXT(d); + XOR_AND_LOAD_NEXT(e); + XOR_AND_LOAD_NEXT(f); + XOR_AND_LOAD_NEXT(g); + XOR_AND_STORE(dst); + } + while (len) { + *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++; + len--; + } +} + +void rf_nWayXor7(src_rbs, dest_rb, len) + RF_ReconBuffer_t **src_rbs; + RF_ReconBuffer_t *dest_rb; + int len; +{ + register unsigned long *dst = (unsigned long *) dest_rb->buffer; + register unsigned long *b = (unsigned long *) src_rbs[0]->buffer; + register unsigned long *c = (unsigned long *) src_rbs[1]->buffer; + register unsigned long *d = (unsigned long *) src_rbs[2]->buffer; + register unsigned long *e = (unsigned long *) src_rbs[3]->buffer; + register unsigned long *f = (unsigned long *) src_rbs[4]->buffer; + register unsigned long *g = (unsigned long *) src_rbs[5]->buffer; + register unsigned long *h = (unsigned long *) src_rbs[6]->buffer; + unsigned long a0,a1,a2,a3, b0,b1,b2,b3; + + callcount[7]++; + /* align dest to cache line */ + while ((((unsigned long) dst) & 0x1f)) { + *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++; + len--; + } + while (len > 4 ) { + LOAD_FIRST(dst,b); + XOR_AND_LOAD_NEXT(c); + XOR_AND_LOAD_NEXT(d); + XOR_AND_LOAD_NEXT(e); + XOR_AND_LOAD_NEXT(f); + XOR_AND_LOAD_NEXT(g); + XOR_AND_LOAD_NEXT(h); + XOR_AND_STORE(dst); + } + while (len) { + *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++; + len--; + } +} + +void rf_nWayXor8(src_rbs, dest_rb, len) + RF_ReconBuffer_t **src_rbs; + RF_ReconBuffer_t *dest_rb; + int len; +{ + register unsigned long *dst = (unsigned long *) dest_rb->buffer; + register unsigned long *b = (unsigned long *) src_rbs[0]->buffer; + register unsigned long *c = (unsigned long *) src_rbs[1]->buffer; + register unsigned long *d = (unsigned long *) src_rbs[2]->buffer; + register unsigned long *e = (unsigned long *) src_rbs[3]->buffer; + register unsigned long *f = (unsigned long *) src_rbs[4]->buffer; + register unsigned long *g = (unsigned long *) src_rbs[5]->buffer; + register unsigned long *h = (unsigned long *) src_rbs[6]->buffer; + register unsigned long *i = (unsigned long *) src_rbs[7]->buffer; + unsigned long a0,a1,a2,a3, b0,b1,b2,b3; + + callcount[8]++; + /* align dest to cache line */ + while ((((unsigned long) dst) & 0x1f)) { + *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++; + len--; + } + while (len > 4 ) { + LOAD_FIRST(dst,b); + XOR_AND_LOAD_NEXT(c); + XOR_AND_LOAD_NEXT(d); + XOR_AND_LOAD_NEXT(e); + XOR_AND_LOAD_NEXT(f); + XOR_AND_LOAD_NEXT(g); + XOR_AND_LOAD_NEXT(h); + XOR_AND_LOAD_NEXT(i); + XOR_AND_STORE(dst); + } + while (len) { + *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++; + len--; + } +} + + +void rf_nWayXor9(src_rbs, dest_rb, len) + RF_ReconBuffer_t **src_rbs; + RF_ReconBuffer_t *dest_rb; + int len; +{ + register unsigned long *dst = (unsigned long *) dest_rb->buffer; + register unsigned long *b = (unsigned long *) src_rbs[0]->buffer; + register unsigned long *c = (unsigned long *) src_rbs[1]->buffer; + register unsigned long *d = (unsigned long *) src_rbs[2]->buffer; + register unsigned long *e = (unsigned long *) src_rbs[3]->buffer; + register unsigned long *f = (unsigned long *) src_rbs[4]->buffer; + register unsigned long *g = (unsigned long *) src_rbs[5]->buffer; + register unsigned long *h = (unsigned long *) src_rbs[6]->buffer; + register unsigned long *i = (unsigned long *) src_rbs[7]->buffer; + register unsigned long *j = (unsigned long *) src_rbs[8]->buffer; + unsigned long a0,a1,a2,a3, b0,b1,b2,b3; + + callcount[9]++; + /* align dest to cache line */ + while ((((unsigned long) dst) & 0x1f)) { + *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++ ^ *j++; + len--; + } + while (len > 4 ) { + LOAD_FIRST(dst,b); + XOR_AND_LOAD_NEXT(c); + XOR_AND_LOAD_NEXT(d); + XOR_AND_LOAD_NEXT(e); + XOR_AND_LOAD_NEXT(f); + XOR_AND_LOAD_NEXT(g); + XOR_AND_LOAD_NEXT(h); + XOR_AND_LOAD_NEXT(i); + XOR_AND_LOAD_NEXT(j); + XOR_AND_STORE(dst); + } + while (len) { + *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++ ^ *j++; + len--; + } +} diff --git a/sys/dev/raidframe/rf_nwayxor.h b/sys/dev/raidframe/rf_nwayxor.h new file mode 100644 index 00000000000..f474dff9908 --- /dev/null +++ b/sys/dev/raidframe/rf_nwayxor.h @@ -0,0 +1,75 @@ +/* $OpenBSD: rf_nwayxor.h,v 1.1 1999/01/11 14:29:31 niklas Exp $ */ +/* $NetBSD: rf_nwayxor.h,v 1.1 1998/11/13 04:20:31 oster Exp $ */ +/* + * rf_nwayxor.h + */ +/* + * Copyright (c) 1996 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ +/* + * rf_nwayxor.h -- types and prototypes for nwayxor module + */ +/* + * : + * Log: rf_nwayxor.h,v + * Revision 1.4 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.3 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.2 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.1 1996/05/18 19:56:47 jimz + * Initial revision + * + */ + +#ifndef _RF__RF_NWAYXOR_H_ +#define _RF__RF_NWAYXOR_H_ + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_reconstruct.h" + +int rf_ConfigureNWayXor(RF_ShutdownList_t **listp); +void rf_nWayXor1(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len); +void rf_nWayXor2(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len); +void rf_nWayXor3(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len); +void rf_nWayXor4(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len); +void rf_nWayXor5(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len); +void rf_nWayXor6(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len); +void rf_nWayXor7(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len); +void rf_nWayXor8(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len); +void rf_nWayXor9(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len); + +#endif /* !_RF__RF_NWAYXOR_H_ */ diff --git a/sys/dev/raidframe/rf_openbsd.h b/sys/dev/raidframe/rf_openbsd.h new file mode 100644 index 00000000000..5e34e977c91 --- /dev/null +++ b/sys/dev/raidframe/rf_openbsd.h @@ -0,0 +1,94 @@ +/* $OpenBSD: rf_openbsd.h,v 1.1 1999/01/11 14:29:32 niklas Exp $ */ + +/*- + * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Greg Oster + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/*- + * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Jason R. Thorpe. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _RF__RF_OPENBSD_H_ +#define _RF__RF_OPENBSD_H_ + +#include <sys/fcntl.h> +#include <sys/systm.h> +#include <sys/namei.h> +#include <sys/vnode.h> + +struct raidcinfo { + struct vnode *ci_vp; /* device's vnode */ + dev_t ci_dev; /* XXX: device's dev_t */ +#if 0 + size_t ci_size; /* size */ + char *ci_path; /* path to component */ + size_t ci_pathlen; /* length of component path */ +#endif +}; + +#endif /* _RF__RF_OPENBSD_H_ */ diff --git a/sys/dev/raidframe/rf_openbsdkintf.c b/sys/dev/raidframe/rf_openbsdkintf.c new file mode 100644 index 00000000000..55b7cfbcca4 --- /dev/null +++ b/sys/dev/raidframe/rf_openbsdkintf.c @@ -0,0 +1,2033 @@ +/* $OpenBSD: rf_openbsdkintf.c,v 1.1 1999/01/11 14:29:32 niklas Exp $ */ + +/*- + * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Greg Oster; Jason R. Thorpe. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1988 University of Utah. + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Utah $Hdr: cd.c 1.6 90/11/28$ + * + * @(#)cd.c 8.2 (Berkeley) 11/16/93 + */ + + + + +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Mark Holland, Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/*********************************************************** + * + * rf_kintf.c -- the kernel interface routines for RAIDframe + * + ***********************************************************/ +/* + * : + * Log: rf_kintf.c,v + * Revision 1.57 1996/07/19 16:12:20 jimz + * remove addition of protectedSectors in InitBP- it's already + * done in the diskqueue code + * + * Revision 1.56 1996/07/17 21:00:58 jimz + * clean up timer interface, tracing + * + * Revision 1.55 1996/06/17 03:00:54 jimz + * Change RAIDFRAME_GET_INFO interface to do its own copyout() + * (because size of device config structure now exceeds 8k) + * + * Revision 1.54 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.53 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.52 1996/06/06 17:28:08 jimz + * track sector number of last I/O dequeued + * + * Revision 1.51 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.50 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.49 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.48 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.47 1996/05/30 12:59:18 jimz + * make etimer happier, more portable + * + * Revision 1.46 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.45 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.44 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.43 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.42 1996/05/23 22:17:54 jimz + * fix sector size hardcoding problems + * + * Revision 1.41 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.40 1996/05/23 13:18:07 jimz + * tracing_mutex -> rf_tracing_mutex + * + * Revision 1.39 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.38 1996/05/20 16:15:32 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.37 1996/05/10 16:23:47 jimz + * RF_offset -> RF_Offset + * + * Revision 1.36 1996/05/08 21:01:24 jimz + * fixed up enum type names that were conflicting with other + * enums and function names (ie, "panic") + * future naming trends will be towards RF_ and rf_ for + * everything raidframe-related + * + * Revision 1.35 1996/05/03 19:10:48 jimz + * change sanity checking for bogus I/Os to return more appropriate + * values (to make some user-level utilities happer with RAIDframe) + * + * Revision 1.34 1996/05/02 22:17:00 jimz + * When using DKUSAGE, send a bogus IO after configuring to let DKUSAGE know + * that we exist. This will let user-level programs doing group stats on the + * RF device function without error before RF gets its first IO + * + * Changed rf_device_config devs and spares fields to RF_RaidDisk_t + * + * Inc numOutstanding for the disk queue in rf_DispatchKernelIO if + * type is IO_TYPE_NOP. I'm not sure this is right, but it seems to be, + * because the disk IO completion routine wants to dec it, and doesn't + * care if there was no such IO. + * + * Revision 1.33 1996/05/02 15:05:44 jimz + * for now, rf_DoAccessKernel will reject non-sector-sized I/Os + * eventually, it should do something more clever... + * (and do it in DoAccess(), not just DoAccessKernel()) + * + * Revision 1.32 1996/05/01 16:28:39 jimz + * get rid of uses of ccmn_ functions + * + * Revision 1.31 1996/05/01 15:42:17 jimz + * ccmn_* memory management is on the way out. This is an archival checkpoint- + * both the old and new code are in place (all the ccmn_ calls are #if 0). After + * this, the ccmn_ code will no longer appear. + * + * Revision 1.30 1996/04/22 15:53:13 jimz + * MAX_RAIDS -> NRAIDFRAME + * + * Revision 1.29 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.28 1995/12/01 19:11:01 root + * added copyright info + * + * Revision 1.27 1995/11/28 18:56:40 wvcii + * disabled buffer copy in rf_write + * + * Revision 1.26 1995/10/06 16:37:08 jimz + * get struct bufs from ubc, not cam + * copy all write data, and operate on copy + * (temporary hack to get around dags in PQ that want + * to Xor into user write buffers) + * + * Revision 1.25 1995/09/30 22:23:08 jimz + * do not require raid to be active to perform ACCTOTAL ioctl + * + * Revision 1.24 1995/09/30 20:39:08 jimz + * added new ioctls: + * RAIDFRAME_RESET_ACCTOTALS + * RAIDFRAME_GET_ACCTOTALS + * RAIDFRAME_KEEP_ACCTOTALS + * + * Revision 1.23 1995/09/20 21:11:59 jimz + * include dfstrace.h in KERNEL block + * (even though it's a kernel-only file, this makes the depend process + * at user-level happy. Why the user-level Makefile wants to depend + * kintf.c is less clear, but this is a workaround). + * + * Revision 1.22 1995/09/19 23:19:03 jimz + * added DKUSAGE support + * + */ + +#ifdef _KERNEL +#define KERNEL +#endif + +#ifdef KERNEL + +#include <sys/errno.h> + +#include "raid.h" +#include <sys/param.h> +#include <sys/malloc.h> +#include <sys/queue.h> +#include <sys/disk.h> +#include <sys/device.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <sys/fcntl.h> +#include <sys/systm.h> +#include <sys/namei.h> +#include <sys/conf.h> +#include <sys/lock.h> +#include <sys/buf.h> +#include <sys/user.h> + +#include "rf_raid.h" +#include "rf_raidframe.h" +#include "rf_dag.h" +#include "rf_dagflags.h" +#include "rf_diskqueue.h" +#include "rf_acctrace.h" +#include "rf_etimer.h" +#include "rf_general.h" +#include "rf_debugMem.h" +#include "rf_kintf.h" +#include "rf_options.h" +#include "rf_driver.h" +#include "rf_parityscan.h" +#include "rf_debugprint.h" +#include "rf_threadstuff.h" + +int rf_kdebug_level = 0; + +#define RFK_BOOT_NONE 0 +#define RFK_BOOT_GOOD 1 +#define RFK_BOOT_BAD 2 +static int rf_kbooted = RFK_BOOT_NONE; + +#ifdef RAIDDEBUG +#define db0_printf(a) printf a +#define db_printf(a) do if (rf_kdebug_level > 0) printf a; while(0) +#define db1_printf(a) do if (rf_kdebug_level > 0) printf a; while(0) +#define db2_printf(a) do if (rf_kdebug_level > 1) printf a; while(0) +#define db3_printf(a) do if (rf_kdebug_level > 2) printf a; while(0) +#define db4_printf(a) do if (rf_kdebug_level > 3) printf a; while(0) +#define db5_printf(a) do if (rf_kdebug_level > 4) printf a; while(0) +#else /* RAIDDEBUG */ +#define db0_printf(a) printf a +#define db1_printf(a) (void)0 +#define db2_printf(a) (void)0 +#define db3_printf(a) (void)0 +#define db4_printf(a) (void)0 +#define db5_printf(a) (void)0 +#endif /* RAIDDEBUG */ + +static RF_Raid_t **raidPtrs; /* global raid device descriptors */ + +static int rf_pending_testaccs; + +RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex) +RF_DECLARE_STATIC_MUTEX(rf_async_done_q_mutex) + +/* requests to install a spare table */ +static RF_SparetWait_t *rf_sparet_wait_queue; + +/* responses from installation process */ +static RF_SparetWait_t *rf_sparet_resp_queue; +static struct rf_test_acc *rf_async_done_qh, *rf_async_done_qt; + +/* used to communicate reconstruction requests */ +static struct rf_recon_req *recon_queue = NULL; + +decl_simple_lock_data(,recon_queue_mutex) + +#define LOCK_RECON_Q_MUTEX() simple_lock(&recon_queue_mutex) +#define UNLOCK_RECON_Q_MUTEX() simple_unlock(&recon_queue_mutex) + +/* prototypes */ +void rf_KernelWakeupFunc __P((struct buf *)); +void rf_InitBP __P((struct buf *, struct vnode *, unsigned, dev_t, + RF_SectorNum_t, RF_SectorCount_t, caddr_t, void (*)(struct buf *), + void *, int, struct proc *)); + +/* this is so that we can compile under 2.0 as well as 3.2 */ +#ifndef proc_to_task +#define proc_to_task(x) ((x)->task) +#endif /* !proc_to_task */ + +void raidattach __P((int)); +int raidsize __P((dev_t)); + +void rf_DiskIOComplete(RF_DiskQueue_t *, RF_DiskQueueData_t *, int); +void rf_CopybackReconstructedData(RF_Raid_t *raidPtr); +int raidinit __P((dev_t,RF_Raid_t *,int)); + +int raidopen __P((dev_t, int, int, struct proc *)); +int raidclose __P((dev_t, int, int, struct proc *)); +int raidioctl __P((dev_t, u_long, caddr_t, int, struct proc *)); +int raidwrite __P((dev_t, struct uio *, int)); +int raidread __P((dev_t, struct uio *, int)); +void raidstrategy __P((struct buf *)); +int raiddump __P((dev_t, daddr_t, caddr_t, size_t)); + +/* + * Pilfered from ccd.c + */ +struct raidbuf { + struct buf rf_buf; /* new I/O buf. MUST BE FIRST!!! */ + struct buf *rf_obp; /* ptr. to original I/O buf */ + int rf_flags; /* misc. flags */ + RF_DiskQueueData_t *req; /* the request that this was part of.. */ +}; + +#define RAIDGETBUF() malloc(sizeof (struct raidbuf), M_DEVBUF, M_NOWAIT) +#define RAIDPUTBUF(buf) free(buf, M_DEVBUF) + +/* + * XXX Not sure if the following should be replacing the raidPtrs above, + * or if it should be used in conjunction with that... + */ +struct raid_softc { + int sc_unit; /* logical unit number */ + int sc_flags; /* flags */ + int sc_cflags; /* configuration flags */ + size_t sc_size; /* size of the raid device */ + dev_t sc_dev; /* our device..*/ + char sc_xname[20]; /* XXX external name */ + struct disk sc_dkdev; /* generic disk device info */ +}; + +/* sc_flags */ +#define RAIDF_INITED 0x01 /* unit has been initialized */ +#define RAIDF_WLABEL 0x02 /* label area is writable */ +#define RAIDF_LABELLING 0x04 /* unit is currently being labelled */ +#define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */ +#define RAIDF_LOCKED 0x80 /* unit is locked */ + +#define raidunit(x) DISKUNIT(x) +static int numraid=0; + +#define RAIDLABELDEV(dev) \ + (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART)) + +/* declared here, and made public, for the benefit of KVM stuff.. */ +struct raid_softc *raid_softc; + +void raidgetdefaultlabel + __P((RF_Raid_t *, struct raid_softc *, struct disklabel *)); +void raidgetdisklabel __P((dev_t)); +void raidmakedisklabel __P((struct raid_softc *)); + +int raidlock __P((struct raid_softc *)); +void raidunlock __P((struct raid_softc *)); +int raidlookup __P((char *, struct proc *p, struct vnode **)); + + +void +raidattach(num) + int num; +{ + int raidID; + + db1_printf(("raidattach: Asked for %d units\n", num)); + + if (num <= 0) { +#ifdef DIAGNOSTIC + panic("raidattach: count <= 0"); +#endif + return; + } + + /* + * This is where all the initialization stuff gets done. + */ + + /* Make some space for requested number of units... */ + RF_Calloc(raidPtrs, num, sizeof(RF_Raid_t *), (RF_Raid_t **)); + if (raidPtrs == NULL) { + panic("raidPtrs is NULL!!\n"); + } + + rf_kbooted = rf_boot(); + if (rf_kbooted) { + panic("Serious error booting RAID!!\n"); + } + + rf_kbooted = RFK_BOOT_GOOD; + + /* + put together some datastructures like the CCD device does.. + This lets us lock the device and what-not when it gets opened. + */ + + raid_softc = (struct raid_softc *) + malloc(num * sizeof(struct raid_softc), + M_DEVBUF, M_NOWAIT); + if (raid_softc == NULL) { + printf("WARNING: no memory for RAIDframe driver\n"); + return; + } + numraid = num; + bzero(raid_softc, num * sizeof(struct raid_softc)); + + for(raidID=0;raidID < num;raidID++) { + RF_Calloc(raidPtrs[raidID], 1, sizeof(RF_Raid_t), + (RF_Raid_t *)); + if (raidPtrs[raidID]==NULL) { + printf("raidPtrs[%d] is NULL\n",raidID); + } + } +} + +int +raidsize(dev) + dev_t dev; +{ + struct raid_softc *rs; + struct disklabel *lp; + int part, unit, omask, size; + + unit = raidunit(dev); + if (unit >= numraid) + return (-1); + rs = &raid_softc[unit]; + + if ((rs->sc_flags & RAIDF_INITED) == 0) + return (-1); + + part = DISKPART(dev); + omask = rs->sc_dkdev.dk_openmask & (1 << part); + lp = rs->sc_dkdev.dk_label; + + if (omask == 0 && raidopen(dev, 0, S_IFBLK, curproc)) + return (-1); + + if (lp->d_partitions[part].p_fstype != FS_SWAP) + size = -1; + else + size = lp->d_partitions[part].p_size * + (lp->d_secsize / DEV_BSIZE); + + if (omask == 0 && raidclose(dev, 0, S_IFBLK, curproc)) + return (-1); + + return (size); + +} + +int +raiddump(dev, blkno, va, size) + dev_t dev; + daddr_t blkno; + caddr_t va; + size_t size; +{ + /* Not implemented. */ + return (ENXIO); +} + +/* ARGSUSED */ +int +raidopen(dev, flags, fmt, p) + dev_t dev; + int flags, fmt; + struct proc *p; +{ + int unit = raidunit(dev); + struct raid_softc *rs; + struct disklabel *lp; + int part,pmask; + unsigned int raidID; + int rc; + int error = 0; + + /* + * XXX This whole next chunk of code is somewhat suspect... Not sure + * it's needed here at all. + */ + if (rf_kbooted == RFK_BOOT_NONE) { + printf("Doing restart on raidopen.\n"); + rf_kbooted = RFK_BOOT_GOOD; + rc = rf_boot(); + if (rc) { + rf_kbooted = RFK_BOOT_BAD; + printf("Someone is unhappy...\n"); + return (rc); + } + } + + if (unit >= numraid) + return (ENXIO); + rs = &raid_softc[unit]; + + if ((error = raidlock(rs)) != 0) + return (error); + lp = rs->sc_dkdev.dk_label; + + raidID = raidunit(dev); + + part = DISKPART(dev); + pmask = (1 << part); + + db1_printf( + ("Opening raid device number: %d partition: %d\n", raidID, part)); + + + if ((rs->sc_flags & RAIDF_INITED) && (rs->sc_dkdev.dk_openmask == 0)) + raidgetdisklabel(dev); + + /* make sure that this partition exists */ + + if (part != RAW_PART) { + db1_printf(("Not a raw partition..\n")); + if (((rs->sc_flags & RAIDF_INITED) == 0) || + ((part >= lp->d_npartitions) || + (lp->d_partitions[part].p_fstype == FS_UNUSED))) { + error = ENXIO; + raidunlock(rs); + db1_printf(("Bailing out...\n")); + return (error); + } + } + + /* Prevent this unit from being unconfigured while open. */ + switch (fmt) { + case S_IFCHR: + rs->sc_dkdev.dk_copenmask |= pmask; + break; + + case S_IFBLK: + rs->sc_dkdev.dk_bopenmask |= pmask; + break; + } + rs->sc_dkdev.dk_openmask = + rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask; + + raidunlock(rs); + + return (error); +} + +/* ARGSUSED */ +int +raidclose(dev, flags, fmt, p) + dev_t dev; + int flags, fmt; + struct proc *p; +{ + int unit = raidunit(dev); + struct raid_softc *rs; + int error = 0; + int part; + + if (unit >= numraid) + return (ENXIO); + rs = &raid_softc[unit]; + + if ((error = raidlock(rs)) != 0) + return (error); + + part = DISKPART(dev); + + /* ...that much closer to allowing unconfiguration... */ + switch (fmt) { + case S_IFCHR: + rs->sc_dkdev.dk_copenmask &= ~(1 << part); + break; + + case S_IFBLK: + rs->sc_dkdev.dk_bopenmask &= ~(1 << part); + break; + } + rs->sc_dkdev.dk_openmask = + rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask; + + raidunlock(rs); + return (0); +} + +void +raidstrategy(bp) + struct buf *bp; +{ + int s; + + unsigned int raidID = raidunit(bp->b_dev); + RF_Raid_t *raidPtr; + struct raid_softc *rs = &raid_softc[raidID]; + struct disklabel *lp; + int wlabel; + + db1_printf(("Strategy: 0x%x 0x%x\n", bp, bp->b_data)); + db1_printf(("Strategy(2): bp->b_bufsize %d\n", (int)bp->b_bufsize)); + db1_printf(("bp->b_count=%d\n", (int)bp->b_bcount)); + db1_printf(("bp->b_resid=%d\n", (int)bp->b_resid)); + db1_printf(("bp->b_blkno=%d\n", (int)bp->b_blkno)); + + if (bp->b_flags & B_READ) + db1_printf(("READ\n")); + else + db1_printf(("WRITE\n")); + + if (rf_kbooted != RFK_BOOT_GOOD) + return; + if (raidID >= numraid || !raidPtrs[raidID]) { + bp->b_error = ENODEV; + bp->b_flags |= B_ERROR; + bp->b_resid = bp->b_bcount; + biodone(bp); + return; + } + raidPtr = raidPtrs[raidID]; + if (!raidPtr->valid) { + bp->b_error = ENODEV; + bp->b_flags |= B_ERROR; + bp->b_resid = bp->b_bcount; + biodone(bp); + return; + } + if (bp->b_bcount == 0) { + db1_printf(("b_bcount is zero..\n")); + biodone(bp); + return; + } + lp = rs->sc_dkdev.dk_label; + + /* + * Do bounds checking and adjust transfer. If there's an + * error, the bounds check will flag that for us. + */ + wlabel = rs->sc_flags & (RAIDF_WLABEL|RAIDF_LABELLING); + if (DISKPART(bp->b_dev) != RAW_PART) + if (bounds_check_with_label(bp, lp, rs->sc_dkdev.dk_cpulabel, + wlabel) <= 0) { + db1_printf(("Bounds check failed!!:%d %d\n", + (int)bp->b_blkno, (int)wlabel)); + biodone(bp); + return; + } + + /* XXX splbio() needed? */ + s = splbio(); + db1_printf(("Beginning strategy...\n")); + + bp->b_resid = 0; + bp->b_error = + rf_DoAccessKernel(raidPtrs[raidID], bp, NULL, NULL, NULL); + if (bp->b_error) { + bp->b_flags |= B_ERROR; + db1_printf( + ("bp->b_flags HAS B_ERROR SET!!!: %d\n", bp->b_error)); + } + splx(s); + db1_printf(("Strategy exiting: 0x%x 0x%x %d %d\n", bp, bp->b_data, + (int)bp->b_bcount, (int)bp->b_resid)); +} + +/* ARGSUSED */ +int +raidread(dev, uio, flags) + dev_t dev; + struct uio *uio; + int flags; +{ + int unit = raidunit(dev); + struct raid_softc *rs; + int result; + int part; + + if (unit >= numraid) + return (ENXIO); + rs = &raid_softc[unit]; + + if ((rs->sc_flags & RAIDF_INITED) == 0) + return (ENXIO); + part = DISKPART(dev); + + db1_printf(("raidread: unit: %d partition: %d\n", unit, part)); + +#if 0 + return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio)); +#endif + result=physio(raidstrategy, NULL, dev, B_READ, minphys, uio); + db1_printf(("raidread done. Result is %d %d\n", result, + uio->uio_resid)); + return (result); +} + +/* ARGSUSED */ +int +raidwrite(dev, uio, flags) + dev_t dev; + struct uio *uio; + int flags; +{ + int unit = raidunit(dev); + struct raid_softc *rs; + + if (unit >= numraid) + return (ENXIO); + rs = &raid_softc[unit]; + + if ((rs->sc_flags & RAIDF_INITED) == 0) + return (ENXIO); + db1_printf(("raidwrite\n")); + return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio)); +} + +int +raidioctl(dev, cmd, data, flag, p) + dev_t dev; + u_long cmd; + caddr_t data; + int flag; + struct proc *p; +{ + int unit = raidunit(dev); + int error = 0; + int part, pmask; + struct raid_softc *rs; +#if 0 + int r, c; +#endif + /* struct raid_ioctl *ccio = (struct ccd_ioctl *)data; */ + + /* struct ccdbuf *cbp; */ + /* struct raidbuf *raidbp; */ + RF_Config_t *k_cfg, *u_cfg; + u_char *specific_buf; + int retcode = 0; + + int row; + struct rf_recon_req *rrcopy, *rr; +#if 0 + int nbytes, spl, rw, row; + struct rf_test_acc *ta; + struct buf *bp; + RF_SparetWait_t *waitreq; + struct rf_test_acc *ta_p, *ta_copy; +#endif + + if (unit >= numraid) + return (ENXIO); + rs = &raid_softc[unit]; + + db1_printf(("raidioctl: %d %d %d %d\n", (int)dev, (int)DISKPART(dev), + (int)unit, (int)cmd)); + + /* Must be open for writes for these commands... */ + switch (cmd) { + case DIOCSDINFO: + case DIOCWDINFO: + case DIOCWLABEL: + if ((flag & FWRITE) == 0) + return (EBADF); + } + + /* Must be initialized for these... */ + switch (cmd) { + case DIOCGDINFO: + case DIOCSDINFO: + case DIOCWDINFO: + case DIOCGPART: + case DIOCWLABEL: + case RAIDFRAME_SHUTDOWN: + case RAIDFRAME_REWRITEPARITY: + case RAIDFRAME_GET_INFO: + case RAIDFRAME_RESET_ACCTOTALS: + case RAIDFRAME_GET_ACCTOTALS: + case RAIDFRAME_KEEP_ACCTOTALS: + case RAIDFRAME_GET_SIZE: + case RAIDFRAME_FAIL_DISK: + case RAIDFRAME_COPYBACK: + case RAIDFRAME_CHECKRECON: + if ((rs->sc_flags & RAIDF_INITED) == 0) + return (ENXIO); + } + + switch (cmd) { + case RAIDFRAME_CONFIGURE: + /* Configure the system */ + + db3_printf(("rf_ioctl: RAIDFRAME_CONFIGURE\n")); + + /* + * Copy-in the configuration information + * data points to a pointer to the configuration structure. + */ + u_cfg = *((RF_Config_t **)data); + RF_Malloc(k_cfg, sizeof (RF_Config_t), (RF_Config_t *)); + if (k_cfg == NULL) { + db3_printf(( + "rf_ioctl: ENOMEM for config. Code is %d\n", + retcode)); + return (ENOMEM); + } + retcode = copyin((caddr_t)u_cfg, (caddr_t)k_cfg, + sizeof (RF_Config_t)); + if (retcode) { + db3_printf(("rf_ioctl: retcode=%d copyin.1\n", + retcode)); + return (retcode); + } + + /* + * Allocate a buffer for the layout-specific data, + * and copy it in. + */ + if (k_cfg->layoutSpecificSize) { + if (k_cfg->layoutSpecificSize > 10000) { + /* sanity check */ + db3_printf(("rf_ioctl: EINVAL %d\n", retcode)); + return (EINVAL); + } + RF_Malloc(specific_buf, k_cfg->layoutSpecificSize, + (u_char *)); + if (specific_buf == NULL) { + RF_Free(k_cfg, sizeof (RF_Config_t)); + db3_printf(("rf_ioctl: ENOMEM %d\n", retcode)); + return (ENOMEM); + } + retcode = copyin(k_cfg->layoutSpecific, + (caddr_t)specific_buf, k_cfg->layoutSpecificSize); + if (retcode) { + db3_printf(("rf_ioctl: retcode=%d copyin.2\n", + retcode)); + return (retcode); + } + } else + specific_buf = NULL; + k_cfg->layoutSpecific = specific_buf; + + /* + * We should do some kind of sanity check on the + * configuration. + * Store the sum of all the bytes in the last byte? + */ + + db1_printf(("Considering configuring the system.:%d 0x%x\n", + unit, p)); + + /* + * We need the pointer to this a little deeper, + * so stash it here... + */ + raidPtrs[unit]->proc = p; + + /* configure the system */ + rf_pending_testaccs = 0; + + raidPtrs[unit]->raidid = unit; + retcode = rf_Configure(raidPtrs[unit], k_cfg); + + if (retcode == 0) { + retcode = raidinit(dev, raidPtrs[unit],unit); + } + + /* Free the buffers. No return code here. */ + if (k_cfg->layoutSpecificSize) { + RF_Free(specific_buf, k_cfg->layoutSpecificSize); + } + RF_Free(k_cfg, sizeof (RF_Config_t)); + + db3_printf(("rf_ioctl: retcode=%d RAIDFRAME_CONFIGURE\n", + retcode)); + return (retcode); + + case RAIDFRAME_SHUTDOWN: + /* Shutdown the system */ + + if ((error = raidlock(rs)) != 0) + return (error); + + /* + * If somebody has a partition mounted, we shouldn't + * shutdown. + */ + + part = DISKPART(dev); + pmask = (1 << part); + if ((rs->sc_dkdev.dk_openmask & ~pmask) || + ((rs->sc_dkdev.dk_bopenmask & pmask) && + (rs->sc_dkdev.dk_copenmask & pmask))) { + raidunlock(rs); + return (EBUSY); + } + + /* + * The intention here was to disallow shutdowns while + * raidframe is mounted, but it doesn't work because the + * shutdown ioctl calls rf_open. + */ + if (rf_pending_testaccs > 0) { + printf("RAIDFRAME: Can't shutdown because there are " + "%d pending test accs\n", + rf_pending_testaccs); + return (EINVAL); + } + if (rf_debugKernelAccess) { + printf("call shutdown\n"); + } + raidPtrs[unit]->proc = p; /* XXX Necessary evil */ + retcode = rf_Shutdown(raidPtrs[unit]); + + db1_printf(("Done main shutdown\n")); + + /* It's no longer initialized... */ + rs->sc_flags &= ~RAIDF_INITED; + + /* Detach the disk. */ + disk_detach(&rs->sc_dkdev); + + raidunlock(rs); + + return (retcode); + + case RAIDFRAME_REWRITEPARITY: + /* initialize all parity */ + + if (raidPtrs[unit]->Layout.map->faultsTolerated == 0) + return (EINVAL); + /* borrow the thread of the requesting process */ + raidPtrs[unit]->proc = p; /* Blah... :-p GO */ + retcode = rf_RewriteParity(raidPtrs[unit]); + /* return I/O Error if the parity rewrite fails */ + + if (retcode) + retcode = EIO; + return (retcode); + +#if 0 /* XXX not supported yet (ever?) */ + case RAIDFRAME_TUR: + /* + * Issue a test-unit-ready through raidframe to the + * indicated device. + */ + + /* debug only */ + retcode = rf_SCSI_DoTUR(0, 0, 0, 0, *(dev_t *)data); + return (retcode); +#endif + + case RAIDFRAME_GET_INFO: + { + RF_Raid_t *raid = raidPtrs[unit]; + RF_DeviceConfig_t *cfg, **ucfgp; + int i, j, d; + + if (!raid->valid) + return (ENODEV); + ucfgp = (RF_DeviceConfig_t **)data; + RF_Malloc(cfg,sizeof(RF_DeviceConfig_t), + (RF_DeviceConfig_t *)); + if (cfg == NULL) + return (ENOMEM); + bzero((char *)cfg, sizeof(RF_DeviceConfig_t)); + cfg->rows = raid->numRow; + cfg->cols = raid->numCol; + cfg->ndevs = raid->numRow * raid->numCol; + if (cfg->ndevs >= RF_MAX_DISKS) { + cfg->ndevs = 0; + return (ENOMEM); + } + cfg->nspares = raid->numSpare; + if (cfg->nspares >= RF_MAX_DISKS) { + cfg->nspares = 0; + return (ENOMEM); + } + cfg->maxqdepth = raid->maxQueueDepth; + d = 0; + for(i=0;i<cfg->rows;i++) { + for(j=0;j<cfg->cols;j++) { + cfg->devs[d] = raid->Disks[i][j]; + d++; + } + } + for(j=cfg->cols,i=0;i<cfg->nspares;i++,j++) { + cfg->spares[i] = raid->Disks[0][j]; + } + retcode = copyout((caddr_t)cfg, (caddr_t)*ucfgp, + sizeof(RF_DeviceConfig_t)); + RF_Free(cfg,sizeof(RF_DeviceConfig_t)); + + return (retcode); + } + break; + + case RAIDFRAME_RESET_ACCTOTALS: + { + RF_Raid_t *raid = raidPtrs[unit]; + + bzero(&raid->acc_totals, sizeof(raid->acc_totals)); + return (0); + } + break; + + case RAIDFRAME_GET_ACCTOTALS: + { + RF_AccTotals_t *totals = (RF_AccTotals_t *)data; + RF_Raid_t *raid = raidPtrs[unit]; + + *totals = raid->acc_totals; + return (0); + } + break; + + case RAIDFRAME_KEEP_ACCTOTALS: + { + RF_Raid_t *raid = raidPtrs[unit]; + int *keep = (int *)data; + + raid->keep_acc_totals = *keep; + return (0); + } + break; + + case RAIDFRAME_GET_SIZE: + *(int *) data = raidPtrs[unit]->totalSectors; + return (0); + +#define RAIDFRAME_RECON 1 + /* XXX The above should probably be set somewhere else!! GO */ +#if RAIDFRAME_RECON > 0 + + /* fail a disk & optionally start reconstruction */ + case RAIDFRAME_FAIL_DISK: + rr = (struct rf_recon_req *) data; + + if (rr->row < 0 || rr->row >= raidPtrs[unit]->numRow + || rr->col < 0 || rr->col >= raidPtrs[unit]->numCol) + return (EINVAL); + + printf("Failing the disk: row: %d col: %d\n",rr->row,rr->col); + + /* + * Make a copy of the recon request so that we don't + * rely on the user's buffer + */ + RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *)); + bcopy(rr, rrcopy, sizeof(*rr)); + rrcopy->raidPtr = (void *) raidPtrs[unit]; + + LOCK_RECON_Q_MUTEX(); + rrcopy->next = recon_queue; + recon_queue = rrcopy; + wakeup(&recon_queue); + UNLOCK_RECON_Q_MUTEX(); + + return (0); + + /* + * Invoke a copyback operation after recon on whatever + * disk needs it, if any. + */ + case RAIDFRAME_COPYBACK: + /* Borrow the current thread to get this done */ + raidPtrs[unit]->proc = p; /* ICK.. but needed :-p GO */ + rf_CopybackReconstructedData(raidPtrs[unit]); + return (0); + + /* Return the percentage completion of reconstruction */ + case RAIDFRAME_CHECKRECON: + row = *(int *)data; + if (row < 0 || row >= raidPtrs[unit]->numRow) + return (EINVAL); + if (raidPtrs[unit]->status[row] != rf_rs_reconstructing) + *(int *)data = 100; + else + *(int *)data = + raidPtrs[unit]->reconControl[row]->percentComplete; + return (0); + +#if 0 + case RAIDFRAME_SPARET_WAIT: + /* + * The sparetable daemon calls this to wait for the + * kernel to need a spare table. + * This ioctl does not return until a spare table is needed. + * XXX -- Calling mpsleep here in the ioctl code is almost + * certainly wrong and evil. -- XXX + * XXX -- I should either compute the spare table in the + * kernel, or have a different. -- XXX + * XXX -- Interface (a different character device) for + * delivering the table. -- XXX + */ + RF_LOCK_MUTEX(rf_sparet_wait_mutex); + while (!rf_sparet_wait_queue) + mpsleep(&rf_sparet_wait_queue, (PZERO+1)|PCATCH, + "sparet wait", 0, + (void *)simple_lock_addr(rf_sparet_wait_mutex), + MS_LOCK_SIMPLE); + waitreq = rf_sparet_wait_queue; + rf_sparet_wait_queue = rf_sparet_wait_queue->next; + RF_UNLOCK_MUTEX(rf_sparet_wait_mutex); + + *((RF_SparetWait_t *)data) = *waitreq; + + RF_Free(waitreq, sizeof *waitreq); + return (0); + + case RAIDFRAME_ABORT_SPARET_WAIT: + /* + * Wakes up a process waiting on SPARET_WAIT and puts an + * error code in it that will cause the dameon to exit. + */ + RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *)); + waitreq->fcol = -1; + RF_LOCK_MUTEX(rf_sparet_wait_mutex); + waitreq->next = rf_sparet_wait_queue; + rf_sparet_wait_queue = waitreq; + RF_UNLOCK_MUTEX(rf_sparet_wait_mutex); + wakeup(&rf_sparet_wait_queue); + return (0); + + case RAIDFRAME_SEND_SPARET: + /* + * Used by the spare table daemon to deliver a spare table + * into the kernel + */ + + /* Install the spare table */ + retcode = rf_SetSpareTable(raidPtrs[unit],*(void **) data); + + /* + * Respond to the requestor. the return status of the + * spare table installation is passed in the "fcol" field + */ + RF_Malloc(waitreq, sizeof *waitreq, (RF_SparetWait_t *)); + waitreq->fcol = retcode; + RF_LOCK_MUTEX(rf_sparet_wait_mutex); + waitreq->next = rf_sparet_resp_queue; + rf_sparet_resp_queue = waitreq; + wakeup(&rf_sparet_resp_queue); + RF_UNLOCK_MUTEX(rf_sparet_wait_mutex); + + return (retcode); +#endif +#endif /* RAIDFRAME_RECON > 0 */ + + default: + /* fall through to the os-specific code below */ + break; + } + + if (!raidPtrs[unit]->valid) + return (EINVAL); + + /* + * Add support for "regular" device ioctls here. + */ + switch (cmd) { + case DIOCGDINFO: + db1_printf( + ("DIOCGDINFO %d %d\n", (int)dev, (int)DISKPART(dev))); + *(struct disklabel *)data = *(rs->sc_dkdev.dk_label); + break; + + case DIOCGPART: + db1_printf( + ("DIOCGPART: %d %d\n", (int)dev, (int)DISKPART(dev))); + ((struct partinfo *)data)->disklab = rs->sc_dkdev.dk_label; + ((struct partinfo *)data)->part = + &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)]; + break; + + case DIOCWDINFO: + db1_printf(("DIOCWDINFO\n")); + case DIOCSDINFO: + db1_printf(("DIOCSDINFO\n")); + if ((error = raidlock(rs)) != 0) + return (error); + + rs->sc_flags |= RAIDF_LABELLING; + + error = setdisklabel(rs->sc_dkdev.dk_label, + (struct disklabel *)data, 0, rs->sc_dkdev.dk_cpulabel); + if (error == 0) { + if (cmd == DIOCWDINFO) + error = writedisklabel(RAIDLABELDEV(dev), + raidstrategy, rs->sc_dkdev.dk_label, + rs->sc_dkdev.dk_cpulabel); + } + + rs->sc_flags &= ~RAIDF_LABELLING; + + raidunlock(rs); + + if (error) + return (error); + break; + + case DIOCWLABEL: + db1_printf(("DIOCWLABEL\n")); + if (*(int *)data != 0) + rs->sc_flags |= RAIDF_WLABEL; + else + rs->sc_flags &= ~RAIDF_WLABEL; + break; + + default: + retcode = ENOTTY; /* XXXX ?? OR EINVAL ? */ + } + return (retcode); +} + +/* + * raidinit -- complete the rest of the initialization for the + * RAIDframe device. + */ +int +raidinit(dev, raidPtr, unit) + dev_t dev; + RF_Raid_t *raidPtr; + int unit; +{ + int retcode; + /* int ix; */ + /* struct raidbuf *raidbp; */ + struct raid_softc *rs; + + retcode = 0; + + rs = &raid_softc[unit]; + + /* XXX should check return code first... */ + rs->sc_flags |= RAIDF_INITED; + + /* XXX doesn't check bounds.*/ + sprintf(rs->sc_xname, "raid%d", unit); + + rs->sc_dkdev.dk_name = rs->sc_xname; + + /* + * disk_attach actually creates space for the CPU disklabel, among + * other things, so it's critical to call this *BEFORE* we + * try putzing with disklabels. + */ + disk_attach(&rs->sc_dkdev); + + /* + * XXX There may be a weird interaction here between this, and + * protectedSectors, as used in RAIDframe. + */ + rs->sc_size = raidPtr->totalSectors; + rs->sc_dev = dev; + return (retcode); +} + +/********************************************************* + * + * initialization code called at boot time (startup.c) + * + ********************************************************/ +int +rf_boot() +{ + int i, rc; + + rc = rf_mutex_init(&rf_sparet_wait_mutex); + if (rc) { + RF_PANIC(); + } + rc = rf_mutex_init(&rf_async_done_q_mutex); + if (rc) { + RF_PANIC(); + } + rf_sparet_wait_queue = rf_sparet_resp_queue = NULL; + recon_queue = NULL; + rf_async_done_qh = rf_async_done_qt = NULL; + for (i=0; i<numraid; i++) + raidPtrs[i] = NULL; + rc = rf_BootRaidframe(); + if (rc == 0) + printf("Kernelized RAIDframe activated\n"); + else + rf_kbooted = RFK_BOOT_BAD; + return (rc); +} + +/* + * This kernel thread never exits. It is created once, and persists + * until the system reboots. + */ +void +rf_ReconKernelThread() +{ + struct rf_recon_req *req; + int s; + + /* + * XXX not sure what spl() level we should be at here... + * probably splbio() + */ + s = splbio(); + + while (1) { + /* grab the next reconstruction request from the queue */ + LOCK_RECON_Q_MUTEX(); + while (!recon_queue) { + UNLOCK_RECON_Q_MUTEX(); + tsleep(&recon_queue, PRIBIO | PCATCH, "raidframe recon", 0); + LOCK_RECON_Q_MUTEX(); + } + req = recon_queue; + recon_queue = recon_queue->next; + UNLOCK_RECON_Q_MUTEX(); + + /* + * If flags specifies that we should start recon, this call + * will not return until reconstruction completes, fails, or + * is aborted. + */ + rf_FailDisk((RF_Raid_t *) req->raidPtr, req->row, req->col, + ((req->flags&RF_FDFLAGS_RECON) ? 1 : 0)); + + RF_Free(req, sizeof(*req)); + } +} + +/* + * Wake up the daemon & tell it to get us a spare table + * XXX + * The entries in the queues should be tagged with the raidPtr so that in the + * extremely rare case that two recons happen at once, we know for + * which device were requesting a spare table. + * XXX + */ +int +rf_GetSpareTableFromDaemon(req) + RF_SparetWait_t *req; +{ + int retcode; + + RF_LOCK_MUTEX(rf_sparet_wait_mutex); + req->next = rf_sparet_wait_queue; + rf_sparet_wait_queue = req; + wakeup(&rf_sparet_wait_queue); + + /* mpsleep unlocks the mutex */ + while (!rf_sparet_resp_queue) { + tsleep(&rf_sparet_resp_queue, PRIBIO | PCATCH, + "raidframe getsparetable", 0); +#if 0 + mpsleep(&rf_sparet_resp_queue, PZERO, "sparet resp", 0, + (void *) simple_lock_addr(rf_sparet_wait_mutex), + MS_LOCK_SIMPLE); +#endif + } + req = rf_sparet_resp_queue; + rf_sparet_resp_queue = req->next; + RF_UNLOCK_MUTEX(rf_sparet_wait_mutex); + + retcode = req->fcol; + /* this is not the same req as we alloc'd */ + RF_Free(req, sizeof(*req)); + return (retcode); +} + +/* + * A wrapper around rf_DoAccess that extracts appropriate info from the + * bp & passes it down. + * Any calls originating in the kernel must use non-blocking I/O + * do some extra sanity checking to return "appropriate" error values for + * certain conditions (to make some standard utilities work) + */ +int +rf_DoAccessKernel(raidPtr, bp, flags, cbFunc, cbArg) + RF_Raid_t *raidPtr; + struct buf *bp; + RF_RaidAccessFlags_t flags; + void (*cbFunc)(struct buf *); + void *cbArg; +{ + RF_SectorCount_t num_blocks, pb, sum; + RF_RaidAddr_t raid_addr; + int retcode; + struct partition *pp; + daddr_t blocknum; + int unit; + struct raid_softc *rs; + + /* XXX The dev_t used here should be for /dev/[r]raid* !!! */ + + unit = raidPtr->raidid; + rs = &raid_softc[unit]; + + /* + * Ok, for the bp we have here, bp->b_blkno is relative to the + * partition.. Need to make it absolute to the underlying device.. + */ + blocknum = bp->b_blkno; + if (DISKPART(bp->b_dev) != RAW_PART) { + pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)]; + blocknum += pp->p_offset; + db1_printf( + ("updated: %d %d\n", DISKPART(bp->b_dev), pp->p_offset)); + } else { + db1_printf(("Is raw..\n")); + } + db1_printf(("Blocks: %d, %d\n", (int)bp->b_blkno, (int)blocknum)); + db1_printf(("bp->b_bcount = %d\n", (int)bp->b_bcount)); + db1_printf(("bp->b_resid = %d\n", (int)bp->b_resid)); + + /* + * *THIS* is where we adjust what block we're going to... but + * DO NOT TOUCH bp->b_blkno!!! + */ + raid_addr = blocknum; + + num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector; + pb = (bp->b_bcount&raidPtr->sectorMask) ? 1 : 0; + sum = raid_addr + num_blocks + pb; + if (1 || rf_debugKernelAccess) { + db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n", + (int)raid_addr, (int)sum, (int)num_blocks, (int)pb, + (int)bp->b_resid)); + } + + + if ((sum > raidPtr->totalSectors) || (sum < raid_addr) || + (sum < num_blocks) || (sum < pb)) { + bp->b_error = ENOSPC; + bp->b_flags |= B_ERROR; + bp->b_resid = bp->b_bcount; + biodone(bp); + return (bp->b_error); + } + + /* + * XXX rf_DoAccess() should do this, not just DoAccessKernel() + */ + if (bp->b_bcount & raidPtr->sectorMask) { + bp->b_error = EINVAL; + bp->b_flags |= B_ERROR; + bp->b_resid = bp->b_bcount; + biodone(bp); + return (bp->b_error); + } + db1_printf(("Calling DoAccess..\n")); + + /* + * Don't ever condition on bp->b_flags & B_WRITE. + * always condition on B_READ instead. + */ + retcode = rf_DoAccess(raidPtr, + (bp->b_flags & B_READ) ? RF_IO_TYPE_READ : RF_IO_TYPE_WRITE, + 0, raid_addr, num_blocks, bp->b_un.b_addr, bp, NULL, NULL, + RF_DAG_NONBLOCKING_IO|flags, NULL, cbFunc, cbArg); + db1_printf(("After call to DoAccess: 0x%x 0x%x %d\n", bp, bp->b_data, + (int)bp->b_resid)); + return (retcode); +} + +/* Invoke an I/O from kernel mode. Disk queue should be locked upon entry */ + +int +rf_DispatchKernelIO(queue, req) + RF_DiskQueue_t *queue; + RF_DiskQueueData_t *req; +{ + int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE; + struct buf *bp; + struct raidbuf *raidbp = NULL; + struct raid_softc *rs; + int unit; + + /* + * XXX along with the vnode, we also need the softc associated with + * this device.. + */ + req->queue = queue; + + unit = queue->raidPtr->raidid; + + db1_printf(("DispatchKernelIO unit: %d\n", unit)); + + if (unit >= numraid) { + printf("Invalid unit number: %d %d\n", unit, numraid); + panic("Invalid Unit number in rf_DispatchKernelIO\n"); + } + + rs = &raid_softc[unit]; + + /* XXX is this the right place? */ + disk_busy(&rs->sc_dkdev); + + bp = req->bp; + + /* + * XXX When there is a physical disk failure, someone is passing + * us a buffer that contains old stuff!! Attempt to deal with + * this problem without taking a performance hit... + * (not sure where the real bug is. It's buried in RAIDframe + * somewhere) :-( GO ) + */ + if (bp->b_flags & B_ERROR) { + bp->b_flags &= ~B_ERROR; + } + if (bp->b_error!=0) { + bp->b_error = 0; + } + + raidbp = RAIDGETBUF(); + + raidbp->rf_flags = 0; /* XXX not really used anywhere... */ + + /* + * context for raidiodone + */ + raidbp->rf_obp = bp; + raidbp->req = req; + + switch (req->type) { + case RF_IO_TYPE_NOP: + /* Used primarily to unlock a locked queue. */ + + db1_printf(("rf_DispatchKernelIO: NOP to r %d c %d\n", + queue->row, queue->col)); + + /* XXX need to do something extra here.. */ + + /* + * I'm leaving this in, as I've never actually seen it + * used, and I'd like folks to report it... GO + */ + printf(("WAKEUP CALLED\n")); + queue->numOutstanding++; + + /* XXX need to glue the original buffer into this?? */ + + rf_KernelWakeupFunc(&raidbp->rf_buf); + break; + + case RF_IO_TYPE_READ: + case RF_IO_TYPE_WRITE: + if (req->tracerec) { + RF_ETIMER_START(req->tracerec->timer); + } + + rf_InitBP(&raidbp->rf_buf, queue->rf_cinfo->ci_vp, + op | bp->b_flags, queue->rf_cinfo->ci_dev, + req->sectorOffset, req->numSector, + req->buf, rf_KernelWakeupFunc, (void *)req, + queue->raidPtr->logBytesPerSector, req->b_proc); + + if (rf_debugKernelAccess) { + db1_printf(("dispatch: bp->b_blkno = %ld\n", + (long)bp->b_blkno)); + } + queue->numOutstanding++; + queue->last_deq_sector = req->sectorOffset; + + /* + * Acc wouldn't have been let in if there were any + * pending reqs at any other priority. + */ + queue->curPriority = req->priority; + + db1_printf(("Going for %c to unit %d row %d col %d\n", + req->type, unit, queue->row, queue->col)); + db1_printf(("sector %d count %d (%d bytes) %d\n", + (int)req->sectorOffset, (int)req->numSector, + (int)(req->numSector << queue->raidPtr->logBytesPerSector), + (int)queue->raidPtr->logBytesPerSector)); + if ((raidbp->rf_buf.b_flags & B_READ) == 0) { + raidbp->rf_buf.b_vp->v_numoutput++; + } + + VOP_STRATEGY(&raidbp->rf_buf); + break; + + default: + panic("bad req->type in rf_DispatchKernelIO"); + } + db1_printf(("Exiting from DispatchKernelIO\n")); + return (0); +} + +/* + * This is the callback function associated with a I/O invoked from + * kernel code. + */ +void +rf_KernelWakeupFunc(vbp) + struct buf *vbp; +{ + RF_DiskQueueData_t *req = NULL; + RF_DiskQueue_t *queue; + struct raidbuf *raidbp = (struct raidbuf *)vbp; + struct buf *bp; + struct raid_softc *rs; + int unit; + int s; + + s = splbio(); /* XXX */ + db1_printf(("recovering the request queue:\n")); + req = raidbp->req; + + bp = raidbp->rf_obp; + db1_printf(("bp=0x%x\n", bp)); + + queue = (RF_DiskQueue_t *)req->queue; + + if (raidbp->rf_buf.b_flags & B_ERROR) { + db1_printf( + ("Setting bp->b_flags!!! %d\n", raidbp->rf_buf.b_error)); + bp->b_flags |= B_ERROR; + bp->b_error = + raidbp->rf_buf.b_error ? raidbp->rf_buf.b_error : EIO; + } + + db1_printf(("raidbp->rf_buf.b_bcount=%d\n", + (int)raidbp->rf_buf.b_bcount)); + db1_printf(("raidbp->rf_buf.b_bufsize=%d\n", + (int)raidbp->rf_buf.b_bufsize)); + db1_printf(("raidbp->rf_buf.b_resid=%d\n", + (int)raidbp->rf_buf.b_resid)); + db1_printf(("raidbp->rf_buf.b_data=0x%x\n", raidbp->rf_buf.b_data)); + +#if 1 + /* XXX Methinks this could be wrong... */ + bp->b_resid = raidbp->rf_buf.b_resid; +#endif + + if (req->tracerec) { + RF_ETIMER_STOP(req->tracerec->timer); + RF_ETIMER_EVAL(req->tracerec->timer); + RF_LOCK_MUTEX(rf_tracing_mutex); + req->tracerec->diskwait_us += + RF_ETIMER_VAL_US(req->tracerec->timer); + req->tracerec->phys_io_us += + RF_ETIMER_VAL_US(req->tracerec->timer); + req->tracerec->num_phys_ios++; + RF_UNLOCK_MUTEX(rf_tracing_mutex); + } + + bp->b_bcount = raidbp->rf_buf.b_bcount;/* XXXX ?? */ + + unit = queue->raidPtr->raidid; /* *Much* simpler :-> */ + +#if 1 + /* + * XXX Ok, let's get aggressive... If B_ERROR is set, let's go + * ballistic, and mark the component as hosed... + */ + if (bp->b_flags & B_ERROR) { + /* Mark the disk as dead but only mark it once... */ + if (queue->raidPtr->Disks[queue->row][queue->col].status == + rf_ds_optimal) { + printf("raid%d: IO Error. Marking %s as failed.\n", + unit, + queue->raidPtr-> + Disks[queue->row][queue->col].devname); + queue->raidPtr->Disks[queue->row][queue->col].status = + rf_ds_failed; + queue->raidPtr->status[queue->row] = rf_rs_degraded; + queue->raidPtr->numFailures++; + } else { + /* Disk is already dead... */ + /* printf("Disk already marked as dead!\n"); */ + } + } +#endif + + rs = &raid_softc[unit]; + RAIDPUTBUF(raidbp); + + if (bp->b_resid==0) { + db1_printf(( + "Disk is no longer busy for this buffer... %d %ld %ld\n", + unit, bp->b_resid, bp->b_bcount)); + /* XXX is this the right place for a disk_unbusy()??!??!?!? */ + disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid)); + } else { + db1_printf(("b_resid is still %ld\n", bp->b_resid)); + } + + rf_DiskIOComplete(queue, req, (bp->b_flags & B_ERROR) ? 1 : 0); + (req->CompleteFunc)(req->argument, (bp->b_flags & B_ERROR) ? 1 : 0); + /* printf("Exiting rf_KernelWakeupFunc\n"); */ + + splx(s); /* XXX */ +} + +/* + * Initialize a buf structure for doing an I/O in the kernel. + */ +void +rf_InitBP(bp, b_vp, rw_flag, dev, startSect, numSect, buf, cbFunc, cbArg, + logBytesPerSector, b_proc) + struct buf *bp; + struct vnode *b_vp; + unsigned rw_flag; + dev_t dev; + RF_SectorNum_t startSect; + RF_SectorCount_t numSect; + caddr_t buf; + void (*cbFunc)(struct buf *); + void *cbArg; + int logBytesPerSector; + struct proc *b_proc; +{ + /* bp->b_flags = B_PHYS | rw_flag; */ + bp->b_flags = B_CALL | rw_flag; /* XXX need B_PHYS here too??? */ + bp->b_bcount = numSect << logBytesPerSector; + bp->b_bufsize = bp->b_bcount; + bp->b_error = 0; + bp->b_dev = dev; + db1_printf(("bp->b_dev is %d\n", dev)); + bp->b_un.b_addr = buf; + db1_printf(("bp->b_data=0x%x\n", bp->b_data)); + + bp->b_blkno = startSect; + bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */ + db1_printf(("b_bcount is: %d\n", (int)bp->b_bcount)); + if (bp->b_bcount == 0) { + panic("bp->b_bcount is zero in rf_InitBP!!\n"); + } + bp->b_proc = b_proc; + bp->b_iodone = cbFunc; + bp->b_vp = b_vp; +} +#endif /* KERNEL */ + +/* Extras... */ + +unsigned int +rpcc() +{ + /* XXX no clue what this is supposed to do.. my guess is + that it's supposed to read the CPU cycle counter... */ + /* db1_printf("this is supposed to do something useful too!??\n"); */ + return (0); +} + +#if 0 +int +rf_GetSpareTableFromDaemon(req) + RF_SparetWait_t *req; +{ + int retcode=1; + printf("This is supposed to do something useful!!\n"); /* XXX */ + + return (retcode); +} +#endif + +void +raidgetdefaultlabel(raidPtr, rs, lp) + RF_Raid_t *raidPtr; + struct raid_softc *rs; + struct disklabel *lp; +{ + db1_printf(("Building a default label...\n")); + bzero(lp, sizeof(*lp)); + + /* fabricate a label... */ + lp->d_secperunit = raidPtr->totalSectors; + lp->d_secsize = raidPtr->bytesPerSector; + lp->d_nsectors = 1024 * (1024 / raidPtr->bytesPerSector); + lp->d_ntracks = 1; + lp->d_ncylinders = raidPtr->totalSectors / lp->d_nsectors; + lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors; + + strncpy(lp->d_typename, "raid", sizeof(lp->d_typename)); + lp->d_type = DTYPE_RAID; + strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname)); + lp->d_rpm = 3600; + lp->d_interleave = 1; + lp->d_flags = 0; + + lp->d_partitions[RAW_PART].p_offset = 0; + lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors; + lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED; + lp->d_npartitions = RAW_PART + 1; + + lp->d_magic = DISKMAGIC; + lp->d_magic2 = DISKMAGIC; + lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label); + +} + +/* + * Read the disklabel from the raid device. If one is not present, fake one + * up. + */ +void +raidgetdisklabel(dev) + dev_t dev; +{ + int unit = raidunit(dev); + struct raid_softc *rs = &raid_softc[unit]; + char *errstring; + struct disklabel *lp = rs->sc_dkdev.dk_label; + struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel; + RF_Raid_t *raidPtr; + + db1_printf(("Getting the disklabel...\n")); + + bzero(clp, sizeof(*clp)); + + raidPtr = raidPtrs[unit]; + + raidgetdefaultlabel(raidPtr, rs, lp); + + /* + * Call the generic disklabel extraction routine. + */ + errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy, + rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel, 0); + if (errstring) + raidmakedisklabel(rs); + else { + int i; + struct partition *pp; + + /* + * Sanity check whether the found disklabel is valid. + * + * This is necessary since total size of the raid device + * may vary when an interleave is changed even though exactly + * same componets are used, and old disklabel may used + * if that is found. + */ + if (lp->d_secperunit != rs->sc_size) + printf("WARNING: %s: " + "total sector size in disklabel (%d) != " + "the size of raid (%d)\n", rs->sc_xname, + lp->d_secperunit, rs->sc_size); + for (i = 0; i < lp->d_npartitions; i++) { + pp = &lp->d_partitions[i]; + if (pp->p_offset + pp->p_size > rs->sc_size) + printf("WARNING: %s: end of partition `%c' " + "exceeds the size of raid (%d)\n", + rs->sc_xname, 'a' + i, rs->sc_size); + } + } +} + +/* + * Take care of things one might want to take care of in the event + * that a disklabel isn't present. + */ +void +raidmakedisklabel(rs) + struct raid_softc *rs; +{ + struct disklabel *lp = rs->sc_dkdev.dk_label; + db1_printf(("Making a label..\n")); + + /* + * For historical reasons, if there's no disklabel present + * the raw partition must be marked FS_BSDFFS. + */ + + lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS; + + strncpy(lp->d_packname, "default label", sizeof(lp->d_packname)); + + lp->d_checksum = dkcksum(lp); +} + +/* + * Lookup the provided name in the filesystem. If the file exists, + * is a valid block device, and isn't being used by anyone else, + * set *vpp to the file's vnode. + * You'll find the original of this in ccd.c + */ +int +raidlookup(path, p, vpp) + char *path; + struct proc *p; + struct vnode **vpp; /* result */ +{ + struct nameidata nd; + struct vnode *vp; + struct vattr va; + int error; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path, p); + if ((error = vn_open(&nd, FREAD|FWRITE, 0)) != 0) { + db1_printf(("RAIDframe: vn_open returned %d\n", error)); + return (error); + } + vp = nd.ni_vp; + if (vp->v_usecount > 1) { + VOP_UNLOCK(vp, 0, p); + (void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p); + return (EBUSY); + } + if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)) != 0) { + VOP_UNLOCK(vp, 0, p); + (void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p); + return (error); + } + /* XXX: eventually we should handle VREG, too. */ + if (va.va_type != VBLK) { + VOP_UNLOCK(vp, 0, p); + (void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p); + return (ENOTBLK); + } + VOP_UNLOCK(vp, 0, p); + *vpp = vp; + return (0); +} + +/* + * Wait interruptibly for an exclusive lock. + * + * XXX + * Several drivers do this; it should be abstracted and made MP-safe. + * (Hmm... where have we seen this warning before :-> GO ) + */ +int +raidlock(rs) + struct raid_softc *rs; +{ + int error; + + while ((rs->sc_flags & RAIDF_LOCKED) != 0) { + rs->sc_flags |= RAIDF_WANTED; + if ((error = tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0) + return (error); + } + rs->sc_flags |= RAIDF_LOCKED; + return (0); +} + +/* + * Unlock and wake up any waiters. + */ +void +raidunlock(rs) + struct raid_softc *rs; +{ + rs->sc_flags &= ~RAIDF_LOCKED; + if ((rs->sc_flags & RAIDF_WANTED) != 0) { + rs->sc_flags &= ~RAIDF_WANTED; + wakeup(rs); + } +} diff --git a/sys/dev/raidframe/rf_options.c b/sys/dev/raidframe/rf_options.c new file mode 100644 index 00000000000..c9af8105ba7 --- /dev/null +++ b/sys/dev/raidframe/rf_options.c @@ -0,0 +1,85 @@ +/* $OpenBSD: rf_options.c,v 1.1 1999/01/11 14:29:33 niklas Exp $ */ +/* $NetBSD: rf_options.c,v 1.1 1998/11/13 04:20:31 oster Exp $ */ +/* + * rf_options.c + */ +/* + * Copyright (c) 1996 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#ifdef _KERNEL +#define KERNEL +#endif + +#ifdef KERNEL +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#include <dfstrace.h> +#endif /* !__NetBSD__ && !__OpenBSD__ */ +#endif /* KERNEL */ + +#include "rf_threadstuff.h" +#include "rf_types.h" +#include "rf_archs.h" +#include "rf_general.h" +#include "rf_options.h" + +#ifdef RF_DBG_OPTION +#undef RF_DBG_OPTION +#endif /* RF_DBG_OPTION */ + +#ifdef __STDC__ +#define RF_DBG_OPTION(_option_,_defval_) long rf_##_option_ = _defval_; +#else /* __STDC__ */ +#define RF_DBG_OPTION(_option_,_defval_) long rf_/**/_option_ = _defval_; +#endif /* __STDC__ */ + +#include "rf_optnames.h" + +#undef RF_DBG_OPTION + +#ifdef __STDC__ +#define RF_DBG_OPTION(_option_,_defval_) { RF_STRING(_option_), &rf_##_option_ }, +#else /* __STDC__ */ +#define RF_DBG_OPTION(_option_,_defval_) { RF_STRING(_option_), &rf_/**/_option_ }, +#endif /* __STDC__ */ + +RF_DebugName_t rf_debugNames[] = { +#include "rf_optnames.h" + {NULL, NULL} +}; + +#undef RF_DBG_OPTION + +#ifdef __STDC__ +#define RF_DBG_OPTION(_option_,_defval_) rf_##_option_ = _defval_ ; +#else /* __STDC__ */ +#define RF_DBG_OPTION(_option_,_defval_) rf_/**/_option_ = _defval_ ; +#endif /* __STDC__ */ + +void rf_ResetDebugOptions() +{ +#include "rf_optnames.h" +} diff --git a/sys/dev/raidframe/rf_options.h b/sys/dev/raidframe/rf_options.h new file mode 100644 index 00000000000..2b5499cb672 --- /dev/null +++ b/sys/dev/raidframe/rf_options.h @@ -0,0 +1,68 @@ +/* $OpenBSD: rf_options.h,v 1.1 1999/01/11 14:29:33 niklas Exp $ */ +/* $NetBSD: rf_options.h,v 1.1 1998/11/13 04:20:31 oster Exp $ */ +/* + * rf_options.h + */ +/* + * Copyright (c) 1996 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#ifndef _RF__RF_OPTIONS_H_ +#define _RF__RF_OPTIONS_H_ + +#ifdef _KERNEL +#define KERNEL +#endif + +#ifdef KERNEL +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#include <dfstrace.h> +#endif /* !__NetBSD__ && !__OpenBSD__ */ +#endif /* KERNEL */ + +#define RF_DEFAULT_LOCK_TABLE_SIZE 256 + +typedef struct RF_DebugNames_s { + char *name; + long *ptr; +} RF_DebugName_t; + +extern RF_DebugName_t rf_debugNames[]; + +#ifdef RF_DBG_OPTION +#undef RF_DBG_OPTION +#endif /* RF_DBG_OPTION */ + +#ifdef __STDC__ +#define RF_DBG_OPTION(_option_,_defval_) extern long rf_##_option_; +#else /* __STDC__ */ +#define RF_DBG_OPTION(_option_,_defval_) extern long rf_/**/_option_; +#endif /* __STDC__ */ +#include "rf_optnames.h" + +void rf_ResetDebugOptions(void); + +#endif /* !_RF__RF_OPTIONS_H_ */ diff --git a/sys/dev/raidframe/rf_optnames.h b/sys/dev/raidframe/rf_optnames.h new file mode 100644 index 00000000000..064b2da76f2 --- /dev/null +++ b/sys/dev/raidframe/rf_optnames.h @@ -0,0 +1,144 @@ +/* $OpenBSD: rf_optnames.h,v 1.1 1999/01/11 14:29:33 niklas Exp $ */ +/* $NetBSD: rf_optnames.h,v 1.1 1998/11/13 04:20:31 oster Exp $ */ +/* + * rf_optnames.h + */ +/* + * Copyright (c) 1996 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Don't protect against multiple inclusion here- we actually want this. + */ + +#ifdef _KERNEL +#define KERNEL +#endif + +RF_DBG_OPTION(accSizeKB,0) /* if nonzero, the fixed access size to run */ +RF_DBG_OPTION(accessDebug,0) +RF_DBG_OPTION(accessTraceBufSize,0) +RF_DBG_OPTION(alignAccesses,0) /* whether accs should be aligned to their size */ +RF_DBG_OPTION(camlayerIOs,0) +RF_DBG_OPTION(camlayerDebug,0) /* debug CAM activity */ +RF_DBG_OPTION(cscanDebug,0) /* debug CSCAN sorting */ +RF_DBG_OPTION(dagDebug,0) +RF_DBG_OPTION(debugPrintUseBuffer,0) +RF_DBG_OPTION(degDagDebug,0) +RF_DBG_OPTION(disableAsyncAccs,0) +RF_DBG_OPTION(diskDebug,0) +RF_DBG_OPTION(doDebug,0) +RF_DBG_OPTION(dtDebug,0) +RF_DBG_OPTION(enableAtomicRMW,0) /* this debug var enables locking of the disk + * arm during small-write operations. Setting + * this variable to anything other than 0 will + * result in deadlock. (wvcii) + */ +RF_DBG_OPTION(engineDebug,0) +RF_DBG_OPTION(fifoDebug,0) /* debug fifo queueing */ +RF_DBG_OPTION(floatingRbufDebug,0) +RF_DBG_OPTION(forceHeadSepLimit,-1) +RF_DBG_OPTION(forceNumFloatingReconBufs,-1) /* wire down number of extra recon buffers to use */ +RF_DBG_OPTION(keepAccTotals,0) /* turn on keep_acc_totals */ +RF_DBG_OPTION(lockTableSize,RF_DEFAULT_LOCK_TABLE_SIZE) +RF_DBG_OPTION(mapDebug,0) +RF_DBG_OPTION(maxNumTraces,-1) +RF_DBG_OPTION(maxRandomSizeKB,128) /* if rf_accSizeKB==0, acc sizes are uniform in [ (1/2)..maxRandomSizeKB ] */ +RF_DBG_OPTION(maxTraceRunTimeSec,0) +RF_DBG_OPTION(memAmtDebug,0) /* trace amount of memory allocated */ +RF_DBG_OPTION(memChunkDebug,0) +RF_DBG_OPTION(memDebug,0) +RF_DBG_OPTION(memDebugAddress,0) +RF_DBG_OPTION(numBufsToAccumulate,1) /* number of buffers to accumulate before doing XOR */ +RF_DBG_OPTION(prReconSched,0) +RF_DBG_OPTION(printDAGsDebug,0) +RF_DBG_OPTION(printStatesDebug,0) +RF_DBG_OPTION(protectedSectors,64L) /* # of sectors at start of disk to + exclude from RAID address space */ +RF_DBG_OPTION(pssDebug,0) +RF_DBG_OPTION(queueDebug,0) +RF_DBG_OPTION(quiesceDebug,0) +RF_DBG_OPTION(raidSectorOffset,0) /* added to all incoming sectors to + debug alignment problems */ +RF_DBG_OPTION(reconDebug,0) +RF_DBG_OPTION(reconbufferDebug,0) +RF_DBG_OPTION(rewriteParityStripes,0) /* debug flag that causes parity rewrite at startup */ +RF_DBG_OPTION(scanDebug,0) /* debug SCAN sorting */ +RF_DBG_OPTION(showXorCallCounts,0) /* show n-way Xor call counts */ +RF_DBG_OPTION(shutdownDebug,0) /* show shutdown calls */ +RF_DBG_OPTION(sizePercentage,100) +RF_DBG_OPTION(sstfDebug,0) /* turn on debugging info for sstf queueing */ +RF_DBG_OPTION(stripeLockDebug,0) +RF_DBG_OPTION(suppressLocksAndLargeWrites,0) +RF_DBG_OPTION(suppressTraceDelays,0) +RF_DBG_OPTION(testDebug,0) +RF_DBG_OPTION(useMemChunks,1) +RF_DBG_OPTION(validateDAGDebug,0) +RF_DBG_OPTION(validateVisitedDebug,1) /* XXX turn to zero by default? */ +RF_DBG_OPTION(verifyParityDebug,0) +RF_DBG_OPTION(warnLongIOs,0) + +#ifdef KERNEL +RF_DBG_OPTION(debugKernelAccess,0) /* DoAccessKernel debugging */ +#endif /* KERNEL */ + +#ifndef KERNEL +RF_DBG_OPTION(disableParityVerify,0) /* supress verification of parity */ +RF_DBG_OPTION(interactiveScript,0) /* set as a debug option for now */ +RF_DBG_OPTION(looptestShowWrites,0) /* user-level loop test write debugging */ +RF_DBG_OPTION(traceDebug,0) +#endif /* !KERNEL */ + +#ifdef SIMULATE +RF_DBG_OPTION(addrSizePercentage,100) +RF_DBG_OPTION(diskTrace,0) /* ised to turn the timing traces on and of */ +RF_DBG_OPTION(eventDebug,0) +RF_DBG_OPTION(mWactive,1500) +RF_DBG_OPTION(mWidle,625) +RF_DBG_OPTION(mWsleep,15) +RF_DBG_OPTION(mWspinup,3500) +#endif /* SIMULATE */ + +#if RF_INCLUDE_PARITYLOGGING > 0 +RF_DBG_OPTION(forceParityLogReint,0) +RF_DBG_OPTION(numParityRegions,0) /* number of regions in the array */ +RF_DBG_OPTION(numReintegrationThreads,1) +RF_DBG_OPTION(parityLogDebug,0) /* if nonzero, enables debugging of parity logging */ +RF_DBG_OPTION(totalInCoreLogCapacity,1024*1024) /* target bytes available for in-core logs */ +#endif /* RF_INCLUDE_PARITYLOGGING > 0 */ + +#if DFSTRACE > 0 +RF_DBG_OPTION(DFSTraceAccesses,0) +#endif /* DFSTRACE > 0 */ + +#if RF_DEMO > 0 +RF_DBG_OPTION(demoMeterHpos,0) /* horizontal position of meters for demo mode */ +RF_DBG_OPTION(demoMeterTag,0) +RF_DBG_OPTION(demoMeterVpos,0) /* vertical position of meters for demo mode */ +RF_DBG_OPTION(demoMode,0) +RF_DBG_OPTION(demoSMM,0) +RF_DBG_OPTION(demoSuppressReconInitVerify,0) /* supress initialization & verify for recon */ +#endif /* RF_DEMO > 0 */ diff --git a/sys/dev/raidframe/rf_owner.h b/sys/dev/raidframe/rf_owner.h new file mode 100644 index 00000000000..5b741bf3a5d --- /dev/null +++ b/sys/dev/raidframe/rf_owner.h @@ -0,0 +1,75 @@ +/* $OpenBSD: rf_owner.h,v 1.1 1999/01/11 14:29:33 niklas Exp $ */ +/* $NetBSD: rf_owner.h,v 1.1 1998/11/13 04:20:31 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* : + * Log: rf_owner.h,v + * Revision 1.8 1996/08/20 14:36:51 jimz + * add bufLen to RF_EventCreate_t to be able to include buffer length + * when freeing buffer + * + * Revision 1.7 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.6 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.5 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.4 1995/12/01 19:44:30 root + * added copyright info + * + */ + +#ifndef _RF__RF_OWNER_H_ +#define _RF__RF_OWNER_H_ + +#include "rf_types.h" + +struct RF_OwnerInfo_s { + RF_RaidAccessDesc_t *desc; + int owner; + double last_start; + int done; + int notFirst; +}; + +struct RF_EventCreate_s { + RF_Raid_t *raidPtr; + RF_Script_t *script; + RF_OwnerInfo_t *ownerInfo; + char *bufPtr; + int bufLen; +}; + +#endif /* !_RF__RF_OWNER_H_ */ diff --git a/sys/dev/raidframe/rf_paritylog.c b/sys/dev/raidframe/rf_paritylog.c new file mode 100644 index 00000000000..84bf2107d99 --- /dev/null +++ b/sys/dev/raidframe/rf_paritylog.c @@ -0,0 +1,1022 @@ +/* $OpenBSD: rf_paritylog.c,v 1.1 1999/01/11 14:29:34 niklas Exp $ */ +/* $NetBSD: rf_paritylog.c,v 1.1 1998/11/13 04:20:31 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* Code for manipulating in-core parity logs + * + * : + * Log: rf_paritylog.c,v + * Revision 1.27 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.26 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.25 1996/07/17 21:00:58 jimz + * clean up timer interface, tracing + * + * Revision 1.24 1996/06/11 10:18:59 jimz + * AllocParityLogCommonData() was freeing the common pointer immediately + * after allocating this. It appeared that this free really belonged + * inside one of the failure cases (for backing out), so I moved it + * in there. + * + * Revision 1.23 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.22 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.21 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.20 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.19 1996/05/30 12:59:18 jimz + * make etimer happier, more portable + * + * Revision 1.18 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.17 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.16 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.15 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.14 1996/05/20 16:16:59 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.13 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.12 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.11 1995/12/06 20:54:44 wvcii + * added prototyping + * + * Revision 1.10 1995/11/30 16:05:37 wvcii + * added copyright info + * + * Revision 1.9 1995/10/08 20:41:28 wvcii + * fixed bug in allocation of CommonLogData (was allocating incorrect size) + * + * Revision 1.8 1995/09/07 15:52:12 jimz + * noop compile when INCLUDE_PARITYLOGGING not defined + * + * Revision 1.7 1995/09/06 19:17:36 wvcii + * moved code for reintegration to rf_paritylogDiskMgr.c + * + * Revision 1.6 95/07/07 00:16:06 wvcii + * this version free from deadlock, fails parity verification + * + * Revision 1.5 1995/06/09 13:14:24 wvcii + * code is now nonblocking + * + * Revision 1.4 95/06/01 17:01:59 wvcii + * code debug + * + * Revision 1.3 95/05/31 13:08:23 wvcii + * code debug + * + * Revision 1.2 95/05/21 15:42:15 wvcii + * code debug + * + * Revision 1.1 95/05/18 10:43:54 wvcii + * Initial revision + * + */ + +#include "rf_archs.h" + +#if RF_INCLUDE_PARITYLOGGING > 0 + +/* + * Append-only log for recording parity "update" and "overwrite" records + */ + +#include "rf_types.h" +#include "rf_threadstuff.h" +#include "rf_mcpair.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_dagfuncs.h" +#include "rf_desc.h" +#include "rf_layout.h" +#include "rf_diskqueue.h" +#include "rf_etimer.h" +#include "rf_paritylog.h" +#include "rf_general.h" +#include "rf_threadid.h" +#include "rf_map.h" +#include "rf_paritylogging.h" +#include "rf_paritylogDiskMgr.h" +#include "rf_sys.h" + +static RF_CommonLogData_t *AllocParityLogCommonData(RF_Raid_t *raidPtr) +{ + RF_CommonLogData_t *common = NULL; + int rc; + + /* Return a struct for holding common parity log information from the free + list (rf_parityLogDiskQueue.freeCommonList). If the free list is empty, call + RF_Malloc to create a new structure. + NON-BLOCKING */ + + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + if (raidPtr->parityLogDiskQueue.freeCommonList) + { + common = raidPtr->parityLogDiskQueue.freeCommonList; + raidPtr->parityLogDiskQueue.freeCommonList = raidPtr->parityLogDiskQueue.freeCommonList->next; + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + } + else + { + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + RF_Malloc(common, sizeof(RF_CommonLogData_t), (RF_CommonLogData_t *)); + rc = rf_mutex_init(&common->mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + RF_Free(common, sizeof(RF_CommonLogData_t)); + common = NULL; + } + } + common->next = NULL; + return(common); +} + +static void FreeParityLogCommonData(RF_CommonLogData_t *common) +{ + RF_Raid_t *raidPtr; + + /* Insert a single struct for holding parity log information + (data) into the free list (rf_parityLogDiskQueue.freeCommonList). + NON-BLOCKING */ + + raidPtr = common->raidPtr; + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + common->next = raidPtr->parityLogDiskQueue.freeCommonList; + raidPtr->parityLogDiskQueue.freeCommonList = common; + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); +} + +static RF_ParityLogData_t *AllocParityLogData(RF_Raid_t *raidPtr) +{ + RF_ParityLogData_t *data = NULL; + + /* Return a struct for holding parity log information from the free + list (rf_parityLogDiskQueue.freeList). If the free list is empty, call + RF_Malloc to create a new structure. + NON-BLOCKING */ + + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + if (raidPtr->parityLogDiskQueue.freeDataList) + { + data = raidPtr->parityLogDiskQueue.freeDataList; + raidPtr->parityLogDiskQueue.freeDataList = raidPtr->parityLogDiskQueue.freeDataList->next; + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + } + else + { + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + RF_Malloc(data, sizeof(RF_ParityLogData_t), (RF_ParityLogData_t *)); + } + data->next = NULL; + data->prev = NULL; + return(data); +} + + +static void FreeParityLogData(RF_ParityLogData_t *data) +{ + RF_ParityLogData_t *nextItem; + RF_Raid_t *raidPtr; + + /* Insert a linked list of structs for holding parity log + information (data) into the free list (parityLogDiskQueue.freeList). + NON-BLOCKING */ + + raidPtr = data->common->raidPtr; + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + while (data) + { + nextItem = data->next; + data->next = raidPtr->parityLogDiskQueue.freeDataList; + raidPtr->parityLogDiskQueue.freeDataList = data; + data = nextItem; + } + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); +} + + +static void EnqueueParityLogData( + RF_ParityLogData_t *data, + RF_ParityLogData_t **head, + RF_ParityLogData_t **tail) +{ + RF_Raid_t *raidPtr; + + /* Insert an in-core parity log (*data) into the head of + a disk queue (*head, *tail). + NON-BLOCKING */ + + raidPtr = data->common->raidPtr; + if (rf_parityLogDebug) + printf("[enqueueing parity log data, region %d, raidAddress %d, numSector %d]\n",data->regionID,(int)data->diskAddress.raidAddress, (int)data->diskAddress.numSector); + RF_ASSERT(data->prev == NULL); + RF_ASSERT(data->next == NULL); + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + if (*head) + { + /* insert into head of queue */ + RF_ASSERT((*head)->prev == NULL); + RF_ASSERT((*tail)->next == NULL); + data->next = *head; + (*head)->prev = data; + *head = data; + } + else + { + /* insert into empty list */ + RF_ASSERT(*head == NULL); + RF_ASSERT(*tail == NULL); + *head = data; + *tail = data; + } + RF_ASSERT((*head)->prev == NULL); + RF_ASSERT((*tail)->next == NULL); + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); +} + +static RF_ParityLogData_t *DequeueParityLogData( + RF_Raid_t *raidPtr, + RF_ParityLogData_t **head, + RF_ParityLogData_t **tail, + int ignoreLocks) +{ + RF_ParityLogData_t *data; + + /* Remove and return an in-core parity log from the tail of + a disk queue (*head, *tail). + NON-BLOCKING */ + + /* remove from tail, preserving FIFO order */ + if (!ignoreLocks) + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + data = *tail; + if (data) + { + if (*head == *tail) + { + /* removing last item from queue */ + *head = NULL; + *tail = NULL; + } + else + { + *tail = (*tail)->prev; + (*tail)->next = NULL; + RF_ASSERT((*head)->prev == NULL); + RF_ASSERT((*tail)->next == NULL); + } + data->next = NULL; + data->prev = NULL; + if (rf_parityLogDebug) + printf("[dequeueing parity log data, region %d, raidAddress %d, numSector %d]\n",data->regionID,(int)data->diskAddress.raidAddress, (int)data->diskAddress.numSector); + } + if (*head) + { + RF_ASSERT((*head)->prev == NULL); + RF_ASSERT((*tail)->next == NULL); + } + if (!ignoreLocks) + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + return(data); +} + + +static void RequeueParityLogData( + RF_ParityLogData_t *data, + RF_ParityLogData_t **head, + RF_ParityLogData_t **tail) +{ + RF_Raid_t *raidPtr; + + /* Insert an in-core parity log (*data) into the tail of + a disk queue (*head, *tail). + NON-BLOCKING */ + + raidPtr = data->common->raidPtr; + RF_ASSERT(data); + if (rf_parityLogDebug) + printf("[requeueing parity log data, region %d, raidAddress %d, numSector %d]\n",data->regionID,(int)data->diskAddress.raidAddress, (int) data->diskAddress.numSector); + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + if (*tail) + { + /* append to tail of list */ + data->prev = *tail; + data->next = NULL; + (*tail)->next = data; + *tail = data; + } + else + { + /* inserting into an empty list */ + *head = data; + *tail = data; + (*head)->prev = NULL; + (*tail)->next = NULL; + } + RF_ASSERT((*head)->prev == NULL); + RF_ASSERT((*tail)->next == NULL); + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); +} + +RF_ParityLogData_t *rf_CreateParityLogData( + RF_ParityRecordType_t operation, + RF_PhysDiskAddr_t *pda, + caddr_t bufPtr, + RF_Raid_t *raidPtr, + int (*wakeFunc)(RF_DagNode_t *node, int status), + void *wakeArg, + RF_AccTraceEntry_t *tracerec, + RF_Etimer_t startTime) +{ + RF_ParityLogData_t *data, *resultHead = NULL, *resultTail = NULL; + RF_CommonLogData_t *common; + RF_PhysDiskAddr_t *diskAddress; + int boundary, offset = 0; + + /* Return an initialized struct of info to be logged. + Build one item per physical disk address, one item per region. + + NON-BLOCKING */ + + diskAddress = pda; + common = AllocParityLogCommonData(raidPtr); + RF_ASSERT(common); + + common->operation = operation; + common->bufPtr = bufPtr; + common->raidPtr = raidPtr; + common->wakeFunc = wakeFunc; + common->wakeArg = wakeArg; + common->tracerec = tracerec; + common->startTime = startTime; + common->cnt = 0; + + if (rf_parityLogDebug) + printf("[entering CreateParityLogData]\n"); + while (diskAddress) + { + common->cnt++; + data = AllocParityLogData(raidPtr); + RF_ASSERT(data); + data->common = common; + data->next = NULL; + data->prev = NULL; + data->regionID = rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector); + if (data->regionID == rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector + diskAddress->numSector - 1)) + { + /* disk address does not cross a region boundary */ + data->diskAddress = *diskAddress; + data->bufOffset = offset; + offset = offset + diskAddress->numSector; + EnqueueParityLogData(data, &resultHead, &resultTail); + /* adjust disk address */ + diskAddress = diskAddress->next; + } + else + { + /* disk address crosses a region boundary */ + /* find address where region is crossed */ + boundary = 0; + while (data->regionID == rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector + boundary)) + boundary++; + + /* enter data before the boundary */ + data->diskAddress = *diskAddress; + data->diskAddress.numSector = boundary; + data->bufOffset = offset; + offset += boundary; + EnqueueParityLogData(data, &resultHead, &resultTail); + /* adjust disk address */ + diskAddress->startSector += boundary; + diskAddress->numSector -= boundary; + } + } + if (rf_parityLogDebug) + printf("[leaving CreateParityLogData]\n"); + return(resultHead); +} + + +RF_ParityLogData_t *rf_SearchAndDequeueParityLogData( + RF_Raid_t *raidPtr, + int regionID, + RF_ParityLogData_t **head, + RF_ParityLogData_t **tail, + int ignoreLocks) +{ + RF_ParityLogData_t *w; + + /* Remove and return an in-core parity log from a specified region (regionID). + If a matching log is not found, return NULL. + + NON-BLOCKING. + */ + + /* walk backward through a list, looking for an entry with a matching region ID */ + if (!ignoreLocks) + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + w = (*tail); + while (w) + { + if (w->regionID == regionID) + { + /* remove an element from the list */ + if (w == *tail) + { + if (*head == *tail) + { + /* removing only element in the list */ + *head = NULL; + *tail = NULL; + } + else + { + /* removing last item in the list */ + *tail = (*tail)->prev; + (*tail)->next = NULL; + RF_ASSERT((*head)->prev == NULL); + RF_ASSERT((*tail)->next == NULL); + } + } + else + { + if (w == *head) + { + /* removing first item in the list */ + *head = (*head)->next; + (*head)->prev = NULL; + RF_ASSERT((*head)->prev == NULL); + RF_ASSERT((*tail)->next == NULL); + } + else + { + /* removing an item from the middle of the list */ + w->prev->next = w->next; + w->next->prev = w->prev; + RF_ASSERT((*head)->prev == NULL); + RF_ASSERT((*tail)->next == NULL); + } + } + w->prev = NULL; + w->next = NULL; + if (rf_parityLogDebug) + printf("[dequeueing parity log data, region %d, raidAddress %d, numSector %d]\n",w->regionID,(int)w->diskAddress.raidAddress,(int) w->diskAddress.numSector); + return(w); + } + else + w = w->prev; + } + if (!ignoreLocks) + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + return(NULL); +} + +static RF_ParityLogData_t *DequeueMatchingLogData( + RF_Raid_t *raidPtr, + RF_ParityLogData_t **head, + RF_ParityLogData_t **tail) +{ + RF_ParityLogData_t *logDataList, *logData; + int regionID; + + /* Remove and return an in-core parity log from the tail of + a disk queue (*head, *tail). Then remove all matching + (identical regionIDs) logData and return as a linked list. + + NON-BLOCKING + */ + + logDataList = DequeueParityLogData(raidPtr, head, tail, RF_TRUE); + if (logDataList) + { + regionID = logDataList->regionID; + logData = logDataList; + logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, head, tail, RF_TRUE); + while (logData->next) + { + logData = logData->next; + logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, head, tail, RF_TRUE); + } + } + return(logDataList); +} + + +static RF_ParityLog_t *AcquireParityLog( + RF_ParityLogData_t *logData, + int finish) +{ + RF_ParityLog_t *log = NULL; + RF_Raid_t *raidPtr; + + /* Grab a log buffer from the pool and return it. + If no buffers are available, return NULL. + NON-BLOCKING + */ + raidPtr = logData->common->raidPtr; + RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex); + if (raidPtr->parityLogPool.parityLogs) + { + log = raidPtr->parityLogPool.parityLogs; + raidPtr->parityLogPool.parityLogs = raidPtr->parityLogPool.parityLogs->next; + log->regionID = logData->regionID; + log->numRecords = 0; + log->next = NULL; + raidPtr->logsInUse++; + RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs); + } + else + { + /* no logs available, so place ourselves on the queue of work waiting on log buffers + this is done while parityLogPool.mutex is held, to ensure synchronization + with ReleaseParityLogs. + */ + if (rf_parityLogDebug) + printf("[blocked on log, region %d, finish %d]\n", logData->regionID, finish); + if (finish) + RequeueParityLogData(logData, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail); + else + EnqueueParityLogData(logData, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail); + } + RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex); + return(log); +} + +void rf_ReleaseParityLogs( + RF_Raid_t *raidPtr, + RF_ParityLog_t *firstLog) +{ + RF_ParityLogData_t *logDataList; + RF_ParityLog_t *log, *lastLog; + int cnt; + + /* Insert a linked list of parity logs (firstLog) to + the free list (parityLogPool.parityLogPool) + + NON-BLOCKING. + */ + + RF_ASSERT(firstLog); + + /* Before returning logs to global free list, service all + requests which are blocked on logs. Holding mutexes for parityLogPool and parityLogDiskQueue + forces synchronization with AcquireParityLog(). + */ + RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex); + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + logDataList = DequeueMatchingLogData(raidPtr, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail); + log = firstLog; + if (firstLog) + firstLog = firstLog->next; + log->numRecords = 0; + log->next = NULL; + while (logDataList && log) + { + RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex); + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + rf_ParityLogAppend(logDataList, RF_TRUE, &log, RF_FALSE); + if (rf_parityLogDebug) + printf("[finishing up buf-blocked log data, region %d]\n", logDataList->regionID); + if (log == NULL) + { + log = firstLog; + if (firstLog) + { + firstLog = firstLog->next; + log->numRecords = 0; + log->next = NULL; + } + } + RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex); + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + if (log) + logDataList = DequeueMatchingLogData(raidPtr, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail); + } + /* return remaining logs to pool */ + if (log) + { + log->next = firstLog; + firstLog = log; + } + if (firstLog) + { + lastLog = firstLog; + raidPtr->logsInUse--; + RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs); + while (lastLog->next) + { + lastLog = lastLog->next; + raidPtr->logsInUse--; + RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs); + } + lastLog->next = raidPtr->parityLogPool.parityLogs; + raidPtr->parityLogPool.parityLogs = firstLog; + cnt = 0; + log = raidPtr->parityLogPool.parityLogs; + while (log) + { + cnt++; + log = log->next; + } + RF_ASSERT(cnt + raidPtr->logsInUse == raidPtr->numParityLogs); + } + RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex); + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); +} + +static void ReintLog( + RF_Raid_t *raidPtr, + int regionID, + RF_ParityLog_t *log) +{ + RF_ASSERT(log); + + /* Insert an in-core parity log (log) into the disk queue of reintegration + work. Set the flag (reintInProgress) for the specified region (regionID) + to indicate that reintegration is in progress for this region. + NON-BLOCKING + */ + + RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex); + raidPtr->regionInfo[regionID].reintInProgress = RF_TRUE; /* cleared when reint complete */ + + if (rf_parityLogDebug) + printf("[requesting reintegration of region %d]\n", log->regionID); + /* move record to reintegration queue */ + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + log->next = raidPtr->parityLogDiskQueue.reintQueue; + raidPtr->parityLogDiskQueue.reintQueue = log; + RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex); + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond); +} + +static void FlushLog( + RF_Raid_t *raidPtr, + RF_ParityLog_t *log) +{ + /* insert a core log (log) into a list of logs (parityLogDiskQueue.flushQueue) + waiting to be written to disk. + NON-BLOCKING + */ + + RF_ASSERT(log); + RF_ASSERT(log->numRecords == raidPtr->numSectorsPerLog); + RF_ASSERT(log->next == NULL); + /* move log to flush queue */ + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + log->next = raidPtr->parityLogDiskQueue.flushQueue; + raidPtr->parityLogDiskQueue.flushQueue = log; + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond); +} + +static int DumpParityLogToDisk( + int finish, + RF_ParityLogData_t *logData) +{ + int i, diskCount, regionID = logData->regionID; + RF_ParityLog_t *log; + RF_Raid_t *raidPtr; + + raidPtr = logData->common->raidPtr; + + /* Move a core log to disk. If the log disk is full, initiate + reintegration. + + Return (0) if we can enqueue the dump immediately, otherwise + return (1) to indicate we are blocked on reintegration and + control of the thread should be relinquished. + + Caller must hold regionInfo[regionID].mutex + + NON-BLOCKING + */ + + if (rf_parityLogDebug) + printf("[dumping parity log to disk, region %d]\n", regionID); + log = raidPtr->regionInfo[regionID].coreLog; + RF_ASSERT(log->numRecords == raidPtr->numSectorsPerLog); + RF_ASSERT(log->next == NULL); + + /* if reintegration is in progress, must queue work */ + RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex); + if (raidPtr->regionInfo[regionID].reintInProgress) + { + /* Can not proceed since this region is currently being reintegrated. + We can not block, so queue remaining work and return */ + if (rf_parityLogDebug) + printf("[region %d waiting on reintegration]\n",regionID); + /* XXX not sure about the use of finish - shouldn't this always be "Enqueue"? */ + if (finish) + RequeueParityLogData(logData, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail); + else + EnqueueParityLogData(logData, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail); + RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex); + return(1); /* relenquish control of this thread */ + } + RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex); + raidPtr->regionInfo[regionID].coreLog = NULL; + if ((raidPtr->regionInfo[regionID].diskCount) < raidPtr->regionInfo[regionID].capacity) + /* IMPORTANT!! this loop bound assumes region disk holds an integral number of core logs */ + { + /* update disk map for this region */ + diskCount = raidPtr->regionInfo[regionID].diskCount; + for (i = 0; i < raidPtr->numSectorsPerLog; i++) + { + raidPtr->regionInfo[regionID].diskMap[i + diskCount].operation = log->records[i].operation; + raidPtr->regionInfo[regionID].diskMap[i + diskCount].parityAddr = log->records[i].parityAddr; + } + log->diskOffset = diskCount; + raidPtr->regionInfo[regionID].diskCount += raidPtr->numSectorsPerLog; + FlushLog(raidPtr, log); + } + else + { + /* no room for log on disk, send it to disk manager and request reintegration */ + RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == raidPtr->regionInfo[regionID].capacity); + ReintLog(raidPtr, regionID, log); + } + if (rf_parityLogDebug) + printf("[finished dumping parity log to disk, region %d]\n", regionID); + return(0); +} + +int rf_ParityLogAppend( + RF_ParityLogData_t *logData, + int finish, + RF_ParityLog_t **incomingLog, + int clearReintFlag) +{ + int regionID, logItem, itemDone; + RF_ParityLogData_t *item; + int punt, done = RF_FALSE; + RF_ParityLog_t *log; + RF_Raid_t *raidPtr; + RF_Etimer_t timer; + int (*wakeFunc)(RF_DagNode_t *node, int status); + void *wakeArg; + + /* Add parity to the appropriate log, one sector at a time. + This routine is called is called by dag functions ParityLogUpdateFunc + and ParityLogOverwriteFunc and therefore MUST BE NONBLOCKING. + + Parity to be logged is contained in a linked-list (logData). When + this routine returns, every sector in the list will be in one of + three places: + 1) entered into the parity log + 2) queued, waiting on reintegration + 3) queued, waiting on a core log + + Blocked work is passed to the ParityLoggingDiskManager for completion. + Later, as conditions which required the block are removed, the work + reenters this routine with the "finish" parameter set to "RF_TRUE." + + NON-BLOCKING + */ + + raidPtr = logData->common->raidPtr; + /* lock the region for the first item in logData */ + RF_ASSERT(logData != NULL); + regionID = logData->regionID; + RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); + RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled); + + if (clearReintFlag) + { + /* Enable flushing for this region. Holding both locks provides + a synchronization barrier with DumpParityLogToDisk + */ + RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex); + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + RF_ASSERT(raidPtr->regionInfo[regionID].reintInProgress == RF_TRUE); + raidPtr->regionInfo[regionID].diskCount = 0; + raidPtr->regionInfo[regionID].reintInProgress = RF_FALSE; + RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex); /* flushing is now enabled */ + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + } + + /* process each item in logData */ + while (logData) + { + /* remove an item from logData */ + item = logData; + logData = logData->next; + item->next = NULL; + item->prev = NULL; + + if (rf_parityLogDebug) + printf("[appending parity log data, region %d, raidAddress %d, numSector %d]\n",item->regionID,(int)item->diskAddress.raidAddress, (int)item->diskAddress.numSector); + + /* see if we moved to a new region */ + if (regionID != item->regionID) + { + RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); + regionID = item->regionID; + RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); + RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled); + } + + punt = RF_FALSE; /* Set to RF_TRUE if work is blocked. This can happen in one of two ways: + 1) no core log (AcquireParityLog) + 2) waiting on reintegration (DumpParityLogToDisk) + If punt is RF_TRUE, the dataItem was queued, so skip to next item. + */ + + /* process item, one sector at a time, until all sectors processed or we punt */ + if (item->diskAddress.numSector > 0) + done = RF_FALSE; + else + RF_ASSERT(0); + while (!punt && !done) + { + /* verify that a core log exists for this region */ + if (!raidPtr->regionInfo[regionID].coreLog) + { + /* Attempt to acquire a parity log. + If acquisition fails, queue remaining work in data item and move to nextItem. + */ + if (incomingLog) { + if (*incomingLog) + { + RF_ASSERT((*incomingLog)->next == NULL); + raidPtr->regionInfo[regionID].coreLog = *incomingLog; + raidPtr->regionInfo[regionID].coreLog->regionID = regionID; + *incomingLog = NULL; + } + else + raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish); + } else + raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish); + /* Note: AcquireParityLog either returns a log or enqueues currentItem */ + } + if (!raidPtr->regionInfo[regionID].coreLog) + punt = RF_TRUE; /* failed to find a core log */ + else + { + RF_ASSERT(raidPtr->regionInfo[regionID].coreLog->next == NULL); + /* verify that the log has room for new entries */ + /* if log is full, dump it to disk and grab a new log */ + if (raidPtr->regionInfo[regionID].coreLog->numRecords == raidPtr->numSectorsPerLog) + { + /* log is full, dump it to disk */ + if (DumpParityLogToDisk(finish, item)) + punt = RF_TRUE; /* dump unsuccessful, blocked on reintegration */ + else + { + /* dump was successful */ + if (incomingLog) { + if (*incomingLog) + { + RF_ASSERT((*incomingLog)->next == NULL); + raidPtr->regionInfo[regionID].coreLog = *incomingLog; + raidPtr->regionInfo[regionID].coreLog->regionID = regionID; + *incomingLog = NULL; + } + else + raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish); + } else + raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish); + /* if a core log is not available, must queue work and return */ + if (!raidPtr->regionInfo[regionID].coreLog) + punt = RF_TRUE; /* blocked on log availability */ + } + } + } + /* if we didn't punt on this item, attempt to add a sector to the core log */ + if (!punt) + { + RF_ASSERT(raidPtr->regionInfo[regionID].coreLog->next == NULL); + /* at this point, we have a core log with enough room for a sector */ + /* copy a sector into the log */ + log = raidPtr->regionInfo[regionID].coreLog; + RF_ASSERT(log->numRecords < raidPtr->numSectorsPerLog); + logItem = log->numRecords++; + log->records[logItem].parityAddr = item->diskAddress; + RF_ASSERT(log->records[logItem].parityAddr.startSector >= raidPtr->regionInfo[regionID].parityStartAddr); + RF_ASSERT(log->records[logItem].parityAddr.startSector < raidPtr->regionInfo[regionID].parityStartAddr + raidPtr->regionInfo[regionID].numSectorsParity); + log->records[logItem].parityAddr.numSector = 1; + log->records[logItem].operation = item->common->operation; + bcopy((item->common->bufPtr + (item->bufOffset++ * (1<<item->common->raidPtr->logBytesPerSector))), log->bufPtr + (logItem * (1<<item->common->raidPtr->logBytesPerSector)), (1<<item->common->raidPtr->logBytesPerSector)); + item->diskAddress.numSector--; + item->diskAddress.startSector++; + if (item->diskAddress.numSector == 0) + done = RF_TRUE; + } + } + + if (!punt) + { + /* Processed this item completely, decrement count of items + to be processed. + */ + RF_ASSERT(item->diskAddress.numSector == 0); + RF_LOCK_MUTEX(item->common->mutex); + item->common->cnt--; + if (item->common->cnt == 0) + itemDone = RF_TRUE; + else + itemDone = RF_FALSE; + RF_UNLOCK_MUTEX(item->common->mutex); + if (itemDone) + { + /* Finished processing all log data for this IO + Return structs to free list and invoke wakeup function. + */ + timer = item->common->startTime; /* grab initial value of timer */ + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + item->common->tracerec->plog_us += RF_ETIMER_VAL_US(timer); + if (rf_parityLogDebug) + printf("[waking process for region %d]\n", item->regionID); + wakeFunc = item->common->wakeFunc; + wakeArg = item->common->wakeArg; + FreeParityLogCommonData(item->common); + FreeParityLogData(item); + (wakeFunc)(wakeArg, 0); + } + else + FreeParityLogData(item); + } + } + RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); + if (rf_parityLogDebug) + printf("[exiting ParityLogAppend]\n"); + return(0); +} + + +void rf_EnableParityLogging(RF_Raid_t *raidPtr) +{ + int regionID; + + for (regionID = 0; regionID < rf_numParityRegions; regionID++) { + RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); + raidPtr->regionInfo[regionID].loggingEnabled = RF_TRUE; + RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); + } + if (rf_parityLogDebug) + printf("[parity logging enabled]\n"); +} + +#endif /* RF_INCLUDE_PARITYLOGGING > 0 */ diff --git a/sys/dev/raidframe/rf_paritylog.h b/sys/dev/raidframe/rf_paritylog.h new file mode 100644 index 00000000000..fd6128174e1 --- /dev/null +++ b/sys/dev/raidframe/rf_paritylog.h @@ -0,0 +1,225 @@ +/* $OpenBSD: rf_paritylog.h,v 1.1 1999/01/11 14:29:34 niklas Exp $ */ +/* $NetBSD: rf_paritylog.h,v 1.1 1998/11/13 04:20:31 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* header file for parity log + * + * : + * Log: rf_paritylog.h,v + * Revision 1.21 1996/07/17 21:00:58 jimz + * clean up timer interface, tracing + * + * Revision 1.20 1996/07/15 17:22:18 jimz + * nit-pick code cleanup + * resolve stdlib problems on DEC OSF + * + * Revision 1.19 1996/06/11 10:17:57 jimz + * definitions and run state for parity logging thread + * + * Revision 1.18 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.17 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.16 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.15 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.14 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.13 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.12 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.11 1995/12/06 20:54:58 wvcii + * added prototyping + * + * Revision 1.10 1995/11/30 16:05:50 wvcii + * added copyright info + * + * Revision 1.9 1995/10/07 05:09:27 wvcii + * removed #define BYTESPERSECTOR 512 + * + * Revision 1.8 1995/09/06 19:27:52 wvcii + * added startTime to commonLogData + * + * Revision 1.7 1995/07/07 00:13:42 wvcii + * this version free from deadlock, fails parity verification + * + */ + +#ifndef _RF__RF_PARITYLOG_H_ +#define _RF__RF_PARITYLOG_H_ + +#include "rf_types.h" + +#define RF_DEFAULT_NUM_SECTORS_PER_LOG 64 + +typedef int RF_RegionId_t; + +typedef enum RF_ParityRecordType_e { + RF_STOP, + RF_UPDATE, + RF_OVERWRITE +} RF_ParityRecordType_t; + +struct RF_CommonLogData_s { + RF_DECLARE_MUTEX(mutex) /* protects cnt */ + int cnt; /* when 0, time to call wakeFunc */ + RF_Raid_t *raidPtr; +/* int (*wakeFunc)(struct buf *); */ + int (*wakeFunc)(RF_DagNode_t *node, int status); + void *wakeArg; + RF_AccTraceEntry_t *tracerec; + RF_Etimer_t startTime; + caddr_t bufPtr; + RF_ParityRecordType_t operation; + RF_CommonLogData_t *next; +}; + +struct RF_ParityLogData_s { + RF_RegionId_t regionID; /* this struct guaranteed to span a single region */ + int bufOffset; /* offset from common->bufPtr */ + RF_PhysDiskAddr_t diskAddress; + RF_CommonLogData_t *common; /* info shared by one or more parityLogData structs */ + RF_ParityLogData_t *next; + RF_ParityLogData_t *prev; +}; + +struct RF_ParityLogAppendQueue_s { + RF_DECLARE_MUTEX(mutex) +}; + +struct RF_ParityLogRecord_s { + RF_PhysDiskAddr_t parityAddr; + RF_ParityRecordType_t operation; +}; + +struct RF_ParityLog_s { + RF_RegionId_t regionID; + int numRecords; + int diskOffset; + RF_ParityLogRecord_t *records; + caddr_t bufPtr; + RF_ParityLog_t *next; +}; + +struct RF_ParityLogQueue_s { + RF_DECLARE_MUTEX(mutex) + RF_ParityLog_t *parityLogs; +}; + +struct RF_RegionBufferQueue_s { + RF_DECLARE_MUTEX(mutex) + RF_DECLARE_COND(cond) + int bufferSize; + int totalBuffers; /* size of array 'buffers' */ + int availableBuffers; /* num available 'buffers' */ + int emptyBuffersIndex; /* stick next freed buffer here */ + int availBuffersIndex; /* grab next buffer from here */ + caddr_t *buffers; /* array buffers used to hold parity */ +}; + +#define RF_PLOG_CREATED (1<<0) /* thread is created */ +#define RF_PLOG_RUNNING (1<<1) /* thread is running */ +#define RF_PLOG_TERMINATE (1<<2) /* thread is terminated (should exit) */ +#define RF_PLOG_SHUTDOWN (1<<3) /* thread is aware and exiting/exited */ + +struct RF_ParityLogDiskQueue_s { + RF_DECLARE_MUTEX(mutex) /* protects all vars in this struct */ + RF_DECLARE_COND(cond) + int threadState; /* is thread running, should it shutdown (see above) */ + RF_ParityLog_t *flushQueue; /* list of parity logs to be flushed to log disk */ + RF_ParityLog_t *reintQueue; /* list of parity logs waiting to be reintegrated */ + RF_ParityLogData_t *bufHead; /* head of FIFO list of log data, waiting on a buffer */ + RF_ParityLogData_t *bufTail; /* tail of FIFO list of log data, waiting on a buffer */ + RF_ParityLogData_t *reintHead; /* head of FIFO list of log data, waiting on reintegration */ + RF_ParityLogData_t *reintTail; /* tail of FIFO list of log data, waiting on reintegration */ + RF_ParityLogData_t *logBlockHead; /* queue of work, blocked until a log is available */ + RF_ParityLogData_t *logBlockTail; + RF_ParityLogData_t *reintBlockHead; /* queue of work, blocked until reintegration is complete */ + RF_ParityLogData_t *reintBlockTail; + RF_CommonLogData_t *freeCommonList; /* list of unused common data structs */ + RF_ParityLogData_t *freeDataList; /* list of unused log data structs */ +}; + +struct RF_DiskMap_s { + RF_PhysDiskAddr_t parityAddr; + RF_ParityRecordType_t operation; +}; + +struct RF_RegionInfo_s { + RF_DECLARE_MUTEX(mutex) /* protects: diskCount, diskMap, loggingEnabled, coreLog */ + RF_DECLARE_MUTEX(reintMutex) /* protects: reintInProgress */ + int reintInProgress; /* flag used to suspend flushing operations */ + RF_SectorCount_t capacity; /* capacity of this region in sectors */ + RF_SectorNum_t regionStartAddr; /* starting disk address for this region */ + RF_SectorNum_t parityStartAddr; /* starting disk address for this region */ + RF_SectorCount_t numSectorsParity; /* number of parity sectors protected by this region */ + RF_SectorCount_t diskCount; /* num of sectors written to this region's disk log */ + RF_DiskMap_t *diskMap; /* in-core map of what's in this region's disk log */ + int loggingEnabled; /* logging enable for this region */ + RF_ParityLog_t *coreLog; /* in-core log for this region */ +}; + +RF_ParityLogData_t *rf_CreateParityLogData(RF_ParityRecordType_t operation, + RF_PhysDiskAddr_t *pda, caddr_t bufPtr, RF_Raid_t *raidPtr, + int (*wakeFunc)(RF_DagNode_t *node, int status), + void *wakeArg, RF_AccTraceEntry_t *tracerec, + RF_Etimer_t startTime); +RF_ParityLogData_t *rf_SearchAndDequeueParityLogData(RF_Raid_t *raidPtr, + RF_RegionId_t regionID, RF_ParityLogData_t **head, + RF_ParityLogData_t **tail, int ignoreLocks); +void rf_ReleaseParityLogs(RF_Raid_t *raidPtr, RF_ParityLog_t *firstLog); +int rf_ParityLogAppend(RF_ParityLogData_t *logData, int finish, + RF_ParityLog_t **incomingLog, int clearReintFlag); +void rf_EnableParityLogging(RF_Raid_t *raidPtr); + +#endif /* !_RF__RF_PARITYLOG_H_ */ diff --git a/sys/dev/raidframe/rf_paritylogDiskMgr.c b/sys/dev/raidframe/rf_paritylogDiskMgr.c new file mode 100644 index 00000000000..92079d5ec26 --- /dev/null +++ b/sys/dev/raidframe/rf_paritylogDiskMgr.c @@ -0,0 +1,790 @@ +/* $OpenBSD: rf_paritylogDiskMgr.c,v 1.1 1999/01/11 14:29:34 niklas Exp $ */ +/* $NetBSD: rf_paritylogDiskMgr.c,v 1.1 1998/11/13 04:20:31 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ +/* Code for flushing and reintegration operations related to parity logging. + * + * : + * Log: rf_paritylogDiskMgr.c,v + * Revision 1.25 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.24 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.23 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.22 1996/06/11 10:17:33 jimz + * Put in thread startup/shutdown mechanism for proper synchronization + * with start and end of day routines. + * + * Revision 1.21 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.20 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.19 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.18 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.17 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.16 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.15 1996/05/30 12:59:18 jimz + * make etimer happier, more portable + * + * Revision 1.14 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.13 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.12 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.11 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.10 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.9 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.8 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.7 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.6 1995/12/06 20:58:27 wvcii + * added prototypes + * + * Revision 1.5 1995/11/30 16:06:05 wvcii + * added copyright info + * + * Revision 1.4 1995/10/09 22:41:10 wvcii + * minor bug fix + * + * Revision 1.3 1995/10/08 20:43:47 wvcii + * lots of random debugging - debugging still incomplete + * + * Revision 1.2 1995/09/07 15:52:19 jimz + * noop compile when INCLUDE_PARITYLOGGING not defined + * + * Revision 1.1 1995/09/06 19:24:44 wvcii + * Initial revision + * + */ + +#include "rf_archs.h" + +#if RF_INCLUDE_PARITYLOGGING > 0 + +#include "rf_types.h" +#include "rf_threadstuff.h" +#include "rf_mcpair.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_dagfuncs.h" +#include "rf_desc.h" +#include "rf_layout.h" +#include "rf_diskqueue.h" +#include "rf_paritylog.h" +#include "rf_general.h" +#include "rf_threadid.h" +#include "rf_etimer.h" +#include "rf_paritylogging.h" +#include "rf_engine.h" +#include "rf_dagutils.h" +#include "rf_map.h" +#include "rf_parityscan.h" +#include "rf_sys.h" + +#include "rf_paritylogDiskMgr.h" + +static caddr_t AcquireReintBuffer(RF_RegionBufferQueue_t *); + +static caddr_t AcquireReintBuffer(pool) + RF_RegionBufferQueue_t *pool; +{ + caddr_t bufPtr = NULL; + + /* Return a region buffer from the free list (pool). + If the free list is empty, WAIT. + BLOCKING */ + + RF_LOCK_MUTEX(pool->mutex); + if (pool->availableBuffers > 0) { + bufPtr = pool->buffers[pool->availBuffersIndex]; + pool->availableBuffers--; + pool->availBuffersIndex++; + if (pool->availBuffersIndex == pool->totalBuffers) + pool->availBuffersIndex = 0; + RF_UNLOCK_MUTEX(pool->mutex); + } + else { + RF_PANIC(); /* should never happen in currect config, single reint */ + RF_WAIT_COND(pool->cond, pool->mutex); + } + return(bufPtr); +} + +static void ReleaseReintBuffer( + RF_RegionBufferQueue_t *pool, + caddr_t bufPtr) +{ + /* Insert a region buffer (bufPtr) into the free list (pool). + NON-BLOCKING */ + + RF_LOCK_MUTEX(pool->mutex); + pool->availableBuffers++; + pool->buffers[pool->emptyBuffersIndex] = bufPtr; + pool->emptyBuffersIndex++; + if (pool->emptyBuffersIndex == pool->totalBuffers) + pool->emptyBuffersIndex = 0; + RF_ASSERT(pool->availableBuffers <= pool->totalBuffers); + RF_UNLOCK_MUTEX(pool->mutex); + RF_SIGNAL_COND(pool->cond); +} + + + +static void ReadRegionLog( + RF_RegionId_t regionID, + RF_MCPair_t *rrd_mcpair, + caddr_t regionBuffer, + RF_Raid_t *raidPtr, + RF_DagHeader_t **rrd_dag_h, + RF_AllocListElem_t **rrd_alloclist, + RF_PhysDiskAddr_t **rrd_pda) +{ + /* Initiate the read a region log from disk. Once initiated, return + to the calling routine. + + NON-BLOCKING + */ + + RF_AccTraceEntry_t tracerec; + RF_DagNode_t *rrd_rdNode; + + /* create DAG to read region log from disk */ + rf_MakeAllocList(*rrd_alloclist); + *rrd_dag_h = rf_MakeSimpleDAG(raidPtr, 1, 0, regionBuffer, rf_DiskReadFunc, rf_DiskReadUndoFunc, + "Rrl", *rrd_alloclist, RF_DAG_FLAGS_NONE, RF_IO_NORMAL_PRIORITY); + + /* create and initialize PDA for the core log */ + /* RF_Malloc(*rrd_pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *)); */ + *rrd_pda = rf_AllocPDAList(1); + rf_MapLogParityLogging(raidPtr, regionID, 0, &((*rrd_pda)->row), &((*rrd_pda)->col), &((*rrd_pda)->startSector)); + (*rrd_pda)->numSector = raidPtr->regionInfo[regionID].capacity; + + if ((*rrd_pda)->next) { + (*rrd_pda)->next = NULL; + printf("set rrd_pda->next to NULL\n"); + } + + /* initialize DAG parameters */ + bzero((char *)&tracerec,sizeof(tracerec)); + (*rrd_dag_h)->tracerec = &tracerec; + rrd_rdNode = (*rrd_dag_h)->succedents[0]->succedents[0]; + rrd_rdNode->params[0].p = *rrd_pda; +/* rrd_rdNode->params[1] = regionBuffer; */ + rrd_rdNode->params[2].v = 0; + rrd_rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0); + + /* launch region log read dag */ + rf_DispatchDAG(*rrd_dag_h, (void (*)(void *))rf_MCPairWakeupFunc, + (void *) rrd_mcpair); +} + + + +static void WriteCoreLog( + RF_ParityLog_t *log, + RF_MCPair_t *fwr_mcpair, + RF_Raid_t *raidPtr, + RF_DagHeader_t **fwr_dag_h, + RF_AllocListElem_t **fwr_alloclist, + RF_PhysDiskAddr_t **fwr_pda) +{ + RF_RegionId_t regionID = log->regionID; + RF_AccTraceEntry_t tracerec; + RF_SectorNum_t regionOffset; + RF_DagNode_t *fwr_wrNode; + + /* Initiate the write of a core log to a region log disk. + Once initiated, return to the calling routine. + + NON-BLOCKING + */ + + /* create DAG to write a core log to a region log disk */ + rf_MakeAllocList(*fwr_alloclist); + *fwr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, 0, log->bufPtr, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, + "Wcl", *fwr_alloclist, RF_DAG_FLAGS_NONE, RF_IO_NORMAL_PRIORITY); + + /* create and initialize PDA for the region log */ + /* RF_Malloc(*fwr_pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *)); */ + *fwr_pda = rf_AllocPDAList(1); + regionOffset = log->diskOffset; + rf_MapLogParityLogging(raidPtr, regionID, regionOffset, &((*fwr_pda)->row), &((*fwr_pda)->col), &((*fwr_pda)->startSector)); + (*fwr_pda)->numSector = raidPtr->numSectorsPerLog; + + /* initialize DAG parameters */ + bzero((char *)&tracerec,sizeof(tracerec)); + (*fwr_dag_h)->tracerec = &tracerec; + fwr_wrNode = (*fwr_dag_h)->succedents[0]->succedents[0]; + fwr_wrNode->params[0].p = *fwr_pda; +/* fwr_wrNode->params[1] = log->bufPtr; */ + fwr_wrNode->params[2].v = 0; + fwr_wrNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0); + + /* launch the dag to write the core log to disk */ + rf_DispatchDAG(*fwr_dag_h, (void (*)(void *)) rf_MCPairWakeupFunc, + (void *) fwr_mcpair); +} + + +static void ReadRegionParity( + RF_RegionId_t regionID, + RF_MCPair_t *prd_mcpair, + caddr_t parityBuffer, + RF_Raid_t *raidPtr, + RF_DagHeader_t **prd_dag_h, + RF_AllocListElem_t **prd_alloclist, + RF_PhysDiskAddr_t **prd_pda) +{ + /* Initiate the read region parity from disk. + Once initiated, return to the calling routine. + + NON-BLOCKING + */ + + RF_AccTraceEntry_t tracerec; + RF_DagNode_t *prd_rdNode; + + /* create DAG to read region parity from disk */ + rf_MakeAllocList(*prd_alloclist); + *prd_dag_h = rf_MakeSimpleDAG(raidPtr, 1, 0, NULL, rf_DiskReadFunc, rf_DiskReadUndoFunc, + "Rrp", *prd_alloclist, RF_DAG_FLAGS_NONE, RF_IO_NORMAL_PRIORITY); + + /* create and initialize PDA for region parity */ + /* RF_Malloc(*prd_pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *)); */ + *prd_pda = rf_AllocPDAList(1); + rf_MapRegionParity(raidPtr, regionID, &((*prd_pda)->row), &((*prd_pda)->col), &((*prd_pda)->startSector), &((*prd_pda)->numSector)); + if (rf_parityLogDebug) + printf("[reading %d sectors of parity from region %d]\n", + (int)(*prd_pda)->numSector, regionID); + if ((*prd_pda)->next) { + (*prd_pda)->next = NULL; + printf("set prd_pda->next to NULL\n"); + } + + /* initialize DAG parameters */ + bzero((char *)&tracerec,sizeof(tracerec)); + (*prd_dag_h)->tracerec = &tracerec; + prd_rdNode = (*prd_dag_h)->succedents[0]->succedents[0]; + prd_rdNode->params[0].p = *prd_pda; + prd_rdNode->params[1].p = parityBuffer; + prd_rdNode->params[2].v = 0; + prd_rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0); + if (rf_validateDAGDebug) + rf_ValidateDAG(*prd_dag_h); + /* launch region parity read dag */ + rf_DispatchDAG(*prd_dag_h, (void (*)(void *)) rf_MCPairWakeupFunc, + (void *) prd_mcpair); +} + +static void WriteRegionParity( + RF_RegionId_t regionID, + RF_MCPair_t *pwr_mcpair, + caddr_t parityBuffer, + RF_Raid_t *raidPtr, + RF_DagHeader_t **pwr_dag_h, + RF_AllocListElem_t **pwr_alloclist, + RF_PhysDiskAddr_t **pwr_pda) +{ + /* Initiate the write of region parity to disk. + Once initiated, return to the calling routine. + + NON-BLOCKING + */ + + RF_AccTraceEntry_t tracerec; + RF_DagNode_t *pwr_wrNode; + + /* create DAG to write region log from disk */ + rf_MakeAllocList(*pwr_alloclist); + *pwr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, 0, parityBuffer, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, + "Wrp", *pwr_alloclist, RF_DAG_FLAGS_NONE, RF_IO_NORMAL_PRIORITY); + + /* create and initialize PDA for region parity */ + /* RF_Malloc(*pwr_pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *)); */ + *pwr_pda = rf_AllocPDAList(1); + rf_MapRegionParity(raidPtr, regionID, &((*pwr_pda)->row), &((*pwr_pda)->col), &((*pwr_pda)->startSector), &((*pwr_pda)->numSector)); + + /* initialize DAG parameters */ + bzero((char *)&tracerec,sizeof(tracerec)); + (*pwr_dag_h)->tracerec = &tracerec; + pwr_wrNode = (*pwr_dag_h)->succedents[0]->succedents[0]; + pwr_wrNode->params[0].p = *pwr_pda; +/* pwr_wrNode->params[1] = parityBuffer; */ + pwr_wrNode->params[2].v = 0; + pwr_wrNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0); + + /* launch the dag to write region parity to disk */ + rf_DispatchDAG(*pwr_dag_h, (void (*)(void *))rf_MCPairWakeupFunc, + (void *) pwr_mcpair); +} + +static void FlushLogsToDisk( + RF_Raid_t *raidPtr, + RF_ParityLog_t *logList) +{ + /* Flush a linked list of core logs to the log disk. + Logs contain the disk location where they should be + written. Logs were written in FIFO order and that + order must be preserved. + + Recommended optimizations: + 1) allow multiple flushes to occur simultaneously + 2) coalesce contiguous flush operations + + BLOCKING + */ + + RF_ParityLog_t *log; + RF_RegionId_t regionID; + RF_MCPair_t *fwr_mcpair; + RF_DagHeader_t *fwr_dag_h; + RF_AllocListElem_t *fwr_alloclist; + RF_PhysDiskAddr_t *fwr_pda; + + fwr_mcpair = rf_AllocMCPair(); + RF_LOCK_MUTEX(fwr_mcpair->mutex); + + RF_ASSERT(logList); + log = logList; + while (log) + { + regionID = log->regionID; + + /* create and launch a DAG to write the core log */ + if (rf_parityLogDebug) + printf("[initiating write of core log for region %d]\n", regionID); + fwr_mcpair->flag = RF_FALSE; + WriteCoreLog(log, fwr_mcpair, raidPtr, &fwr_dag_h, &fwr_alloclist, &fwr_pda); + + /* wait for the DAG to complete */ +#ifndef SIMULATE + while (!fwr_mcpair->flag) + RF_WAIT_COND(fwr_mcpair->cond, fwr_mcpair->mutex); +#endif /* !SIMULATE */ + if (fwr_dag_h->status != rf_enable) + { + RF_ERRORMSG1("Unable to write core log to disk (region %d)\n", regionID); + RF_ASSERT(0); + } + + /* RF_Free(fwr_pda, sizeof(RF_PhysDiskAddr_t)); */ + rf_FreePhysDiskAddr(fwr_pda); + rf_FreeDAG(fwr_dag_h); + rf_FreeAllocList(fwr_alloclist); + + log = log->next; + } + RF_UNLOCK_MUTEX(fwr_mcpair->mutex); + rf_FreeMCPair(fwr_mcpair); + rf_ReleaseParityLogs(raidPtr, logList); +} + +static void ReintegrateRegion( + RF_Raid_t *raidPtr, + RF_RegionId_t regionID, + RF_ParityLog_t *coreLog) +{ + RF_MCPair_t *rrd_mcpair=NULL, *prd_mcpair, *pwr_mcpair; + RF_DagHeader_t *rrd_dag_h, *prd_dag_h, *pwr_dag_h; + RF_AllocListElem_t *rrd_alloclist, *prd_alloclist, *pwr_alloclist; + RF_PhysDiskAddr_t *rrd_pda, *prd_pda, *pwr_pda; + caddr_t parityBuffer, regionBuffer=NULL; + + /* Reintegrate a region (regionID). + 1. acquire region and parity buffers + 2. read log from disk + 3. read parity from disk + 4. apply log to parity + 5. apply core log to parity + 6. write new parity to disk + + BLOCKING + */ + + if (rf_parityLogDebug) + printf("[reintegrating region %d]\n", regionID); + + /* initiate read of region parity */ + if (rf_parityLogDebug) + printf("[initiating read of parity for region %d]\n", regionID); + parityBuffer = AcquireReintBuffer(&raidPtr->parityBufferPool); + prd_mcpair = rf_AllocMCPair(); + RF_LOCK_MUTEX(prd_mcpair->mutex); + prd_mcpair->flag = RF_FALSE; + ReadRegionParity(regionID, prd_mcpair, parityBuffer, raidPtr, &prd_dag_h, &prd_alloclist, &prd_pda); + + /* if region log nonempty, initiate read */ + if (raidPtr->regionInfo[regionID].diskCount > 0) + { + if (rf_parityLogDebug) + printf("[initiating read of disk log for region %d]\n", regionID); + regionBuffer = AcquireReintBuffer(&raidPtr->regionBufferPool); + rrd_mcpair = rf_AllocMCPair(); + RF_LOCK_MUTEX(rrd_mcpair->mutex); + rrd_mcpair->flag = RF_FALSE; + ReadRegionLog(regionID, rrd_mcpair, regionBuffer, raidPtr, &rrd_dag_h, &rrd_alloclist, &rrd_pda); + } + + /* wait on read of region parity to complete */ +#ifndef SIMULATE + while (!prd_mcpair->flag) { + RF_WAIT_COND(prd_mcpair->cond, prd_mcpair->mutex); + } +#endif /* !SIMULATE */ + RF_UNLOCK_MUTEX(prd_mcpair->mutex); + if (prd_dag_h->status != rf_enable) + { + RF_ERRORMSG("Unable to read parity from disk\n"); + /* add code to fail the parity disk */ + RF_ASSERT(0); + } + + /* apply core log to parity */ + /* if (coreLog) + ApplyLogsToParity(coreLog, parityBuffer); */ + + if (raidPtr->regionInfo[regionID].diskCount > 0) + { + /* wait on read of region log to complete */ +#ifndef SIMULATE + while (!rrd_mcpair->flag) + RF_WAIT_COND(rrd_mcpair->cond, rrd_mcpair->mutex); +#endif /* !SIMULATE */ + RF_UNLOCK_MUTEX(rrd_mcpair->mutex); + if (rrd_dag_h->status != rf_enable) + { + RF_ERRORMSG("Unable to read region log from disk\n"); + /* add code to fail the log disk */ + RF_ASSERT(0); + } + /* apply region log to parity */ + /* ApplyRegionToParity(regionID, regionBuffer, parityBuffer); */ + /* release resources associated with region log */ + /* RF_Free(rrd_pda, sizeof(RF_PhysDiskAddr_t)); */ + rf_FreePhysDiskAddr(rrd_pda); + rf_FreeDAG(rrd_dag_h); + rf_FreeAllocList(rrd_alloclist); + rf_FreeMCPair(rrd_mcpair); + ReleaseReintBuffer(&raidPtr->regionBufferPool, regionBuffer); + } + + /* write reintegrated parity to disk */ + if (rf_parityLogDebug) + printf("[initiating write of parity for region %d]\n", regionID); + pwr_mcpair = rf_AllocMCPair(); + RF_LOCK_MUTEX(pwr_mcpair->mutex); + pwr_mcpair->flag = RF_FALSE; + WriteRegionParity(regionID, pwr_mcpair, parityBuffer, raidPtr, &pwr_dag_h, &pwr_alloclist, &pwr_pda); +#ifndef SIMULATE + while (!pwr_mcpair->flag) + RF_WAIT_COND(pwr_mcpair->cond, pwr_mcpair->mutex); +#endif /* !SIMULATE */ + RF_UNLOCK_MUTEX(pwr_mcpair->mutex); + if (pwr_dag_h->status != rf_enable) + { + RF_ERRORMSG("Unable to write parity to disk\n"); + /* add code to fail the parity disk */ + RF_ASSERT(0); + } + + /* release resources associated with read of old parity */ + /* RF_Free(prd_pda, sizeof(RF_PhysDiskAddr_t)); */ + rf_FreePhysDiskAddr(prd_pda); + rf_FreeDAG(prd_dag_h); + rf_FreeAllocList(prd_alloclist); + rf_FreeMCPair(prd_mcpair); + + /* release resources associated with write of new parity */ + ReleaseReintBuffer(&raidPtr->parityBufferPool, parityBuffer); + /* RF_Free(pwr_pda, sizeof(RF_PhysDiskAddr_t)); */ + rf_FreePhysDiskAddr(pwr_pda); + rf_FreeDAG(pwr_dag_h); + rf_FreeAllocList(pwr_alloclist); + rf_FreeMCPair(pwr_mcpair); + + if (rf_parityLogDebug) + printf("[finished reintegrating region %d]\n", regionID); +} + + + +static void ReintegrateLogs( + RF_Raid_t *raidPtr, + RF_ParityLog_t *logList) +{ + RF_ParityLog_t *log, *freeLogList = NULL; + RF_ParityLogData_t *logData, *logDataList; + RF_RegionId_t regionID; + + RF_ASSERT(logList); + while (logList) + { + log = logList; + logList = logList->next; + log->next = NULL; + regionID = log->regionID; + ReintegrateRegion(raidPtr, regionID, log); + log->numRecords = 0; + + /* remove all items which are blocked on reintegration of this region */ + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + logData = rf_SearchAndDequeueParityLogData(raidPtr, regionID, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail, RF_TRUE); + logDataList = logData; + while (logData) + { + logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail, RF_TRUE); + logData = logData->next; + } + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + + /* process blocked log data and clear reintInProgress flag for this region */ + if (logDataList) + rf_ParityLogAppend(logDataList, RF_TRUE, &log, RF_TRUE); + else + { + /* Enable flushing for this region. Holding both locks provides + a synchronization barrier with DumpParityLogToDisk + */ + RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); + RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex); + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + raidPtr->regionInfo[regionID].diskCount = 0; + raidPtr->regionInfo[regionID].reintInProgress = RF_FALSE; + RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); + RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex); /* flushing is now enabled */ + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + } + /* if log wasn't used, attach it to the list of logs to be returned */ + if (log) + { + log->next = freeLogList; + freeLogList = log; + } + } + if (freeLogList) + rf_ReleaseParityLogs(raidPtr, freeLogList); +} + +int rf_ShutdownLogging(RF_Raid_t *raidPtr) +{ + /* shutdown parity logging + 1) disable parity logging in all regions + 2) reintegrate all regions + */ + + RF_SectorCount_t diskCount; + RF_RegionId_t regionID; + RF_ParityLog_t *log; + + if (rf_parityLogDebug) + printf("[shutting down parity logging]\n"); + /* Since parity log maps are volatile, we must reintegrate all regions. */ + if (rf_forceParityLogReint) { + for (regionID = 0; regionID < rf_numParityRegions; regionID++) + { + RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); + raidPtr->regionInfo[regionID].loggingEnabled = RF_FALSE; + log = raidPtr->regionInfo[regionID].coreLog; + raidPtr->regionInfo[regionID].coreLog = NULL; + diskCount = raidPtr->regionInfo[regionID].diskCount; + RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); + if (diskCount > 0 || log != NULL) + ReintegrateRegion(raidPtr, regionID, log); + if (log != NULL) + rf_ReleaseParityLogs(raidPtr, log); + } + } + if (rf_parityLogDebug) + { + printf("[parity logging disabled]\n"); + printf("[should be done!]\n"); + } + return(0); +} + +int rf_ParityLoggingDiskManager(RF_Raid_t *raidPtr) +{ + RF_ParityLog_t *reintQueue, *flushQueue; + int workNeeded, done = RF_FALSE; + + rf_assign_threadid(); /* don't remove this line */ + + /* Main program for parity logging disk thread. This routine waits + for work to appear in either the flush or reintegration queues + and is responsible for flushing core logs to the log disk as + well as reintegrating parity regions. + + BLOCKING + */ + + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + + /* + * Inform our creator that we're running. Don't bother doing the + * mutex lock/unlock dance- we locked above, and we'll unlock + * below with nothing to do, yet. + */ + raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_RUNNING; + RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond); + + /* empty the work queues */ + flushQueue = raidPtr->parityLogDiskQueue.flushQueue; raidPtr->parityLogDiskQueue.flushQueue = NULL; + reintQueue = raidPtr->parityLogDiskQueue.reintQueue; raidPtr->parityLogDiskQueue.reintQueue = NULL; + workNeeded = (flushQueue || reintQueue); + + while (!done) + { + while (workNeeded) + { + /* First, flush all logs in the flush queue, freeing buffers + Second, reintegrate all regions which are reported as full. + Third, append queued log data until blocked. + + Note: Incoming appends (ParityLogAppend) can block on either + 1. empty buffer pool + 2. region under reintegration + To preserve a global FIFO ordering of appends, buffers are not + released to the world until those appends blocked on buffers are + removed from the append queue. Similarly, regions which are + reintegrated are not opened for general use until the append + queue has been emptied. + */ + + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + + /* empty flushQueue, using free'd log buffers to process bufTail */ + if (flushQueue) + FlushLogsToDisk(raidPtr, flushQueue); + + /* empty reintQueue, flushing from reintTail as we go */ + if (reintQueue) + ReintegrateLogs(raidPtr, reintQueue); + + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + flushQueue = raidPtr->parityLogDiskQueue.flushQueue; raidPtr->parityLogDiskQueue.flushQueue = NULL; + reintQueue = raidPtr->parityLogDiskQueue.reintQueue; raidPtr->parityLogDiskQueue.reintQueue = NULL; + workNeeded = (flushQueue || reintQueue); + } + /* no work is needed at this point */ + if (raidPtr->parityLogDiskQueue.threadState&RF_PLOG_TERMINATE) + { + /* shutdown parity logging + 1. disable parity logging in all regions + 2. reintegrate all regions + */ + done = RF_TRUE; /* thread disabled, no work needed */ + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + rf_ShutdownLogging(raidPtr); + } + if (!done) + { + /* thread enabled, no work needed, so sleep */ + if (rf_parityLogDebug) + printf("[parity logging disk manager sleeping]\n"); + RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, raidPtr->parityLogDiskQueue.mutex); + if (rf_parityLogDebug) + printf("[parity logging disk manager just woke up]\n"); + flushQueue = raidPtr->parityLogDiskQueue.flushQueue; raidPtr->parityLogDiskQueue.flushQueue = NULL; + reintQueue = raidPtr->parityLogDiskQueue.reintQueue; raidPtr->parityLogDiskQueue.reintQueue = NULL; + workNeeded = (flushQueue || reintQueue); + } + } + /* + * Announce that we're done. + */ + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_SHUTDOWN; + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond); +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + /* + * In the Net- and OpenBSD kernels, the thread must exit; returning would + * cause the proc trampoline to attempt to return to userspace. + */ + kthread_exit(0); /* does not return */ +#else + return(0); +#endif +} + +#endif /* RF_INCLUDE_PARITYLOGGING > 0 */ diff --git a/sys/dev/raidframe/rf_paritylogDiskMgr.h b/sys/dev/raidframe/rf_paritylogDiskMgr.h new file mode 100644 index 00000000000..c20558d9897 --- /dev/null +++ b/sys/dev/raidframe/rf_paritylogDiskMgr.h @@ -0,0 +1,63 @@ +/* $OpenBSD: rf_paritylogDiskMgr.h,v 1.1 1999/01/11 14:29:35 niklas Exp $ */ +/* $NetBSD: rf_paritylogDiskMgr.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* header file for parity log disk mgr code + * + * : + * Log: rf_paritylogDiskMgr.h,v + * Revision 1.5 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.4 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.3 1995/12/06 20:56:39 wvcii + * added prototypes + * + * Revision 1.2 1995/11/30 16:06:21 wvcii + * added copyright info + * + * Revision 1.1 1995/09/06 19:25:29 wvcii + * Initial revision + * + * + */ + +#ifndef _RF__RF_PARITYLOGDISKMGR_H_ +#define _RF__RF_PARITYLOGDISKMGR_H_ + +#include "rf_types.h" + +int rf_ShutdownLogging(RF_Raid_t *raidPtr); +int rf_ParityLoggingDiskManager(RF_Raid_t *raidPtr); + +#endif /* !_RF__RF_PARITYLOGDISKMGR_H_ */ diff --git a/sys/dev/raidframe/rf_paritylogging.c b/sys/dev/raidframe/rf_paritylogging.c new file mode 100644 index 00000000000..595612b3718 --- /dev/null +++ b/sys/dev/raidframe/rf_paritylogging.c @@ -0,0 +1,1088 @@ +/* $OpenBSD: rf_paritylogging.c,v 1.1 1999/01/11 14:29:35 niklas Exp $ */ +/* $NetBSD: rf_paritylogging.c,v 1.1 1998/11/13 04:20:32 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* : + * Log: rf_paritylogging.c,v + * Revision 1.42 1996/11/05 21:10:40 jimz + * failed pda generalization + * + * Revision 1.41 1996/07/31 16:56:18 jimz + * dataBytesPerStripe, sectorsPerDisk init arch-indep. + * + * Revision 1.40 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.39 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.38 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.37 1996/06/17 03:24:14 jimz + * switch to new shutdown function typing + * + * Revision 1.36 1996/06/14 23:15:38 jimz + * attempt to deal with thread GC problem + * + * Revision 1.35 1996/06/11 13:48:30 jimz + * get it to compile in-kernel + * + * Revision 1.34 1996/06/11 10:16:35 jimz + * Check return values on array configuration- back out if failed. + * Reorder shutdown to avoid using deallocated resources. + * Get rid of bogus join op in shutdown. + * + * Revision 1.33 1996/06/10 18:29:17 wvcii + * fixed bug in rf_IdentifyStripeParityLogging + * - added array initialization + * + * Revision 1.32 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.31 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.30 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.29 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.28 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.27 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.26 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.25 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.24 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.23 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.22 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.21 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.20 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.19 1996/05/20 16:16:30 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.18 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.17 1996/05/03 19:47:11 wvcii + * added includes of new dag library + * + * Revision 1.16 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.15 1995/12/06 20:57:43 wvcii + * added prototypes + * reintegration of logs on shutdown now conditional on forceParityLogReint + * + * Revision 1.14 1995/11/30 16:06:42 wvcii + * added copyright info + * + * Revision 1.13 1995/11/17 19:01:29 wvcii + * added prototyping to MapParity + * + * Revision 1.12 1995/11/07 15:36:03 wvcii + * changed ParityLoggingDagSelect prototype + * function no longer returns numHdrSucc, numTermAnt + * + * Revision 1.11 1995/10/08 20:42:54 wvcii + * lots of random debugging - debugging incomplete + * + * Revision 1.10 1995/09/07 01:26:55 jimz + * Achive basic compilation in kernel. Kernel functionality + * is not guaranteed at all, but it'll compile. Mostly. I hope. + * + * Revision 1.9 1995/09/06 19:21:17 wvcii + * explicit shutdown (forced reintegration) for simulator version + * + * Revision 1.8 1995/07/08 18:19:16 rachad + * Parity verifies can not be done in the simulator. + * + * Revision 1.7 1995/07/07 00:17:20 wvcii + * this version free from deadlock, fails parity verification + * + * Revision 1.6 1995/06/23 13:39:59 robby + * updeated to prototypes in rf_layout.h + * + * Revision 1.5 1995/06/09 13:14:56 wvcii + * code is now nonblocking + * + * Revision 1.4 95/06/01 17:02:23 wvcii + * code debug + * + * Revision 1.3 95/05/31 13:08:57 wvcii + * code debug + * + * Revision 1.2 95/05/21 15:35:00 wvcii + * code debug + * + * + * + */ + +/* + parity logging configuration, dag selection, and mapping is implemented here + */ + +#include "rf_archs.h" + +#if RF_INCLUDE_PARITYLOGGING > 0 + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_dagutils.h" +#include "rf_dagfuncs.h" +#include "rf_dagffrd.h" +#include "rf_dagffwr.h" +#include "rf_dagdegrd.h" +#include "rf_dagdegwr.h" +#include "rf_threadid.h" +#include "rf_paritylog.h" +#include "rf_paritylogDiskMgr.h" +#include "rf_paritylogging.h" +#include "rf_parityloggingdags.h" +#include "rf_general.h" +#include "rf_map.h" +#include "rf_utils.h" +#include "rf_shutdown.h" + +typedef struct RF_ParityLoggingConfigInfo_s { + RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by IdentifyStripe */ +} RF_ParityLoggingConfigInfo_t; + +static void FreeRegionInfo(RF_Raid_t *raidPtr, RF_RegionId_t regionID); +static void rf_ShutdownParityLogging(RF_ThreadArg_t arg); +static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg); +static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg); +static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg); +static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg); +static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg); + +int rf_ConfigureParityLogging( + RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr) +{ + int i, j, startdisk, rc; + RF_SectorCount_t totalLogCapacity, fragmentation, lastRegionCapacity; + RF_SectorCount_t parityBufferCapacity, maxRegionParityRange; + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_ParityLoggingConfigInfo_t *info; + RF_ParityLog_t *l=NULL, *next; + caddr_t lHeapPtr; + + /* + * We create multiple entries on the shutdown list here, since + * this configuration routine is fairly complicated in and of + * itself, and this makes backing out of a failed configuration + * much simpler. + */ + + raidPtr->numSectorsPerLog = RF_DEFAULT_NUM_SECTORS_PER_LOG; + + /* create a parity logging configuration structure */ + RF_MallocAndAdd(info, sizeof(RF_ParityLoggingConfigInfo_t), (RF_ParityLoggingConfigInfo_t *), raidPtr->cleanupList); + if (info == NULL) + return(ENOMEM); + layoutPtr->layoutSpecificInfo = (void *) info; + + RF_ASSERT(raidPtr->numRow == 1); + + /* the stripe identifier must identify the disks in each stripe, + * IN THE ORDER THAT THEY APPEAR IN THE STRIPE. + */ + info->stripeIdentifier = rf_make_2d_array((raidPtr->numCol), (raidPtr->numCol), raidPtr->cleanupList); + if (info->stripeIdentifier == NULL) + return(ENOMEM); + + startdisk = 0; + for (i=0; i<(raidPtr->numCol); i++) + { + for (j=0; j<(raidPtr->numCol); j++) + { + info->stripeIdentifier[i][j] = (startdisk + j) % (raidPtr->numCol - 1); + } + if ((--startdisk) < 0) + startdisk = raidPtr->numCol-1-1; + } + + /* fill in the remaining layout parameters */ + layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk; + layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector; + layoutPtr->numParityCol = 1; + layoutPtr->numParityLogCol = 1; + layoutPtr->numDataCol = raidPtr->numCol - layoutPtr->numParityCol - layoutPtr->numParityLogCol; + layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; + layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk; + raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit; + + raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; + + /* configure parity log parameters + + parameter comment/constraints + ---------------- ------------------- + * numParityRegions all regions (except possibly last) of equal size + * totalInCoreLogCapacity amount of memory in bytes available for in-core logs (default 1 MB) + # numSectorsPerLog capacity of an in-core log in sectors (1 disk track) + numParityLogs total number of in-core logs, should be at least numParityRegions + regionLogCapacity size of a region log (except possibly last one) in sectors + totalLogCapacity total amount of log space in sectors + + * denotes a user settable parameter. + # logs are fixed to be the size of a disk track, value #defined in rf_paritylog.h + + */ + + totalLogCapacity = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit * layoutPtr->numParityLogCol; + raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions; + if (rf_parityLogDebug) + printf("bytes per sector %d\n", raidPtr->bytesPerSector); + + /* reduce fragmentation within a disk region by adjusting the number of regions + in an attempt to allow an integral number of logs to fit into a disk region */ + fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog; + if (fragmentation > 0) + for (i = 1; i < (raidPtr->numSectorsPerLog / 2); i++) + { + if (((totalLogCapacity / (rf_numParityRegions + i)) % raidPtr->numSectorsPerLog) < fragmentation) + { + rf_numParityRegions++; + raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions; + fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog; + } + if (((totalLogCapacity / (rf_numParityRegions - i)) % raidPtr->numSectorsPerLog) < fragmentation) + { + rf_numParityRegions--; + raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions; + fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog; + } + } + /* ensure integral number of regions per log */ + raidPtr->regionLogCapacity = (raidPtr->regionLogCapacity / raidPtr->numSectorsPerLog) * raidPtr->numSectorsPerLog; + + raidPtr->numParityLogs = rf_totalInCoreLogCapacity / (raidPtr->bytesPerSector * raidPtr->numSectorsPerLog); + /* to avoid deadlock, must ensure that enough logs exist for each region to have one simultaneously */ + if (raidPtr->numParityLogs < rf_numParityRegions) + raidPtr->numParityLogs = rf_numParityRegions; + + /* create region information structs */ + RF_Malloc(raidPtr->regionInfo, (rf_numParityRegions * sizeof(RF_RegionInfo_t)), (RF_RegionInfo_t *)); + if (raidPtr->regionInfo == NULL) + return(ENOMEM); + + /* last region may not be full capacity */ + lastRegionCapacity = raidPtr->regionLogCapacity; + while ((rf_numParityRegions - 1) * raidPtr->regionLogCapacity + lastRegionCapacity > totalLogCapacity) + lastRegionCapacity = lastRegionCapacity - raidPtr->numSectorsPerLog; + + raidPtr->regionParityRange = raidPtr->sectorsPerDisk / rf_numParityRegions; + maxRegionParityRange = raidPtr->regionParityRange; + +/* i can't remember why this line is in the code -wvcii 6/30/95 */ +/* if (raidPtr->sectorsPerDisk % rf_numParityRegions > 0) + regionParityRange++; */ + + /* build pool of unused parity logs */ + RF_Malloc(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * raidPtr->numSectorsPerLog * raidPtr->bytesPerSector, (caddr_t)); + if (raidPtr->parityLogBufferHeap == NULL) + return(ENOMEM); + lHeapPtr = raidPtr->parityLogBufferHeap; + rc = rf_mutex_init(&raidPtr->parityLogPool.mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * raidPtr->numSectorsPerLog * raidPtr->bytesPerSector); + return(ENOMEM); + } + for (i = 0; i < raidPtr->numParityLogs; i++) + { + if (i == 0) + { + RF_Calloc(raidPtr->parityLogPool.parityLogs, 1, sizeof(RF_ParityLog_t), (RF_ParityLog_t *)); + if (raidPtr->parityLogPool.parityLogs == NULL) { + RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * raidPtr->numSectorsPerLog * raidPtr->bytesPerSector); + return(ENOMEM); + } + l = raidPtr->parityLogPool.parityLogs; + } + else + { + RF_Calloc(l->next, 1, sizeof(RF_ParityLog_t), (RF_ParityLog_t *)); + if (l->next == NULL) { + RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * raidPtr->numSectorsPerLog * raidPtr->bytesPerSector); + for(l=raidPtr->parityLogPool.parityLogs;l;l=next) { + next = l->next; + if (l->records) + RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t))); + RF_Free(l, sizeof(RF_ParityLog_t)); + } + return(ENOMEM); + } + l = l->next; + } + l->bufPtr = lHeapPtr; + lHeapPtr += raidPtr->numSectorsPerLog * raidPtr->bytesPerSector; + RF_Malloc(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t)), (RF_ParityLogRecord_t *)); + if (l->records == NULL) { + RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * raidPtr->numSectorsPerLog * raidPtr->bytesPerSector); + for(l=raidPtr->parityLogPool.parityLogs;l;l=next) { + next = l->next; + if (l->records) + RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t))); + RF_Free(l, sizeof(RF_ParityLog_t)); + } + return(ENOMEM); + } + } + rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingPool, raidPtr); + if (rc) { + RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + rf_ShutdownParityLoggingPool(raidPtr); + return(rc); + } + + /* build pool of region buffers */ + rc = rf_mutex_init(&raidPtr->regionBufferPool.mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + return(ENOMEM); + } + rc = rf_cond_init(&raidPtr->regionBufferPool.cond); + if (rc) { + RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + rf_mutex_destroy(&raidPtr->regionBufferPool.mutex); + return(ENOMEM); + } + raidPtr->regionBufferPool.bufferSize = raidPtr->regionLogCapacity * raidPtr->bytesPerSector; + printf("regionBufferPool.bufferSize %d\n",raidPtr->regionBufferPool.bufferSize); + raidPtr->regionBufferPool.totalBuffers = 1; /* for now, only one region at a time may be reintegrated */ + raidPtr->regionBufferPool.availableBuffers = raidPtr->regionBufferPool.totalBuffers; + raidPtr->regionBufferPool.availBuffersIndex = 0; + raidPtr->regionBufferPool.emptyBuffersIndex = 0; + RF_Malloc(raidPtr->regionBufferPool.buffers, raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t), (caddr_t *)); + if (raidPtr->regionBufferPool.buffers == NULL) { + rf_mutex_destroy(&raidPtr->regionBufferPool.mutex); + rf_cond_destroy(&raidPtr->regionBufferPool.cond); + return(ENOMEM); + } + for (i = 0; i < raidPtr->regionBufferPool.totalBuffers; i++) { + RF_Malloc(raidPtr->regionBufferPool.buffers[i], raidPtr->regionBufferPool.bufferSize * sizeof(char), (caddr_t)); + if (raidPtr->regionBufferPool.buffers == NULL) { + rf_mutex_destroy(&raidPtr->regionBufferPool.mutex); + rf_cond_destroy(&raidPtr->regionBufferPool.cond); + for(j=0;j<i;j++) { + RF_Free(raidPtr->regionBufferPool.buffers[i], raidPtr->regionBufferPool.bufferSize * sizeof(char)); + } + RF_Free(raidPtr->regionBufferPool.buffers, raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t)); + return(ENOMEM); + } + printf("raidPtr->regionBufferPool.buffers[%d] = %lx\n", i, + (long)raidPtr->regionBufferPool.buffers[i]); + } + rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingRegionBufferPool, raidPtr); + if (rc) { + RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + rf_ShutdownParityLoggingRegionBufferPool(raidPtr); + return(rc); + } + + /* build pool of parity buffers */ + parityBufferCapacity = maxRegionParityRange; + rc = rf_mutex_init(&raidPtr->parityBufferPool.mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + return(rc); + } + rc = rf_cond_init(&raidPtr->parityBufferPool.cond); + if (rc) { + RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + rf_mutex_destroy(&raidPtr->parityBufferPool.mutex); + return(ENOMEM); + } + raidPtr->parityBufferPool.bufferSize = parityBufferCapacity * raidPtr->bytesPerSector; + printf("parityBufferPool.bufferSize %d\n",raidPtr->parityBufferPool.bufferSize); + raidPtr->parityBufferPool.totalBuffers = 1; /* for now, only one region at a time may be reintegrated */ + raidPtr->parityBufferPool.availableBuffers = raidPtr->parityBufferPool.totalBuffers; + raidPtr->parityBufferPool.availBuffersIndex = 0; + raidPtr->parityBufferPool.emptyBuffersIndex = 0; + RF_Malloc(raidPtr->parityBufferPool.buffers, raidPtr->parityBufferPool.totalBuffers * sizeof(caddr_t), (caddr_t *)); + if (raidPtr->parityBufferPool.buffers == NULL) { + rf_mutex_destroy(&raidPtr->parityBufferPool.mutex); + rf_cond_destroy(&raidPtr->parityBufferPool.cond); + return(ENOMEM); + } + for (i = 0; i < raidPtr->parityBufferPool.totalBuffers; i++) { + RF_Malloc(raidPtr->parityBufferPool.buffers[i], raidPtr->parityBufferPool.bufferSize * sizeof(char), (caddr_t)); + if (raidPtr->parityBufferPool.buffers == NULL) { + rf_mutex_destroy(&raidPtr->parityBufferPool.mutex); + rf_cond_destroy(&raidPtr->parityBufferPool.cond); + for(j=0;j<i;j++) { + RF_Free(raidPtr->parityBufferPool.buffers[i], raidPtr->regionBufferPool.bufferSize * sizeof(char)); + } + RF_Free(raidPtr->parityBufferPool.buffers, raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t)); + return(ENOMEM); + } + printf("parityBufferPool.buffers[%d] = %lx\n", i, + (long)raidPtr->parityBufferPool.buffers[i]); + } + rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingParityBufferPool, raidPtr); + if (rc) { + RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + rf_ShutdownParityLoggingParityBufferPool(raidPtr); + return(rc); + } + + /* initialize parityLogDiskQueue */ + rc = rf_create_managed_mutex(listp, &raidPtr->parityLogDiskQueue.mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + return(rc); + } + rc = rf_create_managed_cond(listp, &raidPtr->parityLogDiskQueue.cond); + if (rc) { + RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + return(rc); + } + raidPtr->parityLogDiskQueue.flushQueue = NULL; + raidPtr->parityLogDiskQueue.reintQueue = NULL; + raidPtr->parityLogDiskQueue.bufHead = NULL; + raidPtr->parityLogDiskQueue.bufTail = NULL; + raidPtr->parityLogDiskQueue.reintHead = NULL; + raidPtr->parityLogDiskQueue.reintTail = NULL; + raidPtr->parityLogDiskQueue.logBlockHead = NULL; + raidPtr->parityLogDiskQueue.logBlockTail = NULL; + raidPtr->parityLogDiskQueue.reintBlockHead = NULL; + raidPtr->parityLogDiskQueue.reintBlockTail = NULL; + raidPtr->parityLogDiskQueue.freeDataList = NULL; + raidPtr->parityLogDiskQueue.freeCommonList = NULL; + + rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingDiskQueue, raidPtr); + if (rc) { + RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + return(rc); + } + + for (i = 0; i < rf_numParityRegions; i++) + { + rc = rf_mutex_init(&raidPtr->regionInfo[i].mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + for(j=0;j<i;j++) + FreeRegionInfo(raidPtr, j); + RF_Free(raidPtr->regionInfo, (rf_numParityRegions * sizeof(RF_RegionInfo_t))); + return(ENOMEM); + } + rc = rf_mutex_init(&raidPtr->regionInfo[i].reintMutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + rf_mutex_destroy(&raidPtr->regionInfo[i].mutex); + for(j=0;j<i;j++) + FreeRegionInfo(raidPtr, j); + RF_Free(raidPtr->regionInfo, (rf_numParityRegions * sizeof(RF_RegionInfo_t))); + return(ENOMEM); + } + raidPtr->regionInfo[i].reintInProgress = RF_FALSE; + raidPtr->regionInfo[i].regionStartAddr = raidPtr->regionLogCapacity * i; + raidPtr->regionInfo[i].parityStartAddr = raidPtr->regionParityRange * i; + if (i < rf_numParityRegions - 1) + { + raidPtr->regionInfo[i].capacity = raidPtr->regionLogCapacity; + raidPtr->regionInfo[i].numSectorsParity = raidPtr->regionParityRange; + } + else + { + raidPtr->regionInfo[i].capacity = lastRegionCapacity; + raidPtr->regionInfo[i].numSectorsParity = raidPtr->sectorsPerDisk - raidPtr->regionParityRange * i; + if (raidPtr->regionInfo[i].numSectorsParity > maxRegionParityRange) + maxRegionParityRange = raidPtr->regionInfo[i].numSectorsParity; + } + raidPtr->regionInfo[i].diskCount = 0; + RF_ASSERT(raidPtr->regionInfo[i].capacity + raidPtr->regionInfo[i].regionStartAddr <= totalLogCapacity); + RF_ASSERT(raidPtr->regionInfo[i].parityStartAddr + raidPtr->regionInfo[i].numSectorsParity <= raidPtr->sectorsPerDisk); + RF_Malloc(raidPtr->regionInfo[i].diskMap, (raidPtr->regionInfo[i].capacity * sizeof(RF_DiskMap_t)), (RF_DiskMap_t *)); + if (raidPtr->regionInfo[i].diskMap == NULL) { + rf_mutex_destroy(&raidPtr->regionInfo[i].mutex); + rf_mutex_destroy(&raidPtr->regionInfo[i].reintMutex); + for(j=0;j<i;j++) + FreeRegionInfo(raidPtr, j); + RF_Free(raidPtr->regionInfo, (rf_numParityRegions * sizeof(RF_RegionInfo_t))); + return(ENOMEM); + } + raidPtr->regionInfo[i].loggingEnabled = RF_FALSE; + raidPtr->regionInfo[i].coreLog = NULL; + } + rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingRegionInfo, raidPtr); + if (rc) { + RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + rf_ShutdownParityLoggingRegionInfo(raidPtr); + return(rc); + } + + RF_ASSERT(raidPtr->parityLogDiskQueue.threadState == 0); + raidPtr->parityLogDiskQueue.threadState = RF_PLOG_CREATED; + rc = RF_CREATE_THREAD(raidPtr->pLogDiskThreadHandle, rf_ParityLoggingDiskManager, raidPtr); + if (rc) { + raidPtr->parityLogDiskQueue.threadState = 0; + RF_ERRORMSG3("Unable to create parity logging disk thread file %s line %d rc=%d\n", + __FILE__, __LINE__, rc); + return(ENOMEM); + } + /* wait for thread to start */ + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + while(!(raidPtr->parityLogDiskQueue.threadState&RF_PLOG_RUNNING)) { + RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, raidPtr->parityLogDiskQueue.mutex); + } + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + + rc = rf_ShutdownCreate(listp, rf_ShutdownParityLogging, raidPtr); + if (rc) { + RF_ERRORMSG1("Got rc=%d adding parity logging shutdown event\n", rc); + rf_ShutdownParityLogging(raidPtr); + return(rc); + } + + if (rf_parityLogDebug) + { + printf(" size of disk log in sectors: %d\n", + (int)totalLogCapacity); + printf(" total number of parity regions is %d\n", (int)rf_numParityRegions); + printf(" nominal sectors of log per parity region is %d\n", (int)raidPtr->regionLogCapacity); + printf(" nominal region fragmentation is %d sectors\n",(int)fragmentation); + printf(" total number of parity logs is %d\n", raidPtr->numParityLogs); + printf(" parity log size is %d sectors\n", raidPtr->numSectorsPerLog); + printf(" total in-core log space is %d bytes\n", (int) rf_totalInCoreLogCapacity); + } + + rf_EnableParityLogging(raidPtr); + + return(0); +} + +static void FreeRegionInfo( + RF_Raid_t *raidPtr, + RF_RegionId_t regionID) +{ + RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); + RF_Free(raidPtr->regionInfo[regionID].diskMap, (raidPtr->regionInfo[regionID].capacity * sizeof(RF_DiskMap_t))); + if (!rf_forceParityLogReint && raidPtr->regionInfo[regionID].coreLog) { + rf_ReleaseParityLogs(raidPtr, raidPtr->regionInfo[regionID].coreLog); + raidPtr->regionInfo[regionID].coreLog = NULL; + } + else { + RF_ASSERT(raidPtr->regionInfo[regionID].coreLog == NULL); + RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == 0); + } + RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex); + rf_mutex_destroy(&raidPtr->regionInfo[regionID].mutex); + rf_mutex_destroy(&raidPtr->regionInfo[regionID].reintMutex); +} + + +static void FreeParityLogQueue( + RF_Raid_t *raidPtr, + RF_ParityLogQueue_t *queue) +{ + RF_ParityLog_t *l1, *l2; + + RF_LOCK_MUTEX(queue->mutex); + l1 = queue->parityLogs; + while (l1) + { + l2 = l1; + l1 = l2->next; + RF_Free(l2->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t))); + RF_Free(l2, sizeof(RF_ParityLog_t)); + } + RF_UNLOCK_MUTEX(queue->mutex); + rf_mutex_destroy(&queue->mutex); +} + + +static void FreeRegionBufferQueue(RF_RegionBufferQueue_t *queue) +{ + int i; + + RF_LOCK_MUTEX(queue->mutex); + if (queue->availableBuffers != queue->totalBuffers) + { + printf("Attempt to free region queue which is still in use!\n"); + RF_ASSERT(0); + } + for (i = 0; i < queue->totalBuffers; i++) + RF_Free(queue->buffers[i], queue->bufferSize); + RF_Free(queue->buffers, queue->totalBuffers * sizeof(caddr_t)); + RF_UNLOCK_MUTEX(queue->mutex); + rf_mutex_destroy(&queue->mutex); +} + +static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg) +{ + RF_Raid_t *raidPtr; + RF_RegionId_t i; + + raidPtr = (RF_Raid_t *)arg; + if (rf_parityLogDebug) { + int tid; + rf_get_threadid(tid); + printf("[%d] ShutdownParityLoggingRegionInfo\n", tid); + } + /* free region information structs */ + for (i = 0; i < rf_numParityRegions; i++) + FreeRegionInfo(raidPtr, i); + RF_Free(raidPtr->regionInfo, (rf_numParityRegions * sizeof(raidPtr->regionInfo))); + raidPtr->regionInfo = NULL; +} + +static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg) +{ + RF_Raid_t *raidPtr; + + raidPtr = (RF_Raid_t *)arg; + if (rf_parityLogDebug) { + int tid; + rf_get_threadid(tid); + printf("[%d] ShutdownParityLoggingPool\n", tid); + } + /* free contents of parityLogPool */ + FreeParityLogQueue(raidPtr, &raidPtr->parityLogPool); + RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * raidPtr->numSectorsPerLog * raidPtr->bytesPerSector); +} + +static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg) +{ + RF_Raid_t *raidPtr; + + raidPtr = (RF_Raid_t *)arg; + if (rf_parityLogDebug) { + int tid; + rf_get_threadid(tid); + printf("[%d] ShutdownParityLoggingRegionBufferPool\n", tid); + } + FreeRegionBufferQueue(&raidPtr->regionBufferPool); +} + +static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg) +{ + RF_Raid_t *raidPtr; + + raidPtr = (RF_Raid_t *)arg; + if (rf_parityLogDebug) { + int tid; + rf_get_threadid(tid); + printf("[%d] ShutdownParityLoggingParityBufferPool\n", tid); + } + FreeRegionBufferQueue(&raidPtr->parityBufferPool); +} + +static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg) +{ + RF_ParityLogData_t *d; + RF_CommonLogData_t *c; + RF_Raid_t *raidPtr; + + raidPtr = (RF_Raid_t *)arg; + if (rf_parityLogDebug) { + int tid; + rf_get_threadid(tid); + printf("[%d] ShutdownParityLoggingDiskQueue\n", tid); + } + /* free disk manager stuff */ + RF_ASSERT(raidPtr->parityLogDiskQueue.bufHead == NULL); + RF_ASSERT(raidPtr->parityLogDiskQueue.bufTail == NULL); + RF_ASSERT(raidPtr->parityLogDiskQueue.reintHead == NULL); + RF_ASSERT(raidPtr->parityLogDiskQueue.reintTail == NULL); + while (raidPtr->parityLogDiskQueue.freeDataList) + { + d = raidPtr->parityLogDiskQueue.freeDataList; + raidPtr->parityLogDiskQueue.freeDataList = raidPtr->parityLogDiskQueue.freeDataList->next; + RF_Free(d, sizeof(RF_ParityLogData_t)); + } + while (raidPtr->parityLogDiskQueue.freeCommonList) + { + c = raidPtr->parityLogDiskQueue.freeCommonList; + rf_mutex_destroy(&c->mutex); + raidPtr->parityLogDiskQueue.freeCommonList = raidPtr->parityLogDiskQueue.freeCommonList->next; + RF_Free(c, sizeof(RF_CommonLogData_t)); + } +} + +static void rf_ShutdownParityLogging(RF_ThreadArg_t arg) +{ + RF_Raid_t *raidPtr; + + raidPtr = (RF_Raid_t *)arg; + if (rf_parityLogDebug) { + int tid; + rf_get_threadid(tid); + printf("[%d] ShutdownParityLogging\n", tid); + } +#ifndef SIMULATE + /* shutdown disk thread */ + /* This has the desirable side-effect of forcing all regions to be + reintegrated. This is necessary since all parity log maps are + currently held in volatile memory. */ + + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_TERMINATE; + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond); + /* + * pLogDiskThread will now terminate when queues are cleared + * now wait for it to be done + */ + RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); + while(!(raidPtr->parityLogDiskQueue.threadState&RF_PLOG_SHUTDOWN)) { + RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, raidPtr->parityLogDiskQueue.mutex); + } + RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex); +#else /* !SIMULATE */ + /* explicitly call shutdown routines which force reintegration */ + rf_ShutdownLogging(raidPtr); +#endif /* !SIMULATE */ + if (rf_parityLogDebug) { + int tid; + rf_get_threadid(tid); + printf("[%d] ShutdownParityLogging done (thread completed)\n", tid); + } +} + +int rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t *raidPtr) +{ + return(20); +} + +RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t *raidPtr) +{ + return(10); +} + +/* return the region ID for a given RAID address */ +RF_RegionId_t rf_MapRegionIDParityLogging( + RF_Raid_t *raidPtr, + RF_SectorNum_t address) +{ + RF_RegionId_t regionID; + +/* regionID = address / (raidPtr->regionParityRange * raidPtr->Layout.numDataCol); */ + regionID = address / raidPtr->regionParityRange; + if (regionID == rf_numParityRegions) + { + /* last region may be larger than other regions */ + regionID--; + } + RF_ASSERT(address >= raidPtr->regionInfo[regionID].parityStartAddr); + RF_ASSERT(address < raidPtr->regionInfo[regionID].parityStartAddr + raidPtr->regionInfo[regionID].numSectorsParity); + RF_ASSERT(regionID < rf_numParityRegions); + return(regionID); +} + + +/* given a logical RAID sector, determine physical disk address of data */ +void rf_MapSectorParityLogging( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; + *row = 0; + /* *col = (SUID % (raidPtr->numCol - raidPtr->Layout.numParityLogCol)); */ + *col = SUID % raidPtr->Layout.numDataCol; + *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit + + (raidSector % raidPtr->Layout.sectorsPerStripeUnit); +} + + +/* given a logical RAID sector, determine physical disk address of parity */ +void rf_MapParityParityLogging( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; + + *row = 0; + /* *col = raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%(raidPtr->numCol - raidPtr->Layout.numParityLogCol); */ + *col = raidPtr->Layout.numDataCol; + *diskSector =(SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit + + (raidSector % raidPtr->Layout.sectorsPerStripeUnit); +} + + +/* given a regionID and sector offset, determine the physical disk address of the parity log */ +void rf_MapLogParityLogging( + RF_Raid_t *raidPtr, + RF_RegionId_t regionID, + RF_SectorNum_t regionOffset, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *startSector) +{ + *row = 0; + *col = raidPtr->numCol - 1; + *startSector = raidPtr->regionInfo[regionID].regionStartAddr + regionOffset; +} + + +/* given a regionID, determine the physical disk address of the logged parity for that region */ +void rf_MapRegionParity( + RF_Raid_t *raidPtr, + RF_RegionId_t regionID, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *startSector, + RF_SectorCount_t *numSector) +{ + *row = 0; + *col = raidPtr->numCol - 2; + *startSector = raidPtr->regionInfo[regionID].parityStartAddr; + *numSector = raidPtr->regionInfo[regionID].numSectorsParity; +} + + +/* given a logical RAID address, determine the participating disks in the stripe */ +void rf_IdentifyStripeParityLogging( + RF_Raid_t *raidPtr, + RF_RaidAddr_t addr, + RF_RowCol_t **diskids, + RF_RowCol_t *outRow) +{ + RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr); + RF_ParityLoggingConfigInfo_t *info = (RF_ParityLoggingConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; + *outRow = 0; + *diskids = info->stripeIdentifier[ stripeID % raidPtr->numCol ]; +} + + +void rf_MapSIDToPSIDParityLogging( + RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t stripeID, + RF_StripeNum_t *psID, + RF_ReconUnitNum_t *which_ru) +{ + *which_ru = 0; + *psID = stripeID; +} + + +/* select an algorithm for performing an access. Returns two pointers, + * one to a function that will return information about the DAG, and + * another to a function that will create the dag. + */ +void rf_ParityLoggingDagSelect( + RF_Raid_t *raidPtr, + RF_IoType_t type, + RF_AccessStripeMap_t *asmp, + RF_VoidFuncPtr *createFunc) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_PhysDiskAddr_t *failedPDA=NULL; + RF_RowCol_t frow, fcol; + RF_RowStatus_t rstat; + int prior_recon; + int tid; + + RF_ASSERT(RF_IO_IS_R_OR_W(type)); + + if (asmp->numDataFailed + asmp->numParityFailed > 1) { + RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n"); + /* *infoFunc = */ *createFunc = NULL; + return; + } else if (asmp->numDataFailed + asmp->numParityFailed == 1) { + + /* if under recon & already reconstructed, redirect the access to the spare drive + * and eliminate the failure indication + */ + failedPDA = asmp->failedPDAs[0]; + frow = failedPDA->row; fcol = failedPDA->col; + rstat = raidPtr->status[failedPDA->row]; + prior_recon = (rstat == rf_rs_reconfigured) || ( + (rstat == rf_rs_reconstructing) ? + rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0 + ); + if (prior_recon) { + RF_RowCol_t or = failedPDA->row,oc=failedPDA->col; + RF_SectorNum_t oo=failedPDA->startSector; + if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { /* redirect to dist spare space */ + + if (failedPDA == asmp->parityInfo) { + + /* parity has failed */ + (layoutPtr->map->MapParity)(raidPtr, failedPDA->raidAddress, &failedPDA->row, + &failedPDA->col, &failedPDA->startSector, RF_REMAP); + + if (asmp->parityInfo->next) { /* redir 2nd component, if any */ + RF_PhysDiskAddr_t *p = asmp->parityInfo->next; + RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit; + p->row = failedPDA->row; + p->col = failedPDA->col; + p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) + + SUoffs; /* cheating: startSector is not really a RAID address */ + } + + } else if (asmp->parityInfo->next && failedPDA == asmp->parityInfo->next) { + RF_ASSERT(0); /* should not ever happen */ + } else { + + /* data has failed */ + (layoutPtr->map->MapSector)(raidPtr, failedPDA->raidAddress, &failedPDA->row, + &failedPDA->col, &failedPDA->startSector, RF_REMAP); + + } + + } else { /* redirect to dedicated spare space */ + + failedPDA->row = raidPtr->Disks[frow][fcol].spareRow; + failedPDA->col = raidPtr->Disks[frow][fcol].spareCol; + + /* the parity may have two distinct components, both of which may need to be redirected */ + if (asmp->parityInfo->next) { + if (failedPDA == asmp->parityInfo) { + failedPDA->next->row = failedPDA->row; + failedPDA->next->col = failedPDA->col; + } else if (failedPDA == asmp->parityInfo->next) { /* paranoid: should never occur */ + asmp->parityInfo->row = failedPDA->row; + asmp->parityInfo->col = failedPDA->col; + } + } + } + + RF_ASSERT(failedPDA->col != -1); + + if (rf_dagDebug || rf_mapDebug) { + rf_get_threadid(tid); + printf("[%d] Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n", + tid,type,or,oc,(long)oo,failedPDA->row,failedPDA->col,(long)failedPDA->startSector); + } + + asmp->numDataFailed = asmp->numParityFailed = 0; + } + + } + + + if (type == RF_IO_TYPE_READ) { + + if (asmp->numDataFailed == 0) + *createFunc = (RF_VoidFuncPtr)rf_CreateFaultFreeReadDAG; + else + *createFunc = (RF_VoidFuncPtr)rf_CreateRaidFiveDegradedReadDAG; + + } + else { + + + /* if mirroring, always use large writes. If the access requires two distinct parity updates, + * always do a small write. If the stripe contains a failure but the access does not, do a + * small write. + * The first conditional (numStripeUnitsAccessed <= numDataCol/2) uses a less-than-or-equal + * rather than just a less-than because when G is 3 or 4, numDataCol/2 is 1, and I want + * single-stripe-unit updates to use just one disk. + */ + if ( (asmp->numDataFailed + asmp->numParityFailed) == 0) { + if (((asmp->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol!=1)) || + (asmp->parityInfo->next!=NULL) || rf_CheckStripeForFailures(raidPtr, asmp)) { + *createFunc = (RF_VoidFuncPtr)rf_CreateParityLoggingSmallWriteDAG; + } + else + *createFunc = (RF_VoidFuncPtr)rf_CreateParityLoggingLargeWriteDAG; + } + else + if (asmp->numParityFailed == 1) + *createFunc = (RF_VoidFuncPtr)rf_CreateNonRedundantWriteDAG; + else + if (asmp->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit) + *createFunc = NULL; + else + *createFunc = (RF_VoidFuncPtr)rf_CreateDegradedWriteDAG; + } +} + +#endif /* RF_INCLUDE_PARITYLOGGING > 0 */ diff --git a/sys/dev/raidframe/rf_paritylogging.h b/sys/dev/raidframe/rf_paritylogging.h new file mode 100644 index 00000000000..3a2db063c28 --- /dev/null +++ b/sys/dev/raidframe/rf_paritylogging.h @@ -0,0 +1,137 @@ +/* $OpenBSD: rf_paritylogging.h,v 1.1 1999/01/11 14:29:36 niklas Exp $ */ +/* $NetBSD: rf_paritylogging.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* header file for Parity Logging */ + +/* + * : + * Log: rf_paritylogging.h,v + * Revision 1.22 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.21 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.20 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.19 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.18 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.17 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.16 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.15 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.14 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.13 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.12 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.11 1995/12/06 20:56:25 wvcii + * added prototypes + * + * Revision 1.10 1995/11/30 16:06:58 wvcii + * added copyright info + * + * Revision 1.9 1995/11/17 19:53:08 wvcii + * fixed bug in MapParityRegion prototype + * + * Revision 1.8 1995/11/17 19:09:24 wvcii + * added prototypint to MapParity + * + * Revision 1.7 1995/11/07 15:28:17 wvcii + * changed ParityLoggingDagSelect prototype + * function no longer generates numHdrSucc, numTermAnt + * + * Revision 1.6 1995/07/07 00:16:50 wvcii + * this version free from deadlock, fails parity verification + * + * Revision 1.5 1995/06/23 13:39:44 robby + * updeated to prototypes in rf_layout.h + * + */ + +#ifndef _RF__RF_PARITYLOGGING_H_ +#define _RF__RF_PARITYLOGGING_H_ + +int rf_ConfigureParityLogging(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr); +int rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t *raidPtr); +RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t *raidPtr); +RF_RegionId_t rf_MapRegionIDParityLogging(RF_Raid_t *raidPtr, + RF_SectorNum_t address); +void rf_MapSectorParityLogging(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, + int remap); +void rf_MapParityParityLogging(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, + int remap); +void rf_MapLogParityLogging(RF_Raid_t *raidPtr, RF_RegionId_t regionID, + RF_SectorNum_t regionOffset, RF_RowCol_t *row, RF_RowCol_t *col, + RF_SectorNum_t *startSector); +void rf_MapRegionParity(RF_Raid_t *raidPtr, RF_RegionId_t regionID, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *startSector, + RF_SectorCount_t *numSector); +void rf_IdentifyStripeParityLogging(RF_Raid_t *raidPtr, RF_RaidAddr_t addr, + RF_RowCol_t **diskids, RF_RowCol_t *outRow); +void rf_MapSIDToPSIDParityLogging(RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t stripeID, RF_StripeNum_t *psID, + RF_ReconUnitNum_t *which_ru); +void rf_ParityLoggingDagSelect(RF_Raid_t *raidPtr, RF_IoType_t type, + RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc); + +#endif /* !_RF__RF_PARITYLOGGING_H_ */ diff --git a/sys/dev/raidframe/rf_parityloggingdags.c b/sys/dev/raidframe/rf_parityloggingdags.c new file mode 100644 index 00000000000..1cc51d0a7e3 --- /dev/null +++ b/sys/dev/raidframe/rf_parityloggingdags.c @@ -0,0 +1,752 @@ +/* $OpenBSD: rf_parityloggingdags.c,v 1.1 1999/01/11 14:29:37 niklas Exp $ */ +/* $NetBSD: rf_parityloggingdags.c,v 1.1 1998/11/13 04:20:32 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Log: rf_parityloggingdags.c,v + * Revision 1.27 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.26 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.25 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.24 1996/06/11 13:47:21 jimz + * fix up for in-kernel compilation + * + * Revision 1.23 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.22 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.21 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.20 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.19 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.18 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.17 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.16 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.15 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.14 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.13 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.12 1996/05/08 21:01:24 jimz + * fixed up enum type names that were conflicting with other + * enums and function names (ie, "panic") + * future naming trends will be towards RF_ and rf_ for + * everything raidframe-related + * + * Revision 1.11 1996/05/03 19:42:02 wvcii + * added includes for dag library + * + * Revision 1.10 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.9 1995/12/06 20:55:24 wvcii + * added prototyping + * fixed bug in dag header numSuccedents count for both small and large dags + * + * Revision 1.8 1995/11/30 16:08:01 wvcii + * added copyright info + * + * Revision 1.7 1995/11/07 15:29:05 wvcii + * reorganized code, adding comments and asserts + * dag creation routines now generate term node + * encoded commit point, barrier, and antecedence types into dags + * + * Revision 1.6 1995/09/07 15:52:06 jimz + * noop compile when INCLUDE_PARITYLOGGING not defined + * + * Revision 1.5 1995/06/15 13:51:53 robby + * updated some wrong prototypes (after prototyping rf_dagutils.h) + * + * Revision 1.4 1995/06/09 13:15:05 wvcii + * code is now nonblocking + * + * Revision 1.3 95/05/31 13:09:14 wvcii + * code debug + * + * Revision 1.2 1995/05/21 15:34:14 wvcii + * code debug + * + * Revision 1.1 95/05/16 14:36:53 wvcii + * Initial revision + * + * + */ + +#include "rf_archs.h" + +#if RF_INCLUDE_PARITYLOGGING > 0 + +/* + DAGs specific to parity logging are created here + */ + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_dagutils.h" +#include "rf_dagfuncs.h" +#include "rf_threadid.h" +#include "rf_debugMem.h" +#include "rf_paritylog.h" +#include "rf_memchunk.h" +#include "rf_general.h" + +#include "rf_parityloggingdags.h" + +/****************************************************************************** + * + * creates a DAG to perform a large-write operation: + * + * / Rod \ / Wnd \ + * H -- NIL- Rod - NIL - Wnd ------ NIL - T + * \ Rod / \ Xor - Lpo / + * + * The writes are not done until the reads complete because if they were done in + * parallel, a failure on one of the reads could leave the parity in an inconsistent + * state, so that the retry with a new DAG would produce erroneous parity. + * + * Note: this DAG has the nasty property that none of the buffers allocated for reading + * old data can be freed until the XOR node fires. Need to fix this. + * + * The last two arguments are the number of faults tolerated, and function for the + * redundancy calculation. The undo for the redundancy calc is assumed to be null + * + *****************************************************************************/ + +void rf_CommonCreateParityLoggingLargeWriteDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList, + int nfaults, + int (*redFunc)(RF_DagNode_t *)) +{ + RF_DagNode_t *nodes, *wndNodes, *rodNodes=NULL, *syncNode, *xorNode, *lpoNode, *blockNode, *unblockNode, *termNode; + int nWndNodes, nRodNodes, i; + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_AccessStripeMapHeader_t *new_asm_h[2]; + int nodeNum, asmNum; + RF_ReconUnitNum_t which_ru; + char *sosBuffer, *eosBuffer; + RF_PhysDiskAddr_t *pda; + RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru); + + if (rf_dagDebug) + printf("[Creating parity-logging large-write DAG]\n"); + RF_ASSERT(nfaults == 1); /* this arch only single fault tolerant */ + dag_h->creator = "ParityLoggingLargeWriteDAG"; + + /* alloc the Wnd nodes, the xor node, and the Lpo node */ + nWndNodes = asmap->numStripeUnitsAccessed; + RF_CallocAndAdd(nodes, nWndNodes + 6, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList); + i = 0; + wndNodes = &nodes[i]; i += nWndNodes; + xorNode = &nodes[i]; i += 1; + lpoNode = &nodes[i]; i += 1; + blockNode = &nodes[i]; i += 1; + syncNode = &nodes[i]; i += 1; + unblockNode = &nodes[i]; i += 1; + termNode = &nodes[i]; i += 1; + + dag_h->numCommitNodes = nWndNodes + 1; + dag_h->numCommits = 0; + dag_h->numSuccedents = 1; + + rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h, &nRodNodes, &sosBuffer, &eosBuffer, allocList); + if (nRodNodes > 0) + RF_CallocAndAdd(rodNodes, nRodNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList); + + /* begin node initialization */ + rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nRodNodes + 1, 0, 0, 0, dag_h, "Nil", allocList); + rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWndNodes + 1, 0, 0, dag_h, "Nil", allocList); + rf_InitNode(syncNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nWndNodes + 1, nRodNodes + 1, 0, 0, dag_h, "Nil", allocList); + rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); + + /* initialize the Rod nodes */ + for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) { + if (new_asm_h[asmNum]) { + pda = new_asm_h[asmNum]->stripeMap->physInfo; + while (pda) { + rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc,rf_DiskReadUndoFunc,rf_GenericWakeupFunc,1,1,4,0, dag_h, "Rod", allocList); + rodNodes[nodeNum].params[0].p = pda; + rodNodes[nodeNum].params[1].p = pda->bufPtr; + rodNodes[nodeNum].params[2].v = parityStripeID; + rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + nodeNum++; + pda=pda->next; + } + } + } + RF_ASSERT(nodeNum == nRodNodes); + + /* initialize the wnd nodes */ + pda = asmap->physInfo; + for (i=0; i < nWndNodes; i++) { + rf_InitNode(&wndNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList); + RF_ASSERT(pda != NULL); + wndNodes[i].params[0].p = pda; + wndNodes[i].params[1].p = pda->bufPtr; + wndNodes[i].params[2].v = parityStripeID; + wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + pda = pda->next; + } + + /* initialize the redundancy node */ + rf_InitNode(xorNode, rf_wait, RF_TRUE, redFunc, rf_NullNodeUndoFunc, NULL, 1, 1, 2*(nWndNodes+nRodNodes)+1, 1, dag_h, "Xr ", allocList); + xorNode->flags |= RF_DAGNODE_FLAG_YIELD; + for (i=0; i < nWndNodes; i++) { + xorNode->params[2*i+0] = wndNodes[i].params[0]; /* pda */ + xorNode->params[2*i+1] = wndNodes[i].params[1]; /* buf ptr */ + } + for (i=0; i < nRodNodes; i++) { + xorNode->params[2*(nWndNodes+i)+0] = rodNodes[i].params[0]; /* pda */ + xorNode->params[2*(nWndNodes+i)+1] = rodNodes[i].params[1]; /* buf ptr */ + } + xorNode->params[2*(nWndNodes+nRodNodes)].p = raidPtr; /* xor node needs to get at RAID information */ + + /* look for an Rod node that reads a complete SU. If none, alloc a buffer to receive the parity info. + * Note that we can't use a new data buffer because it will not have gotten written when the xor occurs. + */ + for (i = 0; i < nRodNodes; i++) + if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit) + break; + if (i == nRodNodes) { + RF_CallocAndAdd(xorNode->results[0], 1, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (void *), allocList); + } + else { + xorNode->results[0] = rodNodes[i].params[1].p; + } + + /* initialize the Lpo node */ + rf_InitNode(lpoNode, rf_wait, RF_FALSE, rf_ParityLogOverwriteFunc, rf_ParityLogOverwriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpo", allocList); + + lpoNode->params[0].p = asmap->parityInfo; + lpoNode->params[1].p = xorNode->results[0]; + RF_ASSERT(asmap->parityInfo->next == NULL); /* parityInfo must describe entire parity unit */ + + /* connect nodes to form graph */ + + /* connect dag header to block node */ + RF_ASSERT(dag_h->numSuccedents == 1); + RF_ASSERT(blockNode->numAntecedents == 0); + dag_h->succedents[0] = blockNode; + + /* connect the block node to the Rod nodes */ + RF_ASSERT(blockNode->numSuccedents == nRodNodes + 1); + for (i = 0; i < nRodNodes; i++) { + RF_ASSERT(rodNodes[i].numAntecedents == 1); + blockNode->succedents[i] = &rodNodes[i]; + rodNodes[i].antecedents[0] = blockNode; + rodNodes[i].antType[0] = rf_control; + } + + /* connect the block node to the sync node */ + /* necessary if nRodNodes == 0 */ + RF_ASSERT(syncNode->numAntecedents == nRodNodes + 1); + blockNode->succedents[nRodNodes] = syncNode; + syncNode->antecedents[0] = blockNode; + syncNode->antType[0] = rf_control; + + /* connect the Rod nodes to the syncNode */ + for (i = 0; i < nRodNodes; i++) { + rodNodes[i].succedents[0] = syncNode; + syncNode->antecedents[1 + i] = &rodNodes[i]; + syncNode->antType[1 + i] = rf_control; + } + + /* connect the sync node to the xor node */ + RF_ASSERT(syncNode->numSuccedents == nWndNodes + 1); + RF_ASSERT(xorNode->numAntecedents == 1); + syncNode->succedents[0] = xorNode; + xorNode->antecedents[0] = syncNode; + xorNode->antType[0] = rf_trueData; /* carry forward from sync */ + + /* connect the sync node to the Wnd nodes */ + for (i = 0; i < nWndNodes; i++) { + RF_ASSERT(wndNodes->numAntecedents == 1); + syncNode->succedents[1 + i] = &wndNodes[i]; + wndNodes[i].antecedents[0] = syncNode; + wndNodes[i].antType[0] = rf_control; + } + + /* connect the xor node to the Lpo node */ + RF_ASSERT(xorNode->numSuccedents == 1); + RF_ASSERT(lpoNode->numAntecedents == 1); + xorNode->succedents[0] = lpoNode; + lpoNode->antecedents[0]= xorNode; + lpoNode->antType[0] = rf_trueData; + + /* connect the Wnd nodes to the unblock node */ + RF_ASSERT(unblockNode->numAntecedents == nWndNodes + 1); + for (i = 0; i < nWndNodes; i++) { + RF_ASSERT(wndNodes->numSuccedents == 1); + wndNodes[i].succedents[0] = unblockNode; + unblockNode->antecedents[i] = &wndNodes[i]; + unblockNode->antType[i] = rf_control; + } + + /* connect the Lpo node to the unblock node */ + RF_ASSERT(lpoNode->numSuccedents == 1); + lpoNode->succedents[0] = unblockNode; + unblockNode->antecedents[nWndNodes] = lpoNode; + unblockNode->antType[nWndNodes] = rf_control; + + /* connect unblock node to terminator */ + RF_ASSERT(unblockNode->numSuccedents == 1); + RF_ASSERT(termNode->numAntecedents == 1); + RF_ASSERT(termNode->numSuccedents == 0); + unblockNode->succedents[0] = termNode; + termNode->antecedents[0] = unblockNode; + termNode->antType[0] = rf_control; +} + + + + +/****************************************************************************** + * + * creates a DAG to perform a small-write operation (either raid 5 or pq), which is as follows: + * + * Header + * | + * Block + * / | ... \ \ + * / | \ \ + * Rod Rod Rod Rop + * | \ /| \ / | \/ | + * | | | /\ | + * Wnd Wnd Wnd X + * | \ / | + * | \ / | + * \ \ / Lpo + * \ \ / / + * +-> Unblock <-+ + * | + * T + * + * + * R = Read, W = Write, X = Xor, o = old, n = new, d = data, p = parity. + * When the access spans a stripe unit boundary and is less than one SU in size, there will + * be two Rop -- X -- Wnp branches. I call this the "double-XOR" case. + * The second output from each Rod node goes to the X node. In the double-XOR + * case, there are exactly 2 Rod nodes, and each sends one output to one X node. + * There is one Rod -- Wnd -- T branch for each stripe unit being updated. + * + * The block and unblock nodes are unused. See comment above CreateFaultFreeReadDAG. + * + * Note: this DAG ignores all the optimizations related to making the RMWs atomic. + * it also has the nasty property that none of the buffers allocated for reading + * old data & parity can be freed until the XOR node fires. Need to fix this. + * + * A null qfuncs indicates single fault tolerant + *****************************************************************************/ + +void rf_CommonCreateParityLoggingSmallWriteDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList, + RF_RedFuncs_t *pfuncs, + RF_RedFuncs_t *qfuncs) +{ + RF_DagNode_t *xorNodes, *blockNode, *unblockNode, *nodes; + RF_DagNode_t *readDataNodes, *readParityNodes; + RF_DagNode_t *writeDataNodes, *lpuNodes; + RF_DagNode_t *unlockDataNodes=NULL, *termNode; + RF_PhysDiskAddr_t *pda = asmap->physInfo; + int numDataNodes = asmap->numStripeUnitsAccessed; + int numParityNodes = (asmap->parityInfo->next) ? 2 : 1; + int i, j, nNodes, totalNumNodes; + RF_ReconUnitNum_t which_ru; + int (*func)(RF_DagNode_t *node), (*undoFunc)(RF_DagNode_t *node); + int (*qfunc)(RF_DagNode_t *node); + char *name, *qname; + RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru); + long nfaults = qfuncs ? 2 : 1; + int lu_flag = (rf_enableAtomicRMW) ? 1 : 0; /* lock/unlock flag */ + + if (rf_dagDebug) printf("[Creating parity-logging small-write DAG]\n"); + RF_ASSERT(numDataNodes > 0); + RF_ASSERT(nfaults == 1); + dag_h->creator = "ParityLoggingSmallWriteDAG"; + + /* DAG creation occurs in three steps: + 1. count the number of nodes in the DAG + 2. create the nodes + 3. initialize the nodes + 4. connect the nodes + */ + + /* Step 1. compute number of nodes in the graph */ + + /* number of nodes: + a read and write for each data unit + a redundancy computation node for each parity node + a read and Lpu for each parity unit + a block and unblock node (2) + a terminator node + if atomic RMW + an unlock node for each data unit, redundancy unit + */ + totalNumNodes = (2 * numDataNodes) + numParityNodes + (2 * numParityNodes) + 3; + if (lu_flag) + totalNumNodes += numDataNodes; + + nNodes = numDataNodes + numParityNodes; + + dag_h->numCommitNodes = numDataNodes + numParityNodes; + dag_h->numCommits = 0; + dag_h->numSuccedents = 1; + + /* Step 2. create the nodes */ + RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList); + i = 0; + blockNode = &nodes[i]; i += 1; + unblockNode = &nodes[i]; i += 1; + readDataNodes = &nodes[i]; i += numDataNodes; + readParityNodes = &nodes[i]; i += numParityNodes; + writeDataNodes = &nodes[i]; i += numDataNodes; + lpuNodes = &nodes[i]; i += numParityNodes; + xorNodes = &nodes[i]; i += numParityNodes; + termNode = &nodes[i]; i += 1; + if (lu_flag) { + unlockDataNodes = &nodes[i]; i += numDataNodes; + } + RF_ASSERT(i == totalNumNodes); + + /* Step 3. initialize the nodes */ + /* initialize block node (Nil) */ + rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil", allocList); + + /* initialize unblock node (Nil) */ + rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nNodes, 0, 0, dag_h, "Nil", allocList); + + /* initialize terminatory node (Trm) */ + rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList); + + /* initialize nodes which read old data (Rod) */ + for (i = 0; i < numDataNodes; i++) { + rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rod", allocList); + RF_ASSERT(pda != NULL); + readDataNodes[i].params[0].p = pda; /* physical disk addr desc */ + readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old data */ + readDataNodes[i].params[2].v = parityStripeID; + readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, lu_flag, 0, which_ru); + pda=pda->next; + readDataNodes[i].propList[0] = NULL; + readDataNodes[i].propList[1] = NULL; + } + + /* initialize nodes which read old parity (Rop) */ + pda = asmap->parityInfo; i = 0; + for (i = 0; i < numParityNodes; i++) { + RF_ASSERT(pda != NULL); + rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rop", allocList); + readParityNodes[i].params[0].p = pda; + readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old parity */ + readParityNodes[i].params[2].v = parityStripeID; + readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + readParityNodes[i].propList[0] = NULL; + pda=pda->next; + } + + /* initialize nodes which write new data (Wnd) */ + pda = asmap->physInfo; + for (i=0; i < numDataNodes; i++) { + RF_ASSERT(pda != NULL); + rf_InitNode(&writeDataNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, nNodes, 4, 0, dag_h, "Wnd", allocList); + writeDataNodes[i].params[0].p = pda; /* physical disk addr desc */ + writeDataNodes[i].params[1].p = pda->bufPtr; /* buffer holding new data to be written */ + writeDataNodes[i].params[2].v = parityStripeID; + writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + + if (lu_flag) { + /* initialize node to unlock the disk queue */ + rf_InitNode(&unlockDataNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Und", allocList); + unlockDataNodes[i].params[0].p = pda; /* physical disk addr desc */ + unlockDataNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, lu_flag, which_ru); + } + pda = pda->next; + } + + + /* initialize nodes which compute new parity */ + /* we use the simple XOR func in the double-XOR case, and when we're accessing only a portion of one stripe unit. + * the distinction between the two is that the regular XOR func assumes that the targbuf is a full SU in size, + * and examines the pda associated with the buffer to decide where within the buffer to XOR the data, whereas + * the simple XOR func just XORs the data into the start of the buffer. + */ + if ((numParityNodes==2) || ((numDataNodes == 1) && (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit))) { + func = pfuncs->simple; undoFunc = rf_NullNodeUndoFunc; name = pfuncs->SimpleName; + if (qfuncs) + { qfunc = qfuncs->simple; qname = qfuncs->SimpleName;} + } else { + func = pfuncs->regular; undoFunc = rf_NullNodeUndoFunc; name = pfuncs->RegularName; + if (qfuncs) { qfunc = qfuncs->regular; qname = qfuncs->RegularName;} + } + /* initialize the xor nodes: params are {pda,buf} from {Rod,Wnd,Rop} nodes, and raidPtr */ + if (numParityNodes==2) { /* double-xor case */ + for (i=0; i < numParityNodes; i++) { + rf_InitNode(&xorNodes[i], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, 7, 1, dag_h, name, allocList); /* no wakeup func for xor */ + xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD; + xorNodes[i].params[0] = readDataNodes[i].params[0]; + xorNodes[i].params[1] = readDataNodes[i].params[1]; + xorNodes[i].params[2] = readParityNodes[i].params[0]; + xorNodes[i].params[3] = readParityNodes[i].params[1]; + xorNodes[i].params[4] = writeDataNodes[i].params[0]; + xorNodes[i].params[5] = writeDataNodes[i].params[1]; + xorNodes[i].params[6].p = raidPtr; + xorNodes[i].results[0] = readParityNodes[i].params[1].p; /* use old parity buf as target buf */ + } + } + else { + /* there is only one xor node in this case */ + rf_InitNode(&xorNodes[0], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList); + xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD; + for (i=0; i < numDataNodes + 1; i++) { + /* set up params related to Rod and Rop nodes */ + xorNodes[0].params[2*i+0] = readDataNodes[i].params[0]; /* pda */ + xorNodes[0].params[2*i+1] = readDataNodes[i].params[1]; /* buffer pointer */ + } + for (i=0; i < numDataNodes; i++) { + /* set up params related to Wnd and Wnp nodes */ + xorNodes[0].params[2*(numDataNodes+1+i)+0] = writeDataNodes[i].params[0]; /* pda */ + xorNodes[0].params[2*(numDataNodes+1+i)+1] = writeDataNodes[i].params[1]; /* buffer pointer */ + } + xorNodes[0].params[2*(numDataNodes+numDataNodes+1)].p = raidPtr; /* xor node needs to get at RAID information */ + xorNodes[0].results[0] = readParityNodes[0].params[1].p; + } + + /* initialize the log node(s) */ + pda = asmap->parityInfo; + for (i = 0; i < numParityNodes; i++) { + RF_ASSERT(pda); + rf_InitNode(&lpuNodes[i], rf_wait, RF_FALSE, rf_ParityLogUpdateFunc, rf_ParityLogUpdateUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpu", allocList); + lpuNodes[i].params[0].p = pda; /* PhysDiskAddr of parity */ + lpuNodes[i].params[1].p = xorNodes[i].results[0]; /* buffer pointer to parity */ + pda = pda->next; + } + + + /* Step 4. connect the nodes */ + + /* connect header to block node */ + RF_ASSERT(dag_h->numSuccedents == 1); + RF_ASSERT(blockNode->numAntecedents == 0); + dag_h->succedents[0] = blockNode; + + /* connect block node to read old data nodes */ + RF_ASSERT(blockNode->numSuccedents == (numDataNodes + numParityNodes)); + for (i = 0; i < numDataNodes; i++) { + blockNode->succedents[i] = &readDataNodes[i]; + RF_ASSERT(readDataNodes[i].numAntecedents == 1); + readDataNodes[i].antecedents[0]= blockNode; + readDataNodes[i].antType[0] = rf_control; + } + + /* connect block node to read old parity nodes */ + for (i = 0; i < numParityNodes; i++) { + blockNode->succedents[numDataNodes + i] = &readParityNodes[i]; + RF_ASSERT(readParityNodes[i].numAntecedents == 1); + readParityNodes[i].antecedents[0] = blockNode; + readParityNodes[i].antType[0] = rf_control; + } + + /* connect read old data nodes to write new data nodes */ + for (i = 0; i < numDataNodes; i++) { + RF_ASSERT(readDataNodes[i].numSuccedents == numDataNodes + numParityNodes); + for (j = 0; j < numDataNodes; j++) { + RF_ASSERT(writeDataNodes[j].numAntecedents == numDataNodes + numParityNodes); + readDataNodes[i].succedents[j] = &writeDataNodes[j]; + writeDataNodes[j].antecedents[i] = &readDataNodes[i]; + if (i == j) + writeDataNodes[j].antType[i] = rf_antiData; + else + writeDataNodes[j].antType[i] = rf_control; + } + } + + /* connect read old data nodes to xor nodes */ + for (i = 0; i < numDataNodes; i++) + for (j = 0; j < numParityNodes; j++){ + RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes); + readDataNodes[i].succedents[numDataNodes + j] = &xorNodes[j]; + xorNodes[j].antecedents[i] = &readDataNodes[i]; + xorNodes[j].antType[i] = rf_trueData; + } + + /* connect read old parity nodes to write new data nodes */ + for (i = 0; i < numParityNodes; i++) { + RF_ASSERT(readParityNodes[i].numSuccedents == numDataNodes + numParityNodes); + for (j = 0; j < numDataNodes; j++) { + readParityNodes[i].succedents[j] = &writeDataNodes[j]; + writeDataNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i]; + writeDataNodes[j].antType[numDataNodes + i] = rf_control; + } + } + + /* connect read old parity nodes to xor nodes */ + for (i = 0; i < numParityNodes; i++) + for (j = 0; j < numParityNodes; j++) { + readParityNodes[i].succedents[numDataNodes + j] = &xorNodes[j]; + xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i]; + xorNodes[j].antType[numDataNodes + i] = rf_trueData; + } + + /* connect xor nodes to write new parity nodes */ + for (i = 0; i < numParityNodes; i++) { + RF_ASSERT(xorNodes[i].numSuccedents == 1); + RF_ASSERT(lpuNodes[i].numAntecedents == 1); + xorNodes[i].succedents[0] = &lpuNodes[i]; + lpuNodes[i].antecedents[0] = &xorNodes[i]; + lpuNodes[i].antType[0] = rf_trueData; + } + + for (i = 0; i < numDataNodes; i++) { + if (lu_flag) { + /* connect write new data nodes to unlock nodes */ + RF_ASSERT(writeDataNodes[i].numSuccedents == 1); + RF_ASSERT(unlockDataNodes[i].numAntecedents == 1); + writeDataNodes[i].succedents[0] = &unlockDataNodes[i]; + unlockDataNodes[i].antecedents[0] = &writeDataNodes[i]; + unlockDataNodes[i].antType[0] = rf_control; + + /* connect unlock nodes to unblock node */ + RF_ASSERT(unlockDataNodes[i].numSuccedents == 1); + RF_ASSERT(unblockNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes))); + unlockDataNodes[i].succedents[0] = unblockNode; + unblockNode->antecedents[i] = &unlockDataNodes[i]; + unblockNode->antType[i] = rf_control; + } + else { + /* connect write new data nodes to unblock node */ + RF_ASSERT(writeDataNodes[i].numSuccedents == 1); + RF_ASSERT(unblockNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes))); + writeDataNodes[i].succedents[0] = unblockNode; + unblockNode->antecedents[i] = &writeDataNodes[i]; + unblockNode->antType[i] = rf_control; + } + } + + /* connect write new parity nodes to unblock node */ + for (i = 0; i < numParityNodes; i++) { + RF_ASSERT(lpuNodes[i].numSuccedents == 1); + lpuNodes[i].succedents[0] = unblockNode; + unblockNode->antecedents[numDataNodes + i] = &lpuNodes[i]; + unblockNode->antType[numDataNodes + i] = rf_control; + } + + /* connect unblock node to terminator */ + RF_ASSERT(unblockNode->numSuccedents == 1); + RF_ASSERT(termNode->numAntecedents == 1); + RF_ASSERT(termNode->numSuccedents == 0); + unblockNode->succedents[0] = termNode; + termNode->antecedents[0] = unblockNode; + termNode->antType[0] = rf_control; +} + + +void rf_CreateParityLoggingSmallWriteDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList, + RF_RedFuncs_t *pfuncs, + RF_RedFuncs_t *qfuncs) +{ + dag_h->creator = "ParityLoggingSmallWriteDAG"; + rf_CommonCreateParityLoggingSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_xorFuncs, NULL); +} + + +void rf_CreateParityLoggingLargeWriteDAG( + RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, + RF_DagHeader_t *dag_h, + void *bp, + RF_RaidAccessFlags_t flags, + RF_AllocListElem_t *allocList, + int nfaults, + int (*redFunc)(RF_DagNode_t *)) +{ + dag_h->creator = "ParityLoggingSmallWriteDAG"; + rf_CommonCreateParityLoggingLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularXorFunc); +} + +#endif /* RF_INCLUDE_PARITYLOGGING > 0 */ diff --git a/sys/dev/raidframe/rf_parityloggingdags.h b/sys/dev/raidframe/rf_parityloggingdags.h new file mode 100644 index 00000000000..1eecfc7fe08 --- /dev/null +++ b/sys/dev/raidframe/rf_parityloggingdags.h @@ -0,0 +1,94 @@ +/* $OpenBSD: rf_parityloggingdags.h,v 1.1 1999/01/11 14:29:37 niklas Exp $ */ +/* $NetBSD: rf_parityloggingdags.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/**************************************************************************** + * * + * rf_parityloggingdags.h -- header file for parity logging dags * + * * + ****************************************************************************/ + +/* : + * Log: rf_parityloggingdags.h,v + * Revision 1.10 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.9 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.8 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.7 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.6 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.5 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.4 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.3 1995/12/06 20:55:08 wvcii + * added prototyping + * + */ + +#ifndef _RF__RF_PARITYLOGGINGDAGS_H_ +#define _RF__RF_PARITYLOGGINGDAGS_H_ + +/* routines that create DAGs */ +void rf_CommonCreateParityLoggingLargeWriteDAG(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, + void *bp, RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList, + int nfaults, int (*redFunc)(RF_DagNode_t *)); +void rf_CommonCreateParityLoggingSmallWriteDAG(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, + void *bp, RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList, + RF_RedFuncs_t *pfuncs, RF_RedFuncs_t *qfuncs); + +void rf_CreateParityLoggingLargeWriteDAG(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, + void *bp, RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList, + int nfaults, int (*redFunc)(RF_DagNode_t *)); +void rf_CreateParityLoggingSmallWriteDAG(RF_Raid_t *raidPtr, + RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, + void *bp, RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList, + RF_RedFuncs_t *pfuncs, RF_RedFuncs_t *qfuncs); + +#endif /* !_RF__RF_PARITYLOGGINGDAGS_H_ */ diff --git a/sys/dev/raidframe/rf_parityscan.c b/sys/dev/raidframe/rf_parityscan.c new file mode 100644 index 00000000000..3e6086873be --- /dev/null +++ b/sys/dev/raidframe/rf_parityscan.c @@ -0,0 +1,553 @@ +/* $OpenBSD: rf_parityscan.c,v 1.1 1999/01/11 14:29:37 niklas Exp $ */ +/* $NetBSD: rf_parityscan.c,v 1.1 1998/11/13 04:20:32 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/***************************************************************************** + * + * rf_parityscan.c -- misc utilities related to parity verification + * + *****************************************************************************/ + +/* + * : + * Log: rf_parityscan.c,v + * Revision 1.47 1996/08/20 20:35:01 jimz + * change diagnostic string in rewrite + * + * Revision 1.46 1996/08/20 20:03:19 jimz + * fixed parity rewrite to actually use arch-specific parity stuff + * (this ever worked... how?) + * + * Revision 1.45 1996/08/16 17:41:25 jimz + * allow rewrite parity on any fault-tolerant arch + * + * Revision 1.44 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.43 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.42 1996/07/22 21:12:01 jimz + * clean up parity scan status printing + * + * Revision 1.41 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.40 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.39 1996/07/09 21:44:26 jimz + * fix bogus return code in VerifyParityBasic when a stripe can't be corrected + * + * Revision 1.38 1996/06/20 17:56:57 jimz + * update VerifyParity to check complete AccessStripeMaps + * + * Revision 1.37 1996/06/19 22:23:01 jimz + * parity verification is now a layout-configurable thing + * not all layouts currently support it (correctly, anyway) + * + * Revision 1.36 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.35 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.34 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.33 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.32 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.31 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.30 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.29 1996/05/30 12:59:18 jimz + * make etimer happier, more portable + * + * Revision 1.28 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.27 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.26 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.25 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.24 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.23 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.22 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.21 1996/05/08 21:01:24 jimz + * fixed up enum type names that were conflicting with other + * enums and function names (ie, "panic") + * future naming trends will be towards RF_ and rf_ for + * everything raidframe-related + * + * Revision 1.20 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.19 1995/11/30 16:16:49 wvcii + * added copyright info + * + * Revision 1.18 1995/11/19 16:32:19 wvcii + * eliminated initialization of dag header fields which no longer exist + * (numDags, numDagsDone, firstHdr) + * + * Revision 1.17 1995/11/07 16:23:36 wvcii + * added comments, asserts, and prototypes + * encoded commit point nodes, barrier, and antecedents types into dags + * + */ + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_dagfuncs.h" +#include "rf_dagutils.h" +#include "rf_mcpair.h" +#include "rf_general.h" +#include "rf_engine.h" +#include "rf_parityscan.h" +#include "rf_map.h" +#include "rf_sys.h" + +/***************************************************************************************** + * + * walk through the entire arry and write new parity. + * This works by creating two DAGs, one to read a stripe of data and one to + * write new parity. The first is executed, the data is xored together, and + * then the second is executed. To avoid constantly building and tearing down + * the DAGs, we create them a priori and fill them in with the mapping + * information as we go along. + * + * there should never be more than one thread running this. + * + ****************************************************************************************/ + +int rf_RewriteParity(raidPtr) + RF_Raid_t *raidPtr; +{ + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_AccessStripeMapHeader_t *asm_h; + int old_pctg, new_pctg, rc; + RF_PhysDiskAddr_t pda; + RF_SectorNum_t i; + + pda.startSector = 0; + pda.numSector = raidPtr->Layout.sectorsPerStripeUnit; + old_pctg = -1; + +/* rf_verifyParityDebug=1; */ + for (i=0; i<raidPtr->totalSectors; i+=layoutPtr->dataSectorsPerStripe) { + asm_h = rf_MapAccess(raidPtr, i, layoutPtr->dataSectorsPerStripe, NULL, RF_DONT_REMAP); + rc = rf_VerifyParity(raidPtr, asm_h->stripeMap, 1, 0); + /* printf("Parity verified: rc=%d\n",rc); */ + switch (rc) { + case RF_PARITY_OKAY: + case RF_PARITY_CORRECTED: + break; + case RF_PARITY_BAD: + printf("Parity bad during correction\n"); + RF_PANIC(); + break; + case RF_PARITY_COULD_NOT_CORRECT: + printf("Could not correct bad parity\n"); + RF_PANIC(); + break; + case RF_PARITY_COULD_NOT_VERIFY: + printf("Could not verify parity\n"); + RF_PANIC(); + break; + default: + printf("Bad rc=%d from VerifyParity in RewriteParity\n", rc); + RF_PANIC(); + } + rf_FreeAccessStripeMap(asm_h); + new_pctg = i*1000/raidPtr->totalSectors; + if (new_pctg != old_pctg) { +#ifndef KERNEL + fprintf(stderr,"\rParity rewrite: %d.%d%% complete", + new_pctg/10, new_pctg%10); + fflush(stderr); +#endif /* !KERNEL */ + } + old_pctg = new_pctg; + } +#ifndef KERNEL + fprintf(stderr,"\rParity rewrite: 100.0%% complete\n"); +#endif /* !KERNEL */ +#if 1 + return(0); /* XXX nothing was here.. GO */ +#endif +} + +/***************************************************************************************** + * + * verify that the parity in a particular stripe is correct. + * we validate only the range of parity defined by parityPDA, since + * this is all we have locked. The way we do this is to create an asm + * that maps the whole stripe and then range-restrict it to the parity + * region defined by the parityPDA. + * + ****************************************************************************************/ +int rf_VerifyParity(raidPtr, aasm, correct_it, flags) + RF_Raid_t *raidPtr; + RF_AccessStripeMap_t *aasm; + int correct_it; + RF_RaidAccessFlags_t flags; +{ + RF_PhysDiskAddr_t *parityPDA; + RF_AccessStripeMap_t *doasm; + RF_LayoutSW_t *lp; + int lrc, rc; + + lp = raidPtr->Layout.map; + if (lp->faultsTolerated == 0) { + /* + * There isn't any parity. Call it "okay." + */ + return(RF_PARITY_OKAY); + } + rc = RF_PARITY_OKAY; + if (lp->VerifyParity) { + for(doasm=aasm;doasm;doasm=doasm->next) { + for(parityPDA=doasm->parityInfo;parityPDA;parityPDA=parityPDA->next) { + lrc = lp->VerifyParity(raidPtr, doasm->raidAddress, parityPDA, + correct_it, flags); + if (lrc > rc) { + /* see rf_parityscan.h for why this works */ + rc = lrc; + } + } + } + } + else { + rc = RF_PARITY_COULD_NOT_VERIFY; + } + return(rc); +} + +int rf_VerifyParityBasic(raidPtr, raidAddr, parityPDA, correct_it, flags) + RF_Raid_t *raidPtr; + RF_RaidAddr_t raidAddr; + RF_PhysDiskAddr_t *parityPDA; + int correct_it; + RF_RaidAccessFlags_t flags; +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_RaidAddr_t startAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr); + RF_SectorCount_t numsector = parityPDA->numSector; + int numbytes = rf_RaidAddressToByte(raidPtr, numsector); + int bytesPerStripe = numbytes * layoutPtr->numDataCol; + RF_DagHeader_t *rd_dag_h, *wr_dag_h; /* read, write dag */ + RF_DagNode_t *blockNode, *unblockNode, *wrBlock, *wrUnblock; + RF_AccessStripeMapHeader_t *asm_h; + RF_AccessStripeMap_t *asmap; + RF_AllocListElem_t *alloclist; + RF_PhysDiskAddr_t *pda; + char *pbuf, *buf, *end_p, *p; + int i, retcode; + RF_ReconUnitNum_t which_ru; + RF_StripeNum_t psID = rf_RaidAddressToParityStripeID(layoutPtr, raidAddr, &which_ru); + int stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol; + RF_AccTraceEntry_t tracerec; + RF_MCPair_t *mcpair; + + retcode = RF_PARITY_OKAY; + + mcpair = rf_AllocMCPair(); + rf_MakeAllocList(alloclist); + RF_MallocAndAdd(buf, numbytes * (layoutPtr->numDataCol + layoutPtr->numParityCol), (char *), alloclist); + RF_CallocAndAdd(pbuf, 1, numbytes, (char *), alloclist); /* use calloc to make sure buffer is zeroed */ + end_p = buf + bytesPerStripe; + + rd_dag_h = rf_MakeSimpleDAG(raidPtr, stripeWidth, numbytes, buf, rf_DiskReadFunc, rf_DiskReadUndoFunc, + "Rod", alloclist, flags, RF_IO_NORMAL_PRIORITY); + blockNode = rd_dag_h->succedents[0]; + unblockNode = blockNode->succedents[0]->succedents[0]; + + /* map the stripe and fill in the PDAs in the dag */ + asm_h = rf_MapAccess(raidPtr, startAddr, layoutPtr->dataSectorsPerStripe, buf, RF_DONT_REMAP); + asmap = asm_h->stripeMap; + + for (pda=asmap->physInfo,i=0; i<layoutPtr->numDataCol; i++,pda=pda->next) { + RF_ASSERT(pda); + rf_RangeRestrictPDA(raidPtr, parityPDA, pda, 0, 1); + RF_ASSERT(pda->numSector != 0); + if (rf_TryToRedirectPDA(raidPtr, pda, 0)) goto out; /* no way to verify parity if disk is dead. return w/ good status */ + blockNode->succedents[i]->params[0].p = pda; + blockNode->succedents[i]->params[2].v = psID; + blockNode->succedents[i]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + } + + RF_ASSERT(!asmap->parityInfo->next); + rf_RangeRestrictPDA(raidPtr, parityPDA, asmap->parityInfo, 0, 1); + RF_ASSERT(asmap->parityInfo->numSector != 0); + if (rf_TryToRedirectPDA(raidPtr, asmap->parityInfo, 1)) + goto out; + blockNode->succedents[layoutPtr->numDataCol]->params[0].p = asmap->parityInfo; + + /* fire off the DAG */ + bzero((char *)&tracerec,sizeof(tracerec)); + rd_dag_h->tracerec = &tracerec; + + if (rf_verifyParityDebug) { + printf("Parity verify read dag:\n"); + rf_PrintDAGList(rd_dag_h); + } + + RF_LOCK_MUTEX(mcpair->mutex); + mcpair->flag = 0; + rf_DispatchDAG(rd_dag_h, (void (*)(void *))rf_MCPairWakeupFunc, + (void *) mcpair); + while (!mcpair->flag) + RF_WAIT_COND(mcpair->cond, mcpair->mutex); + RF_UNLOCK_MUTEX(mcpair->mutex); + if (rd_dag_h->status != rf_enable) { + RF_ERRORMSG("Unable to verify parity: can't read the stripe\n"); + retcode = RF_PARITY_COULD_NOT_VERIFY; + goto out; + } + + for (p=buf; p<end_p; p+=numbytes) { + rf_bxor(p, pbuf, numbytes, NULL); + } + for (i=0; i<numbytes; i++) { +#if 0 + if (pbuf[i]!=0 || buf[bytesPerStripe+i]!=0) { + printf("Bytes: %d %d %d\n",i,pbuf[i],buf[bytesPerStripe+i]); + } +#endif + if (pbuf[i] != buf[bytesPerStripe+i]) { + if (!correct_it) + RF_ERRORMSG3("Parity verify error: byte %d of parity is 0x%x should be 0x%x\n", + i,(u_char) buf[bytesPerStripe+i],(u_char) pbuf[i]); + retcode = RF_PARITY_BAD; + break; + } + } + + if (retcode && correct_it) { + wr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, numbytes, pbuf, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, + "Wnp", alloclist, flags, RF_IO_NORMAL_PRIORITY); + wrBlock = wr_dag_h->succedents[0]; wrUnblock = wrBlock->succedents[0]->succedents[0]; + wrBlock->succedents[0]->params[0].p = asmap->parityInfo; + wrBlock->succedents[0]->params[2].v = psID; + wrBlock->succedents[0]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + bzero((char *)&tracerec,sizeof(tracerec)); + wr_dag_h->tracerec = &tracerec; + if (rf_verifyParityDebug) { + printf("Parity verify write dag:\n"); + rf_PrintDAGList(wr_dag_h); + } + RF_LOCK_MUTEX(mcpair->mutex); + mcpair->flag = 0; + rf_DispatchDAG(wr_dag_h, (void (*)(void *))rf_MCPairWakeupFunc, + (void *) mcpair); + while (!mcpair->flag) + RF_WAIT_COND(mcpair->cond, mcpair->mutex); + RF_UNLOCK_MUTEX(mcpair->mutex); + if (wr_dag_h->status != rf_enable) { + RF_ERRORMSG("Unable to correct parity in VerifyParity: can't write the stripe\n"); + retcode = RF_PARITY_COULD_NOT_CORRECT; + } + rf_FreeDAG(wr_dag_h); + if (retcode == RF_PARITY_BAD) + retcode = RF_PARITY_CORRECTED; + } + +out: + rf_FreeAccessStripeMap(asm_h); + rf_FreeAllocList(alloclist); + rf_FreeDAG(rd_dag_h); + rf_FreeMCPair(mcpair); + return(retcode); +} + +int rf_TryToRedirectPDA(raidPtr, pda, parity) + RF_Raid_t *raidPtr; + RF_PhysDiskAddr_t *pda; + int parity; +{ + if (raidPtr->Disks[pda->row][pda->col].status == rf_ds_reconstructing) { + if (rf_CheckRUReconstructed(raidPtr->reconControl[pda->row]->reconMap, pda->startSector)) { + if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { + RF_RowCol_t or = pda->row, oc = pda->col; + RF_SectorNum_t os = pda->startSector; + if (parity) { + (raidPtr->Layout.map->MapParity)(raidPtr, pda->raidAddress, &pda->row, &pda->col, &pda->startSector, RF_REMAP); + if (rf_verifyParityDebug) printf("VerifyParity: Redir P r %d c %d sect %ld -> r %d c %d sect %ld\n", + or,oc,(long)os,pda->row,pda->col,(long)pda->startSector); + } else { + (raidPtr->Layout.map->MapSector)(raidPtr, pda->raidAddress, &pda->row, &pda->col, &pda->startSector, RF_REMAP); + if (rf_verifyParityDebug) printf("VerifyParity: Redir D r %d c %d sect %ld -> r %d c %d sect %ld\n", + or,oc,(long)os,pda->row,pda->col,(long)pda->startSector); + } + } else { + RF_RowCol_t spRow = raidPtr->Disks[pda->row][pda->col].spareRow; + RF_RowCol_t spCol = raidPtr->Disks[pda->row][pda->col].spareCol; + pda->row = spRow; + pda->col = spCol; + } + } + } + if (RF_DEAD_DISK(raidPtr->Disks[pda->row][pda->col].status)) return(1); + return(0); +} + +/***************************************************************************************** + * + * currently a stub. + * + * takes as input an ASM describing a write operation and containing one failure, and + * verifies that the parity was correctly updated to reflect the write. + * + * if it's a data unit that's failed, we read the other data units in the stripe and + * the parity unit, XOR them together, and verify that we get the data intended for + * the failed disk. Since it's easy, we also validate that the right data got written + * to the surviving data disks. + * + * If it's the parity that failed, there's really no validation we can do except the + * above verification that the right data got written to all disks. This is because + * the new data intended for the failed disk is supplied in the ASM, but this is of + * course not the case for the new parity. + * + ****************************************************************************************/ +int rf_VerifyDegrModeWrite(raidPtr, asmh) + RF_Raid_t *raidPtr; + RF_AccessStripeMapHeader_t *asmh; +{ + return(0); +} + +/* creates a simple DAG with a header, a block-recon node at level 1, + * nNodes nodes at level 2, an unblock-recon node at level 3, and + * a terminator node at level 4. The stripe address field in + * the block and unblock nodes are not touched, nor are the pda + * fields in the second-level nodes, so they must be filled in later. + * + * commit point is established at unblock node - this means that any + * failure during dag execution causes the dag to fail + */ +RF_DagHeader_t *rf_MakeSimpleDAG(raidPtr, nNodes, bytesPerSU, databuf, doFunc, undoFunc, name, alloclist, flags, priority) + RF_Raid_t *raidPtr; + int nNodes; + int bytesPerSU; + char *databuf; + int (*doFunc)(RF_DagNode_t *node); + int (*undoFunc)(RF_DagNode_t *node); + char *name; /* node names at the second level */ + RF_AllocListElem_t *alloclist; + RF_RaidAccessFlags_t flags; + int priority; +{ + RF_DagHeader_t *dag_h; + RF_DagNode_t *nodes, *termNode, *blockNode, *unblockNode; + int i; + + /* create the nodes, the block & unblock nodes, and the terminator node */ + RF_CallocAndAdd(nodes, nNodes+3, sizeof(RF_DagNode_t), (RF_DagNode_t *), alloclist); + blockNode = &nodes[nNodes]; + unblockNode = blockNode+1; + termNode = unblockNode+1; + + dag_h = rf_AllocDAGHeader(); + dag_h->raidPtr = (void *) raidPtr; + dag_h->allocList = NULL; /* we won't use this alloc list */ + dag_h->status = rf_enable; + dag_h->numSuccedents = 1; + dag_h->creator = "SimpleDAG"; + + /* this dag can not commit until the unblock node is reached + * errors prior to the commit point imply the dag has failed + */ + dag_h->numCommitNodes = 1; + dag_h->numCommits = 0; + + dag_h->succedents[0] = blockNode; + rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil", alloclist); + rf_InitNode(unblockNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nNodes, 0, 0, dag_h, "Nil", alloclist); + unblockNode->succedents[0] = termNode; + for (i=0; i<nNodes; i++) { + blockNode->succedents[i] = unblockNode->antecedents[i] = &nodes[i]; + unblockNode->antType[i] = rf_control; + rf_InitNode(&nodes[i], rf_wait, RF_FALSE, doFunc, undoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, alloclist); + nodes[i].succedents[0] = unblockNode; + nodes[i].antecedents[0] = blockNode; + nodes[i].antType[0] = rf_control; + nodes[i].params[1].p = (databuf + (i*bytesPerSU)); + } + rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", alloclist); + termNode->antecedents[0] = unblockNode; + termNode->antType[0] = rf_control; + return(dag_h); +} diff --git a/sys/dev/raidframe/rf_parityscan.h b/sys/dev/raidframe/rf_parityscan.h new file mode 100644 index 00000000000..44aec7e2ca6 --- /dev/null +++ b/sys/dev/raidframe/rf_parityscan.h @@ -0,0 +1,118 @@ +/* $OpenBSD: rf_parityscan.h,v 1.1 1999/01/11 14:29:38 niklas Exp $ */ +/* $NetBSD: rf_parityscan.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* : + * Log: rf_parityscan.h,v + * Revision 1.14 1996/07/05 18:01:12 jimz + * don't make parity protos ndef KERNEL + * + * Revision 1.13 1996/06/20 17:41:43 jimz + * change decl for VerifyParity + * + * Revision 1.12 1996/06/20 15:38:39 jimz + * renumber parityscan return codes + * + * Revision 1.11 1996/06/19 22:23:01 jimz + * parity verification is now a layout-configurable thing + * not all layouts currently support it (correctly, anyway) + * + * Revision 1.10 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.9 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.8 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.7 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.6 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.5 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.4 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.3 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.2 1995/11/30 16:20:46 wvcii + * added copyright info + * + */ + +#ifndef _RF__RF_PARITYSCAN_H_ +#define _RF__RF_PARITYSCAN_H_ + +#include "rf_types.h" +#include "rf_alloclist.h" + +int rf_RewriteParity(RF_Raid_t *raidPtr); +int rf_VerifyParityBasic(RF_Raid_t *raidPtr, RF_RaidAddr_t raidAddr, + RF_PhysDiskAddr_t *parityPDA, int correct_it, RF_RaidAccessFlags_t flags); +int rf_VerifyParity(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *stripeMap, + int correct_it, RF_RaidAccessFlags_t flags); +int rf_TryToRedirectPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, int parity); +int rf_VerifyDegrModeWrite(RF_Raid_t *raidPtr, RF_AccessStripeMapHeader_t *asmh); +RF_DagHeader_t *rf_MakeSimpleDAG(RF_Raid_t *raidPtr, int nNodes, + int bytesPerSU, char *databuf, + int (*doFunc)(RF_DagNode_t *), + int (*undoFunc)(RF_DagNode_t *), + char *name, RF_AllocListElem_t *alloclist, + RF_RaidAccessFlags_t flags, int priority); + +#define RF_DO_CORRECT_PARITY 1 +#define RF_DONT_CORRECT_PARITY 0 + +/* + * Return vals for VerifyParity operation + * + * Ordering is important here. + */ +#define RF_PARITY_OKAY 0 /* or no parity information */ +#define RF_PARITY_CORRECTED 1 +#define RF_PARITY_BAD 2 +#define RF_PARITY_COULD_NOT_CORRECT 3 +#define RF_PARITY_COULD_NOT_VERIFY 4 + +#endif /* !_RF__RF_PARITYSCAN_H_ */ diff --git a/sys/dev/raidframe/rf_pq.c b/sys/dev/raidframe/rf_pq.c new file mode 100644 index 00000000000..ebbc7917b26 --- /dev/null +++ b/sys/dev/raidframe/rf_pq.c @@ -0,0 +1,1026 @@ +/* $OpenBSD: rf_pq.c,v 1.1 1999/01/11 14:29:38 niklas Exp $ */ +/* $NetBSD: rf_pq.c,v 1.1 1998/11/13 04:20:32 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Daniel Stodolsky + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Code for RAID level 6 (P + Q) disk array architecture. + * + * : + * Log: rf_pq.c,v + * Revision 1.33 1996/11/05 21:10:40 jimz + * failed pda generalization + * + * Revision 1.32 1996/07/31 16:29:50 jimz + * "fix" math on 32-bit machines using RF_LONGSHIFT + * (may be incorrect) + * + * Revision 1.31 1996/07/31 15:35:01 jimz + * evenodd changes; bugfixes for double-degraded archs, generalize + * some formerly PQ-only functions + * + * Revision 1.30 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.29 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.28 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.27 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.26 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.25 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.24 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.23 1996/05/30 12:59:18 jimz + * make etimer happier, more portable + * + * Revision 1.22 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.21 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.20 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.19 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.18 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.17 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.16 1996/05/17 14:52:04 wvcii + * added prototyping to QDelta() + * - changed buf params from volatile unsigned long * to char * + * changed QDelta for kernel + * - just bzero the buf since kernel doesn't include pq decode table + * + * Revision 1.15 1996/05/03 19:40:20 wvcii + * added includes for dag library + * + * Revision 1.14 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.13 1995/11/30 16:19:55 wvcii + * added copyright info + * + * Revision 1.12 1995/11/07 16:13:47 wvcii + * changed PQDagSelect prototype + * function no longer returns numHdrSucc, numTermAnt + * note: this file contains node functions which should be + * moved to rf_dagfuncs.c so that all node funcs are bundled together + * + * Revision 1.11 1995/10/04 03:50:33 wvcii + * removed panics, minor code cleanup in dag selection + * + * + */ + +#include "rf_archs.h" +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_dagffrd.h" +#include "rf_dagffwr.h" +#include "rf_dagdegrd.h" +#include "rf_dagdegwr.h" +#include "rf_dagutils.h" +#include "rf_dagfuncs.h" +#include "rf_threadid.h" +#include "rf_etimer.h" +#include "rf_pqdeg.h" +#include "rf_general.h" +#include "rf_map.h" +#include "rf_pq.h" +#include "rf_sys.h" + +RF_RedFuncs_t rf_pFuncs = { rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P" }; +RF_RedFuncs_t rf_pRecoveryFuncs = { rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func" }; + +int rf_RegularONPFunc(node) + RF_DagNode_t *node; +{ + return(rf_RegularXorFunc(node)); +} + +/* + same as simpleONQ func, but the coefficient is always 1 +*/ + +int rf_SimpleONPFunc(node) + RF_DagNode_t *node; +{ + return(rf_SimpleXorFunc(node)); +} + +int rf_RecoveryPFunc(node) +RF_DagNode_t *node; +{ + return(rf_RecoveryXorFunc(node)); +} + +int rf_RegularPFunc(node) + RF_DagNode_t *node; +{ + return(rf_RegularXorFunc(node)); +} + +#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) + +static void QDelta(char *dest, char *obuf, char *nbuf, unsigned length, + unsigned char coeff); +static void rf_InvertQ(unsigned long *qbuf, unsigned long *abuf, + unsigned length, unsigned coeff); + +RF_RedFuncs_t rf_qFuncs = { rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q" }; +RF_RedFuncs_t rf_qRecoveryFuncs = { rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func" }; +RF_RedFuncs_t rf_pqRecoveryFuncs = { rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func" }; + +void rf_PQDagSelect( + RF_Raid_t *raidPtr, + RF_IoType_t type, + RF_AccessStripeMap_t *asmap, + RF_VoidFuncPtr *createFunc) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + unsigned ndfail = asmap->numDataFailed; + unsigned npfail = asmap->numParityFailed; + unsigned ntfail = npfail + ndfail; + + RF_ASSERT(RF_IO_IS_R_OR_W(type)); + if (ntfail > 2) + { + RF_ERRORMSG("more than two disks failed in a single group! Aborting I/O operation.\n"); + /* *infoFunc = */ *createFunc = NULL; + return; + } + + /* ok, we can do this I/O */ + if (type == RF_IO_TYPE_READ) + { + switch (ndfail) + { + case 0: + /* fault free read */ + *createFunc = rf_CreateFaultFreeReadDAG; /* same as raid 5 */ + break; + case 1: + /* lost a single data unit */ + /* two cases: + (1) parity is not lost. + do a normal raid 5 reconstruct read. + (2) parity is lost. + do a reconstruct read using "q". + */ + if (ntfail == 2) /* also lost redundancy */ + { + if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) + *createFunc = rf_PQ_110_CreateReadDAG; + else + *createFunc = rf_PQ_101_CreateReadDAG; + } + else + { + /* P and Q are ok. But is there a failure + in some unaccessed data unit? + */ + if (rf_NumFailedDataUnitsInStripe(raidPtr,asmap)==2) + *createFunc = rf_PQ_200_CreateReadDAG; + else + *createFunc = rf_PQ_100_CreateReadDAG; + } + break; + case 2: + /* lost two data units */ + /* *infoFunc = PQOneTwo; */ + *createFunc = rf_PQ_200_CreateReadDAG; + break; + } + return; + } + + /* a write */ + switch (ntfail) + { + case 0: /* fault free */ + if (rf_suppressLocksAndLargeWrites || + (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) || + (asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) { + + *createFunc = rf_PQCreateSmallWriteDAG; + } + else { + *createFunc = rf_PQCreateLargeWriteDAG; + } + break; + + case 1: /* single disk fault */ + if (npfail==1) + { + RF_ASSERT ((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q)); + if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) + { /* q died, treat like normal mode raid5 write.*/ + if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1)) + || rf_NumFailedDataUnitsInStripe(raidPtr,asmap)) + *createFunc = rf_PQ_001_CreateSmallWriteDAG; + else + *createFunc = rf_PQ_001_CreateLargeWriteDAG; + } + else + { /* parity died, small write only updating Q */ + if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1)) + || rf_NumFailedDataUnitsInStripe(raidPtr,asmap)) + *createFunc = rf_PQ_010_CreateSmallWriteDAG; + else + *createFunc = rf_PQ_010_CreateLargeWriteDAG; + } + } + else + { /* data missing. + Do a P reconstruct write if only a single data unit + is lost in the stripe, otherwise a PQ reconstruct + write. */ + if (rf_NumFailedDataUnitsInStripe(raidPtr,asmap)==2) + *createFunc = rf_PQ_200_CreateWriteDAG; + else + *createFunc = rf_PQ_100_CreateWriteDAG; + } + break; + + case 2: /* two disk faults */ + switch (npfail) + { + case 2: /* both p and q dead */ + *createFunc = rf_PQ_011_CreateWriteDAG; + break; + case 1: /* either p or q and dead data */ + RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA); + RF_ASSERT ((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)); + if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q) + *createFunc = rf_PQ_101_CreateWriteDAG; + else + *createFunc = rf_PQ_110_CreateWriteDAG; + break; + case 0: /* double data loss */ + *createFunc = rf_PQ_200_CreateWriteDAG; + break; + } + break; + + default: /* more than 2 disk faults */ + *createFunc = NULL; + RF_PANIC(); + } + return; +} + +/* + Used as a stop gap info function +*/ +static void PQOne(raidPtr, nSucc, nAnte, asmap) + RF_Raid_t *raidPtr; + int *nSucc; + int *nAnte; + RF_AccessStripeMap_t *asmap; +{ + *nSucc = *nAnte = 1; +} + +static void PQOneTwo(raidPtr, nSucc, nAnte, asmap) + RF_Raid_t *raidPtr; + int *nSucc; + int *nAnte; + RF_AccessStripeMap_t *asmap; +{ + *nSucc = 1; + *nAnte = 2; +} + +RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG) +{ + rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2, + rf_RegularPQFunc, RF_FALSE); +} + +int rf_RegularONQFunc(node) + RF_DagNode_t *node; +{ + int np = node->numParams; + int d; + RF_Raid_t *raidPtr = (RF_Raid_t *)node->params[np-1].p; + int i; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + RF_Etimer_t timer; + char *qbuf, *qpbuf; + char *obuf, *nbuf; + RF_PhysDiskAddr_t *old, *new; + unsigned long coeff; + unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; + + RF_ETIMER_START(timer); + + d = (np-3)/4; + RF_ASSERT (4*d+3 == np); + qbuf = (char *) node->params[2*d+1].p; /* q buffer*/ + for (i=0; i < d; i++) + { + old = (RF_PhysDiskAddr_t *) node->params[2*i].p; + obuf = (char *) node->params[2*i+1].p; + new = (RF_PhysDiskAddr_t *) node->params[2*(d+1+i)].p; + nbuf = (char *) node->params[2*(d+1+i)+1].p; + RF_ASSERT (new->numSector == old->numSector); + RF_ASSERT (new->raidAddress == old->raidAddress); + /* the stripe unit within the stripe tells us the coefficient to use + for the multiply. */ + coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),new->raidAddress); + /* compute the data unit offset within the column, then add one */ + coeff = (coeff % raidPtr->Layout.numDataCol); + qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,old->startSector % secPerSU); + QDelta(qpbuf,obuf,nbuf, rf_RaidAddressToByte(raidPtr, old->numSector),coeff); + } + + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + tracerec->q_us += RF_ETIMER_VAL_US(timer); + rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no I/O in this node */ + return(0); +} + +/* + See the SimpleXORFunc for the difference between a simple and regular func. + These Q functions should be used for + + new q = Q(data,old data,old q) + + style updates and not for + + q = ( new data, new data, .... ) + + computations. + + The simple q takes 2(2d+1)+1 params, where d is the number + of stripes written. The order of params is + old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d + [2d] old q pda_0, old q buffer + [2d_2] new data pda_0, new data buffer_0, ... new data pda_d, new data buffer_d + raidPtr +*/ + +int rf_SimpleONQFunc(node) + RF_DagNode_t *node; +{ + int np = node->numParams; + int d; + RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p; + int i; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + RF_Etimer_t timer; + char *qbuf; + char *obuf, *nbuf; + RF_PhysDiskAddr_t *old, *new; + unsigned long coeff; + + RF_ETIMER_START(timer); + + d = (np-3)/4; + RF_ASSERT (4*d+3 == np); + qbuf = (char *) node->params[2*d+1].p; /* q buffer*/ + for (i=0; i < d; i++) + { + old = (RF_PhysDiskAddr_t *) node->params[2*i].p; + obuf = (char *) node->params[2*i+1].p; + new = (RF_PhysDiskAddr_t *) node->params[2*(d+1+i)].p; + nbuf = (char *) node->params[2*(d+1+i)+1].p; + RF_ASSERT (new->numSector == old->numSector); + RF_ASSERT (new->raidAddress == old->raidAddress); + /* the stripe unit within the stripe tells us the coefficient to use + for the multiply. */ + coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),new->raidAddress); + /* compute the data unit offset within the column, then add one */ + coeff = (coeff % raidPtr->Layout.numDataCol); + QDelta(qbuf,obuf,nbuf, rf_RaidAddressToByte(raidPtr, old->numSector),coeff); + } + + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + tracerec->q_us += RF_ETIMER_VAL_US(timer); + rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no I/O in this node */ + return(0); +} + +RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG) +{ + rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs); +} + +static void RegularQSubr(node,qbuf) + RF_DagNode_t *node; + char *qbuf; +{ + int np = node->numParams; + int d; + RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p; + unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; + int i; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + RF_Etimer_t timer; + char *obuf, *qpbuf; + RF_PhysDiskAddr_t *old; + unsigned long coeff; + + RF_ETIMER_START(timer); + + d = (np-1)/2; + RF_ASSERT (2*d+1 == np); + for (i=0; i < d; i++) + { + old = (RF_PhysDiskAddr_t *) node->params[2*i].p; + obuf = (char *) node->params[2*i+1].p; + coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),old->raidAddress); + /* compute the data unit offset within the column, then add one */ + coeff = (coeff % raidPtr->Layout.numDataCol); + /* the input buffers may not all be aligned with the start of the + stripe. so shift by their sector offset within the stripe unit */ + qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,old->startSector % secPerSU); + rf_IncQ((unsigned long *)qpbuf,(unsigned long *)obuf,rf_RaidAddressToByte(raidPtr, old->numSector),coeff); + } + + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + tracerec->q_us += RF_ETIMER_VAL_US(timer); +} + +/* + used in degraded writes. +*/ + +static void DegrQSubr(node) + RF_DagNode_t *node; +{ + int np = node->numParams; + int d; + RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p; + unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; + int i; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + RF_Etimer_t timer; + char *qbuf = node->results[1]; + char *obuf, *qpbuf; + RF_PhysDiskAddr_t *old; + unsigned long coeff; + unsigned fail_start; + int j; + + old = (RF_PhysDiskAddr_t *)node->params[np-2].p; + fail_start = old->startSector % secPerSU; + + RF_ETIMER_START(timer); + + d = (np-2)/2; + RF_ASSERT (2*d+2 == np); + for (i=0; i < d; i++) + { + old = (RF_PhysDiskAddr_t *) node->params[2*i].p; + obuf = (char *) node->params[2*i+1].p; + coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),old->raidAddress); + /* compute the data unit offset within the column, then add one */ + coeff = (coeff % raidPtr->Layout.numDataCol); + /* the input buffers may not all be aligned with the start of the + stripe. so shift by their sector offset within the stripe unit */ + j = old->startSector % secPerSU; + RF_ASSERT(j >= fail_start); + qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,j - fail_start); + rf_IncQ((unsigned long *)qpbuf,(unsigned long *)obuf,rf_RaidAddressToByte(raidPtr, old->numSector),coeff); + } + + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + tracerec->q_us += RF_ETIMER_VAL_US(timer); +} + +/* + Called by large write code to compute the new parity and the new q. + + structure of the params: + + pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol + raidPtr + + for a total of 2d+1 arguments. + The result buffers results[0], results[1] are the buffers for the p and q, + respectively. + + We compute Q first, then compute P. The P calculation may try to reuse + one of the input buffers for its output, so if we computed P first, we would + corrupt the input for the q calculation. +*/ + +int rf_RegularPQFunc(node) + RF_DagNode_t *node; +{ + RegularQSubr(node,node->results[1]); + return(rf_RegularXorFunc(node)); /* does the wakeup */ +} + +int rf_RegularQFunc(node) + RF_DagNode_t *node; +{ + /* Almost ... adjust Qsubr args */ + RegularQSubr(node, node->results[0]); + rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no I/O in this node */ + return(0); +} + +/* + Called by singly degraded write code to compute the new parity and the new q. + + structure of the params: + + pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d + failedPDA raidPtr + + for a total of 2d+2 arguments. + The result buffers results[0], results[1] are the buffers for the parity and q, + respectively. + + We compute Q first, then compute parity. The parity calculation may try to reuse + one of the input buffers for its output, so if we computed parity first, we would + corrupt the input for the q calculation. + + We treat this identically to the regularPQ case, ignoring the failedPDA extra argument. +*/ + +void rf_Degraded_100_PQFunc(node) + RF_DagNode_t *node; +{ + int np = node->numParams; + + RF_ASSERT (np >= 2); + DegrQSubr(node); + rf_RecoveryXorFunc(node); +} + + +/* + The two below are used when reading a stripe with a single lost data unit. + The parameters are + + pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr + + and results[0] contains the data buffer. Which is originally zero-filled. + +*/ + +/* this Q func is used by the degraded-mode dag functions to recover lost data. + * the second-to-last parameter is the PDA for the failed portion of the access. + * the code here looks at this PDA and assumes that the xor target buffer is + * equal in size to the number of sectors in the failed PDA. It then uses + * the other PDAs in the parameter list to determine where within the target + * buffer the corresponding data should be xored. + * + * Recall the basic equation is + * + * Q = ( data_1 + 2 * data_2 ... + k * data_k ) mod 256 + * + * so to recover data_j we need + * + * J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256 + * + * So the coefficient for each buffer is (255 - data_col), and j should be initialized by + * copying Q into it. Then we need to do a table lookup to convert to solve + * data_j /= J + * + * + */ +int rf_RecoveryQFunc(node) + RF_DagNode_t *node; +{ + RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p; + RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout; + RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams-2].p; + int i; + RF_PhysDiskAddr_t *pda; + RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr,failedPDA->startSector); + char *srcbuf, *destbuf; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + RF_Etimer_t timer; + unsigned long coeff; + + RF_ETIMER_START(timer); + /* start by copying Q into the buffer */ + bcopy(node->params[node->numParams-3].p,node->results[0], + rf_RaidAddressToByte(raidPtr, failedPDA->numSector)); + for (i=0; i<node->numParams-4; i+=2) + { + RF_ASSERT (node->params[i+1].p != node->results[0]); + pda = (RF_PhysDiskAddr_t *) node->params[i].p; + srcbuf = (char *) node->params[i+1].p; + suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); + destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr,suoffset-failedSUOffset); + coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),pda->raidAddress); + /* compute the data unit offset within the column */ + coeff = (coeff % raidPtr->Layout.numDataCol); + rf_IncQ((unsigned long *)destbuf, (unsigned long *)srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff); + } + /* Do the nasty inversion now */ + coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),failedPDA->startSector) % raidPtr->Layout.numDataCol); + rf_InvertQ(node->results[0],node->results[0],rf_RaidAddressToByte(raidPtr,pda->numSector),coeff); + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + tracerec->q_us += RF_ETIMER_VAL_US(timer); + rf_GenericWakeupFunc(node, 0); + return(0); +} + +int rf_RecoveryPQFunc(node) + RF_DagNode_t *node; +{ + RF_PANIC(); + return(1); +} + +/* + Degraded write Q subroutine. + Used when P is dead. + Large-write style Q computation. + Parameters + + (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr. + + We ignore failedPDA. + + This is a "simple style" recovery func. +*/ + +void rf_PQ_DegradedWriteQFunc(node) + RF_DagNode_t *node; +{ + int np = node->numParams; + int d; + RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p; + unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit; + int i; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + RF_Etimer_t timer; + char *qbuf = node->results[0]; + char *obuf, *qpbuf; + RF_PhysDiskAddr_t *old; + unsigned long coeff; + int fail_start,j; + + old = (RF_PhysDiskAddr_t *) node->params[np-2].p; + fail_start = old->startSector % secPerSU; + + RF_ETIMER_START(timer); + + d = (np-2)/2; + RF_ASSERT (2*d+2 == np); + + for (i=0; i < d; i++) + { + old = (RF_PhysDiskAddr_t *) node->params[2*i].p; + obuf = (char *) node->params[2*i+1].p; + coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),old->raidAddress); + /* compute the data unit offset within the column, then add one */ + coeff = (coeff % raidPtr->Layout.numDataCol); + j = old->startSector % secPerSU; + RF_ASSERT(j >= fail_start); + qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,j - fail_start); + rf_IncQ((unsigned long *)qpbuf,(unsigned long *)obuf,rf_RaidAddressToByte(raidPtr, old->numSector),coeff); + } + + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + tracerec->q_us += RF_ETIMER_VAL_US(timer); + rf_GenericWakeupFunc(node, 0); +} + + + + +/* Q computations */ + +/* + coeff - colummn; + + compute dest ^= qfor[28-coeff][rn[coeff+1] a] + + on 5-bit basis; + length in bytes; +*/ + +void rf_IncQ(dest,buf,length,coeff) + unsigned long *dest; + unsigned long *buf; + unsigned length; + unsigned coeff; +{ + unsigned long a, d, new; + unsigned long a1, a2; + unsigned int *q = &(rf_qfor[28-coeff][0]); + unsigned r = rf_rn[coeff+1]; + +#define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f) +#define INSERT(a,i) (a << (5L*i)) + + length /= 8; + /* 13 5 bit quants in a 64 bit word */ + while (length) + { + a = *buf++; + d = *dest; + a1 = EXTRACT(a,0) ^ r; + a2 = EXTRACT(a,1) ^ r; + new = INSERT(a2,1) | a1 ; + a1 = EXTRACT(a,2) ^ r; + a2 = EXTRACT(a,3) ^ r; + a1 = q[a1]; + a2 = q[a2]; + new = new | INSERT(a1,2) | INSERT (a2,3); + a1 = EXTRACT(a,4) ^ r; + a2 = EXTRACT(a,5) ^ r; + a1 = q[a1]; + a2 = q[a2]; + new = new | INSERT(a1,4) | INSERT (a2,5); + a1 = EXTRACT(a,5) ^ r; + a2 = EXTRACT(a,6) ^ r; + a1 = q[a1]; + a2 = q[a2]; + new = new | INSERT(a1,5) | INSERT (a2,6); +#if RF_LONGSHIFT > 2 + a1 = EXTRACT(a,7) ^ r; + a2 = EXTRACT(a,8) ^ r; + a1 = q[a1]; + a2 = q[a2]; + new = new | INSERT(a1,7) | INSERT (a2,8); + a1 = EXTRACT(a,9) ^ r; + a2 = EXTRACT(a,10) ^ r; + a1 = q[a1]; + a2 = q[a2]; + new = new | INSERT(a1,9) | INSERT (a2,10); + a1 = EXTRACT(a,11) ^ r; + a2 = EXTRACT(a,12) ^ r; + a1 = q[a1]; + a2 = q[a2]; + new = new | INSERT(a1,11) | INSERT (a2,12); +#endif /* RF_LONGSHIFT > 2 */ + d ^= new; + *dest++ = d; + length--; + } +} + +/* + compute + + dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ] + + on a five bit basis. + optimization: compute old ^ new on 64 bit basis. + + length in bytes. +*/ + +static void QDelta( + char *dest, + char *obuf, + char *nbuf, + unsigned length, + unsigned char coeff) +{ + unsigned long a, d, new; + unsigned long a1, a2; + unsigned int *q = &(rf_qfor[28-coeff][0]); + unsigned r = rf_rn[coeff+1]; + +#ifdef KERNEL + /* PQ in kernel currently not supported because the encoding/decoding table is not present */ + bzero(dest, length); +#else /* KERNEL */ + /* this code probably doesn't work and should be rewritten -wvcii */ + /* 13 5 bit quants in a 64 bit word */ + length /= 8; + while (length) + { + a = *obuf++; /* XXX need to reorg to avoid cache conflicts */ + a ^= *nbuf++; + d = *dest; + a1 = EXTRACT(a,0) ^ r; + a2 = EXTRACT(a,1) ^ r; + a1 = q[a1]; + a2 = q[a2]; + new = INSERT(a2,1) | a1 ; + a1 = EXTRACT(a,2) ^ r; + a2 = EXTRACT(a,3) ^ r; + a1 = q[a1]; + a2 = q[a2]; + new = new | INSERT(a1,2) | INSERT (a2,3); + a1 = EXTRACT(a,4) ^ r; + a2 = EXTRACT(a,5) ^ r; + a1 = q[a1]; + a2 = q[a2]; + new = new | INSERT(a1,4) | INSERT (a2,5); + a1 = EXTRACT(a,5) ^ r; + a2 = EXTRACT(a,6) ^ r; + a1 = q[a1]; + a2 = q[a2]; + new = new | INSERT(a1,5) | INSERT (a2,6); +#if RF_LONGSHIFT > 2 + a1 = EXTRACT(a,7) ^ r; + a2 = EXTRACT(a,8) ^ r; + a1 = q[a1]; + a2 = q[a2]; + new = new | INSERT(a1,7) | INSERT (a2,8); + a1 = EXTRACT(a,9) ^ r; + a2 = EXTRACT(a,10) ^ r; + a1 = q[a1]; + a2 = q[a2]; + new = new | INSERT(a1,9) | INSERT (a2,10); + a1 = EXTRACT(a,11) ^ r; + a2 = EXTRACT(a,12) ^ r; + a1 = q[a1]; + a2 = q[a2]; + new = new | INSERT(a1,11) | INSERT (a2,12); +#endif /* RF_LONGSHIFT > 2 */ + d ^= new; + *dest++ = d; + length--; + } +#endif /* KERNEL */ +} + +/* + recover columns a and b from the given p and q into + bufs abuf and bbuf. All bufs are word aligned. + Length is in bytes. +*/ + + +/* + * XXX + * + * Everything about this seems wrong. + */ +void rf_PQ_recover(pbuf,qbuf,abuf,bbuf,length,coeff_a,coeff_b) + unsigned long *pbuf; + unsigned long *qbuf; + unsigned long *abuf; + unsigned long *bbuf; + unsigned length; + unsigned coeff_a; + unsigned coeff_b; +{ + unsigned long p, q, a, a0, a1; + int col = (29 * coeff_a) + coeff_b; + unsigned char *q0 = & (rf_qinv[col][0]); + + length /= 8; + while (length) + { + p = *pbuf++; + q = *qbuf++; + a0 = EXTRACT(p,0); + a1 = EXTRACT(q,0); + a = q0[a0<<5 | a1]; +#define MF(i) \ + a0 = EXTRACT(p,i); \ + a1 = EXTRACT(q,i); \ + a = a | INSERT(q0[a0<<5 | a1],i) + + MF(1); + MF(2); + MF(3); + MF(4); + MF(5); + MF(6); +#if 0 + MF(7); + MF(8); + MF(9); + MF(10); + MF(11); + MF(12); +#endif /* 0 */ + *abuf++ = a; + *bbuf++ = a ^ p; + length--; + } +} + +/* + Lost parity and a data column. Recover that data column. + Assume col coeff is lost. Let q the contents of Q after + all surviving data columns have been q-xored out of it. + Then we have the equation + + q[28-coeff][a_i ^ r_i+1] = q + + but q is cyclic with period 31. + So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] = + q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} . + + so a_i = r_{coeff+1} ^ q[3+coeff][q] + + The routine is passed q buffer and the buffer + the data is to be recoverd into. They can be the same. +*/ + + + +static void rf_InvertQ( + unsigned long *qbuf, + unsigned long *abuf, + unsigned length, + unsigned coeff) +{ + unsigned long a, new; + unsigned long a1, a2; + unsigned int *q = &(rf_qfor[3+coeff][0]); + unsigned r = rf_rn[coeff+1]; + + /* 13 5 bit quants in a 64 bit word */ + length /= 8; + while (length) + { + a = *qbuf++; + a1 = EXTRACT(a,0); + a2 = EXTRACT(a,1); + a1 = r ^ q[a1]; + a2 = r ^ q[a2]; + new = INSERT(a2,1) | a1; +#define M(i,j) \ + a1 = EXTRACT(a,i); \ + a2 = EXTRACT(a,j); \ + a1 = r ^ q[a1]; \ + a2 = r ^ q[a2]; \ + new = new | INSERT(a1,i) | INSERT(a2,j) + + M(2,3); + M(4,5); + M(5,6); +#if RF_LONGSHIFT > 2 + M(7,8); + M(9,10); + M(11,12); +#endif /* RF_LONGSHIFT > 2 */ + *abuf++ = new; + length--; + } +} + +#endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */ diff --git a/sys/dev/raidframe/rf_pq.h b/sys/dev/raidframe/rf_pq.h new file mode 100644 index 00000000000..52f816354fa --- /dev/null +++ b/sys/dev/raidframe/rf_pq.h @@ -0,0 +1,115 @@ +/* $OpenBSD: rf_pq.h,v 1.1 1999/01/11 14:29:39 niklas Exp $ */ +/* $NetBSD: rf_pq.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */ +/* + * rf_pq.h + */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Daniel Stodolsky + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ +/* + * : + * Log: rf_pq.h,v + * Revision 1.9 1996/07/31 15:35:05 jimz + * evenodd changes; bugfixes for double-degraded archs, generalize + * some formerly PQ-only functions + * + * Revision 1.8 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.7 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.6 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.5 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.4 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.3 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.2 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.1 1996/05/18 19:56:21 jimz + * Initial revision + * + */ + +#ifndef _RF__RF_PQ_H_ +#define _RF__RF_PQ_H_ + +#include "rf_archs.h" + +extern RF_RedFuncs_t rf_pFuncs; +extern RF_RedFuncs_t rf_pRecoveryFuncs; + +int rf_RegularONPFunc(RF_DagNode_t *node); +int rf_SimpleONPFunc(RF_DagNode_t *node); +int rf_RecoveryPFunc(RF_DagNode_t *node); +int rf_RegularPFunc(RF_DagNode_t *node); + +#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) + +extern RF_RedFuncs_t rf_qFuncs; +extern RF_RedFuncs_t rf_qRecoveryFuncs; +extern RF_RedFuncs_t rf_pqRecoveryFuncs; + +void rf_PQDagSelect(RF_Raid_t *raidPtr, RF_IoType_t type, + RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc); +RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG); +int rf_RegularONQFunc(RF_DagNode_t *node); +int rf_SimpleONQFunc(RF_DagNode_t *node); +RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG); +int rf_RegularPQFunc(RF_DagNode_t *node); +int rf_RegularQFunc(RF_DagNode_t *node); +void rf_Degraded_100_PQFunc(RF_DagNode_t *node); +int rf_RecoveryQFunc(RF_DagNode_t *node); +int rf_RecoveryPQFunc(RF_DagNode_t *node); +void rf_PQ_DegradedWriteQFunc(RF_DagNode_t *node); +void rf_IncQ(unsigned long *dest, unsigned long *buf, unsigned length, + unsigned coeff); +void rf_PQ_recover(unsigned long *pbuf, unsigned long *qbuf, unsigned long *abuf, + unsigned long *bbuf, unsigned length, unsigned coeff_a, unsigned coeff_b); + +#endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */ + +#endif /* !_RF__RF_PQ_H_ */ diff --git a/sys/dev/raidframe/rf_pqdeg.c b/sys/dev/raidframe/rf_pqdeg.c new file mode 100644 index 00000000000..6376201b6c3 --- /dev/null +++ b/sys/dev/raidframe/rf_pqdeg.c @@ -0,0 +1,286 @@ +/* $OpenBSD: rf_pqdeg.c,v 1.1 1999/01/11 14:29:39 niklas Exp $ */ +/* $NetBSD: rf_pqdeg.c,v 1.1 1998/11/13 04:20:32 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Daniel Stodolsky + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * Log: rf_pqdeg.c,v + * Revision 1.19 1996/11/05 21:10:40 jimz + * failed pda generalization + * + * Revision 1.18 1996/07/31 16:30:01 jimz + * asm/asmap fix + * + * Revision 1.17 1996/07/31 15:35:09 jimz + * evenodd changes; bugfixes for double-degraded archs, generalize + * some formerly PQ-only functions + * + * Revision 1.16 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.15 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.14 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.13 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.12 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.11 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.10 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.9 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.8 1996/05/03 19:41:07 wvcii + * added includes for dag library + * + * Revision 1.7 1995/11/30 16:19:36 wvcii + * added copyright info + * + * Revision 1.6 1995/11/07 16:15:08 wvcii + * updated/added prototyping for dag creation + * + * Revision 1.5 1995/03/01 20:25:48 holland + * kernelization changes + * + * Revision 1.4 1995/02/03 22:31:36 holland + * many changes related to kernelization + * + * Revision 1.3 1995/02/01 15:13:05 holland + * moved #include of general.h out of raid.h and into each file + * + * Revision 1.2 1994/12/05 04:50:26 danner + * additional pq support + * + * Revision 1.1 1994/11/29 20:36:02 danner + * Initial revision + * + */ + +#include "rf_archs.h" + +#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_dagutils.h" +#include "rf_dagfuncs.h" +#include "rf_dagffrd.h" +#include "rf_dagffwr.h" +#include "rf_dagdegrd.h" +#include "rf_dagdegwr.h" +#include "rf_threadid.h" +#include "rf_etimer.h" +#include "rf_pqdeg.h" +#include "rf_general.h" +#include "rf_pqdegdags.h" +#include "rf_pq.h" + +/* + Degraded mode dag functions for P+Q calculations. + + The following nomenclature is used. + + PQ_<D><P><Q>_Create{Large,Small}<Write|Read>DAG + + where <D><P><Q> are single digits representing the number of failed + data units <D> (0,1,2), parity units <P> (0,1), and Q units <Q>, effecting + the I/O. The reads have only PQ_<D><P><Q>_CreateReadDAG variants, while + the single fault writes have both large and small write versions. (Single fault + PQ is equivalent to normal mode raid 5 in many aspects. + + Some versions degenerate into the same case, and are grouped together below. +*/ + +/* Reads, single failure + + we have parity, so we can do a raid 5 + reconstruct read. +*/ + +RF_CREATE_DAG_FUNC_DECL(rf_PQ_100_CreateReadDAG) +{ + rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pRecoveryFuncs); +} + +/* Reads double failure */ + +/* + Q is lost, but not parity + so we can a raid 5 reconstruct read. +*/ + +RF_CREATE_DAG_FUNC_DECL(rf_PQ_101_CreateReadDAG) +{ + rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pRecoveryFuncs); +} + +/* + parity is lost, so we need to + do a reconstruct read and recompute + the data with Q. +*/ + +RF_CREATE_DAG_FUNC_DECL(rf_PQ_110_CreateReadDAG) +{ + RF_PhysDiskAddr_t *temp; + /* swap P and Q pointers to fake out the DegradedReadDAG code */ + temp = asmap->parityInfo; asmap->parityInfo = asmap->qInfo; asmap->qInfo = temp; + rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_qRecoveryFuncs); +} + +/* + Two data units are dead in this stripe, so we will need read + both P and Q to reconstruct the data. Note that only + one data unit we are reading may actually be missing. +*/ +RF_CREATE_DAG_FUNC_DECL(rf_CreateDoubleDegradedReadDAG) +{ + rf_PQ_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList); +} + +RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateReadDAG) +{ + rf_CreateDoubleDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList); +} + +/* Writes, single failure */ + +RF_CREATE_DAG_FUNC_DECL(rf_PQ_100_CreateWriteDAG) +{ + if (asmap->numStripeUnitsAccessed != 1 && + asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit) + RF_PANIC(); + rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, + allocList, 2, (int (*)())rf_Degraded_100_PQFunc, RF_FALSE); +} + +/* Dead P - act like a RAID 5 small write with parity = Q */ +RF_CREATE_DAG_FUNC_DECL(rf_PQ_010_CreateSmallWriteDAG) +{ + RF_PhysDiskAddr_t *temp; + /* swap P and Q pointers to fake out the DegradedReadDAG code */ + temp = asmap->parityInfo; asmap->parityInfo = asmap->qInfo; asmap->qInfo = temp; + rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_qFuncs, NULL); +} + +/* Dead Q - act like a RAID 5 small write */ +RF_CREATE_DAG_FUNC_DECL(rf_PQ_001_CreateSmallWriteDAG) +{ + rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, NULL); +} + +/* Dead P - act like a RAID 5 large write but for Q */ +RF_CREATE_DAG_FUNC_DECL(rf_PQ_010_CreateLargeWriteDAG) +{ + RF_PhysDiskAddr_t *temp; + /* swap P and Q pointers to fake out the code */ + temp = asmap->parityInfo; asmap->parityInfo = asmap->qInfo; asmap->qInfo = temp; + rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularQFunc, RF_FALSE); +} + +/* Dead Q - act like a RAID 5 large write */ +RF_CREATE_DAG_FUNC_DECL(rf_PQ_001_CreateLargeWriteDAG) +{ + rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularPFunc, RF_FALSE); +} + + +/* + * writes, double failure + */ + +/* + * Lost P & Q - do a nonredundant write + */ +RF_CREATE_DAG_FUNC_DECL(rf_PQ_011_CreateWriteDAG) +{ + rf_CreateNonRedundantWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, + RF_IO_TYPE_WRITE); +} + +/* + In the two cases below, + A nasty case arises when the write a (strict) portion of a failed stripe unit + and parts of another su. For now, we do not support this. +*/ + +/* + Lost Data and P - do a Q write. +*/ +RF_CREATE_DAG_FUNC_DECL(rf_PQ_110_CreateWriteDAG) +{ + RF_PhysDiskAddr_t *temp; + + if (asmap->numStripeUnitsAccessed != 1 && + asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit) + { + RF_PANIC(); + } + /* swap P and Q to fake out parity code */ + temp = asmap->parityInfo; + asmap->parityInfo = asmap->qInfo; + asmap->qInfo = temp; + rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, + allocList,1, (int (*)())rf_PQ_DegradedWriteQFunc, RF_FALSE); + /* is the regular Q func the right one to call? */ +} + +/* + Lost Data and Q - do degraded mode P write +*/ +RF_CREATE_DAG_FUNC_DECL(rf_PQ_101_CreateWriteDAG) +{ + if (asmap->numStripeUnitsAccessed != 1 && + asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit) + RF_PANIC(); + rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, + allocList,1, rf_RecoveryXorFunc, RF_FALSE); +} + +#endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */ diff --git a/sys/dev/raidframe/rf_pqdeg.h b/sys/dev/raidframe/rf_pqdeg.h new file mode 100644 index 00000000000..dc34a7970f7 --- /dev/null +++ b/sys/dev/raidframe/rf_pqdeg.h @@ -0,0 +1,93 @@ +/* $OpenBSD: rf_pqdeg.h,v 1.1 1999/01/11 14:29:39 niklas Exp $ */ +/* $NetBSD: rf_pqdeg.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Daniel Stodolsky + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* : + * Log: rf_pqdeg.h,v + * Revision 1.7 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.6 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.5 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.4 1995/11/30 16:19:11 wvcii + * added copyright info + * + */ + +#ifndef _RF__RF_PQDEG_H_ +#define _RF__RF_PQDEG_H_ + +#include "rf_types.h" + +#if RF_UTILITY == 0 +#include "rf_dag.h" + +/* extern decl's of the failure mode PQ functions. + * See pddeg.c for nomenclature discussion. + */ + +/* reads, single failure */ +RF_CREATE_DAG_FUNC_DECL(rf_PQ_100_CreateReadDAG); +/* reads, two failure */ +RF_CREATE_DAG_FUNC_DECL(rf_PQ_110_CreateReadDAG); +RF_CREATE_DAG_FUNC_DECL(rf_PQ_101_CreateReadDAG); +RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateReadDAG); + +/* writes, single failure */ +RF_CREATE_DAG_FUNC_DECL(rf_PQ_100_CreateWriteDAG); +RF_CREATE_DAG_FUNC_DECL(rf_PQ_010_CreateSmallWriteDAG); +RF_CREATE_DAG_FUNC_DECL(rf_PQ_010_CreateLargeWriteDAG); +RF_CREATE_DAG_FUNC_DECL(rf_PQ_001_CreateSmallWriteDAG); +RF_CREATE_DAG_FUNC_DECL(rf_PQ_001_CreateLargeWriteDAG); + +/* writes, double failure */ +RF_CREATE_DAG_FUNC_DECL(rf_PQ_011_CreateWriteDAG); +RF_CREATE_DAG_FUNC_DECL(rf_PQ_110_CreateWriteDAG); +RF_CREATE_DAG_FUNC_DECL(rf_PQ_101_CreateWriteDAG); +RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG); +#endif /* RF_UTILITY == 0 */ + +typedef RF_uint32 RF_ua32_t[32]; +typedef RF_uint8 RF_ua1024_t[1024]; + +extern RF_ua32_t rf_rn; +extern RF_ua32_t rf_qfor[32]; +#ifndef KERNEL /* we don't support PQ in the kernel yet, so don't link in this monster table */ +extern RF_ua1024_t rf_qinv[29*29]; +#else /* !KERNEL */ +extern RF_ua1024_t rf_qinv[1]; +#endif /* !KERNEL */ + +#endif /* !_RF__RF_PQDEG_H_ */ diff --git a/sys/dev/raidframe/rf_pqdegdags.c b/sys/dev/raidframe/rf_pqdegdags.c new file mode 100644 index 00000000000..e8346b4f941 --- /dev/null +++ b/sys/dev/raidframe/rf_pqdegdags.c @@ -0,0 +1,554 @@ +/* $OpenBSD: rf_pqdegdags.c,v 1.1 1999/01/11 14:29:40 niklas Exp $ */ +/* $NetBSD: rf_pqdegdags.c,v 1.1 1998/11/13 04:20:32 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Daniel Stodolsky + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * rf_pqdegdags.c + * Degraded mode dags for double fault cases. +*/ + +/* + * : + * Log: rf_pqdegdags.c,v + * Revision 1.31 1996/11/05 21:10:40 jimz + * failed pda generalization + * + * Revision 1.30 1996/07/31 16:30:05 jimz + * asm/asmap fix + * + * Revision 1.29 1996/07/31 15:35:15 jimz + * evenodd changes; bugfixes for double-degraded archs, generalize + * some formerly PQ-only functions + * + * Revision 1.28 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.27 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.26 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.25 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.24 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.23 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.22 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.21 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.20 1996/05/30 12:59:18 jimz + * make etimer happier, more portable + * + * Revision 1.19 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.18 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.17 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.16 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.15 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.14 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.13 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.12 1996/05/08 21:01:24 jimz + * fixed up enum type names that were conflicting with other + * enums and function names (ie, "panic") + * future naming trends will be towards RF_ and rf_ for + * everything raidframe-related + * + * Revision 1.11 1996/05/03 19:47:50 wvcii + * removed include of rf_redstripe.h + * + * Revision 1.10 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.9 1995/11/30 16:17:57 wvcii + * added copyright info + * + * Revision 1.8 1995/11/07 15:33:25 wvcii + * dag creation routines now generate term node + * added asserts + * encoded commit point nodes, antecedence types into dags + * didn't add commit barrier - the code is a mess and needs to + * be cleand up first + * + */ + +#include "rf_archs.h" + +#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_dagfuncs.h" +#include "rf_dagutils.h" +#include "rf_etimer.h" +#include "rf_acctrace.h" +#include "rf_general.h" +#include "rf_pqdegdags.h" +#include "rf_pq.h" +#include "rf_sys.h" + +static void applyPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, RF_PhysDiskAddr_t *ppda, + RF_PhysDiskAddr_t *qpda, void *bp); + +/* + Two data drives have failed, and we are doing a read that covers one of them. + We may also be reading some of the surviving drives. + + + ***************************************************************************************** + * + * creates a DAG to perform a degraded-mode read of data within one stripe. + * This DAG is as follows: + * + * Hdr + * | + * Block + * / / \ \ \ \ + * Rud ... Rud Rrd ... Rrd Rp Rq + * | \ | \ | \ | \ | \ | \ + * + * | | + * Unblock X + * \ / + * ------ T ------ + * + * Each R node is a successor of the L node + * One successor arc from each R node goes to U, and the other to X + * There is one Rud for each chunk of surviving user data requested by the user, + * and one Rrd for each chunk of surviving user data _not_ being read by the user + * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata + * X = pq recovery node, T = terminate + * + * The block & unblock nodes are leftovers from a previous version. They + * do nothing, but I haven't deleted them because it would be a tremendous + * effort to put them back in. + * + * Note: The target buffer for the XOR node is set to the actual user buffer where the + * failed data is supposed to end up. This buffer is zero'd by the code here. Thus, + * if you create a degraded read dag, use it, and then re-use, you have to be sure to + * zero the target buffer prior to the re-use. + * + * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats + * needs and what's not. + ****************************************************************************************/ +/* init a disk node with 2 successors and one predecessor */ +#define INIT_DISK_NODE(node,name) \ +rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \ +(node)->succedents[0] = unblockNode; \ +(node)->succedents[1] = recoveryNode; \ +(node)->antecedents[0] = blockNode; \ +(node)->antType[0] = rf_control + +#define DISK_NODE_PARAMS(_node_,_p_) \ + (_node_).params[0].p = _p_ ; \ + (_node_).params[1].p = (_p_)->bufPtr; \ + (_node_).params[2].v = parityStripeID; \ + (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru) + +#define DISK_NODE_PDA(node) ((node)->params[0].p) + +RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead) +{ + rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList, + "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc); +} + +static void applyPDA(raidPtr,pda,ppda,qpda, bp) + RF_Raid_t *raidPtr; + RF_PhysDiskAddr_t *pda; + RF_PhysDiskAddr_t *ppda; + RF_PhysDiskAddr_t *qpda; + void *bp; +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector); + RF_SectorCount_t s0len = ppda->numSector, len; + RF_SectorNum_t suoffset; + unsigned coeff; + char *pbuf = ppda->bufPtr; + char *qbuf = qpda->bufPtr; + char *buf; + int delta; + + suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); + len = pda->numSector; + /* see if pda intersects a recovery pda */ + if ((suoffset < s0off+s0len) && ( suoffset+len > s0off)) + { + buf = pda->bufPtr; + coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),pda->raidAddress); + coeff = (coeff % raidPtr->Layout.numDataCol); + + if (suoffset < s0off) + { + delta = s0off - suoffset; + buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),delta); + suoffset = s0off; + len -= delta; + } + if (suoffset > s0off) + { + delta = suoffset - s0off; + pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),delta); + qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),delta); + } + if ((suoffset + len) > (s0len + s0off)) + len = s0len + s0off - suoffset; + + /* src, dest, len */ + rf_bxor(buf,pbuf,rf_RaidAddressToByte(raidPtr,len), bp); + + /* dest, src, len, coeff */ + rf_IncQ((unsigned long *)qbuf,(unsigned long *)buf,rf_RaidAddressToByte(raidPtr,len),coeff); + } +} +/* + Recover data in the case of a double failure. There can be two + result buffers, one for each chunk of data trying to be recovered. + The params are pda's that have not been range restricted or otherwise + politely massaged - this should be done here. The last params are the + pdas of P and Q, followed by the raidPtr. The list can look like + + pda, pda, ... , p pda, q pda, raidptr, asm + + or + + pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm + + depending on wether two chunks of recovery data were required. + + The second condition only arises if there are two failed buffers + whose lengths do not add up a stripe unit. +*/ + + +int rf_PQDoubleRecoveryFunc(node) + RF_DagNode_t *node; +{ + int np = node->numParams; + RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p; + RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p; + RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout); + int d, i; + unsigned coeff; + RF_RaidAddr_t sosAddr, suoffset; + RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit; + int two = 0; + RF_PhysDiskAddr_t *ppda,*ppda2,*qpda,*qpda2,*pda,npda; + char *buf; + int numDataCol = layoutPtr->numDataCol; + RF_Etimer_t timer; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + + RF_ETIMER_START(timer); + + if (asmap->failedPDAs[1] && + (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) + { + RF_ASSERT(0); + ppda = node->params[np-6].p; + ppda2 = node->params[np-5].p; + qpda = node->params[np-4].p; + qpda2 = node->params[np-3].p; + d = (np-6); + two = 1; + } + else + { + ppda = node->params[np-4].p; + qpda = node->params[np-3].p; + d = (np-4); + } + + for (i=0; i < d; i++) + { + pda = node->params[i].p; + buf = pda->bufPtr; + suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); + len = pda->numSector; + coeff = rf_RaidAddressToStripeUnitID(layoutPtr,pda->raidAddress); + /* compute the data unit offset within the column */ + coeff = (coeff % raidPtr->Layout.numDataCol); + /* see if pda intersects a recovery pda */ + applyPDA(raidPtr,pda,ppda,qpda,node->dagHdr->bp); + if (two) + applyPDA(raidPtr,pda,ppda,qpda,node->dagHdr->bp); + } + + /* ok, we got the parity back to the point where we can recover. + We now need to determine the coeff of the columns that need to be + recovered. We can also only need to recover a single stripe unit. + */ + + if (asmap->failedPDAs[1] == NULL) + { /* only a single stripe unit to recover. */ + pda = asmap->failedPDAs[0]; + sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); + /* need to determine the column of the other failed disk */ + coeff = rf_RaidAddressToStripeUnitID(layoutPtr,pda->raidAddress); + /* compute the data unit offset within the column */ + coeff = (coeff % raidPtr->Layout.numDataCol); + for (i=0; i < numDataCol; i++) + { + npda.raidAddress = sosAddr + (i * secPerSU); + (raidPtr->Layout.map->MapSector)(raidPtr,npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0); + /* skip over dead disks */ + if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status)) + if (i != coeff) break; + } + RF_ASSERT (i < numDataCol); + RF_ASSERT (two==0); + /* recover the data. Since we need only want to recover one column, we overwrite the + parity with the other one. */ + if (coeff < i) /* recovering 'a' */ + rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)pda->bufPtr,(unsigned long *)ppda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), coeff, i); + else /* recovering 'b' */ + rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,(unsigned long *)pda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), i, coeff); + } + else + RF_PANIC(); + + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + if (tracerec) + tracerec->q_us += RF_ETIMER_VAL_US(timer); + rf_GenericWakeupFunc(node,0); + return(0); +} + +int rf_PQWriteDoubleRecoveryFunc(node) + RF_DagNode_t *node; +{ + /* The situation: + + We are doing a write that hits only one + failed data unit. + The other failed data unit is not being overwritten, so + we need to generate it. + + For the moment, we assume all the nonfailed data being + written is in the shadow of the failed data unit. + (i.e,, either a single data unit write or the entire + failed stripe unit is being overwritten. ) + + Recovery strategy: + apply the recovery data to the parity and q. + Use P & Q to recover the second failed data unit in P. + Zero fill Q, then apply the recovered data to p. + Then apply the data being written to the failed drive. + Then walk through the surviving drives, applying new data + when it exists, othewise the recovery data. Quite a mess. + + + The params + + read pda0, read pda1, ... read pda (numDataCol-3), + write pda0, ... , write pda (numStripeUnitAccess - numDataFailed), + failed pda, raidPtr, asmap + */ + + int np = node->numParams; + RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p; + RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p; + RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout); + int i; + RF_RaidAddr_t sosAddr; + unsigned coeff; + RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; + RF_PhysDiskAddr_t *ppda,*qpda,*pda,npda; + int numDataCol = layoutPtr->numDataCol; + RF_Etimer_t timer; + RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; + + RF_ASSERT(node->numResults == 2); + RF_ASSERT(asmap->failedPDAs[1] == NULL); + RF_ETIMER_START(timer); + ppda = node->results[0]; + qpda = node->results[1]; + /* apply the recovery data */ + for (i=0; i < numDataCol-2; i++) + applyPDA(raidPtr,node->params[i].p,ppda,qpda, node->dagHdr->bp); + + /* determine the other failed data unit */ + pda = asmap->failedPDAs[0]; + sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); + /* need to determine the column of the other failed disk */ + coeff = rf_RaidAddressToStripeUnitID(layoutPtr,pda->raidAddress); + /* compute the data unit offset within the column */ + coeff = (coeff % raidPtr->Layout.numDataCol); + for (i=0; i < numDataCol; i++) + { + npda.raidAddress = sosAddr + (i * secPerSU); + (raidPtr->Layout.map->MapSector)(raidPtr,npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0); + /* skip over dead disks */ + if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status)) + if (i != coeff) break; + } + RF_ASSERT (i < numDataCol); + /* recover the data. The column we want to recover we write over the parity. + The column we don't care about we dump in q. */ + if (coeff < i) /* recovering 'a' */ + rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), coeff, i); + else /* recovering 'b' */ + rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), i, coeff); + + /* OK. The valid data is in P. Zero fill Q, then inc it into it. */ + bzero(qpda->bufPtr,rf_RaidAddressToByte(raidPtr,qpda->numSector)); + rf_IncQ((unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,rf_RaidAddressToByte(raidPtr,qpda->numSector),i); + + /* now apply all the write data to the buffer */ + /* single stripe unit write case: the failed data is only thing we are writing. */ + RF_ASSERT(asmap->numStripeUnitsAccessed == 1); + /* dest, src, len, coeff */ + rf_IncQ((unsigned long *)qpda->bufPtr,(unsigned long *)asmap->failedPDAs[0]->bufPtr,rf_RaidAddressToByte(raidPtr,qpda->numSector),coeff); + rf_bxor(asmap->failedPDAs[0]->bufPtr,ppda->bufPtr,rf_RaidAddressToByte(raidPtr,ppda->numSector),node->dagHdr->bp); + + /* now apply all the recovery data */ + for (i=0; i < numDataCol-2; i++) + applyPDA(raidPtr,node->params[i].p,ppda,qpda, node->dagHdr->bp); + + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + if (tracerec) + tracerec->q_us += RF_ETIMER_VAL_US(timer); + + rf_GenericWakeupFunc(node,0); + return(0); +} + +RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite) +{ + RF_PANIC(); +} + +/* + Two lost data unit write case. + + There are really two cases here: + + (1) The write completely covers the two lost data units. + In that case, a reconstruct write that doesn't write the + failed data units will do the correct thing. So in this case, + the dag looks like + + full stripe read of surviving data units (not being overwriten) + write new data (ignoring failed units) compute P&Q + write P&Q + + + (2) The write does not completely cover both failed data units + (but touches at least one of them). Then we need to do the + equivalent of a reconstruct read to recover the missing data + unit from the other stripe. + + For any data we are writing that is not in the "shadow" + of the failed units, we need to do a four cycle update. + PANIC on this case. for now + +*/ + +RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit; + int sum; + int nf = asmap->numDataFailed; + + sum = asmap->failedPDAs[0]->numSector; + if (nf == 2) + sum += asmap->failedPDAs[1]->numSector; + + if ((nf == 2) && ( sum == (2*sectorsPerSU))) + { + /* large write case */ + rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList); + return; + } + + + if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) + { + /* small write case, no user data not in shadow */ + rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList); + return; + } + RF_PANIC(); +} + +RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite) +{ + rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc); +} + +#endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */ diff --git a/sys/dev/raidframe/rf_pqdegdags.h b/sys/dev/raidframe/rf_pqdegdags.h new file mode 100644 index 00000000000..e860ffe0183 --- /dev/null +++ b/sys/dev/raidframe/rf_pqdegdags.h @@ -0,0 +1,77 @@ +/* $OpenBSD: rf_pqdegdags.h,v 1.1 1999/01/11 14:29:40 niklas Exp $ */ +/* $NetBSD: rf_pqdegdags.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */ +/* + * rf_pqdegdags.h + */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Daniel Stodolsky + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ +/* + * rf_pqdegdags.c + * Degraded mode dags for double fault cases. + */ +/* + * : + * Log: rf_pqdegdags.h,v + * Revision 1.6 1996/07/31 15:35:20 jimz + * evenodd changes; bugfixes for double-degraded archs, generalize + * some formerly PQ-only functions + * + * Revision 1.5 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.4 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.3 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.2 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.1 1996/05/18 19:56:30 jimz + * Initial revision + * + */ + +#ifndef _RF__RF_PQDEGDAGS_H_ +#define _RF__RF_PQDEGDAGS_H_ + +#include "rf_dag.h" + +RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead); +int rf_PQDoubleRecoveryFunc(RF_DagNode_t *node); +int rf_PQWriteDoubleRecoveryFunc(RF_DagNode_t *node); +RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite); +RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite); +RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG); + +#endif /* !_RF__RF_PQDEGDAGS_H_ */ diff --git a/sys/dev/raidframe/rf_psstatus.c b/sys/dev/raidframe/rf_psstatus.c new file mode 100644 index 00000000000..a1396d150bd --- /dev/null +++ b/sys/dev/raidframe/rf_psstatus.c @@ -0,0 +1,417 @@ +/* $OpenBSD: rf_psstatus.c,v 1.1 1999/01/11 14:29:40 niklas Exp $ */ +/* $NetBSD: rf_psstatus.c,v 1.1 1998/11/13 04:20:32 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/***************************************************************************** + * + * psstatus.c + * + * The reconstruction code maintains a bunch of status related to the parity + * stripes that are currently under reconstruction. This header file defines + * the status structures. + * + *****************************************************************************/ + +/* : + * Log: rf_psstatus.c,v + * Revision 1.29 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.28 1996/07/15 17:22:18 jimz + * nit-pick code cleanup + * resolve stdlib problems on DEC OSF + * + * Revision 1.27 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.26 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.25 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.24 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.23 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.22 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.21 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.20 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.19 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.18 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.17 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.16 1996/05/20 16:15:27 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.15 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.14 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.13 1995/11/30 16:17:18 wvcii + * added copyright info + * + */ + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_threadid.h" +#include "rf_general.h" +#include "rf_debugprint.h" +#include "rf_freelist.h" +#include "rf_psstatus.h" +#include "rf_shutdown.h" +#include "rf_sys.h" + +#define Dprintf1(s,a) if (rf_pssDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf2(s,a,b) if (rf_pssDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf3(s,a,b,c) if (rf_pssDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL) + +static void RealPrintPSStatusTable(RF_Raid_t *raidPtr, + RF_PSStatusHeader_t *pssTable); + +#define RF_MAX_FREE_PSS 32 +#define RF_PSS_INC 8 +#define RF_PSS_INITIAL 4 + +static int init_pss( RF_ReconParityStripeStatus_t *, RF_Raid_t *); +static void clean_pss(RF_ReconParityStripeStatus_t *, RF_Raid_t *); +static void rf_ShutdownPSStatus(void *); + +static int init_pss(p, raidPtr) + RF_ReconParityStripeStatus_t *p; + RF_Raid_t *raidPtr; +{ + RF_Calloc(p->issued, raidPtr->numCol, sizeof(char), (char *)); + if (p->issued == NULL) + return(ENOMEM); + return(0); +} + +static void clean_pss(p, raidPtr) + RF_ReconParityStripeStatus_t *p; + RF_Raid_t *raidPtr; +{ + RF_Free(p->issued, raidPtr->numCol*sizeof(char)); +} + +static void rf_ShutdownPSStatus(arg) + void *arg; +{ + RF_Raid_t *raidPtr = (RF_Raid_t *)arg; + + RF_FREELIST_DESTROY_CLEAN_ARG(raidPtr->pss_freelist,next,(RF_ReconParityStripeStatus_t *),clean_pss,raidPtr); +} + +int rf_ConfigurePSStatus( + RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr) +{ + int rc; + + raidPtr->pssTableSize = RF_PSS_DEFAULT_TABLESIZE; + RF_FREELIST_CREATE(raidPtr->pss_freelist, RF_MAX_FREE_PSS, + RF_PSS_INC, sizeof(RF_ReconParityStripeStatus_t)); + if (raidPtr->pss_freelist == NULL) + return(ENOMEM); + rc = rf_ShutdownCreate(listp, rf_ShutdownPSStatus, raidPtr); + if (rc) { + RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", + __FILE__, __LINE__, rc); + rf_ShutdownPSStatus(raidPtr); + return(rc); + } + RF_FREELIST_PRIME_INIT_ARG(raidPtr->pss_freelist, RF_PSS_INITIAL,next, + (RF_ReconParityStripeStatus_t *),init_pss,raidPtr); + return(0); +} + +/***************************************************************************************** + * sets up the pss table + * We pre-allocate a bunch of entries to avoid as much as possible having to + * malloc up hash chain entries. + ****************************************************************************************/ +RF_PSStatusHeader_t *rf_MakeParityStripeStatusTable(raidPtr) + RF_Raid_t *raidPtr; +{ + RF_PSStatusHeader_t *pssTable; + int i, j, rc; + + RF_Calloc(pssTable, raidPtr->pssTableSize, sizeof(RF_PSStatusHeader_t), (RF_PSStatusHeader_t *)); + for (i=0; i<raidPtr->pssTableSize; i++) { + rc = rf_mutex_init(&pssTable[i].mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + /* fail and deallocate */ + for(j=0;j<i;j++) { + rf_mutex_destroy(&pssTable[i].mutex); + } + RF_Free(pssTable, raidPtr->pssTableSize*sizeof(RF_PSStatusHeader_t)); + return(NULL); + } + } + return(pssTable); +} + +void rf_FreeParityStripeStatusTable(raidPtr, pssTable) + RF_Raid_t *raidPtr; + RF_PSStatusHeader_t *pssTable; +{ + int i; + + if (rf_pssDebug) + RealPrintPSStatusTable(raidPtr, pssTable); + for (i=0; i<raidPtr->pssTableSize; i++) { + if (pssTable[i].chain) { + printf("ERROR: pss hash chain not null at recon shutdown\n"); + } + rf_mutex_destroy(&pssTable[i].mutex); + } + RF_Free(pssTable, raidPtr->pssTableSize * sizeof(RF_PSStatusHeader_t)); +} + + +/* looks up the status structure for a parity stripe. + * if the create_flag is on, creates and returns the status structure it it doesn't exist + * otherwise returns NULL if the status structure does not exist + * + * ASSUMES THE PSS DESCRIPTOR IS LOCKED UPON ENTRY + */ +RF_ReconParityStripeStatus_t *rf_LookupRUStatus( + RF_Raid_t *raidPtr, + RF_PSStatusHeader_t *pssTable, + RF_StripeNum_t psID, + RF_ReconUnitNum_t which_ru, + RF_PSSFlags_t flags, /* whether or not to create it if it doesn't exist + what flags to set initially */ + int *created) +{ + RF_PSStatusHeader_t *hdr = &pssTable[ RF_HASH_PSID(raidPtr,psID) ]; + RF_ReconParityStripeStatus_t *p, *pssPtr = hdr->chain; + + *created = 0; + for (p = pssPtr; p; p=p->next) { + if (p->parityStripeID == psID && p->which_ru == which_ru) + break; + } + + if (!p && (flags&RF_PSS_CREATE)) { + Dprintf2("PSS: creating pss for psid %ld ru %d\n",psID,which_ru); + p = rf_AllocPSStatus(raidPtr); + p->next = hdr->chain; hdr->chain = p; + + p->parityStripeID = psID; + p->which_ru = which_ru; + p->flags = flags; + p->rbuf = NULL; + p->writeRbuf = NULL; + p->blockCount = 0; + p->procWaitList = NULL; + p->blockWaitList = NULL; + p->bufWaitList = NULL; + *created = 1; + } else if (p) { /* we didn't create, but we want to specify some new status */ + p->flags |= flags; /* add in whatever flags we're specifying */ + } + if (p && (flags & RF_PSS_RECON_BLOCKED)) { + int tid; + rf_get_threadid(tid); + p->blockCount++; /* if we're asking to block recon, bump the count */ + Dprintf3("[%d] Blocked recon on psid %ld. count now %d\n",tid,psID,p->blockCount); + } + return(p); +} + +/* deletes an entry from the parity stripe status table. typically used + * when an entry has been allocated solely to block reconstruction, and + * no recon was requested while recon was blocked. Assumes the hash + * chain is ALREADY LOCKED. + */ +void rf_PSStatusDelete(raidPtr, pssTable, pssPtr) + RF_Raid_t *raidPtr; + RF_PSStatusHeader_t *pssTable; + RF_ReconParityStripeStatus_t *pssPtr; +{ + RF_PSStatusHeader_t *hdr = &(pssTable[ RF_HASH_PSID(raidPtr,pssPtr->parityStripeID) ] ); + RF_ReconParityStripeStatus_t *p = hdr->chain, *pt = NULL; + + while (p) { + if (p == pssPtr) { + if (pt) pt->next = p->next; else hdr->chain = p->next; + p->next = NULL; + rf_FreePSStatus(raidPtr, p); + return; + } + pt = p; p=p->next; + } + RF_ASSERT(0); /* we must find it here */ +} + +/* deletes an entry from the ps status table after reconstruction has completed */ +void rf_RemoveFromActiveReconTable(raidPtr, row, psid, which_ru) + RF_Raid_t *raidPtr; + RF_RowCol_t row; + RF_ReconUnitNum_t which_ru; + RF_StripeNum_t psid; +{ + RF_PSStatusHeader_t *hdr = &(raidPtr->reconControl[row]->pssTable[ RF_HASH_PSID(raidPtr,psid) ]); + RF_ReconParityStripeStatus_t *p, *pt; + RF_CallbackDesc_t *cb, *cb1; + + RF_LOCK_MUTEX( hdr->mutex ); + for (pt=NULL, p = hdr->chain; p; pt=p,p=p->next) { + if ((p->parityStripeID == psid) && (p->which_ru == which_ru)) + break; + } + if (p == NULL) { + rf_PrintPSStatusTable(raidPtr, row); + } + RF_ASSERT(p); /* it must be there */ + + Dprintf2("PSS: deleting pss for psid %ld ru %d\n",psid,which_ru); + + /* delete this entry from the hash chain */ + if (pt) pt->next = p->next; + else hdr->chain = p->next; + p->next = NULL; + + RF_UNLOCK_MUTEX( hdr->mutex ); + + /* wakup anyone waiting on the parity stripe ID */ + cb = p->procWaitList; + p->procWaitList = NULL; + while (cb) { + Dprintf1("Waking up access waiting on parity stripe ID %ld\n",p->parityStripeID); + cb1 = cb->next; + (cb->callbackFunc)(cb->callbackArg); + + /* THIS IS WHAT THE ORIGINAL CODE HAD... the extra 0 is bogus, IMHO */ + /* (cb->callbackFunc)(cb->callbackArg, 0); */ + rf_FreeCallbackDesc(cb); + cb = cb1; + } + + rf_FreePSStatus(raidPtr, p); +} + +RF_ReconParityStripeStatus_t *rf_AllocPSStatus(raidPtr) + RF_Raid_t *raidPtr; +{ + RF_ReconParityStripeStatus_t *p; + + RF_FREELIST_GET_INIT_ARG(raidPtr->pss_freelist,p,next,(RF_ReconParityStripeStatus_t *),init_pss,raidPtr); + if (p) { + bzero(p->issued, raidPtr->numCol); + } + p->next = NULL; + /* no need to initialize here b/c the only place we're called from is the above Lookup */ + return(p); +} + +void rf_FreePSStatus(raidPtr, p) + RF_Raid_t *raidPtr; + RF_ReconParityStripeStatus_t *p; +{ + RF_ASSERT(p->procWaitList == NULL); + RF_ASSERT(p->blockWaitList == NULL); + RF_ASSERT(p->bufWaitList == NULL); + + RF_FREELIST_FREE_CLEAN_ARG(raidPtr->pss_freelist,p,next,clean_pss,raidPtr); +} + +static void RealPrintPSStatusTable(raidPtr, pssTable) + RF_Raid_t *raidPtr; + RF_PSStatusHeader_t *pssTable; +{ + int i, j, procsWaiting, blocksWaiting, bufsWaiting; + RF_ReconParityStripeStatus_t *p; + RF_CallbackDesc_t *cb; + + printf("\nParity Stripe Status Table\n"); + for (i=0; i< raidPtr->pssTableSize; i++) { + for (p = pssTable[i].chain; p; p=p->next) { + procsWaiting = blocksWaiting = bufsWaiting = 0; + for (cb = p->procWaitList; cb; cb=cb->next) procsWaiting++; + for (cb = p->blockWaitList; cb; cb=cb->next) blocksWaiting++; + for (cb = p->bufWaitList; cb; cb=cb->next) bufsWaiting++; + printf("PSID %ld RU %d : blockCount %d %d/%d/%d proc/block/buf waiting, issued ", + (long)p->parityStripeID, p->which_ru, p->blockCount, procsWaiting, blocksWaiting, bufsWaiting); + for (j=0;j<raidPtr->numCol; j++) printf("%c", (p->issued[j]) ? '1' : '0'); + if (!p->flags) printf(" flags: (none)"); + else { + if (p->flags & RF_PSS_UNDER_RECON) printf(" under-recon"); + if (p->flags & RF_PSS_FORCED_ON_WRITE) printf(" forced-w"); + if (p->flags & RF_PSS_FORCED_ON_READ) printf(" forced-r"); + if (p->flags & RF_PSS_RECON_BLOCKED) printf(" blocked"); + if (p->flags & RF_PSS_BUFFERWAIT) printf(" bufwait"); + } + printf("\n"); + } + } +} + +void rf_PrintPSStatusTable(raidPtr, row) + RF_Raid_t *raidPtr; + RF_RowCol_t row; +{ + RF_PSStatusHeader_t *pssTable = raidPtr->reconControl[row]->pssTable; + RealPrintPSStatusTable(raidPtr, pssTable); +} diff --git a/sys/dev/raidframe/rf_psstatus.h b/sys/dev/raidframe/rf_psstatus.h new file mode 100644 index 00000000000..eaca5822094 --- /dev/null +++ b/sys/dev/raidframe/rf_psstatus.h @@ -0,0 +1,154 @@ +/* $OpenBSD: rf_psstatus.h,v 1.1 1999/01/11 14:29:41 niklas Exp $ */ +/* $NetBSD: rf_psstatus.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/***************************************************************************** + * + * psstatus.h + * + * The reconstruction code maintains a bunch of status related to the parity + * stripes that are currently under reconstruction. This header file defines + * the status structures. + * + *****************************************************************************/ + +/* : + * Log: rf_psstatus.h,v + * Revision 1.16 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.15 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.14 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.13 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.12 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.11 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.10 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.9 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.8 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.7 1995/11/30 16:17:28 wvcii + * added copyright info + * + */ + +#ifndef _RF__RF_PSSTATUS_H_ +#define _RF__RF_PSSTATUS_H_ + +#include "rf_types.h" +#include "rf_threadstuff.h" +#include "rf_callback.h" + +#define RF_PS_MAX_BUFS 10 /* max number of bufs we'll accumulate before we do an XOR */ + +#define RF_PSS_DEFAULT_TABLESIZE 200 + +/* + * Macros to acquire/release the mutex lock on a parity stripe status + * descriptor. Note that we use just one lock for the whole hash chain. + */ +#define RF_HASH_PSID(_raid_,_psid_) ( (_psid_) % ((_raid_)->pssTableSize) ) /* simple hash function */ +#define RF_LOCK_PSS_MUTEX(_raidPtr, _row, _psid) \ + RF_LOCK_MUTEX((_raidPtr)->reconControl[_row]->pssTable[ RF_HASH_PSID(_raidPtr,_psid) ].mutex) +#define RF_UNLOCK_PSS_MUTEX(_raidPtr, _row, _psid) \ + RF_UNLOCK_MUTEX((_raidPtr)->reconControl[_row]->pssTable[ RF_HASH_PSID(_raidPtr,_psid) ].mutex) + +struct RF_ReconParityStripeStatus_s { + RF_StripeNum_t parityStripeID; /* the parity stripe ID */ + RF_ReconUnitNum_t which_ru; /* which reconstruction unit with the indicated parity stripe */ + RF_PSSFlags_t flags; /* flags indicating various conditions */ + void *rbuf; /* this is the accumulating xor sum */ + void *writeRbuf; /* DEBUG ONLY: a pointer to the rbuf after it has filled & been sent to disk */ + void *rbufsForXor[RF_PS_MAX_BUFS]; /* these are buffers still to be xored into the accumulating sum */ + int xorBufCount; /* num buffers waiting to be xored */ + int blockCount; /* count of # proc that have blocked recon on this parity stripe */ + char *issued; /* issued[i]==1 <=> column i has already issued a read request for the indicated RU */ + RF_CallbackDesc_t *procWaitList; /* list of user procs waiting for recon to be done */ + RF_CallbackDesc_t *blockWaitList;/* list of disks blocked waiting for user write to complete */ + RF_CallbackDesc_t *bufWaitList; /* list of disks blocked waiting to acquire a buffer for this RU */ + RF_ReconParityStripeStatus_t *next; +}; + +struct RF_PSStatusHeader_s { + RF_DECLARE_MUTEX(mutex) /* mutex for this hash chain */ + RF_ReconParityStripeStatus_t *chain; /* the hash chain */ +}; + +/* masks for the "flags" field above */ +#define RF_PSS_NONE 0x00000000 /* no flags */ +#define RF_PSS_UNDER_RECON 0x00000001 /* this parity stripe is currently under reconstruction */ +#define RF_PSS_FORCED_ON_WRITE 0x00000002 /* indicates a recon was forced due to a user-write operation */ +#define RF_PSS_FORCED_ON_READ 0x00000004 /* ditto for read, but not currently implemented */ +#define RF_PSS_RECON_BLOCKED 0x00000008 /* reconstruction is currently blocked due to a pending user I/O */ +#define RF_PSS_CREATE 0x00000010 /* tells LookupRUStatus to create the entry */ +#define RF_PSS_BUFFERWAIT 0x00000020 /* someone is waiting for a buffer for this RU */ + +int rf_ConfigurePSStatus(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr); + +RF_PSStatusHeader_t *rf_MakeParityStripeStatusTable(RF_Raid_t *raidPtr); +void rf_FreeParityStripeStatusTable(RF_Raid_t *raidPtr, + RF_PSStatusHeader_t *pssTable); +RF_ReconParityStripeStatus_t *rf_LookupRUStatus(RF_Raid_t *raidPtr, + RF_PSStatusHeader_t *pssTable, RF_StripeNum_t psID, + RF_ReconUnitNum_t which_ru, RF_PSSFlags_t flags, int *created); +void rf_PSStatusDelete(RF_Raid_t *raidPtr, RF_PSStatusHeader_t *pssTable, + RF_ReconParityStripeStatus_t *pssPtr); +void rf_RemoveFromActiveReconTable(RF_Raid_t *raidPtr, RF_RowCol_t row, + RF_StripeNum_t psid, RF_ReconUnitNum_t which_ru); +RF_ReconParityStripeStatus_t *rf_AllocPSStatus(RF_Raid_t *raidPtr); +void rf_FreePSStatus(RF_Raid_t *raidPtr, RF_ReconParityStripeStatus_t *p); +void rf_PrintPSStatusTable(RF_Raid_t *raidPtr, RF_RowCol_t row); + +#endif /* !_RF__RF_PSSTATUS_H_ */ diff --git a/sys/dev/raidframe/rf_raid.h b/sys/dev/raidframe/rf_raid.h new file mode 100644 index 00000000000..278cc9f507a --- /dev/null +++ b/sys/dev/raidframe/rf_raid.h @@ -0,0 +1,437 @@ +/* $OpenBSD: rf_raid.h,v 1.1 1999/01/11 14:29:41 niklas Exp $ */ +/* $NetBSD: rf_raid.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/********************************************** + * rf_raid.h -- main header file for RAID driver + **********************************************/ + +/* + * : + * Log: rf_raid.h,v + * Revision 1.48 1996/08/20 22:33:54 jimz + * make hist_diskreq a doubly-indexed array + * + * Revision 1.47 1996/07/15 05:40:41 jimz + * some recon datastructure cleanup + * better handling of multiple failures + * added undocumented double-recon test + * + * Revision 1.46 1996/07/10 22:28:51 jimz + * get rid of obsolete row statuses (dead,degraded2) + * + * Revision 1.45 1996/06/14 14:56:29 jimz + * make engine threading stuff ifndef SIMULATE + * + * Revision 1.44 1996/06/14 14:16:54 jimz + * move in engine node queue, atomicity control + * + * Revision 1.43 1996/06/12 04:41:26 jimz + * tweaks to make genplot work with user-level driver + * (mainly change stat collection) + * + * Revision 1.42 1996/06/11 10:57:17 jimz + * add recon_done_procs, recon_done_proc_mutex + * + * Revision 1.41 1996/06/11 01:26:48 jimz + * added mechanism for user-level to sync diskthread startup, + * shutdown + * + * Revision 1.40 1996/06/10 14:18:58 jimz + * move user, throughput stats into per-array structure + * + * Revision 1.39 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.38 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.37 1996/06/05 19:38:32 jimz + * fixed up disk queueing types config + * added sstf disk queueing + * fixed exit bug on diskthreads (ref-ing bad mem) + * + * Revision 1.36 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.35 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.34 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.33 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.32 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.31 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.30 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.29 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.28 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.27 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.26 1996/05/08 21:01:24 jimz + * fixed up enum type names that were conflicting with other + * enums and function names (ie, "panic") + * future naming trends will be towards RF_ and rf_ for + * everything raidframe-related + * + * Revision 1.25 1996/05/02 14:57:55 jimz + * add sectorMask + * + * Revision 1.24 1996/04/22 15:53:13 jimz + * MAX_RAIDS -> NRAIDFRAME + * + * Revision 1.23 1995/12/14 18:39:46 jimz + * convert to rf_types.h types + * + * Revision 1.22 1995/12/06 15:02:26 root + * added copyright info + * + * Revision 1.21 1995/10/09 17:39:24 jimz + * added info for tracking number of outstanding accesses + * at user-level + * + * Revision 1.20 1995/09/30 20:37:46 jimz + * added acc_totals to Raid for kernel + * + * Revision 1.19 1995/09/19 22:57:14 jimz + * add cache of raidid for kernel + * + * Revision 1.18 1995/09/18 16:50:04 jimz + * added RF_MAX_DISKS (for config ioctls) + * + * Revision 1.17 1995/09/07 19:02:31 jimz + * mods to get raidframe to compile and link + * in kernel environment + * + * Revision 1.16 1995/07/21 19:29:51 robby + * added some info for the idler to the Raid + * + * Revision 1.15 1995/07/16 03:19:14 cfb + * added cachePtr to *raidPtr + * + * Revision 1.14 1995/06/23 13:39:36 robby + * updeated to prototypes in rf_layout.h + * + */ + +#ifndef _RF__RF_RAID_H_ +#define _RF__RF_RAID_H_ + +#ifdef _KERNEL +#define KERNEL +#endif + +#include "rf_archs.h" +#include "rf_types.h" +#include "rf_threadstuff.h" + +#ifdef _KERNEL +#if defined(__NetBSD__) +#include "rf_netbsd.h" +#elif defined(__OpenBSD__) +#include "rf_openbsd.h" +#endif +#endif + +#ifdef KERNEL +/* XXX Needs to be added. GO +#include <raidframe.h> +*/ +#include <sys/disklabel.h> +#else /* KERNEL */ +#include <stdio.h> +#include <assert.h> +#endif /* KERNEL */ +#include <sys/types.h> + +#include "rf_alloclist.h" +#include "rf_stripelocks.h" +#include "rf_layout.h" +#include "rf_disks.h" +#include "rf_debugMem.h" +#include "rf_diskqueue.h" +#include "rf_reconstruct.h" +#include "rf_acctrace.h" + +#if RF_INCLUDE_PARITYLOGGING > 0 +#include "rf_paritylog.h" +#endif /* RF_INCLUDE_PARITYLOGGING > 0 */ + +#define RF_MAX_DISKS 128 /* max disks per array */ +#if defined(__NetBSD__) || defined(__OpenBSD__) +#define RF_DEV2RAIDID(_dev) (DISKUNIT(_dev)) +#else +#define RF_DEV2RAIDID(_dev) (minor(_dev)>>6) /* convert dev_t to raid id */ +#endif + +/* + * Each row in the array is a distinct parity group, so + * each has it's own status, which is one of the following. + */ +typedef enum RF_RowStatus_e { + rf_rs_optimal, + rf_rs_degraded, + rf_rs_reconstructing, + rf_rs_reconfigured +} RF_RowStatus_t; + +struct RF_CumulativeStats_s { + struct timeval start; /* the time when the stats were last started*/ + struct timeval stop; /* the time when the stats were last stopped */ + long sum_io_us; /* sum of all user response times (us) */ + long num_ios; /* total number of I/Os serviced */ + long num_sect_moved; /* total number of sectors read or written */ +}; + +struct RF_ThroughputStats_s { + RF_DECLARE_MUTEX(mutex)/* a mutex used to lock the configuration stuff */ + struct timeval start; /* timer started when numOutstandingRequests moves from 0 to 1 */ + struct timeval stop; /* timer stopped when numOutstandingRequests moves from 1 to 0 */ + RF_uint64 sum_io_us; /* total time timer is enabled */ + RF_uint64 num_ios; /* total number of ios processed by RAIDframe */ + long num_out_ios; /* number of outstanding ios */ +}; + +#ifdef SIMULATE +typedef struct RF_PendingRecon_s RF_PendingRecon_t; +struct RF_PendingRecon_s { + RF_RowCol_t row; + RF_RowCol_t col; + RF_PendingRecon_t *next; +}; +#endif /* SIMULATE */ + +struct RF_Raid_s { + /* This portion never changes, and can be accessed without locking */ + /* an exception is Disks[][].status, which requires locking when it is changed */ + u_int numRow; /* number of rows of disks, typically == # of ranks */ + u_int numCol; /* number of columns of disks, typically == # of disks/rank */ + u_int numSpare; /* number of spare disks */ + int maxQueueDepth; /* max disk queue depth */ + RF_SectorCount_t totalSectors; /* total number of sectors in the array */ + RF_SectorCount_t sectorsPerDisk; /* number of sectors on each disk */ + u_int logBytesPerSector; /* base-2 log of the number of bytes in a sector */ + u_int bytesPerSector; /* bytes in a sector */ + RF_int32 sectorMask; /* mask of bytes-per-sector */ + + RF_RaidLayout_t Layout; /* all information related to layout */ + RF_RaidDisk_t **Disks; /* all information related to physical disks */ + RF_DiskQueue_t **Queues; /* all information related to disk queues */ + /* NOTE: This is an anchor point via which the queues can be accessed, + * but the enqueue/dequeue routines in diskqueue.c use a local copy of + * this pointer for the actual accesses. + */ + /* The remainder of the structure can change, and therefore requires locking on reads and updates */ + RF_DECLARE_MUTEX(mutex) /* mutex used to serialize access to the fields below */ + RF_RowStatus_t *status; /* the status of each row in the array */ + int valid; /* indicates successful configuration */ + RF_LockTableEntry_t *lockTable; /* stripe-lock table */ + RF_LockTableEntry_t *quiesceLock; /* quiesnce table */ + int numFailures; /* total number of failures in the array */ + + /* + * Cleanup stuff + */ + RF_ShutdownList_t *shutdownList; /* shutdown activities */ + RF_AllocListElem_t *cleanupList; /* memory to be freed at shutdown time */ + + /* + * Recon stuff + */ + RF_HeadSepLimit_t headSepLimit; + int numFloatingReconBufs; + int reconInProgress; +#ifdef SIMULATE + RF_PendingRecon_t *pendingRecon; +#endif /* SIMULATE */ + RF_DECLARE_COND(waitForReconCond) + RF_RaidReconDesc_t *reconDesc; /* reconstruction descriptor */ + RF_ReconCtrl_t **reconControl; /* reconstruction control structure pointers for each row in the array */ + +#if !defined(KERNEL) && !defined(SIMULATE) + /* + * Disk thread stuff + */ + int diskthreads_created; + int diskthreads_running; + int diskthreads_shutdown; + RF_DECLARE_MUTEX(diskthread_count_mutex) + RF_DECLARE_COND(diskthread_count_cond) +#endif /* !KERNEL && !SIMULATE */ + + /* + * Array-quiescence stuff + */ + RF_DECLARE_MUTEX(access_suspend_mutex) + RF_DECLARE_COND(quiescent_cond) + RF_IoCount_t accesses_suspended; + RF_IoCount_t accs_in_flight; + int access_suspend_release; + int waiting_for_quiescence; + RF_CallbackDesc_t *quiesce_wait_list; + + /* + * Statistics + */ +#if !defined(KERNEL) && !defined(SIMULATE) + RF_ThroughputStats_t throughputstats; +#endif /* !KERNEL && !SIMULATE */ + RF_CumulativeStats_t userstats; + + /* + * Engine thread control + */ + RF_DECLARE_MUTEX(node_queue_mutex) + RF_DECLARE_COND(node_queue_cond) + RF_DagNode_t *node_queue; +#ifndef SIMULATE + RF_Thread_t engine_thread; + RF_ThreadGroup_t engine_tg; +#endif /* !SIMULATE */ + int shutdown_engine; + int dags_in_flight; /* debug */ + + /* + * PSS (Parity Stripe Status) stuff + */ + RF_FreeList_t *pss_freelist; + long pssTableSize; + + /* + * Reconstruction stuff + */ + int procsInBufWait; + int numFullReconBuffers; + RF_AccTraceEntry_t *recon_tracerecs; + unsigned long accumXorTimeUs; + RF_ReconDoneProc_t *recon_done_procs; + RF_DECLARE_MUTEX(recon_done_proc_mutex) + +#if !defined(KERNEL) && !defined(SIMULATE) + RF_Thread_t **diskthreads, *sparediskthreads; /* thread descriptors for disk threads in user-level version */ +#endif /* !KERNEL && !SIMULATE */ + + /* + * nAccOutstanding, waitShutdown protected by desc freelist lock + * (This may seem strange, since that's a central serialization point + * for a per-array piece of data, but otherwise, it'd be an extra + * per-array lock, and that'd only be less efficient...) + */ + RF_DECLARE_COND(outstandingCond) + int waitShutdown; + int nAccOutstanding; + + RF_DiskId_t **diskids; + RF_DiskId_t *sparediskids; + +#ifdef KERNEL + int raidid; +#endif /* KERNEL */ + RF_AccTotals_t acc_totals; + int keep_acc_totals; + +#ifdef _KERNEL + struct raidcinfo **raid_cinfo; /* array of component info */ + struct proc *proc; /* XXX shouldn't be needed here.. :-p */ +#endif + + int terminate_disk_queues; + + /* + * XXX + * + * config-specific information should be moved + * somewhere else, or at least hung off this + * in some generic way + */ + + /* used by rf_compute_workload_shift */ + RF_RowCol_t hist_diskreq[RF_MAXROW][RF_MAXCOL]; + + /* used by declustering */ + int noRotate; + +#if RF_INCLUDE_PARITYLOGGING > 0 + /* used by parity logging */ + RF_SectorCount_t regionLogCapacity; + RF_ParityLogQueue_t parityLogPool; /* pool of unused parity logs */ + RF_RegionInfo_t *regionInfo; /* array of region state */ + int numParityLogs; + int numSectorsPerLog; + int regionParityRange; + int logsInUse; /* debugging */ + RF_ParityLogDiskQueue_t parityLogDiskQueue; /* state of parity logging disk work */ + RF_RegionBufferQueue_t regionBufferPool; /* buffers for holding region log */ + RF_RegionBufferQueue_t parityBufferPool; /* buffers for holding parity */ + caddr_t parityLogBufferHeap; /* pool of unused parity logs */ +#ifndef SIMULATE + RF_Thread_t pLogDiskThreadHandle; +#endif /* !SIMULATE */ + +#endif /* RF_INCLUDE_PARITYLOGGING > 0 */ +}; + +#endif /* !_RF__RF_RAID_H_ */ diff --git a/sys/dev/raidframe/rf_raid0.c b/sys/dev/raidframe/rf_raid0.c new file mode 100644 index 00000000000..c81068affd9 --- /dev/null +++ b/sys/dev/raidframe/rf_raid0.c @@ -0,0 +1,242 @@ +/* $OpenBSD: rf_raid0.c,v 1.1 1999/01/11 14:29:41 niklas Exp $ */ +/* $NetBSD: rf_raid0.c,v 1.1 1998/11/13 04:20:33 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/*************************************** + * + * rf_raid0.c -- implements RAID Level 0 + * + ***************************************/ + +/* + * : + * Log: rf_raid0.c,v + * Revision 1.24 1996/07/31 16:56:18 jimz + * dataBytesPerStripe, sectorsPerDisk init arch-indep. + * + * Revision 1.23 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.22 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.21 1996/06/19 22:07:34 jimz + * added parity verify + * + * Revision 1.20 1996/06/17 14:38:33 jimz + * properly #if out RF_DEMO code + * fix bug in MakeConfig that was causing weird behavior + * in configuration routines (config was not zeroed at start) + * clean up genplot handling of stacks + * + * Revision 1.19 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.18 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.17 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.16 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.15 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.14 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.13 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.12 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.11 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.10 1996/05/03 19:37:32 wvcii + * moved dag creation routines to dag library + * + * Revision 1.9 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.8 1995/12/06 15:06:36 root + * added copyright info + * + * Revision 1.7 1995/11/17 18:57:15 wvcii + * added prototypint to MapParity + * + * Revision 1.6 1995/11/16 13:53:51 wvcii + * fixed bug in CreateRAID0WriteDAG prototype + * + * Revision 1.5 1995/11/07 15:22:01 wvcii + * changed RAID0DagSelect prototype + * function no longer generates numHdrSucc, numTermAnt + * + * Revision 1.4 1995/06/23 13:39:17 robby + * updeated to prototypes in rf_layout.h + * + */ + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_raid0.h" +#include "rf_dag.h" +#include "rf_dagffrd.h" +#include "rf_dagffwr.h" +#include "rf_dagutils.h" +#include "rf_dagfuncs.h" +#include "rf_threadid.h" +#include "rf_general.h" +#include "rf_configure.h" +#include "rf_parityscan.h" + +typedef struct RF_Raid0ConfigInfo_s { + RF_RowCol_t *stripeIdentifier; +} RF_Raid0ConfigInfo_t; + +int rf_ConfigureRAID0( + RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr) +{ + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_Raid0ConfigInfo_t *info; + RF_RowCol_t i; + + /* create a RAID level 0 configuration structure */ + RF_MallocAndAdd(info, sizeof(RF_Raid0ConfigInfo_t), (RF_Raid0ConfigInfo_t *), raidPtr->cleanupList); + if (info == NULL) + return(ENOMEM); + layoutPtr->layoutSpecificInfo = (void *)info; + + RF_MallocAndAdd(info->stripeIdentifier, raidPtr->numCol * sizeof(RF_RowCol_t), (RF_RowCol_t *), raidPtr->cleanupList); + if (info->stripeIdentifier == NULL) + return(ENOMEM); + for (i=0; i<raidPtr->numCol; i++) + info->stripeIdentifier[i] = i; + + RF_ASSERT(raidPtr->numRow == 1); + raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * raidPtr->numCol * layoutPtr->sectorsPerStripeUnit; + layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk; + layoutPtr->dataSectorsPerStripe = raidPtr->numCol * layoutPtr->sectorsPerStripeUnit; + layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector; + layoutPtr->numDataCol = raidPtr->numCol; + layoutPtr->numParityCol = 0; + return(0); +} + +void rf_MapSectorRAID0( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; + *row = 0; + *col = SUID % raidPtr->numCol; + *diskSector = (SUID / raidPtr->numCol) * raidPtr->Layout.sectorsPerStripeUnit + + (raidSector % raidPtr->Layout.sectorsPerStripeUnit); +} + +void rf_MapParityRAID0( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + *row = *col = 0; + *diskSector = 0; +} + +void rf_IdentifyStripeRAID0( + RF_Raid_t *raidPtr, + RF_RaidAddr_t addr, + RF_RowCol_t **diskids, + RF_RowCol_t *outRow) +{ + RF_Raid0ConfigInfo_t *info; + + info = raidPtr->Layout.layoutSpecificInfo; + *diskids = info->stripeIdentifier; + *outRow = 0; +} + +void rf_MapSIDToPSIDRAID0( + RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t stripeID, + RF_StripeNum_t *psID, + RF_ReconUnitNum_t *which_ru) +{ + *which_ru = 0; + *psID = stripeID; +} + +void rf_RAID0DagSelect( + RF_Raid_t *raidPtr, + RF_IoType_t type, + RF_AccessStripeMap_t *asmap, + RF_VoidFuncPtr *createFunc) +{ + *createFunc = ((type == RF_IO_TYPE_READ) ? + (RF_VoidFuncPtr)rf_CreateFaultFreeReadDAG : (RF_VoidFuncPtr)rf_CreateRAID0WriteDAG); +} + +int rf_VerifyParityRAID0( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidAddr, + RF_PhysDiskAddr_t *parityPDA, + int correct_it, + RF_RaidAccessFlags_t flags) +{ + /* + * No parity is always okay. + */ + return(RF_PARITY_OKAY); +} diff --git a/sys/dev/raidframe/rf_raid0.h b/sys/dev/raidframe/rf_raid0.h new file mode 100644 index 00000000000..fe90ff49c73 --- /dev/null +++ b/sys/dev/raidframe/rf_raid0.h @@ -0,0 +1,111 @@ +/* $OpenBSD: rf_raid0.h,v 1.1 1999/01/11 14:29:41 niklas Exp $ */ +/* $NetBSD: rf_raid0.h,v 1.1 1998/11/13 04:20:33 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* rf_raid0.h - header file for RAID Level 0 */ + +/* + * : + * Log: rf_raid0.h,v + * Revision 1.15 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.14 1996/06/19 22:07:42 jimz + * added parity verify + * + * Revision 1.13 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.12 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.11 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.10 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.9 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.8 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.7 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.6 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.5 1995/12/06 15:02:36 root + * added copyright info + * + * Revision 1.4 1995/11/17 18:58:33 wvcii + * added prototyping to MapParity + * + * Revision 1.3 1995/11/07 15:21:00 wvcii + * changed RAID0DagSelect prototype + * + * Revision 1.2 1995/06/23 13:39:10 robby + * updeated to prototypes in rf_layout.h + * + */ + +#ifndef _RF__RF_RAID0_H_ +#define _RF__RF_RAID0_H_ + +int rf_ConfigureRAID0(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr); +void rf_MapSectorRAID0(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_MapParityRAID0(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_IdentifyStripeRAID0(RF_Raid_t *raidPtr, RF_RaidAddr_t addr, + RF_RowCol_t **diskids, RF_RowCol_t *outRow); +void rf_MapSIDToPSIDRAID0(RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t stripeID, RF_StripeNum_t *psID, + RF_ReconUnitNum_t *which_ru); +void rf_RAID0DagSelect(RF_Raid_t *raidPtr, RF_IoType_t type, + RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc); +int rf_VerifyParityRAID0(RF_Raid_t *raidPtr, RF_RaidAddr_t raidAddr, + RF_PhysDiskAddr_t *parityPDA, int correct_it, RF_RaidAccessFlags_t flags); + +#endif /* !_RF__RF_RAID0_H_ */ diff --git a/sys/dev/raidframe/rf_raid1.c b/sys/dev/raidframe/rf_raid1.c new file mode 100644 index 00000000000..e941bf384b2 --- /dev/null +++ b/sys/dev/raidframe/rf_raid1.c @@ -0,0 +1,881 @@ +/* $OpenBSD: rf_raid1.c,v 1.1 1999/01/11 14:29:42 niklas Exp $ */ +/* $NetBSD: rf_raid1.c,v 1.1 1998/11/13 04:20:33 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/***************************************************************************** + * + * rf_raid1.c -- implements RAID Level 1 + * + *****************************************************************************/ + +/* + * : + * Log: rf_raid1.c,v + * Revision 1.46 1996/11/05 21:10:40 jimz + * failed pda generalization + * + * Revision 1.45 1996/07/31 16:56:18 jimz + * dataBytesPerStripe, sectorsPerDisk init arch-indep. + * + * Revision 1.44 1996/07/30 03:06:43 jimz + * get rid of extra rf_threadid.h include + * + * Revision 1.43 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.42 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.41 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.40 1996/07/17 14:31:19 jimz + * minor cleanup for readability + * + * Revision 1.39 1996/07/15 17:22:18 jimz + * nit-pick code cleanup + * resolve stdlib problems on DEC OSF + * + * Revision 1.38 1996/07/15 02:56:31 jimz + * fixed dag selection to deal with failed + recon to spare disks + * enhanced recon, parity check debugging + * + * Revision 1.37 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.36 1996/07/11 19:08:00 jimz + * generalize reconstruction mechanism + * allow raid1 reconstructs via copyback (done with array + * quiesced, not online, therefore not disk-directed) + * + * Revision 1.35 1996/07/10 23:01:24 jimz + * Better commenting of VerifyParity (for posterity) + * + * Revision 1.34 1996/07/10 22:29:45 jimz + * VerifyParityRAID1: corrected return values for stripes in degraded mode + * + * Revision 1.33 1996/07/10 16:05:39 jimz + * fixed a couple minor bugs in VerifyParityRAID1 + * added code to correct bad RAID1 parity + * + * Revision 1.32 1996/06/20 18:47:04 jimz + * fix up verification bugs + * + * Revision 1.31 1996/06/20 15:38:59 jimz + * added parity verification + * can't correct bad parity yet, but can return pass/fail + * + * Revision 1.30 1996/06/19 22:23:01 jimz + * parity verification is now a layout-configurable thing + * not all layouts currently support it (correctly, anyway) + * + * Revision 1.29 1996/06/11 08:54:27 jimz + * improved error-checking at configuration time + * + * Revision 1.28 1996/06/10 18:25:24 wvcii + * fixed bug in rf_IdentifyStripeRAID1 - added array initialization + * + * Revision 1.27 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.26 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.25 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.24 1996/06/06 17:29:43 jimz + * use CreateMirrorIdleReadDAG for mirrored read + * + * Revision 1.23 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.22 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.21 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.20 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.19 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.18 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.17 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.16 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.15 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.14 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.13 1996/05/03 19:36:22 wvcii + * moved dag creation routines to dag library + * + * Revision 1.12 1996/02/23 01:38:16 amiri + * removed chained declustering special case in SelectIdleDisk + * + * Revision 1.11 1996/02/22 16:47:18 amiri + * disabled shortest queue optimization for chained declustering + * + * Revision 1.10 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.9 1995/12/04 19:21:28 wvcii + * modified SelectIdleDisk to take a mirror node as a parameter and + * conditionally swap params 0 (data pda) and 4 (mirror pda). + * modified CreateRaidOneReadDAG so that it creates the DAG itself + * as opposed to reusing code in CreateNonredundantDAG. + * + * Revision 1.8 1995/11/30 16:07:45 wvcii + * added copyright info + * + * Revision 1.7 1995/11/16 14:46:18 wvcii + * fixed bugs in mapping and degraded dag creation, added comments + * + * Revision 1.6 1995/11/14 22:29:16 wvcii + * fixed bugs in dag creation + * + * Revision 1.5 1995/11/07 15:23:33 wvcii + * changed RAID1DagSelect prototype + * function no longer generates numHdrSucc, numTermAnt + * changed dag creation routines: + * term node generated during dag creation + * encoded commit nodes, barrier, antecedent types + * + * Revision 1.4 1995/10/10 19:09:21 wvcii + * write dag now handles non-aligned accesses + * + * Revision 1.3 1995/10/05 02:32:56 jimz + * ifdef'd out queue locking for load balancing + * + * Revision 1.2 1995/10/04 07:04:40 wvcii + * reads are now scheduled according to disk queue length. + * queue length is the sum of number of ios queued in raidframe as well as those at the disk. + * reads are sent to the disk with the shortest queue. + * testing against user disks successful, sim & kernel untested. + * + * Revision 1.1 1995/10/04 03:53:23 wvcii + * Initial revision + * + * + */ + +#include "rf_raid.h" +#include "rf_raid1.h" +#include "rf_dag.h" +#include "rf_dagffrd.h" +#include "rf_dagffwr.h" +#include "rf_dagdegrd.h" +#include "rf_dagutils.h" +#include "rf_dagfuncs.h" +#include "rf_threadid.h" +#include "rf_diskqueue.h" +#include "rf_general.h" +#include "rf_utils.h" +#include "rf_parityscan.h" +#include "rf_mcpair.h" +#include "rf_layout.h" +#include "rf_map.h" +#include "rf_engine.h" +#include "rf_reconbuffer.h" +#include "rf_sys.h" + +typedef struct RF_Raid1ConfigInfo_s { + RF_RowCol_t **stripeIdentifier; +} RF_Raid1ConfigInfo_t; + +/* start of day code specific to RAID level 1 */ +int rf_ConfigureRAID1( + RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr) +{ + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_Raid1ConfigInfo_t *info; + RF_RowCol_t i; + + /* create a RAID level 1 configuration structure */ + RF_MallocAndAdd(info, sizeof(RF_Raid1ConfigInfo_t), (RF_Raid1ConfigInfo_t *), raidPtr->cleanupList); + if (info == NULL) + return(ENOMEM); + layoutPtr->layoutSpecificInfo = (void *) info; + + /* ... and fill it in. */ + info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol / 2, 2, raidPtr->cleanupList); + if (info->stripeIdentifier == NULL) + return(ENOMEM); + for (i = 0; i < (raidPtr->numCol / 2); i ++) { + info->stripeIdentifier[i][0] = (2 * i); + info->stripeIdentifier[i][1] = (2 * i) + 1; + } + + RF_ASSERT(raidPtr->numRow == 1); + + /* this implementation of RAID level 1 uses one row of numCol disks and allows multiple (numCol / 2) + * stripes per row. A stripe consists of a single data unit and a single parity (mirror) unit. + * stripe id = raidAddr / stripeUnitSize + */ + raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * (raidPtr->numCol / 2) * layoutPtr->sectorsPerStripeUnit; + layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk * (raidPtr->numCol / 2); + layoutPtr->dataSectorsPerStripe = layoutPtr->sectorsPerStripeUnit; + layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector; + layoutPtr->numDataCol = 1; + layoutPtr->numParityCol = 1; + return(0); +} + + +/* returns the physical disk location of the primary copy in the mirror pair */ +void rf_MapSectorRAID1( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; + RF_RowCol_t mirrorPair = SUID % (raidPtr->numCol / 2); + + *row = 0; + *col = 2 * mirrorPair; + *diskSector = ((SUID / (raidPtr->numCol / 2)) * raidPtr->Layout.sectorsPerStripeUnit) + (raidSector % raidPtr->Layout.sectorsPerStripeUnit); +} + + +/* Map Parity + * + * returns the physical disk location of the secondary copy in the mirror + * pair + */ +void rf_MapParityRAID1( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; + RF_RowCol_t mirrorPair = SUID % (raidPtr->numCol / 2); + + *row = 0; + *col = (2 * mirrorPair) + 1; + + *diskSector = ((SUID / (raidPtr->numCol / 2)) * raidPtr->Layout.sectorsPerStripeUnit) + (raidSector % raidPtr->Layout.sectorsPerStripeUnit); +} + + +/* IdentifyStripeRAID1 + * + * returns a list of disks for a given redundancy group + */ +void rf_IdentifyStripeRAID1( + RF_Raid_t *raidPtr, + RF_RaidAddr_t addr, + RF_RowCol_t **diskids, + RF_RowCol_t *outRow) +{ + RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr); + RF_Raid1ConfigInfo_t *info = raidPtr->Layout.layoutSpecificInfo; + RF_ASSERT(stripeID >= 0); + RF_ASSERT(addr >= 0); + *outRow = 0; + *diskids = info->stripeIdentifier[ stripeID % (raidPtr->numCol/2)]; + RF_ASSERT(*diskids); +} + + +/* MapSIDToPSIDRAID1 + * + * maps a logical stripe to a stripe in the redundant array + */ +void rf_MapSIDToPSIDRAID1( + RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t stripeID, + RF_StripeNum_t *psID, + RF_ReconUnitNum_t *which_ru) +{ + *which_ru = 0; + *psID = stripeID; +} + + + +/****************************************************************************** + * select a graph to perform a single-stripe access + * + * Parameters: raidPtr - description of the physical array + * type - type of operation (read or write) requested + * asmap - logical & physical addresses for this access + * createFunc - name of function to use to create the graph + *****************************************************************************/ + +void rf_RAID1DagSelect( + RF_Raid_t *raidPtr, + RF_IoType_t type, + RF_AccessStripeMap_t *asmap, + RF_VoidFuncPtr *createFunc) +{ + RF_RowCol_t frow, fcol, or, oc; + RF_PhysDiskAddr_t *failedPDA; + int prior_recon, tid; + RF_RowStatus_t rstat; + RF_SectorNum_t oo; + + + RF_ASSERT(RF_IO_IS_R_OR_W(type)); + + if (asmap->numDataFailed + asmap->numParityFailed > 1) { + RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n"); + *createFunc = NULL; + return; + } + + if (asmap->numDataFailed + asmap->numParityFailed) { + /* + * We've got a fault. Re-map to spare space, iff applicable. + * Shouldn't the arch-independent code do this for us? + * Anyway, it turns out if we don't do this here, then when + * we're reconstructing, writes go only to the surviving + * original disk, and aren't reflected on the reconstructed + * spare. Oops. --jimz + */ + failedPDA = asmap->failedPDAs[0]; + frow = failedPDA->row; + fcol = failedPDA->col; + rstat = raidPtr->status[frow]; + prior_recon = (rstat == rf_rs_reconfigured) || ( + (rstat == rf_rs_reconstructing) ? + rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0 + ); + if (prior_recon) { + or = frow; + oc = fcol; + oo = failedPDA->startSector; + /* + * If we did distributed sparing, we'd monkey with that here. + * But we don't, so we'll + */ + failedPDA->row = raidPtr->Disks[frow][fcol].spareRow; + failedPDA->col = raidPtr->Disks[frow][fcol].spareCol; + /* + * Redirect other components, iff necessary. This looks + * pretty suspicious to me, but it's what the raid5 + * DAG select does. + */ + if (asmap->parityInfo->next) { + if (failedPDA == asmap->parityInfo) { + failedPDA->next->row = failedPDA->row; + failedPDA->next->col = failedPDA->col; + } + else { + if (failedPDA == asmap->parityInfo->next) { + asmap->parityInfo->row = failedPDA->row; + asmap->parityInfo->col = failedPDA->col; + } + } + } + if (rf_dagDebug || rf_mapDebug) { + rf_get_threadid(tid); + printf("[%d] Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n", + tid, type, or, oc, (long)oo, failedPDA->row, failedPDA->col, + (long)failedPDA->startSector); + } + asmap->numDataFailed = asmap->numParityFailed = 0; + } + } + if (type == RF_IO_TYPE_READ) { + if (asmap->numDataFailed == 0) + *createFunc = (RF_VoidFuncPtr)rf_CreateMirrorIdleReadDAG; + else + *createFunc = (RF_VoidFuncPtr)rf_CreateRaidOneDegradedReadDAG; + } + else { + *createFunc = (RF_VoidFuncPtr)rf_CreateRaidOneWriteDAG; + } +} + +int rf_VerifyParityRAID1( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidAddr, + RF_PhysDiskAddr_t *parityPDA, + int correct_it, + RF_RaidAccessFlags_t flags) +{ + int nbytes, bcount, stripeWidth, ret, i, j, tid=0, nbad, *bbufs; + RF_DagNode_t *blockNode, *unblockNode, *wrBlock; + RF_DagHeader_t *rd_dag_h, *wr_dag_h; + RF_AccessStripeMapHeader_t *asm_h; + RF_AllocListElem_t *allocList; + RF_AccTraceEntry_t tracerec; + RF_ReconUnitNum_t which_ru; + RF_RaidLayout_t *layoutPtr; + RF_AccessStripeMap_t *aasm; + RF_SectorCount_t nsector; + RF_RaidAddr_t startAddr; + char *buf, *buf1, *buf2; + RF_PhysDiskAddr_t *pda; + RF_StripeNum_t psID; + RF_MCPair_t *mcpair; + + if (rf_verifyParityDebug) { + rf_get_threadid(tid); + } + + layoutPtr = &raidPtr->Layout; + startAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr); + nsector = parityPDA->numSector; + nbytes = rf_RaidAddressToByte(raidPtr, nsector); + psID = rf_RaidAddressToParityStripeID(layoutPtr, raidAddr, &which_ru); + + asm_h = NULL; + rd_dag_h = wr_dag_h = NULL; + mcpair = NULL; + + ret = RF_PARITY_COULD_NOT_VERIFY; + + rf_MakeAllocList(allocList); + if (allocList == NULL) + return(RF_PARITY_COULD_NOT_VERIFY); + mcpair = rf_AllocMCPair(); + if (mcpair == NULL) + goto done; + RF_ASSERT(layoutPtr->numDataCol == layoutPtr->numParityCol); + stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol; + bcount = nbytes*(layoutPtr->numDataCol + layoutPtr->numParityCol); + RF_MallocAndAdd(buf, bcount, (char *), allocList); + if (buf == NULL) + goto done; + if (rf_verifyParityDebug) { + printf("[%d] RAID1 parity verify: buf=%lx bcount=%d (%lx - %lx)\n", + tid, (long)buf, bcount, (long)buf, (long)buf+bcount); + } + + /* + * Generate a DAG which will read the entire stripe- then we can + * just compare data chunks versus "parity" chunks. + */ + + rd_dag_h = rf_MakeSimpleDAG(raidPtr, stripeWidth, nbytes, buf, + rf_DiskReadFunc, rf_DiskReadUndoFunc, "Rod", allocList, flags, + RF_IO_NORMAL_PRIORITY); + if (rd_dag_h == NULL) + goto done; + blockNode = rd_dag_h->succedents[0]; + unblockNode = blockNode->succedents[0]->succedents[0]; + + /* + * Map the access to physical disk addresses (PDAs)- this will + * get us both a list of data addresses, and "parity" addresses + * (which are really mirror copies). + */ + asm_h = rf_MapAccess(raidPtr, startAddr, layoutPtr->dataSectorsPerStripe, + buf, RF_DONT_REMAP); + aasm = asm_h->stripeMap; + + buf1 = buf; + /* + * Loop through the data blocks, setting up read nodes for each. + */ + for(pda=aasm->physInfo,i=0;i<layoutPtr->numDataCol;i++,pda=pda->next) + { + RF_ASSERT(pda); + + rf_RangeRestrictPDA(raidPtr, parityPDA, pda, 0, 1); + + RF_ASSERT(pda->numSector != 0); + if (rf_TryToRedirectPDA(raidPtr, pda, 0)) { + /* cannot verify parity with dead disk */ + goto done; + } + pda->bufPtr = buf1; + blockNode->succedents[i]->params[0].p = pda; + blockNode->succedents[i]->params[1].p = buf1; + blockNode->succedents[i]->params[2].v = psID; + blockNode->succedents[i]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + buf1 += nbytes; + } + RF_ASSERT(pda == NULL); + /* + * keep i, buf1 running + * + * Loop through parity blocks, setting up read nodes for each. + */ + for(pda=aasm->parityInfo;i<layoutPtr->numDataCol+layoutPtr->numParityCol;i++,pda=pda->next) + { + RF_ASSERT(pda); + rf_RangeRestrictPDA(raidPtr, parityPDA, pda, 0, 1); + RF_ASSERT(pda->numSector != 0); + if (rf_TryToRedirectPDA(raidPtr, pda, 0)) { + /* cannot verify parity with dead disk */ + goto done; + } + pda->bufPtr = buf1; + blockNode->succedents[i]->params[0].p = pda; + blockNode->succedents[i]->params[1].p = buf1; + blockNode->succedents[i]->params[2].v = psID; + blockNode->succedents[i]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + buf1 += nbytes; + } + RF_ASSERT(pda == NULL); + + bzero((char *)&tracerec, sizeof(tracerec)); + rd_dag_h->tracerec = &tracerec; + + if (rf_verifyParityDebug > 1) { + printf("[%d] RAID1 parity verify read dag:\n", tid); + rf_PrintDAGList(rd_dag_h); + } + + RF_LOCK_MUTEX(mcpair->mutex); + mcpair->flag = 0; + rf_DispatchDAG(rd_dag_h, (void (*)(void *))rf_MCPairWakeupFunc, + (void *)mcpair); + while (mcpair->flag == 0) { + RF_WAIT_MCPAIR(mcpair); + } + RF_UNLOCK_MUTEX(mcpair->mutex); + + if (rd_dag_h->status != rf_enable) { + RF_ERRORMSG("Unable to verify raid1 parity: can't read stripe\n"); + ret = RF_PARITY_COULD_NOT_VERIFY; + goto done; + } + + /* + * buf1 is the beginning of the data blocks chunk + * buf2 is the beginning of the parity blocks chunk + */ + buf1 = buf; + buf2 = buf + (nbytes * layoutPtr->numDataCol); + ret = RF_PARITY_OKAY; + /* + * bbufs is "bad bufs"- an array whose entries are the data + * column numbers where we had miscompares. (That is, column 0 + * and column 1 of the array are mirror copies, and are considered + * "data column 0" for this purpose). + */ + RF_MallocAndAdd(bbufs, layoutPtr->numParityCol*sizeof(int), (int *), + allocList); + nbad = 0; + /* + * Check data vs "parity" (mirror copy). + */ + for(i=0;i<layoutPtr->numDataCol;i++) { + if (rf_verifyParityDebug) { + printf("[%d] RAID1 parity verify %d bytes: i=%d buf1=%lx buf2=%lx buf=%lx\n", + tid, nbytes, i, (long)buf1, (long)buf2, (long)buf); + } + ret = bcmp(buf1, buf2, nbytes); + if (ret) { + if (rf_verifyParityDebug > 1) { + for(j=0;j<nbytes;j++) { + if (buf1[j] != buf2[j]) + break; + } + printf("psid=%ld j=%d\n", (long)psID, j); + printf("buf1 %02x %02x %02x %02x %02x\n", buf1[0]&0xff, + buf1[1]&0xff, buf1[2]&0xff, buf1[3]&0xff, buf1[4]&0xff); + printf("buf2 %02x %02x %02x %02x %02x\n", buf2[0]&0xff, + buf2[1]&0xff, buf2[2]&0xff, buf2[3]&0xff, buf2[4]&0xff); + } + if (rf_verifyParityDebug) { + printf("[%d] RAID1: found bad parity, i=%d\n", tid, i); + } + /* + * Parity is bad. Keep track of which columns were bad. + */ + if (bbufs) + bbufs[nbad] = i; + nbad++; + ret = RF_PARITY_BAD; + } + buf1 += nbytes; + buf2 += nbytes; + } + + if ((ret != RF_PARITY_OKAY) && correct_it) { + ret = RF_PARITY_COULD_NOT_CORRECT; + if (rf_verifyParityDebug) { + printf("[%d] RAID1 parity verify: parity not correct\n", tid); + } + if (bbufs == NULL) + goto done; + /* + * Make a DAG with one write node for each bad unit. We'll simply + * write the contents of the data unit onto the parity unit for + * correction. (It's possible that the mirror copy was the correct + * copy, and that we're spooging good data by writing bad over it, + * but there's no way we can know that. + */ + wr_dag_h = rf_MakeSimpleDAG(raidPtr, nbad, nbytes, buf, + rf_DiskWriteFunc, rf_DiskWriteUndoFunc, "Wnp", allocList, flags, + RF_IO_NORMAL_PRIORITY); + if (wr_dag_h == NULL) + goto done; + wrBlock = wr_dag_h->succedents[0]; + /* + * Fill in a write node for each bad compare. + */ + for(i=0;i<nbad;i++) { + j = i+layoutPtr->numDataCol; + pda = blockNode->succedents[j]->params[0].p; + pda->bufPtr = blockNode->succedents[i]->params[1].p; + wrBlock->succedents[i]->params[0].p = pda; + wrBlock->succedents[i]->params[1].p = pda->bufPtr; + wrBlock->succedents[i]->params[2].v = psID; + wrBlock->succedents[0]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru); + } + bzero((char *)&tracerec, sizeof(tracerec)); + wr_dag_h->tracerec = &tracerec; + if (rf_verifyParityDebug > 1) { + printf("Parity verify write dag:\n"); + rf_PrintDAGList(wr_dag_h); + } + RF_LOCK_MUTEX(mcpair->mutex); + mcpair->flag = 0; + /* fire off the write DAG */ + rf_DispatchDAG(wr_dag_h, (void (*)(void *))rf_MCPairWakeupFunc, + (void *)mcpair); + while (!mcpair->flag) { + RF_WAIT_COND(mcpair->cond, mcpair->mutex); + } + RF_UNLOCK_MUTEX(mcpair->mutex); + if (wr_dag_h->status != rf_enable) { + RF_ERRORMSG("Unable to correct RAID1 parity in VerifyParity\n"); + goto done; + } + ret = RF_PARITY_CORRECTED; + } + +done: + /* + * All done. We might've gotten here without doing part of the function, + * so cleanup what we have to and return our running status. + */ + if (asm_h) + rf_FreeAccessStripeMap(asm_h); + if (rd_dag_h) + rf_FreeDAG(rd_dag_h); + if (wr_dag_h) + rf_FreeDAG(wr_dag_h); + if (mcpair) + rf_FreeMCPair(mcpair); + rf_FreeAllocList(allocList); + if (rf_verifyParityDebug) { + printf("[%d] RAID1 parity verify, returning %d\n", tid, ret); + } + return(ret); +} + +int rf_SubmitReconBufferRAID1(rbuf, keep_it, use_committed) + RF_ReconBuffer_t *rbuf; /* the recon buffer to submit */ + int keep_it; /* whether we can keep this buffer or we have to return it */ + int use_committed; /* whether to use a committed or an available recon buffer */ +{ + RF_ReconParityStripeStatus_t *pssPtr; + RF_ReconCtrl_t *reconCtrlPtr; + RF_RaidLayout_t *layoutPtr; + int tid=0, retcode, created; + RF_CallbackDesc_t *cb, *p; + RF_ReconBuffer_t *t; + RF_Raid_t *raidPtr; + caddr_t ta; + + retcode = 0; + created = 0; + + raidPtr = rbuf->raidPtr; + layoutPtr = &raidPtr->Layout; + reconCtrlPtr = raidPtr->reconControl[rbuf->row]; + + RF_ASSERT(rbuf); + RF_ASSERT(rbuf->col != reconCtrlPtr->fcol); + + if (rf_reconbufferDebug) { + rf_get_threadid(tid); + printf("[%d] RAID1 reconbuffer submission r%d c%d psid %ld ru%d (failed offset %ld)\n", + tid, rbuf->row, rbuf->col, (long)rbuf->parityStripeID, rbuf->which_ru, + (long)rbuf->failedDiskSectorOffset); + } + + if (rf_reconDebug) { + printf("RAID1 reconbuffer submit psid %ld buf %lx\n", + (long)rbuf->parityStripeID, (long)rbuf->buffer); + printf("RAID1 psid %ld %02x %02x %02x %02x %02x\n", + (long)rbuf->parityStripeID, + rbuf->buffer[0], rbuf->buffer[1], rbuf->buffer[2], rbuf->buffer[3], + rbuf->buffer[4]); + } + + RF_LOCK_PSS_MUTEX(raidPtr,rbuf->row,rbuf->parityStripeID); + + RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); + + pssPtr = rf_LookupRUStatus(raidPtr, reconCtrlPtr->pssTable, + rbuf->parityStripeID, rbuf->which_ru, RF_PSS_NONE, &created); + RF_ASSERT(pssPtr); /* if it didn't exist, we wouldn't have gotten an rbuf for it */ + + /* + * Since this is simple mirroring, the first submission for a stripe is also + * treated as the last. + */ + + t = NULL; + if (keep_it) { + if (rf_reconbufferDebug) { + printf("[%d] RAID1 rbuf submission: keeping rbuf\n", tid); + } + t = rbuf; + } + else { + if (use_committed) { + if (rf_reconbufferDebug) { + printf("[%d] RAID1 rbuf submission: using committed rbuf\n", tid); + } + t = reconCtrlPtr->committedRbufs; + RF_ASSERT(t); + reconCtrlPtr->committedRbufs = t->next; + t->next = NULL; + } + else if (reconCtrlPtr->floatingRbufs) { + if (rf_reconbufferDebug) { + printf("[%d] RAID1 rbuf submission: using floating rbuf\n", tid); + } + t = reconCtrlPtr->floatingRbufs; + reconCtrlPtr->floatingRbufs = t->next; + t->next = NULL; + } + } + if (t == NULL) { + if (rf_reconbufferDebug) { + printf("[%d] RAID1 rbuf submission: waiting for rbuf\n", tid); + } + RF_ASSERT((keep_it == 0) && (use_committed == 0)); + raidPtr->procsInBufWait++; + if ((raidPtr->procsInBufWait == (raidPtr->numCol-1)) + && (raidPtr->numFullReconBuffers == 0)) + { + /* ruh-ro */ + RF_ERRORMSG("Buffer wait deadlock\n"); + rf_PrintPSStatusTable(raidPtr, rbuf->row); + RF_PANIC(); + } + pssPtr->flags |= RF_PSS_BUFFERWAIT; + cb = rf_AllocCallbackDesc(); + cb->row = rbuf->row; + cb->col = rbuf->col; + cb->callbackArg.v = rbuf->parityStripeID; + cb->callbackArg2.v = rbuf->which_ru; + cb->next = NULL; + if (reconCtrlPtr->bufferWaitList == NULL) { + /* we are the wait list- lucky us */ + reconCtrlPtr->bufferWaitList = cb; + } + else { + /* append to wait list */ + for(p=reconCtrlPtr->bufferWaitList;p->next;p=p->next); + p->next = cb; + } + retcode = 1; + goto out; + } + if (t != rbuf) { + t->row = rbuf->row; + t->col = reconCtrlPtr->fcol; + t->parityStripeID = rbuf->parityStripeID; + t->which_ru = rbuf->which_ru; + t->failedDiskSectorOffset = rbuf->failedDiskSectorOffset; + t->spRow = rbuf->spRow; + t->spCol = rbuf->spCol; + t->spOffset = rbuf->spOffset; + /* Swap buffers. DANCE! */ + ta = t->buffer; + t->buffer = rbuf->buffer; + rbuf->buffer = ta; + } + /* + * Use the rbuf we've been given as the target. + */ + RF_ASSERT(pssPtr->rbuf == NULL); + pssPtr->rbuf = t; + + t->count = 1; + /* + * Below, we use 1 for numDataCol (which is equal to the count in the + * previous line), so we'll always be done. + */ + rf_CheckForFullRbuf(raidPtr, reconCtrlPtr, pssPtr, 1); + +out: + RF_UNLOCK_PSS_MUTEX( raidPtr,rbuf->row,rbuf->parityStripeID); + RF_UNLOCK_MUTEX( reconCtrlPtr->rb_mutex ); + if (rf_reconbufferDebug) { + printf("[%d] RAID1 rbuf submission: returning %d\n", tid, retcode); + } + return(retcode); +} diff --git a/sys/dev/raidframe/rf_raid1.h b/sys/dev/raidframe/rf_raid1.h new file mode 100644 index 00000000000..9ce0cb64067 --- /dev/null +++ b/sys/dev/raidframe/rf_raid1.h @@ -0,0 +1,130 @@ +/* $OpenBSD: rf_raid1.h,v 1.1 1999/01/11 14:29:42 niklas Exp $ */ +/* $NetBSD: rf_raid1.h,v 1.1 1998/11/13 04:20:33 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: William V. Courtright II + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* header file for RAID Level 1 */ + +/* + * : + * Log: rf_raid1.h,v + * Revision 1.17 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.16 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.15 1996/07/11 19:08:00 jimz + * generalize reconstruction mechanism + * allow raid1 reconstructs via copyback (done with array + * quiesced, not online, therefore not disk-directed) + * + * Revision 1.14 1996/06/19 22:23:01 jimz + * parity verification is now a layout-configurable thing + * not all layouts currently support it (correctly, anyway) + * + * Revision 1.13 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.12 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.11 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.10 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.9 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.8 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.7 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.6 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.5 1996/05/03 19:35:34 wvcii + * moved dags to dag library + * + * Revision 1.4 1995/11/30 16:07:26 wvcii + * added copyright info + * + * Revision 1.3 1995/11/16 14:56:41 wvcii + * updated prototypes + * + * Revision 1.2 1995/11/07 15:23:01 wvcii + * changed RAID1DagSelect prototype + * function no longer generates numHdrSucc, numTermAnt + * + * Revision 1.1 1995/10/04 03:52:59 wvcii + * Initial revision + * + * + */ + +#ifndef _RF__RF_RAID1_H_ +#define _RF__RF_RAID1_H_ + +#include "rf_types.h" + +int rf_ConfigureRAID1(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr); +void rf_MapSectorRAID1(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_MapParityRAID1(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_IdentifyStripeRAID1(RF_Raid_t *raidPtr, RF_RaidAddr_t addr, + RF_RowCol_t **diskids, RF_RowCol_t *outRow); +void rf_MapSIDToPSIDRAID1(RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t stripeID, RF_StripeNum_t *psID, + RF_ReconUnitNum_t *which_ru); +void rf_RAID1DagSelect(RF_Raid_t *raidPtr, RF_IoType_t type, + RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc); +int rf_VerifyParityRAID1(RF_Raid_t *raidPtr, RF_RaidAddr_t raidAddr, + RF_PhysDiskAddr_t *parityPDA, int correct_it, RF_RaidAccessFlags_t flags); +int rf_SubmitReconBufferRAID1(RF_ReconBuffer_t *rbuf, int keep_int, + int use_committed); + +#endif /* !_RF__RF_RAID1_H_ */ diff --git a/sys/dev/raidframe/rf_raid4.c b/sys/dev/raidframe/rf_raid4.c new file mode 100644 index 00000000000..5a2c0da50bf --- /dev/null +++ b/sys/dev/raidframe/rf_raid4.c @@ -0,0 +1,225 @@ +/* $OpenBSD: rf_raid4.c,v 1.1 1999/01/11 14:29:43 niklas Exp $ */ +/* $NetBSD: rf_raid4.c,v 1.1 1998/11/13 04:20:33 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Rachad Youssef + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/*************************************** + * + * rf_raid4.c -- implements RAID Level 4 + * + ***************************************/ + +/* + * : + * Log: rf_raid4.c,v + * Revision 1.24 1996/07/31 16:56:18 jimz + * dataBytesPerStripe, sectorsPerDisk init arch-indep. + * + * Revision 1.23 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.22 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.21 1996/06/11 08:54:27 jimz + * improved error-checking at configuration time + * + * Revision 1.20 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.19 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.18 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.17 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.16 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.15 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.14 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.13 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.12 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.11 1996/05/03 19:39:41 wvcii + * added includes for dag library + * + * Revision 1.10 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.9 1995/12/06 15:02:46 root + * added copyright info + * + * Revision 1.8 1995/11/17 18:57:32 wvcii + * added prototyping to MapParity + * + * Revision 1.7 1995/06/23 13:38:58 robby + * updeated to prototypes in rf_layout.h + * + */ + +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_dagutils.h" +#include "rf_dagfuncs.h" +#include "rf_dagffrd.h" +#include "rf_dagffwr.h" +#include "rf_dagdegrd.h" +#include "rf_dagdegwr.h" +#include "rf_threadid.h" +#include "rf_raid4.h" +#include "rf_general.h" + +typedef struct RF_Raid4ConfigInfo_s { + RF_RowCol_t *stripeIdentifier; /* filled in at config time & used by IdentifyStripe */ +} RF_Raid4ConfigInfo_t; + + + +int rf_ConfigureRAID4( + RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr) +{ + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_Raid4ConfigInfo_t *info; + int i; + + /* create a RAID level 4 configuration structure ... */ + RF_MallocAndAdd(info, sizeof(RF_Raid4ConfigInfo_t), (RF_Raid4ConfigInfo_t *), raidPtr->cleanupList); + if (info == NULL) + return(ENOMEM); + layoutPtr->layoutSpecificInfo = (void *) info; + + /* ... and fill it in. */ + RF_MallocAndAdd(info->stripeIdentifier, raidPtr->numCol * sizeof(RF_RowCol_t), (RF_RowCol_t *), raidPtr->cleanupList); + if (info->stripeIdentifier == NULL) + return(ENOMEM); + for (i=0; i<raidPtr->numCol; i++) + info->stripeIdentifier[i] = i; + + RF_ASSERT(raidPtr->numRow == 1); + + /* fill in the remaining layout parameters */ + layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk; + layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector; + layoutPtr->numDataCol = raidPtr->numCol-1; + layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; + layoutPtr->numParityCol = 1; + raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; + + return(0); +} + +int rf_GetDefaultNumFloatingReconBuffersRAID4(RF_Raid_t *raidPtr) +{ + return(20); +} + +RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitRAID4(RF_Raid_t *raidPtr) +{ + return(20); +} + +void rf_MapSectorRAID4( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; + *row = 0; + *col = SUID % raidPtr->Layout.numDataCol; + *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit + + (raidSector % raidPtr->Layout.sectorsPerStripeUnit); +} + +void rf_MapParityRAID4( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; + + *row = 0; + *col = raidPtr->Layout.numDataCol; + *diskSector =(SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit + + (raidSector % raidPtr->Layout.sectorsPerStripeUnit); +} + +void rf_IdentifyStripeRAID4( + RF_Raid_t *raidPtr, + RF_RaidAddr_t addr, + RF_RowCol_t **diskids, + RF_RowCol_t *outRow) +{ + RF_Raid4ConfigInfo_t *info = raidPtr->Layout.layoutSpecificInfo; + + *outRow = 0; + *diskids = info->stripeIdentifier; +} + +void rf_MapSIDToPSIDRAID4( + RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t stripeID, + RF_StripeNum_t *psID, + RF_ReconUnitNum_t *which_ru) +{ + *which_ru = 0; + *psID = stripeID; +} diff --git a/sys/dev/raidframe/rf_raid4.h b/sys/dev/raidframe/rf_raid4.h new file mode 100644 index 00000000000..81f8e5375d3 --- /dev/null +++ b/sys/dev/raidframe/rf_raid4.h @@ -0,0 +1,109 @@ +/* $OpenBSD: rf_raid4.h,v 1.1 1999/01/11 14:29:43 niklas Exp $ */ +/* $NetBSD: rf_raid4.h,v 1.1 1998/11/13 04:20:33 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Rachad Youssef + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* rf_raid4.h header file for RAID Level 4 */ + +/* + * : + * Log: rf_raid4.h,v + * Revision 1.15 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.14 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.13 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.12 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.11 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.10 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.9 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.8 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.7 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.6 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.5 1995/12/06 15:07:03 root + * added copyright info + * + * Revision 1.4 1995/11/17 18:58:46 wvcii + * added prototyping to MapParity + * + * Revision 1.3 1995/06/23 13:38:46 robby + * updeated to prototypes in rf_layout.h + * + */ + +#ifndef _RF__RF_RAID4_H_ +#define _RF__RF_RAID4_H_ + +int rf_ConfigureRAID4(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr); +int rf_GetDefaultNumFloatingReconBuffersRAID4(RF_Raid_t *raidPtr); +RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitRAID4(RF_Raid_t *raidPtr); +void rf_MapSectorRAID4(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_MapParityRAID4(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_IdentifyStripeRAID4(RF_Raid_t *raidPtr, RF_RaidAddr_t addr, + RF_RowCol_t **diskids, RF_RowCol_t *outRow); +void rf_MapSIDToPSIDRAID4(RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t stripeID, RF_StripeNum_t *psID, + RF_ReconUnitNum_t *which_ru); +void rf_RAID4DagSelect(RF_Raid_t *raidPtr, RF_IoType_t type, + RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc); + +#endif /* !_RF__RF_RAID4_H_ */ diff --git a/sys/dev/raidframe/rf_raid5.c b/sys/dev/raidframe/rf_raid5.c new file mode 100644 index 00000000000..febb9f51f44 --- /dev/null +++ b/sys/dev/raidframe/rf_raid5.c @@ -0,0 +1,403 @@ +/* $OpenBSD: rf_raid5.c,v 1.1 1999/01/11 14:29:43 niklas Exp $ */ +/* $NetBSD: rf_raid5.c,v 1.1 1998/11/13 04:20:33 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/****************************************************************************** + * + * rf_raid5.c -- implements RAID Level 5 + * + *****************************************************************************/ + +/* + * : + * Log: rf_raid5.c,v + * Revision 1.26 1996/11/05 21:10:40 jimz + * failed pda generalization + * + * Revision 1.25 1996/07/31 16:56:18 jimz + * dataBytesPerStripe, sectorsPerDisk init arch-indep. + * + * Revision 1.24 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.23 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.22 1996/06/11 08:54:27 jimz + * improved error-checking at configuration time + * + * Revision 1.21 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.20 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.19 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.18 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.17 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.16 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.15 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.14 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.13 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.12 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.11 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.10 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.9 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.8 1996/05/03 19:38:58 wvcii + * moved dag creation routines to dag library + * + * Revision 1.7 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.6 1995/12/06 15:04:28 root + * added copyright info + * + * Revision 1.5 1995/11/17 18:59:41 wvcii + * added prototyping to MapParity + * + * Revision 1.4 1995/06/23 13:38:21 robby + * updeated to prototypes in rf_layout.h + * + */ + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_raid5.h" +#include "rf_dag.h" +#include "rf_dagffrd.h" +#include "rf_dagffwr.h" +#include "rf_dagdegrd.h" +#include "rf_dagdegwr.h" +#include "rf_dagutils.h" +#include "rf_threadid.h" +#include "rf_general.h" +#include "rf_map.h" +#include "rf_utils.h" + +typedef struct RF_Raid5ConfigInfo_s { + RF_RowCol_t **stripeIdentifier; /* filled in at config time and used by IdentifyStripe */ +} RF_Raid5ConfigInfo_t; + +int rf_ConfigureRAID5( + RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr) +{ + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_Raid5ConfigInfo_t *info; + RF_RowCol_t i, j, startdisk; + + /* create a RAID level 5 configuration structure */ + RF_MallocAndAdd(info, sizeof(RF_Raid5ConfigInfo_t), (RF_Raid5ConfigInfo_t *), raidPtr->cleanupList); + if (info == NULL) + return(ENOMEM); + layoutPtr->layoutSpecificInfo = (void *) info; + + RF_ASSERT(raidPtr->numRow == 1); + + /* the stripe identifier must identify the disks in each stripe, + * IN THE ORDER THAT THEY APPEAR IN THE STRIPE. + */ + info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol, raidPtr->numCol, raidPtr->cleanupList); + if (info->stripeIdentifier == NULL) + return(ENOMEM); + startdisk = 0; + for (i=0; i<raidPtr->numCol; i++) { + for (j=0; j<raidPtr->numCol; j++) { + info->stripeIdentifier[i][j] = (startdisk + j) % raidPtr->numCol; + } + if ((--startdisk) < 0) startdisk = raidPtr->numCol-1; + } + + /* fill in the remaining layout parameters */ + layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk; + layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector; + layoutPtr->numDataCol = raidPtr->numCol-1; + layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; + layoutPtr->numParityCol = 1; + layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk; + + raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; + + return(0); +} + +int rf_GetDefaultNumFloatingReconBuffersRAID5(RF_Raid_t *raidPtr) +{ + return(20); +} + +RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitRAID5(RF_Raid_t *raidPtr) +{ + return(10); +} + +#if !defined(__NetBSD__) && !defined(__OpenBSD__) && !defined(_KERNEL) +/* not currently used */ +int rf_ShutdownRAID5(RF_Raid_t *raidPtr) +{ + return(0); +} +#endif + +void rf_MapSectorRAID5( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; + *row = 0; + *col = (SUID % raidPtr->numCol); + *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit + + (raidSector % raidPtr->Layout.sectorsPerStripeUnit); +} + +void rf_MapParityRAID5( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; + + *row = 0; + *col = raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%raidPtr->numCol; + *diskSector =(SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit + + (raidSector % raidPtr->Layout.sectorsPerStripeUnit); +} + +void rf_IdentifyStripeRAID5( + RF_Raid_t *raidPtr, + RF_RaidAddr_t addr, + RF_RowCol_t **diskids, + RF_RowCol_t *outRow) +{ + RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr); + RF_Raid5ConfigInfo_t *info = (RF_Raid5ConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; + + *outRow = 0; + *diskids = info->stripeIdentifier[ stripeID % raidPtr->numCol ]; +} + +void rf_MapSIDToPSIDRAID5( + RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t stripeID, + RF_StripeNum_t *psID, + RF_ReconUnitNum_t *which_ru) +{ + *which_ru = 0; + *psID = stripeID; +} + +/* select an algorithm for performing an access. Returns two pointers, + * one to a function that will return information about the DAG, and + * another to a function that will create the dag. + */ +void rf_RaidFiveDagSelect( + RF_Raid_t *raidPtr, + RF_IoType_t type, + RF_AccessStripeMap_t *asmap, + RF_VoidFuncPtr *createFunc) +{ + RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); + RF_PhysDiskAddr_t *failedPDA=NULL; + RF_RowCol_t frow, fcol; + RF_RowStatus_t rstat; + int prior_recon; + int tid; + + RF_ASSERT(RF_IO_IS_R_OR_W(type)); + + if (asmap->numDataFailed + asmap->numParityFailed > 1) { + RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n"); + /* *infoFunc = */ *createFunc = NULL; + return; + } else if (asmap->numDataFailed + asmap->numParityFailed == 1) { + + /* if under recon & already reconstructed, redirect the access to the spare drive + * and eliminate the failure indication + */ + failedPDA = asmap->failedPDAs[0]; + frow = failedPDA->row; fcol = failedPDA->col; + rstat = raidPtr->status[failedPDA->row]; + prior_recon = (rstat == rf_rs_reconfigured) || ( + (rstat == rf_rs_reconstructing) ? + rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0 + ); + if (prior_recon) { + RF_RowCol_t or = failedPDA->row,oc=failedPDA->col; + RF_SectorNum_t oo=failedPDA->startSector; + + if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { /* redirect to dist spare space */ + + if (failedPDA == asmap->parityInfo) { + + /* parity has failed */ + (layoutPtr->map->MapParity)(raidPtr, failedPDA->raidAddress, &failedPDA->row, + &failedPDA->col, &failedPDA->startSector, RF_REMAP); + + if (asmap->parityInfo->next) { /* redir 2nd component, if any */ + RF_PhysDiskAddr_t *p = asmap->parityInfo->next; + RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit; + p->row = failedPDA->row; + p->col = failedPDA->col; + p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) + + SUoffs; /* cheating: startSector is not really a RAID address */ + } + + } else if (asmap->parityInfo->next && failedPDA == asmap->parityInfo->next) { + RF_ASSERT(0); /* should not ever happen */ + } else { + + /* data has failed */ + (layoutPtr->map->MapSector)(raidPtr, failedPDA->raidAddress, &failedPDA->row, + &failedPDA->col, &failedPDA->startSector, RF_REMAP); + + } + + } else { /* redirect to dedicated spare space */ + + failedPDA->row = raidPtr->Disks[frow][fcol].spareRow; + failedPDA->col = raidPtr->Disks[frow][fcol].spareCol; + + /* the parity may have two distinct components, both of which may need to be redirected */ + if (asmap->parityInfo->next) { + if (failedPDA == asmap->parityInfo) { + failedPDA->next->row = failedPDA->row; + failedPDA->next->col = failedPDA->col; + } else if (failedPDA == asmap->parityInfo->next) { /* paranoid: should never occur */ + asmap->parityInfo->row = failedPDA->row; + asmap->parityInfo->col = failedPDA->col; + } + } + } + + RF_ASSERT(failedPDA->col != -1); + + if (rf_dagDebug || rf_mapDebug) { + rf_get_threadid(tid); + printf("[%d] Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n", + tid,type,or,oc,(long)oo,failedPDA->row,failedPDA->col, + (long)failedPDA->startSector); + } + + asmap->numDataFailed = asmap->numParityFailed = 0; + } + + } + + /* all dags begin/end with block/unblock node + * therefore, hdrSucc & termAnt counts should always be 1 + * also, these counts should not be visible outside dag creation routines - + * manipulating the counts here should be removed */ + if (type == RF_IO_TYPE_READ) { + if (asmap->numDataFailed == 0) + *createFunc = (RF_VoidFuncPtr)rf_CreateFaultFreeReadDAG; + else + *createFunc = (RF_VoidFuncPtr)rf_CreateRaidFiveDegradedReadDAG; + } else { + + + /* if mirroring, always use large writes. If the access requires two + * distinct parity updates, always do a small write. If the stripe + * contains a failure but the access does not, do a small write. + * The first conditional (numStripeUnitsAccessed <= numDataCol/2) uses a + * less-than-or-equal rather than just a less-than because when G is 3 + * or 4, numDataCol/2 is 1, and I want single-stripe-unit updates to use + * just one disk. + */ + if ( (asmap->numDataFailed + asmap->numParityFailed) == 0) { + if (rf_suppressLocksAndLargeWrites || + (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol!=1)) || + (asmap->parityInfo->next!=NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) { + *createFunc = (RF_VoidFuncPtr)rf_CreateSmallWriteDAG; + } + else + *createFunc = (RF_VoidFuncPtr)rf_CreateLargeWriteDAG; + } + else { + if (asmap->numParityFailed == 1) + *createFunc = (RF_VoidFuncPtr)rf_CreateNonRedundantWriteDAG; + else + if (asmap->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit) + *createFunc = NULL; + else + *createFunc = (RF_VoidFuncPtr)rf_CreateDegradedWriteDAG; + } + } +} diff --git a/sys/dev/raidframe/rf_raid5.h b/sys/dev/raidframe/rf_raid5.h new file mode 100644 index 00000000000..a6ffc971ca4 --- /dev/null +++ b/sys/dev/raidframe/rf_raid5.h @@ -0,0 +1,113 @@ +/* $OpenBSD: rf_raid5.h,v 1.1 1999/01/11 14:29:43 niklas Exp $ */ +/* $NetBSD: rf_raid5.h,v 1.1 1998/11/13 04:20:33 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* rf_raid5.h - header file for RAID Level 5 */ + +/* + * : + * Log: rf_raid5.h,v + * Revision 1.15 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.14 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.13 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.12 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.11 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.10 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.9 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.8 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.7 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.6 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.5 1995/12/06 15:04:35 root + * added copyright info + * + * Revision 1.4 1995/11/17 19:09:08 wvcii + * added prototyping to MapParity + * + * Revision 1.3 1995/11/07 15:25:40 wvcii + * changed RAIDFiveDagSelect prototype + * function no longer generates numHdrSucc, numTermAnt + * + * Revision 1.2 1995/06/23 13:37:53 robby + * updeated to prototypes in rf_layout.h + * + */ + +#ifndef _RF__RF_RAID5_H_ +#define _RF__RF_RAID5_H_ + +int rf_ConfigureRAID5(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr); +int rf_GetDefaultNumFloatingReconBuffersRAID5(RF_Raid_t *raidPtr); +RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitRAID5(RF_Raid_t *raidPtr); +void rf_MapSectorRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_MapParityRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_IdentifyStripeRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t addr, + RF_RowCol_t **diskids, RF_RowCol_t *outRow); +void rf_MapSIDToPSIDRAID5(RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t stripeID, RF_StripeNum_t *psID, + RF_ReconUnitNum_t *which_ru); +void rf_RaidFiveDagSelect(RF_Raid_t *raidPtr, RF_IoType_t type, + RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc); + +#endif /* !_RF__RF_RAID5_H_ */ diff --git a/sys/dev/raidframe/rf_raid5_rotatedspare.c b/sys/dev/raidframe/rf_raid5_rotatedspare.c new file mode 100644 index 00000000000..ca103f2116a --- /dev/null +++ b/sys/dev/raidframe/rf_raid5_rotatedspare.c @@ -0,0 +1,250 @@ +/* $OpenBSD: rf_raid5_rotatedspare.c,v 1.1 1999/01/11 14:29:44 niklas Exp $ */ +/* $NetBSD: rf_raid5_rotatedspare.c,v 1.1 1998/11/13 04:20:33 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Khalil Amiri + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/************************************************************************** + * + * rf_raid5_rotated_spare.c -- implements RAID Level 5 with rotated sparing + * + **************************************************************************/ + +/* : + * Log: rf_raid5_rotatedspare.c,v + * Revision 1.22 1996/07/31 16:56:18 jimz + * dataBytesPerStripe, sectorsPerDisk init arch-indep. + * + * Revision 1.21 1996/07/29 14:05:12 jimz + * fix numPUs/numRUs confusion (everything is now numRUs) + * clean up some commenting, return values + * + * Revision 1.20 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.19 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.18 1996/06/19 17:53:48 jimz + * move GetNumSparePUs, InstallSpareTable ops into layout switch + * + * Revision 1.17 1996/06/11 08:54:27 jimz + * improved error-checking at configuration time + * + * Revision 1.16 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.15 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.14 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.13 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.12 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.11 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.10 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.9 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.8 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.7 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.6 1996/05/03 19:48:36 wvcii + * removed include of rf_redstripe.h + * + * Revision 1.5 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.4 1995/12/06 15:05:53 root + * added copyright info + * + * Revision 1.3 1995/11/19 21:26:29 amiri + * Added an assert to make sure numCol >= 3 + * + * Revision 1.2 1995/11/17 19:03:18 wvcii + * added prototyping to MapParity + * + */ + +#include "rf_raid.h" +#include "rf_raid5.h" +#include "rf_dag.h" +#include "rf_dagutils.h" +#include "rf_dagfuncs.h" +#include "rf_threadid.h" +#include "rf_general.h" +#include "rf_utils.h" +#include "rf_raid5_rotatedspare.h" + +typedef struct RF_Raid5RSConfigInfo_s { + RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by IdentifyStripe */ +} RF_Raid5RSConfigInfo_t; + +int rf_ConfigureRAID5_RS( + RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr) +{ + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_Raid5RSConfigInfo_t *info; + RF_RowCol_t i, j, startdisk; + + /* create a RAID level 5 configuration structure */ + RF_MallocAndAdd(info, sizeof(RF_Raid5RSConfigInfo_t), (RF_Raid5RSConfigInfo_t *), raidPtr->cleanupList); + if (info == NULL) + return(ENOMEM); + layoutPtr->layoutSpecificInfo = (void *) info; + + RF_ASSERT(raidPtr->numRow == 1); + RF_ASSERT(raidPtr->numCol >= 3); + + /* the stripe identifier must identify the disks in each stripe, + * IN THE ORDER THAT THEY APPEAR IN THE STRIPE. + */ + info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol, raidPtr->numCol, raidPtr->cleanupList); + if (info->stripeIdentifier == NULL) + return(ENOMEM); + startdisk = 0; + for (i=0; i<raidPtr->numCol; i++) { + for (j=0; j<raidPtr->numCol; j++) { + info->stripeIdentifier[i][j] = (startdisk + j) % raidPtr->numCol; + } + if ((--startdisk) < 0) startdisk = raidPtr->numCol-1; + } + + /* fill in the remaining layout parameters */ + layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk; + layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector; + layoutPtr->numDataCol = raidPtr->numCol-2; + layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; + layoutPtr->numParityCol = 1; + layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk; + raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit; + + raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit; + + return(0); +} + +RF_ReconUnitCount_t rf_GetNumSpareRUsRAID5_RS(raidPtr) + RF_Raid_t *raidPtr; +{ + return ( raidPtr->Layout.stripeUnitsPerDisk / raidPtr->numCol ); +} + +void rf_MapSectorRAID5_RS( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; + + *row = 0; + if (remap) { + *col = raidPtr->numCol-1-(1+SUID/raidPtr->Layout.numDataCol)%raidPtr->numCol; + *col = (*col+1)%raidPtr->numCol; /*spare unit is rotated with parity; line above maps to parity */ + } + else { + *col = ( SUID + (SUID/raidPtr->Layout.numDataCol) ) % raidPtr->numCol; + } + *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit + + (raidSector % raidPtr->Layout.sectorsPerStripeUnit); +} + +void rf_MapParityRAID5_RS( + RF_Raid_t *raidPtr, + RF_RaidAddr_t raidSector, + RF_RowCol_t *row, + RF_RowCol_t *col, + RF_SectorNum_t *diskSector, + int remap) +{ + RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit; + + *row = 0; + *col = raidPtr->numCol-1-(1+SUID/raidPtr->Layout.numDataCol)%raidPtr->numCol; + *diskSector =(SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit + + (raidSector % raidPtr->Layout.sectorsPerStripeUnit); + if (remap) + *col = (*col+1)%raidPtr->numCol; +} + +void rf_IdentifyStripeRAID5_RS( + RF_Raid_t *raidPtr, + RF_RaidAddr_t addr, + RF_RowCol_t **diskids, + RF_RowCol_t *outRow) +{ + RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr); + RF_Raid5RSConfigInfo_t *info = (RF_Raid5RSConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo; + *outRow = 0; + *diskids = info->stripeIdentifier[ stripeID % raidPtr->numCol ]; + +} + +void rf_MapSIDToPSIDRAID5_RS( + RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t stripeID, + RF_StripeNum_t *psID, + RF_ReconUnitNum_t *which_ru) +{ + *which_ru = 0; + *psID = stripeID; +} + diff --git a/sys/dev/raidframe/rf_raid5_rotatedspare.h b/sys/dev/raidframe/rf_raid5_rotatedspare.h new file mode 100644 index 00000000000..e144b00f6d0 --- /dev/null +++ b/sys/dev/raidframe/rf_raid5_rotatedspare.h @@ -0,0 +1,105 @@ +/* $OpenBSD: rf_raid5_rotatedspare.h,v 1.1 1999/01/11 14:29:44 niklas Exp $ */ +/* $NetBSD: rf_raid5_rotatedspare.h,v 1.1 1998/11/13 04:20:33 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Khalil Amiri + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* rf_raid5_rotatedspare.h - header file for RAID Level 5 with rotated sparing */ + +/* : + * Log: rf_raid5_rotatedspare.h,v + * Revision 1.13 1996/07/29 14:05:12 jimz + * fix numPUs/numRUs confusion (everything is now numRUs) + * clean up some commenting, return values + * + * Revision 1.12 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.11 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.10 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.9 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.8 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.7 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.6 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.5 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.4 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.3 1995/12/06 15:06:00 root + * added copyright info + * + * Revision 1.2 1995/11/17 19:09:54 wvcii + * added prototyping to MapParity + * + */ + +#ifndef _RF__RF_RAID5_ROTATEDSPARE_H_ +#define _RF__RF_RAID5_ROTATEDSPARE_H_ + +int rf_ConfigureRAID5_RS(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr); +RF_ReconUnitCount_t rf_GetNumSpareRUsRAID5_RS(RF_Raid_t *raidPtr); +void rf_MapSectorRAID5_RS(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_MapParityRAID5_RS(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, + RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap); +void rf_IdentifyStripeRAID5_RS(RF_Raid_t *raidPtr, RF_RaidAddr_t addr, + RF_RowCol_t **diskids, RF_RowCol_t *outRow); +void rf_MapSIDToPSIDRAID5_RS(RF_RaidLayout_t *layoutPtr, + RF_StripeNum_t stripeID, RF_StripeNum_t *psID, + RF_ReconUnitNum_t *which_ru); + +#endif /* !_RF__RF_RAID5_ROTATEDSPARE_H_ */ diff --git a/sys/dev/raidframe/rf_raidframe.h b/sys/dev/raidframe/rf_raidframe.h new file mode 100644 index 00000000000..e316dd09eb4 --- /dev/null +++ b/sys/dev/raidframe/rf_raidframe.h @@ -0,0 +1,165 @@ +/* $OpenBSD: rf_raidframe.h,v 1.1 1999/01/11 14:29:44 niklas Exp $ */ +/* $NetBSD: rf_raidframe.h,v 1.1 1998/11/13 04:20:33 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/***************************************************** + * + * rf_raidframe.h + * + * main header file for using raidframe in the kernel. + * + *****************************************************/ + +/* + * : + * + * Log: rf_raidframe.h,v + * Revision 1.21 1996/06/17 03:00:15 jimz + * Change RAIDFRAME_GET_INFO interface to work around ioctl + * size limitation problem. This operation now takes a pointer + * to a pointer, and does its own copyout() (so it can transfer + * more than 8k at a time). + * + * Revision 1.20 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.19 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.18 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.17 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.16 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.15 1996/05/02 22:09:48 jimz + * change devs and spares in device_config to RF_RaidDisk_t + * + * Revision 1.14 1995/12/06 15:03:33 root + * added copyright info + * + * Revision 1.13 1995/09/30 20:39:54 jimz + * added new ioctls: + * RAIDFRAME_RESET_ACCTOTALS + * RAIDFRAME_GET_ACCTOTALS + * RAIDFRAME_KEEP_ACCTOTALS + * + * Revision 1.12 1995/09/25 20:11:51 wvcii + * Added #include "rf_raid.h" + * + * + */ + +#ifndef _RF__RF_RAIDFRAME_H_ +#define _RF__RF_RAIDFRAME_H_ + +#include "rf_types.h" +#include "rf_configure.h" +#include "rf_disks.h" +#include "rf_raid.h" + +struct rf_test_acc { /* used by RAIDFRAME_TEST_ACC ioctl */ + RF_SectorNum_t startSector; /* raidAddress */ + RF_SectorCount_t numSector; /* number of sectors to xfer */ + char *buf; /* data buffer */ + void *returnBufs[10]; /* for async accs only, completed I/Os returned */ + struct rf_test_acc *next; /* for making lists */ + RF_IoType_t type; /* (see rf_types.h for RF_IO_TYPE_*) */ + struct rf_test_acc *myaddr; /* user-address of this struct */ + void *bp; /* used in-kernel: need not be set by user */ +}; + +typedef RF_uint32 RF_ReconReqFlags_t; + +struct rf_recon_req { /* used to tell the kernel to fail a disk */ + RF_RowCol_t row, col; + RF_ReconReqFlags_t flags; + void *raidPtr; /* used internally; need not be set at ioctl time */ + struct rf_recon_req *next; /* used internally; need not be set at ioctl time */ +}; + +struct RF_SparetWait_s { + int C, G, fcol; /* C = # disks in row, G = # units in stripe, fcol = which disk has failed */ + + RF_StripeCount_t SUsPerPU; /* this stuff is the info required to create a spare table */ + int TablesPerSpareRegion; + int BlocksPerTable; + RF_StripeCount_t TableDepthInPUs; + RF_StripeCount_t SpareSpaceDepthPerRegionInSUs; + + RF_SparetWait_t *next; /* used internally; need not be set at ioctl time */ +}; + +typedef struct RF_DeviceConfig_s { + u_int rows; + u_int cols; + u_int maxqdepth; + int ndevs; + RF_RaidDisk_t devs[RF_MAX_DISKS]; + int nspares; + RF_RaidDisk_t spares[RF_MAX_DISKS]; +} RF_DeviceConfig_t; + + +/* flags that can be put in the rf_recon_req structure */ +#define RF_FDFLAGS_NONE 0x0 /* just fail the disk */ +#define RF_FDFLAGS_RECON 0x1 /* fail and initiate recon */ + +#define RF_SCSI_DISK_MAJOR 8 /* the device major number for disks in the system */ + +#define RAIDFRAME_CONFIGURE _IOW ('r', 1, void *) /* configure the driver */ +#define RAIDFRAME_SHUTDOWN _IO ('r', 2) /* shutdown the driver */ +#define RAIDFRAME_TUR _IOW ('r', 3, dev_t) /* debug only: test unit ready */ +#define RAIDFRAME_TEST_ACC _IOWR('r', 4, struct rf_test_acc) /* run a test access */ +#define RAIDFRAME_FAIL_DISK _IOW ('r', 5, struct rf_recon_req) /* fail a disk & optionally start recon */ +#define RAIDFRAME_CHECKRECON _IOWR('r', 6, int) /* get reconstruction % complete on indicated row */ +#define RAIDFRAME_REWRITEPARITY _IO ('r', 7) /* rewrite (initialize) all parity */ +#define RAIDFRAME_COPYBACK _IO ('r', 8) /* copy reconstructed data back to replaced disk */ +#define RAIDFRAME_SPARET_WAIT _IOR ('r', 9, RF_SparetWait_t) /* does not return until kernel needs a spare table */ +#define RAIDFRAME_SEND_SPARET _IOW ('r', 10, void *) /* used to send a spare table down into the kernel */ +#define RAIDFRAME_ABORT_SPARET_WAIT _IO ('r', 11) /* used to wake up the sparemap daemon & tell it to exit */ +#define RAIDFRAME_START_ATRACE _IO ('r', 12) /* start tracing accesses */ +#define RAIDFRAME_STOP_ATRACE _IO ('r', 13) /* stop tracing accesses */ +#define RAIDFRAME_GET_SIZE _IOR ('r', 14, int) /* get size (# sectors) in raid device */ +#define RAIDFRAME_GET_INFO _IOWR('r', 15, RF_DeviceConfig_t *) /* get configuration */ +#define RAIDFRAME_RESET_ACCTOTALS _IO ('r', 16) /* reset AccTotals for device */ +#define RAIDFRAME_GET_ACCTOTALS _IOR ('r', 17, RF_AccTotals_t) /* retrieve AccTotals for device */ +#define RAIDFRAME_KEEP_ACCTOTALS _IOW ('r', 18, int) /* turn AccTotals on or off for device */ + +#endif /* !_RF__RF_RAIDFRAME_H_ */ diff --git a/sys/dev/raidframe/rf_randmacros.h b/sys/dev/raidframe/rf_randmacros.h new file mode 100644 index 00000000000..c3536e0c613 --- /dev/null +++ b/sys/dev/raidframe/rf_randmacros.h @@ -0,0 +1,228 @@ +/* $OpenBSD: rf_randmacros.h,v 1.1 1999/01/11 14:29:45 niklas Exp $ */ +/* $NetBSD: rf_randmacros.h,v 1.1 1998/11/13 04:20:33 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* rf_randmacros.h + * some macros to simplify using random in a multithreaded environment + */ + +/* : + * Log: rf_randmacros.h,v + * Revision 1.17 1996/08/12 22:37:57 jimz + * use regular random() stuff for AIX + * + * Revision 1.16 1996/08/11 00:41:03 jimz + * fix up for aix4 + * + * Revision 1.15 1996/07/29 05:22:34 jimz + * use rand/srand on hpux + * + * Revision 1.14 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.13 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.12 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.11 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.10 1996/07/15 17:22:18 jimz + * nit-pick code cleanup + * resolve stdlib problems on DEC OSF + * + * Revision 1.9 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.8 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.7 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.6 1996/05/21 18:52:56 jimz + * mask out highest bit from RANDOM (was causing angst) + * + * Revision 1.5 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.4 1995/12/06 15:05:41 root + * added copyright info + * + */ + +#ifndef _RF__RF_RANDMACROS_H_ +#define _RF__RF_RANDMACROS_H_ + +#ifndef KERNEL + +#ifdef __osf__ +/* + * Okay, here's the deal. The DEC man page for initstate_r() sez: + * + * int initstate_r(unsigned seed, char *state, int size, char **retval, + * struct random_data *rand_data); + * + * That wouldn't bug me so much, if /usr/include/random.h on the alpha + * didn't say: + * + * int initstate_r(unsigned, char *, int, RANDMOD *); + * + * Most of the other random functions have similar problems (docs + * don't match random.h). This is the case for random_r(), for + * instance. Generally, I'm inclined to trust the code over the + * documentation. Problem is, I have no clue what the arguments for + * the prototyped versions are, since they don't have descriptive names + * comma the bastards. + * + * Update: I looked at the DU sources to get this straightened out. + * The docs are correct. and everything in random.h is wrong. Uh, that's + * really cool or something. Not. I'm going to try slapping in prototypes + * that match my view of the universe, here. + * + * Okay, now let's have some more fun. /usr/include/stdlib.h also defines + * all this stuff, only differently. I mean differently from random.h, + * _and_ differently from the source. How cool is _that_? + * + * --jimz + */ +#ifndef _NO_PROTO +#define _NO_PROTO +#define _RF_SPANKME +#endif /* !_NO_PROTO */ +#include <random.h> +#ifdef _RF_SPANKME +#undef _NO_PROTO +#undef _RF_SPANKME +#endif /* _RF_SPANKME */ + +extern int initstate_r(unsigned seed, char *arg_state, int n, char **retval, + struct random_data *rand_data); +extern int random_r(int *retval, struct random_data *rand_data); +#endif /* __osf__ */ +#ifdef SIMULATE +#if defined(DEC_OSF) || defined(hpux) +extern int random(void); +extern int srandom(unsigned); +#endif /* DEC_OSF || hpux */ +#if defined(AIX) && RF_AIXVERS == 3 +extern int random(void); +extern int srandom(unsigned); +#endif /* AIX && RF_AIXVERS == 3 */ +#endif /* SIMULATE */ + +#define RF_FASTRANDOM 0 /* when >0 make RANDOM a macro instead of a function */ + +#ifdef __osf__ +long rf_do_random(long *rval, struct random_data *rdata); /* in utils.c */ +#endif /* __osf__ */ + +#ifndef SIMULATE + +#ifdef __osf__ +/* + * Mark's original comment about this rigamarole was, "What a pile of crap." + */ +#define RF_DECLARE_RANDOM \ + struct random_data randdata; \ + long randstate[64+1]; \ + char *stptr = ((char *) randstate)+4; \ + char *randst; \ + long randval + +#define RF_DECLARE_STATIC_RANDOM \ + static struct random_data randdata_st; \ + static long randstate_st[64+1]; \ + static char *stptr_st = ((char *) randstate_st)+4; \ + static char *randst_st; \ + long randval_st; + +#define RF_INIT_RANDOM(_s_) \ + randdata.state = NULL; \ + initstate_r((unsigned) (_s_), stptr, 64, &randst, &randdata); + +#define RF_INIT_STATIC_RANDOM(_s_) \ + randdata_st.state = NULL; \ + initstate_r((unsigned) (_s_), stptr_st, 64, &randst_st, &randdata_st); + +#if RF_FASTRANDOM > 0 +#define RF_RANDOM() (random_r(&randval, &randdata),randval) +#define RF_STATIC_RANDOM() (random_r(&randval_st, &randdata_st),randval_st) +#else /* RF_FASTRANDOM > 0 */ +#define RF_RANDOM() (rf_do_random(&randval, &randdata)&0x7fffffffffffffff) +#define RF_STATIC_RANDOM() rf_do_random(&randval_st, &randdata_st) +#endif /* RF_FASTRANDOM > 0 */ + +#define RF_SRANDOM(_s_) srandom_r((_s_), &randdata) +#define RF_STATIC_SRANDOM(_s_) srandom_r((_s_), &randdata_st) +#endif /* __osf__ */ + +#ifdef AIX +#define RF_INIT_STATIC_RANDOM(_s_) +#define RF_DECLARE_STATIC_RANDOM static int rf_rand_decl##__LINE__ +#define RF_DECLARE_RANDOM int rf_rand_decl##__LINE__ +#define RF_RANDOM() random() +#define RF_STATIC_RANDOM() random() +#define RF_INIT_RANDOM(_n_) srandom(_n_) +#endif /* AIX */ + +#else /* !SIMULATE */ + +#define RF_INIT_STATIC_RANDOM(_s_) +#define RF_DECLARE_STATIC_RANDOM static int rf_rand_decl##__LINE__ +#define RF_DECLARE_RANDOM int rf_rand_decl##__LINE__ +#if defined(sun) || defined(hpux) +#define RF_RANDOM() rand() +#define RF_STATIC_RANDOM() rand() +#define RF_INIT_RANDOM(_n_) srand(_n_) +#else /* sun || hpux */ +#define RF_RANDOM() random() +#define RF_STATIC_RANDOM() random() +#define RF_INIT_RANDOM(_n_) srandom(_n_) +#endif /* sun || hpux */ + +#endif /* !SIMULATE */ + +#endif /* !KERNEL */ + +#endif /* !_RF__RF_RANDMACROS_H_ */ diff --git a/sys/dev/raidframe/rf_reconbuffer.c b/sys/dev/raidframe/rf_reconbuffer.c new file mode 100644 index 00000000000..2c24e47c111 --- /dev/null +++ b/sys/dev/raidframe/rf_reconbuffer.c @@ -0,0 +1,538 @@ +/* $OpenBSD: rf_reconbuffer.c,v 1.1 1999/01/11 14:29:45 niklas Exp $ */ +/* $NetBSD: rf_reconbuffer.c,v 1.1 1998/11/13 04:20:33 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/*************************************************** + * + * rf_reconbuffer.c -- reconstruction buffer manager + * + ***************************************************/ + +/* : + * Log: rf_reconbuffer.c,v + * Revision 1.33 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.32 1996/07/17 21:00:58 jimz + * clean up timer interface, tracing + * + * Revision 1.31 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.30 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.29 1996/06/06 01:23:58 jimz + * don't free reconCtrlPtr until after all fields have been used out of it + * + * Revision 1.28 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.27 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.26 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.25 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.24 1996/05/30 12:59:18 jimz + * make etimer happier, more portable + * + * Revision 1.23 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.22 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.21 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.20 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.19 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.18 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.17 1995/12/06 15:03:24 root + * added copyright info + * + */ + +#ifdef _KERNEL +#define KERNEL +#endif + +#include "rf_raid.h" +#include "rf_reconbuffer.h" +#include "rf_acctrace.h" +#include "rf_etimer.h" +#include "rf_general.h" +#include "rf_debugprint.h" +#include "rf_revent.h" +#include "rf_reconutil.h" +#include "rf_nwayxor.h" + +#ifdef KERNEL +#define Dprintf1(s,a) if (rf_reconbufferDebug) printf(s,a) +#define Dprintf2(s,a,b) if (rf_reconbufferDebug) printf(s,a,b) +#define Dprintf3(s,a,b,c) if (rf_reconbufferDebug) printf(s,a,b,c) +#define Dprintf4(s,a,b,c,d) if (rf_reconbufferDebug) printf(s,a,b,c,d) +#define Dprintf5(s,a,b,c,d,e) if (rf_reconbufferDebug) printf(s,a,b,c,d,e) +#else /* KERNEL */ +#define Dprintf1(s,a) if (rf_reconbufferDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf2(s,a,b) if (rf_reconbufferDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf3(s,a,b,c) if (rf_reconbufferDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL) +#define Dprintf4(s,a,b,c,d) if (rf_reconbufferDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL) +#define Dprintf5(s,a,b,c,d,e) if (rf_reconbufferDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL) +#endif /* KERNEL */ + +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + +/* XXX XXX XXX This is wrong, for a number of reasons: + a) thread_block doesn't exist with UVM + b) The prototype begin used here is wrong for the regular VM + (regular VM expects a (char *) as an argument. I don't put + that in here as this code uses thread_block with no arguments.. :-/ + +*/ +#if 0 +void thread_block(void); +#endif +#endif + +/***************************************************************************************** + * + * Submit a reconstruction buffer to the manager for XOR. + * We can only submit a buffer if (1) we can xor into an existing buffer, which means + * we don't have to acquire a new one, (2) we can acquire a floating + * recon buffer, or (3) the caller has indicated that we are allowed to keep the + * submitted buffer. + * + * Returns non-zero if and only if we were not able to submit. + * In this case, we append the current disk ID to the wait list on the indicated + * RU, so that it will be re-enabled when we acquire a buffer for this RU. + * + ****************************************************************************************/ + +/* just to make the code below more readable */ +#define BUFWAIT_APPEND(_cb_, _pssPtr_, _row_, _col_) \ + _cb_ = rf_AllocCallbackDesc(); \ + (_cb_)->row = (_row_); (_cb_)->col = (_col_); (_cb_)->next = (_pssPtr_)->bufWaitList; (_pssPtr_)->bufWaitList = (_cb_); + +/* + * nWayXorFuncs[i] is a pointer to a function that will xor "i" + * bufs into the accumulating sum. + */ +static RF_VoidFuncPtr nWayXorFuncs[] = { + NULL, + (RF_VoidFuncPtr)rf_nWayXor1, + (RF_VoidFuncPtr)rf_nWayXor2, + (RF_VoidFuncPtr)rf_nWayXor3, + (RF_VoidFuncPtr)rf_nWayXor4, + (RF_VoidFuncPtr)rf_nWayXor5, + (RF_VoidFuncPtr)rf_nWayXor6, + (RF_VoidFuncPtr)rf_nWayXor7, + (RF_VoidFuncPtr)rf_nWayXor8, + (RF_VoidFuncPtr)rf_nWayXor9 +}; + +int rf_SubmitReconBuffer(rbuf, keep_it, use_committed) + RF_ReconBuffer_t *rbuf; /* the recon buffer to submit */ + int keep_it; /* whether we can keep this buffer or we have to return it */ + int use_committed; /* whether to use a committed or an available recon buffer */ +{ + RF_LayoutSW_t *lp; + int rc; + + lp = rbuf->raidPtr->Layout.map; + rc = lp->SubmitReconBuffer(rbuf, keep_it, use_committed); + return(rc); +} + +int rf_SubmitReconBufferBasic(rbuf, keep_it, use_committed) + RF_ReconBuffer_t *rbuf; /* the recon buffer to submit */ + int keep_it; /* whether we can keep this buffer or we have to return it */ + int use_committed; /* whether to use a committed or an available recon buffer */ +{ + RF_Raid_t *raidPtr = rbuf->raidPtr; + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[rbuf->row]; + RF_ReconParityStripeStatus_t *pssPtr; + RF_ReconBuffer_t *targetRbuf, *t = NULL; /* temporary rbuf pointers */ + caddr_t ta; /* temporary data buffer pointer */ + RF_CallbackDesc_t *cb, *p; + int retcode = 0, created = 0; + + RF_Etimer_t timer; + + /* makes no sense to have a submission from the failed disk */ + RF_ASSERT(rbuf); + RF_ASSERT(rbuf->col != reconCtrlPtr->fcol); + + Dprintf5("RECON: submission by row %d col %d for psid %ld ru %d (failed offset %ld)\n", + rbuf->row, rbuf->col, (long)rbuf->parityStripeID, rbuf->which_ru, (long)rbuf->failedDiskSectorOffset); + + RF_LOCK_PSS_MUTEX(raidPtr,rbuf->row,rbuf->parityStripeID); + + RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); + + pssPtr = rf_LookupRUStatus(raidPtr, reconCtrlPtr->pssTable, rbuf->parityStripeID, rbuf->which_ru, RF_PSS_NONE, &created); + RF_ASSERT(pssPtr); /* if it didn't exist, we wouldn't have gotten an rbuf for it */ + + /* check to see if enough buffers have accumulated to do an XOR. If so, there's no need to + * acquire a floating rbuf. Before we can do any XORing, we must have acquired a destination + * buffer. If we have, then we can go ahead and do the XOR if (1) including this buffer, enough + * bufs have accumulated, or (2) this is the last submission for this stripe. + * Otherwise, we have to go acquire a floating rbuf. + */ + + targetRbuf = (RF_ReconBuffer_t *) pssPtr->rbuf; + if ( (targetRbuf != NULL) && + ((pssPtr->xorBufCount == rf_numBufsToAccumulate-1) || (targetRbuf->count + pssPtr->xorBufCount + 1 == layoutPtr->numDataCol)) ) { + pssPtr->rbufsForXor[ pssPtr->xorBufCount++ ] = rbuf; /* install this buffer */ + Dprintf3("RECON: row %d col %d invoking a %d-way XOR\n",rbuf->row, rbuf->col,pssPtr->xorBufCount); + RF_ETIMER_START(timer); + rf_MultiWayReconXor(raidPtr, pssPtr); + RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); + raidPtr->accumXorTimeUs += RF_ETIMER_VAL_US(timer); + if (!keep_it) { + raidPtr->recon_tracerecs[rbuf->col].xor_us = RF_ETIMER_VAL_US(timer); + RF_ETIMER_STOP(raidPtr->recon_tracerecs[rbuf->col].recon_timer); + RF_ETIMER_EVAL(raidPtr->recon_tracerecs[rbuf->col].recon_timer); + raidPtr->recon_tracerecs[rbuf->col].specific.recon.recon_return_to_submit_us += + RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[rbuf->col].recon_timer); + RF_ETIMER_START(raidPtr->recon_tracerecs[rbuf->col].recon_timer); + + rf_LogTraceRec(raidPtr, &raidPtr->recon_tracerecs[rbuf->col]); + } + rf_CheckForFullRbuf(raidPtr, reconCtrlPtr, pssPtr, layoutPtr->numDataCol); + + /* if use_committed is on, we _must_ consume a buffer off the committed list. */ + if (use_committed) { + t = reconCtrlPtr->committedRbufs; + RF_ASSERT(t); + reconCtrlPtr->committedRbufs = t->next; + rf_ReleaseFloatingReconBuffer(raidPtr, rbuf->row, t); + } + if (keep_it) { + RF_UNLOCK_PSS_MUTEX( raidPtr,rbuf->row,rbuf->parityStripeID); + RF_UNLOCK_MUTEX( reconCtrlPtr->rb_mutex ); + rf_FreeReconBuffer(rbuf); + return(retcode); + } + goto out; + } + + /* set the value of "t", which we'll use as the rbuf from here on */ + if (keep_it) { + t = rbuf; + } + else { + if (use_committed) { /* if a buffer has been committed to us, use it */ + t = reconCtrlPtr->committedRbufs; + RF_ASSERT(t); + reconCtrlPtr->committedRbufs = t->next; + t->next = NULL; + } else if (reconCtrlPtr->floatingRbufs) { + t = reconCtrlPtr->floatingRbufs; + reconCtrlPtr->floatingRbufs = t->next; + t->next = NULL; + } + } + + /* If we weren't able to acquire a buffer, + * append to the end of the buf list in the recon ctrl struct. + */ + if (!t) { + RF_ASSERT(!keep_it && !use_committed); + Dprintf2("RECON: row %d col %d failed to acquire floating rbuf\n",rbuf->row, rbuf->col); + + raidPtr->procsInBufWait++; + if ( (raidPtr->procsInBufWait == raidPtr->numCol -1) && (raidPtr->numFullReconBuffers == 0)) { + printf("Buffer wait deadlock detected. Exiting.\n"); + rf_PrintPSStatusTable(raidPtr, rbuf->row); + RF_PANIC(); + } + pssPtr->flags |= RF_PSS_BUFFERWAIT; + cb = rf_AllocCallbackDesc(); /* append to buf wait list in recon ctrl structure */ + cb->row = rbuf->row; cb->col = rbuf->col; + cb->callbackArg.v = rbuf->parityStripeID; + cb->callbackArg2.v = rbuf->which_ru; + cb->next = NULL; + if (!reconCtrlPtr->bufferWaitList) reconCtrlPtr->bufferWaitList = cb; + else { /* might want to maintain head/tail pointers here rather than search for end of list */ + for (p = reconCtrlPtr->bufferWaitList; p->next; p=p->next); + p->next = cb; + } + retcode = 1; + goto out; + } + Dprintf2("RECON: row %d col %d acquired rbuf\n",rbuf->row, rbuf->col); + RF_ETIMER_STOP(raidPtr->recon_tracerecs[rbuf->col].recon_timer); + RF_ETIMER_EVAL(raidPtr->recon_tracerecs[rbuf->col].recon_timer); + raidPtr->recon_tracerecs[rbuf->col].specific.recon.recon_return_to_submit_us += + RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[rbuf->col].recon_timer); + RF_ETIMER_START(raidPtr->recon_tracerecs[rbuf->col].recon_timer); + + rf_LogTraceRec(raidPtr, &raidPtr->recon_tracerecs[rbuf->col]); + + /* initialize the buffer */ + if (t!=rbuf) { + t->row = rbuf->row; t->col = reconCtrlPtr->fcol; + t->parityStripeID = rbuf->parityStripeID; + t->which_ru = rbuf->which_ru; + t->failedDiskSectorOffset = rbuf->failedDiskSectorOffset; + t->spRow=rbuf->spRow; + t->spCol=rbuf->spCol; + t->spOffset=rbuf->spOffset; + + ta = t->buffer; t->buffer = rbuf->buffer; rbuf->buffer = ta; /* swap buffers */ + } + + /* the first installation always gets installed as the destination buffer. + * subsequent installations get stacked up to allow for multi-way XOR + */ + if (!pssPtr->rbuf) {pssPtr->rbuf = t; t->count = 1;} + else pssPtr->rbufsForXor[ pssPtr->xorBufCount++ ] = t; /* install this buffer */ + + rf_CheckForFullRbuf(raidPtr, reconCtrlPtr, pssPtr, layoutPtr->numDataCol); /* the buffer is full if G=2 */ + +out: + RF_UNLOCK_PSS_MUTEX( raidPtr,rbuf->row,rbuf->parityStripeID); + RF_UNLOCK_MUTEX( reconCtrlPtr->rb_mutex ); + return(retcode); +} + +int rf_MultiWayReconXor(raidPtr, pssPtr) + RF_Raid_t *raidPtr; + RF_ReconParityStripeStatus_t *pssPtr; /* the pss descriptor for this parity stripe */ +{ + int i, numBufs = pssPtr->xorBufCount; + int numBytes = rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU); + RF_ReconBuffer_t **rbufs = (RF_ReconBuffer_t **) pssPtr->rbufsForXor; + RF_ReconBuffer_t *targetRbuf = (RF_ReconBuffer_t *) pssPtr->rbuf; + + RF_ASSERT(pssPtr->rbuf != NULL); + RF_ASSERT(numBufs > 0 && numBufs < RF_PS_MAX_BUFS); +#ifdef KERNEL +#if !defined(__NetBSD__) && !defined(__OpenBSD__) + thread_block(); /* yield the processor before doing a big XOR */ +#endif +#endif /* KERNEL */ + /* + * XXX + * + * What if more than 9 bufs? + */ + nWayXorFuncs[numBufs](pssPtr->rbufsForXor, targetRbuf, numBytes/sizeof(long)); + + /* release all the reconstruction buffers except the last one, which belongs to the + * the disk who's submission caused this XOR to take place + */ + for (i=0; i < numBufs-1; i++) { + if (rbufs[i]->type == RF_RBUF_TYPE_FLOATING) rf_ReleaseFloatingReconBuffer(raidPtr, rbufs[i]->row, rbufs[i]); + else if (rbufs[i]->type == RF_RBUF_TYPE_FORCED) rf_FreeReconBuffer(rbufs[i]); + else RF_ASSERT(0); + } + targetRbuf->count += pssPtr->xorBufCount; + pssPtr->xorBufCount = 0; + return(0); +} + +/* removes one full buffer from one of the full-buffer lists and returns it. + * + * ASSUMES THE RB_MUTEX IS UNLOCKED AT ENTRY. + */ +RF_ReconBuffer_t *rf_GetFullReconBuffer(reconCtrlPtr) + RF_ReconCtrl_t *reconCtrlPtr; +{ + RF_ReconBuffer_t *p; + + RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); + + if ( (p=reconCtrlPtr->priorityList) != NULL) { + reconCtrlPtr->priorityList = p->next; + p->next = NULL; + goto out; + } + if ( (p=reconCtrlPtr->fullBufferList) != NULL) { + reconCtrlPtr->fullBufferList = p->next; + p->next = NULL; + goto out; + } + +out: + RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex); + return(p); +} + + +/* if the reconstruction buffer is full, move it to the full list, which is maintained + * sorted by failed disk sector offset + * + * ASSUMES THE RB_MUTEX IS LOCKED AT ENTRY. + */ +int rf_CheckForFullRbuf(raidPtr, reconCtrl, pssPtr, numDataCol) + RF_Raid_t *raidPtr; + RF_ReconCtrl_t *reconCtrl; + RF_ReconParityStripeStatus_t *pssPtr; + int numDataCol; +{ + RF_ReconBuffer_t *p, *pt, *rbuf = (RF_ReconBuffer_t *) pssPtr->rbuf; + + if (rbuf->count == numDataCol) { + raidPtr->numFullReconBuffers++; + Dprintf2("RECON: rbuf for psid %ld ru %d has filled\n", + (long)rbuf->parityStripeID, rbuf->which_ru); + if (!reconCtrl->fullBufferList || (rbuf->failedDiskSectorOffset < reconCtrl->fullBufferList->failedDiskSectorOffset)) { + Dprintf2("RECON: rbuf for psid %ld ru %d is head of list\n", + (long)rbuf->parityStripeID, rbuf->which_ru); + rbuf->next = reconCtrl->fullBufferList; + reconCtrl->fullBufferList = rbuf; + } + else { + for (pt = reconCtrl->fullBufferList, p = pt->next; p && p->failedDiskSectorOffset < rbuf->failedDiskSectorOffset; pt=p, p=p->next); + rbuf->next = p; + pt->next = rbuf; + Dprintf2("RECON: rbuf for psid %ld ru %d is in list\n", + (long)rbuf->parityStripeID, rbuf->which_ru); + } +#if 0 + pssPtr->writeRbuf = pssPtr->rbuf; /* DEBUG ONLY: we like to be able to find this rbuf while it's awaiting write */ +#else + rbuf->pssPtr = pssPtr; +#endif + pssPtr->rbuf = NULL; + rf_CauseReconEvent(raidPtr, rbuf->row, rbuf->col, NULL, RF_REVENT_BUFREADY); + } + return(0); +} + + +/* release a floating recon buffer for someone else to use. + * assumes the rb_mutex is LOCKED at entry + */ +void rf_ReleaseFloatingReconBuffer(raidPtr, row, rbuf) + RF_Raid_t *raidPtr; + RF_RowCol_t row; + RF_ReconBuffer_t *rbuf; +{ + RF_ReconCtrl_t *rcPtr = raidPtr->reconControl[row]; + RF_CallbackDesc_t *cb; + + Dprintf2("RECON: releasing rbuf for psid %ld ru %d\n", + (long)rbuf->parityStripeID, rbuf->which_ru); + + /* if anyone is waiting on buffers, wake one of them up. They will subsequently wake up anyone + * else waiting on their RU + */ + if (rcPtr->bufferWaitList) { + rbuf->next = rcPtr->committedRbufs; + rcPtr->committedRbufs = rbuf; + cb = rcPtr->bufferWaitList; + rcPtr->bufferWaitList = cb->next; + rf_CauseReconEvent(raidPtr, cb->row, cb->col, (void *) 1, RF_REVENT_BUFCLEAR); /* arg==1 => we've committed a buffer */ + rf_FreeCallbackDesc(cb); + raidPtr->procsInBufWait--; + } else { + rbuf->next = rcPtr->floatingRbufs; + rcPtr->floatingRbufs = rbuf; + } +} + +/* release any disk that is waiting on a buffer for the indicated RU. + * assumes the rb_mutex is LOCKED at entry + */ +void rf_ReleaseBufferWaiters(raidPtr, pssPtr) + RF_Raid_t *raidPtr; + RF_ReconParityStripeStatus_t *pssPtr; +{ + RF_CallbackDesc_t *cb1, *cb = pssPtr->bufWaitList; + + Dprintf2("RECON: releasing buf waiters for psid %ld ru %d\n", + (long)pssPtr->parityStripeID, pssPtr->which_ru); + pssPtr->flags &= ~RF_PSS_BUFFERWAIT; + while (cb) { + cb1 = cb->next; + cb->next = NULL; + rf_CauseReconEvent(raidPtr, cb->row, cb->col, (void *) 0, RF_REVENT_BUFCLEAR); /* arg==0 => we haven't committed a buffer */ + rf_FreeCallbackDesc(cb); + cb = cb1; + } + pssPtr->bufWaitList = NULL; +} + +/* when reconstruction is forced on an RU, there may be some disks waiting to + * acquire a buffer for that RU. Since we allocate a new buffer as part of + * the forced-reconstruction process, we no longer have to wait for any + * buffers, so we wakeup any waiter that we find in the bufferWaitList + * + * assumes the rb_mutex is LOCKED at entry + */ +void rf_ReleaseBufferWaiter(rcPtr, rbuf) + RF_ReconCtrl_t *rcPtr; + RF_ReconBuffer_t *rbuf; +{ + RF_CallbackDesc_t *cb, *cbt; + + for (cbt = NULL, cb = rcPtr->bufferWaitList; cb; cbt = cb, cb=cb->next) { + if ( (cb->callbackArg.v == rbuf->parityStripeID) && ( cb->callbackArg2.v == rbuf->which_ru)) { + Dprintf2("RECON: Dropping row %d col %d from buffer wait list\n", cb->row, cb->col); + if (cbt) cbt->next = cb->next; + else rcPtr->bufferWaitList = cb->next; + rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, cb->row, cb->col, (void *) 0, RF_REVENT_BUFREADY); /* arg==0 => no committed buffer */ + rf_FreeCallbackDesc(cb); + return; + } + } +} diff --git a/sys/dev/raidframe/rf_reconbuffer.h b/sys/dev/raidframe/rf_reconbuffer.h new file mode 100644 index 00000000000..61ec9c1c4ff --- /dev/null +++ b/sys/dev/raidframe/rf_reconbuffer.h @@ -0,0 +1,98 @@ +/* $OpenBSD: rf_reconbuffer.h,v 1.1 1999/01/11 14:29:45 niklas Exp $ */ +/* $NetBSD: rf_reconbuffer.h,v 1.1 1998/11/13 04:20:33 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/******************************************************************* + * + * rf_reconbuffer.h -- header file for reconstruction buffer manager + * + *******************************************************************/ + +/* : + * Log: rf_reconbuffer.h,v + * Revision 1.9 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.8 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.7 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.6 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.5 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.4 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.3 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.2 1995/12/06 15:04:47 root + * added copyright info + * + */ + +#ifndef _RF__RF_RECONBUFFER_H_ +#define _RF__RF_RECONBUFFER_H_ + +#include "rf_types.h" +#include "rf_reconstruct.h" + +int rf_SubmitReconBuffer(RF_ReconBuffer_t *rbuf, int keep_int, + int use_committed); +int rf_SubmitReconBufferBasic(RF_ReconBuffer_t *rbuf, int keep_int, + int use_committed); +int rf_MultiWayReconXor(RF_Raid_t *raidPtr, + RF_ReconParityStripeStatus_t *pssPtr); +RF_ReconBuffer_t *rf_GetFullReconBuffer(RF_ReconCtrl_t *reconCtrlPtr); +int rf_CheckForFullRbuf(RF_Raid_t *raidPtr, RF_ReconCtrl_t *reconCtrl, + RF_ReconParityStripeStatus_t *pssPtr, int numDataCol); +void rf_ReleaseFloatingReconBuffer(RF_Raid_t *raidPtr, RF_RowCol_t row, + RF_ReconBuffer_t *rbuf); +void rf_ReleaseBufferWaiters(RF_Raid_t *raidPtr, + RF_ReconParityStripeStatus_t *pssPtr); +void rf_ReleaseBufferWaiter(RF_ReconCtrl_t *rcPtr, RF_ReconBuffer_t *rbuf); + +#endif /* !_RF__RF_RECONBUFFER_H_ */ diff --git a/sys/dev/raidframe/rf_reconmap.c b/sys/dev/raidframe/rf_reconmap.c new file mode 100644 index 00000000000..565a4ca616c --- /dev/null +++ b/sys/dev/raidframe/rf_reconmap.c @@ -0,0 +1,459 @@ +/* $OpenBSD: rf_reconmap.c,v 1.1 1999/01/11 14:29:46 niklas Exp $ */ +/* $NetBSD: rf_reconmap.c,v 1.1 1998/11/13 04:20:33 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/************************************************************************* + * rf_reconmap.c + * + * code to maintain a map of what sectors have/have not been reconstructed + * + *************************************************************************/ + +/* : + * Log: rf_reconmap.c,v + * Revision 1.23 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.22 1996/07/15 17:22:18 jimz + * nit-pick code cleanup + * resolve stdlib problems on DEC OSF + * + * Revision 1.21 1996/06/17 14:38:33 jimz + * properly #if out RF_DEMO code + * fix bug in MakeConfig that was causing weird behavior + * in configuration routines (config was not zeroed at start) + * clean up genplot handling of stacks + * + * Revision 1.20 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.19 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.18 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.17 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.16 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.15 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.14 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.13 1996/05/24 04:40:57 jimz + * don't do recon meter demo stuff in kernel + * + * Revision 1.12 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.11 1996/05/20 16:14:50 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.10 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.9 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.8 1995/12/06 15:05:23 root + * added copyright info + * + */ + +#include "rf_raid.h" +#include <sys/time.h> +#include "rf_general.h" +#include "rf_utils.h" +#if RF_DEMO > 0 +#include "rf_demo.h" +#endif /* RF_DEMO > 0 */ +#include "rf_sys.h" + +/* special pointer values indicating that a reconstruction unit + * has been either totally reconstructed or not at all. Both + * are illegal pointer values, so you have to be careful not to + * dereference through them. RU_NOTHING must be zero, since + * MakeReconMap uses bzero to initialize the structure. These are used + * only at the head of the list. + */ +#define RU_ALL ((RF_ReconMapListElem_t *) -1) +#define RU_NOTHING ((RF_ReconMapListElem_t *) 0) + +/* used to mark the end of the list */ +#define RU_NIL ((RF_ReconMapListElem_t *) 0) + + +static void compact_stat_entry(RF_Raid_t *raidPtr, RF_ReconMap_t *mapPtr, + int i); +static void crunch_list(RF_ReconMap_t *mapPtr, RF_ReconMapListElem_t *listPtr); +static RF_ReconMapListElem_t *MakeReconMapListElem(RF_SectorNum_t startSector, + RF_SectorNum_t stopSector, RF_ReconMapListElem_t *next); +static void FreeReconMapListElem(RF_ReconMap_t *mapPtr, + RF_ReconMapListElem_t *p); +static void update_size(RF_ReconMap_t *mapPtr, int size); +static void PrintList(RF_ReconMapListElem_t *listPtr); + +/*----------------------------------------------------------------------------- + * + * Creates and initializes new Reconstruction map + * + *-----------------------------------------------------------------------------*/ + +RF_ReconMap_t *rf_MakeReconMap(raidPtr, ru_sectors, disk_sectors, spareUnitsPerDisk) + RF_Raid_t *raidPtr; + RF_SectorCount_t ru_sectors; /* size of reconstruction unit in sectors */ + RF_SectorCount_t disk_sectors; /* size of disk in sectors */ + RF_ReconUnitCount_t spareUnitsPerDisk; /* zero unless distributed sparing */ +{ + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_ReconUnitCount_t num_rus = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerRU; + RF_ReconMap_t *p; + int rc; + + RF_Malloc(p, sizeof(RF_ReconMap_t), (RF_ReconMap_t *)); + p->sectorsPerReconUnit = ru_sectors; + p->sectorsInDisk = disk_sectors; + + p->totalRUs = num_rus; + p->spareRUs = spareUnitsPerDisk; + p->unitsLeft = num_rus - spareUnitsPerDisk; + + RF_Malloc(p->status, num_rus * sizeof(RF_ReconMapListElem_t *), (RF_ReconMapListElem_t **)); + RF_ASSERT(p->status != (RF_ReconMapListElem_t **) NULL); + + (void) bzero((char *) p->status, num_rus * sizeof(RF_ReconMapListElem_t *)); + + p->size = sizeof(RF_ReconMap_t) + num_rus * sizeof(RF_ReconMapListElem_t *); + p->maxSize = p->size; + + rc = rf_mutex_init(&p->mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + RF_Free(p->status, num_rus * sizeof(RF_ReconMapListElem_t *)); + RF_Free(p, sizeof(RF_ReconMap_t)); + return(NULL); + } + return(p); +} + + +/*----------------------------------------------------------------------------- + * + * marks a new set of sectors as reconstructed. All the possible mergings get + * complicated. To simplify matters, the approach I take is to just dump + * something into the list, and then clean it up (i.e. merge elements and + * eliminate redundant ones) in a second pass over the list (compact_stat_entry()). + * Not 100% efficient, since a structure can be allocated and then immediately + * freed, but it keeps this code from becoming (more of) a nightmare of + * special cases. The only thing that compact_stat_entry() assumes is that the + * list is sorted by startSector, and so this is the only condition I maintain + * here. (MCH) + * + *-----------------------------------------------------------------------------*/ + +void rf_ReconMapUpdate(raidPtr, mapPtr, startSector, stopSector) + RF_Raid_t *raidPtr; + RF_ReconMap_t *mapPtr; + RF_SectorNum_t startSector; + RF_SectorNum_t stopSector; +{ + RF_SectorCount_t sectorsPerReconUnit = mapPtr->sectorsPerReconUnit; + RF_SectorNum_t i, first_in_RU, last_in_RU; + RF_ReconMapListElem_t *p, *pt; + + RF_LOCK_MUTEX(mapPtr->mutex); + RF_ASSERT(startSector >=0 && stopSector < mapPtr->sectorsInDisk && stopSector > startSector); + + while (startSector <= stopSector) { + i = startSector/mapPtr->sectorsPerReconUnit; + first_in_RU = i*sectorsPerReconUnit; + last_in_RU = first_in_RU + sectorsPerReconUnit -1 ; + p = mapPtr->status[i]; + if (p!=RU_ALL) { + if (p==RU_NOTHING || p->startSector > startSector ) { /* insert at front of list */ + + mapPtr->status[i] = MakeReconMapListElem(startSector, RF_MIN(stopSector,last_in_RU), (p==RU_NOTHING) ? NULL : p); + update_size(mapPtr, sizeof(RF_ReconMapListElem_t)); + + } else { /* general case */ + do { /* search for place to insert */ + pt = p; p = p->next; + } while (p && (p->startSector < startSector)); + pt->next = MakeReconMapListElem(startSector,RF_MIN(stopSector,last_in_RU),p); + update_size(mapPtr, sizeof(RF_ReconMapListElem_t)); + } + compact_stat_entry(raidPtr, mapPtr, i); + } + startSector = RF_MIN(stopSector, last_in_RU) +1; + } + RF_UNLOCK_MUTEX(mapPtr->mutex); +} + + + +/*----------------------------------------------------------------------------- + * + * performs whatever list compactions can be done, and frees any space + * that is no longer necessary. Assumes only that the list is sorted + * by startSector. crunch_list() compacts a single list as much as possible, + * and the second block of code deletes the entire list if possible. + * crunch_list() is also called from MakeReconMapAccessList(). + * + * When a recon unit is detected to be fully reconstructed, we set the + * corresponding bit in the parity stripe map so that the head follow + * code will not select this parity stripe again. This is redundant (but + * harmless) when compact_stat_entry is called from the reconstruction code, + * but necessary when called from the user-write code. + * + *-----------------------------------------------------------------------------*/ + +static void compact_stat_entry(raidPtr, mapPtr, i) + RF_Raid_t *raidPtr; + RF_ReconMap_t *mapPtr; + int i; +{ + RF_SectorCount_t sectorsPerReconUnit = mapPtr->sectorsPerReconUnit; + RF_ReconMapListElem_t *p = mapPtr->status[i]; + + crunch_list(mapPtr, p); + + if ((p->startSector == i*sectorsPerReconUnit) && + (p->stopSector == i*sectorsPerReconUnit +sectorsPerReconUnit -1)) { + mapPtr->status[i] = RU_ALL; + mapPtr->unitsLeft--; + FreeReconMapListElem(mapPtr,p); + } +} + +static void crunch_list(mapPtr, listPtr) + RF_ReconMap_t *mapPtr; + RF_ReconMapListElem_t *listPtr; +{ + RF_ReconMapListElem_t *pt, *p = listPtr; + + if (!p) return; + pt = p; p = p->next; + while (p) { + if (pt->stopSector >= p->startSector-1) { + pt->stopSector = RF_MAX(pt->stopSector, p->stopSector); + pt->next = p->next; + FreeReconMapListElem(mapPtr, p); + p = pt->next; + } + else { + pt = p; + p = p->next; + } + } +} + +/*----------------------------------------------------------------------------- + * + * Allocate and fill a new list element + * + *-----------------------------------------------------------------------------*/ + +static RF_ReconMapListElem_t *MakeReconMapListElem( + RF_SectorNum_t startSector, + RF_SectorNum_t stopSector, + RF_ReconMapListElem_t *next) +{ + RF_ReconMapListElem_t *p; + + RF_Malloc(p, sizeof(RF_ReconMapListElem_t), (RF_ReconMapListElem_t *)); + if (p == NULL) + return(NULL); + p->startSector = startSector; + p->stopSector = stopSector; + p->next = next; + return(p); +} + +/*----------------------------------------------------------------------------- + * + * Free a list element + * + *-----------------------------------------------------------------------------*/ + +static void FreeReconMapListElem(mapPtr,p) + RF_ReconMap_t *mapPtr; + RF_ReconMapListElem_t *p; +{ + int delta; + + if (mapPtr) { + delta = 0 - (int)sizeof(RF_ReconMapListElem_t); + update_size(mapPtr, delta); + } + RF_Free(p, sizeof(*p)); +} + +/*----------------------------------------------------------------------------- + * + * Free an entire status structure. Inefficient, but can be called at any time. + * + *-----------------------------------------------------------------------------*/ +void rf_FreeReconMap(mapPtr) + RF_ReconMap_t *mapPtr; +{ + RF_ReconMapListElem_t *p, *q; + RF_ReconUnitCount_t numRUs; + RF_ReconUnitNum_t i; + + numRUs = mapPtr->sectorsInDisk / mapPtr->sectorsPerReconUnit; + if (mapPtr->sectorsInDisk % mapPtr->sectorsPerReconUnit) + numRUs++; + + for (i=0; i<numRUs; i++) { + p = mapPtr->status[i]; + while (p != RU_NOTHING && p != RU_ALL) { + q = p; p = p->next; + RF_Free(q, sizeof(*q)); + } + } + rf_mutex_destroy(&mapPtr->mutex); + RF_Free(mapPtr->status, mapPtr->totalRUs * sizeof(RF_ReconMapListElem_t *)); + RF_Free(mapPtr, sizeof(RF_ReconMap_t)); +} + +/*----------------------------------------------------------------------------- + * + * returns nonzero if the indicated RU has been reconstructed already + * + *---------------------------------------------------------------------------*/ + +int rf_CheckRUReconstructed(mapPtr, startSector) + RF_ReconMap_t *mapPtr; + RF_SectorNum_t startSector; +{ + RF_ReconMapListElem_t *l; /* used for searching */ + RF_ReconUnitNum_t i; + + i = startSector / mapPtr->sectorsPerReconUnit; + l = mapPtr->status[i]; + return( (l == RU_ALL) ? 1 : 0 ); +} + +RF_ReconUnitCount_t rf_UnitsLeftToReconstruct(mapPtr) + RF_ReconMap_t *mapPtr; +{ + RF_ASSERT(mapPtr != NULL); + return( mapPtr->unitsLeft ); +} + +/* updates the size fields of a status descriptor */ +static void update_size(mapPtr, size) + RF_ReconMap_t *mapPtr; + int size; +{ + mapPtr->size += size; + mapPtr->maxSize = RF_MAX(mapPtr->size, mapPtr->maxSize); +} + +static void PrintList(listPtr) + RF_ReconMapListElem_t *listPtr; +{ + while (listPtr) { + printf("%d,%d -> ",(int)listPtr->startSector,(int)listPtr->stopSector); + listPtr = listPtr->next; + } + printf("\n"); +} + +void rf_PrintReconMap(raidPtr, mapPtr, frow, fcol) + RF_Raid_t *raidPtr; + RF_ReconMap_t *mapPtr; + RF_RowCol_t frow; + RF_RowCol_t fcol; +{ + RF_ReconUnitCount_t numRUs; + RF_ReconMapListElem_t *p; + RF_ReconUnitNum_t i; + + numRUs = mapPtr->totalRUs; + if (mapPtr->sectorsInDisk % mapPtr->sectorsPerReconUnit) + numRUs++; + + for (i=0; i<numRUs; i++) { + p = mapPtr->status[i]; + if (p==RU_ALL) /*printf("[%d] ALL\n",i)*/; + else if (p == RU_NOTHING) { + printf("%d: Unreconstructed\n",i); + } else { + printf("%d: ", i); + PrintList(p); + } + } +} + +void rf_PrintReconSchedule(mapPtr, starttime) + RF_ReconMap_t *mapPtr; + struct timeval *starttime; +{ + static int old_pctg = -1; + struct timeval tv, diff; + int new_pctg; + + new_pctg = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs); + if (new_pctg != old_pctg) { + RF_GETTIME(tv); + RF_TIMEVAL_DIFF(starttime, &tv, &diff); +#if RF_DEMO > 0 + if (rf_demoMode) { + rf_update_recon_meter(new_pctg); + } + else { + printf("%d %d.%06d\n",new_pctg, diff.tv_sec, diff.tv_usec); + } +#else /* RF_DEMO > 0 */ + printf("%d %d.%06d\n",(int)new_pctg, (int)diff.tv_sec, (int)diff.tv_usec); +#endif /* RF_DEMO > 0 */ + old_pctg = new_pctg; + } +} diff --git a/sys/dev/raidframe/rf_reconmap.h b/sys/dev/raidframe/rf_reconmap.h new file mode 100644 index 00000000000..5d03baefb1b --- /dev/null +++ b/sys/dev/raidframe/rf_reconmap.h @@ -0,0 +1,114 @@ +/* $OpenBSD: rf_reconmap.h,v 1.1 1999/01/11 14:29:46 niklas Exp $ */ +/* $NetBSD: rf_reconmap.h,v 1.1 1998/11/13 04:20:33 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/****************************************************************************** + * rf_reconMap.h -- Header file describing reconstruction status data structure + ******************************************************************************/ + +/* : + * Log: rf_reconmap.h,v + * Revision 1.10 1996/08/01 15:59:25 jimz + * minor cleanup + * + * Revision 1.9 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.8 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.7 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.6 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.5 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.4 1995/12/06 15:04:01 root + * added copyright info + * + */ + +#ifndef _RF__RF_RECONMAP_H_ +#define _RF__RF_RECONMAP_H_ + +#include "rf_types.h" +#include "rf_threadstuff.h" + +/* + * Main reconstruction status descriptor. size and maxsize are used for + * monitoring only: they have no function for reconstruction. + */ +struct RF_ReconMap_s { + RF_SectorCount_t sectorsPerReconUnit; /* sectors per reconstruct unit */ + RF_SectorCount_t sectorsInDisk; /* total sectors in disk */ + RF_SectorCount_t unitsLeft; /* recon units left to recon */ + RF_ReconUnitCount_t totalRUs; /* total recon units on disk */ + RF_ReconUnitCount_t spareRUs; /* total number of spare RUs on failed disk */ + RF_StripeCount_t totalParityStripes; /* total number of parity stripes in array */ + u_int size; /* overall size of this structure */ + u_int maxSize; /* maximum size so far */ + RF_ReconMapListElem_t **status; /* array of ptrs to list elements */ + RF_DECLARE_MUTEX(mutex) +}; + +/* a list element */ +struct RF_ReconMapListElem_s { + RF_SectorNum_t startSector; /* bounding sect nums on this block */ + RF_SectorNum_t stopSector; + RF_ReconMapListElem_t *next; /* next element in list */ +}; + +RF_ReconMap_t *rf_MakeReconMap(RF_Raid_t *raidPtr, RF_SectorCount_t ru_sectors, + RF_SectorCount_t disk_sectors, RF_ReconUnitCount_t spareUnitsPerDisk); + +void rf_ReconMapUpdate(RF_Raid_t *raidPtr, RF_ReconMap_t *mapPtr, + RF_SectorNum_t startSector, RF_SectorNum_t stopSector); + +void rf_FreeReconMap(RF_ReconMap_t *mapPtr); + +int rf_CheckRUReconstructed(RF_ReconMap_t *mapPtr, RF_SectorNum_t startSector); + +RF_ReconUnitCount_t rf_UnitsLeftToReconstruct(RF_ReconMap_t *mapPtr); + +void rf_PrintReconMap(RF_Raid_t *raidPtr, RF_ReconMap_t *mapPtr, + RF_RowCol_t frow, RF_RowCol_t fcol); + +void rf_PrintReconSchedule(RF_ReconMap_t *mapPtr, struct timeval *starttime); + +#endif /* !_RF__RF_RECONMAP_H_ */ diff --git a/sys/dev/raidframe/rf_reconstruct.c b/sys/dev/raidframe/rf_reconstruct.c new file mode 100644 index 00000000000..7df351a7ec0 --- /dev/null +++ b/sys/dev/raidframe/rf_reconstruct.c @@ -0,0 +1,1595 @@ +/* $OpenBSD: rf_reconstruct.c,v 1.1 1999/01/11 14:29:46 niklas Exp $ */ +/* $NetBSD: rf_reconstruct.c,v 1.1 1998/11/13 04:20:33 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/************************************************************ + * + * rf_reconstruct.c -- code to perform on-line reconstruction + * + ************************************************************/ + +/* + * : + * Log: rf_reconstruct.c,v + * Revision 1.65 1996/08/06 22:24:56 jimz + * get rid of sys/buf.h on linux + * + * Revision 1.64 1996/07/30 04:28:53 jimz + * include rf_types.h first + * + * Revision 1.63 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.62 1996/07/17 21:00:58 jimz + * clean up timer interface, tracing + * + * Revision 1.61 1996/07/15 05:40:41 jimz + * some recon datastructure cleanup + * better handling of multiple failures + * added undocumented double-recon test + * + * Revision 1.60 1996/07/15 02:57:18 jimz + * added debugging (peek at first couple bytes of recon buffers + * as they go by) + * + * Revision 1.59 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.58 1996/07/11 19:08:00 jimz + * generalize reconstruction mechanism + * allow raid1 reconstructs via copyback (done with array + * quiesced, not online, therefore not disk-directed) + * + * Revision 1.57 1996/06/17 14:38:33 jimz + * properly #if out RF_DEMO code + * fix bug in MakeConfig that was causing weird behavior + * in configuration routines (config was not zeroed at start) + * clean up genplot handling of stacks + * + * Revision 1.56 1996/06/17 03:24:59 jimz + * include shutdown.h for define of now-macroized ShutdownCreate + * + * Revision 1.55 1996/06/11 10:58:36 jimz + * get rid of simulator-testcode artifacts + * add generic ReconDoneProc mechanism instead + * + * Revision 1.54 1996/06/10 14:18:58 jimz + * move user, throughput stats into per-array structure + * + * Revision 1.53 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.52 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.51 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.50 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.49 1996/06/06 01:24:36 jimz + * don't get rid of reconCtrlPtr until we're done with it + * + * Revision 1.48 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.47 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.46 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.45 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.44 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.43 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.42 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.41 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.40 1996/05/24 04:40:40 jimz + * don't do demoMode stuff in kernel + * + * Revision 1.39 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.38 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.37 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.36 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.35 1996/05/01 16:28:16 jimz + * don't include ccmn.h + * + * Revision 1.34 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.33 1995/12/06 15:05:09 root + * added copyright info + * + * Revision 1.32 1995/11/17 19:04:11 wvcii + * added prototyping to ComputePSDiskOffsets + * prow and pcol now type int (were u_int) + * + * Revision 1.31 1995/11/17 01:39:35 amiri + * isolated some demo related stuff + * + * Revision 1.30 1995/10/18 19:33:14 amiri + * removed fflush (stdin/stdout) calls from ReconstructFailedDisk + * + * Revision 1.29 1995/10/11 10:20:33 jimz + * #if 0'd problem code for sigmetrics + * + * Revision 1.28 1995/10/10 23:18:15 amiri + * added fflushes to stdin/stdout before requesting + * input in demo mode. + * + * Revision 1.27 1995/10/10 19:24:47 amiri + * took out update_mode (for demo) from + * KERNEL source. + * + * Revision 1.26 1995/10/09 23:35:48 amiri + * added support for more meters in recon. demo + * + * Revision 1.25 1995/07/03 18:14:30 holland + * changed the way the number of floating recon bufs & + * the head sep limit get set + * + * Revision 1.24 1995/07/02 15:07:42 holland + * bug fixes related to getting distributed sparing numbers + * + * Revision 1.23 1995/06/23 13:36:36 robby + * updeated to prototypes in rf_layout.h + * +*/ + +#ifdef _KERNEL +#define KERNEL +#endif + +#include "rf_types.h" +#include <sys/time.h> +#ifndef LINUX +#include <sys/buf.h> +#endif /* !LINUX */ +#include <sys/errno.h> +#include "rf_raid.h" +#include "rf_reconutil.h" +#include "rf_revent.h" +#include "rf_reconbuffer.h" +#include "rf_threadid.h" +#include "rf_acctrace.h" +#include "rf_etimer.h" +#include "rf_dag.h" +#include "rf_desc.h" +#include "rf_general.h" +#include "rf_freelist.h" +#include "rf_debugprint.h" +#include "rf_driver.h" +#include "rf_utils.h" +#include "rf_cpuutil.h" +#include "rf_shutdown.h" +#include "rf_sys.h" + +#if RF_DEMO > 0 +#include "rf_demo.h" +#endif /* RF_DEMO > 0 */ + +#ifdef KERNEL +#include "rf_kintf.h" +#endif /* KERNEL */ + +/* setting these to -1 causes them to be set to their default values if not set by debug options */ + +#define Dprintf(s) if (rf_reconDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf3(s,a,b,c) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL) +#define Dprintf4(s,a,b,c,d) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL) +#define Dprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL) +#define Dprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL) +#define Dprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL) +#define Dprintf8(s,a,b,c,d,e,f,g,h) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),(void *)((unsigned long)h)) + +#define DDprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) +#define DDprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) +#define DDprintf3(s,a,b,c) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL) +#define DDprintf4(s,a,b,c,d) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL) +#define DDprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL) +#define DDprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL) +#define DDprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL) +#define DDprintf8(s,a,b,c,d,e,f,g,h) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),(void *)((unsigned long)h)) + +#ifdef KERNEL +static RF_Thread_t recon_thr_handle; +static int recon_thread_initialized = 0; +#endif /* KERNEL */ + +static RF_FreeList_t *rf_recond_freelist; +#define RF_MAX_FREE_RECOND 4 +#define RF_RECOND_INC 1 + +static RF_RaidReconDesc_t *AllocRaidReconDesc(RF_Raid_t *raidPtr, + RF_RowCol_t row, RF_RowCol_t col, RF_RaidDisk_t *spareDiskPtr, + int numDisksDone, RF_RowCol_t srow, RF_RowCol_t scol); +static void FreeReconDesc(RF_RaidReconDesc_t *reconDesc); +static int ProcessReconEvent(RF_Raid_t *raidPtr, RF_RowCol_t frow, + RF_ReconEvent_t *event); +static int IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t row, + RF_RowCol_t col); +static int TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col); +static int ComputePSDiskOffsets(RF_Raid_t *raidPtr, RF_StripeNum_t psid, + RF_RowCol_t row, RF_RowCol_t col, RF_SectorNum_t *outDiskOffset, + RF_SectorNum_t *outFailedDiskSectorOffset, RF_RowCol_t *spRow, + RF_RowCol_t *spCol, RF_SectorNum_t *spOffset); +static int IssueNextWriteRequest(RF_Raid_t *raidPtr, RF_RowCol_t row); +static int ReconReadDoneProc(void *arg, int status); +static int ReconWriteDoneProc(void *arg, int status); +static void CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_RowCol_t row, + RF_HeadSepLimit_t hsCtr); +static int CheckHeadSeparation(RF_Raid_t *raidPtr, RF_PerDiskReconCtrl_t *ctrl, + RF_RowCol_t row, RF_RowCol_t col, RF_HeadSepLimit_t hsCtr, + RF_ReconUnitNum_t which_ru); +static int CheckForcedOrBlockedReconstruction(RF_Raid_t *raidPtr, + RF_ReconParityStripeStatus_t *pssPtr, RF_PerDiskReconCtrl_t *ctrl, + RF_RowCol_t row, RF_RowCol_t col, RF_StripeNum_t psid, + RF_ReconUnitNum_t which_ru); +static void ForceReconReadDoneProc(void *arg, int status); + +static void rf_ShutdownReconstruction(void *); + + +struct RF_ReconDoneProc_s { + void (*proc)(RF_Raid_t *, void *); + void *arg; + RF_ReconDoneProc_t *next; +}; + +static RF_FreeList_t *rf_rdp_freelist; +#define RF_MAX_FREE_RDP 4 +#define RF_RDP_INC 1 + +static void SignalReconDone(RF_Raid_t *raidPtr) +{ + RF_ReconDoneProc_t *p; + + RF_LOCK_MUTEX(raidPtr->recon_done_proc_mutex); + for(p=raidPtr->recon_done_procs;p;p=p->next) { + p->proc(raidPtr, p->arg); + } + RF_UNLOCK_MUTEX(raidPtr->recon_done_proc_mutex); +} + +int rf_RegisterReconDoneProc( + RF_Raid_t *raidPtr, + void (*proc)(RF_Raid_t *, void *), + void *arg, + RF_ReconDoneProc_t **handlep) +{ + RF_ReconDoneProc_t *p; + + RF_FREELIST_GET(rf_rdp_freelist,p,next,(RF_ReconDoneProc_t *)); + if (p == NULL) + return(ENOMEM); + p->proc = proc; + p->arg = arg; + RF_LOCK_MUTEX(raidPtr->recon_done_proc_mutex); + p->next = raidPtr->recon_done_procs; + raidPtr->recon_done_procs = p; + RF_UNLOCK_MUTEX(raidPtr->recon_done_proc_mutex); + if (handlep) + *handlep = p; + return(0); +} + +/***************************************************************************************** + * + * sets up the parameters that will be used by the reconstruction process + * currently there are none, except for those that the layout-specific + * configuration (e.g. rf_ConfigureDeclustered) routine sets up. + * + * in the kernel, we fire off the recon thread. + * + ****************************************************************************************/ +static void rf_ShutdownReconstruction(ignored) + void *ignored; +{ + RF_FREELIST_DESTROY(rf_recond_freelist,next,(RF_RaidReconDesc_t *)); + RF_FREELIST_DESTROY(rf_rdp_freelist,next,(RF_ReconDoneProc_t *)); +} + +int rf_ConfigureReconstruction(listp) + RF_ShutdownList_t **listp; +{ + int rc; + + RF_FREELIST_CREATE(rf_recond_freelist, RF_MAX_FREE_RECOND, + RF_RECOND_INC, sizeof(RF_RaidReconDesc_t)); + if (rf_recond_freelist == NULL) + return(ENOMEM); + RF_FREELIST_CREATE(rf_rdp_freelist, RF_MAX_FREE_RDP, + RF_RDP_INC, sizeof(RF_ReconDoneProc_t)); + if (rf_rdp_freelist == NULL) { + RF_FREELIST_DESTROY(rf_recond_freelist,next,(RF_RaidReconDesc_t *)); + return(ENOMEM); + } + rc = rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL); + if (rc) { + RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", + __FILE__, __LINE__, rc); + rf_ShutdownReconstruction(NULL); + return(rc); + } + +#ifdef KERNEL + if (!recon_thread_initialized) { + RF_CREATE_THREAD(recon_thr_handle, rf_ReconKernelThread, NULL); + recon_thread_initialized = 1; + } +#endif /* KERNEL */ + + return(0); +} + +static RF_RaidReconDesc_t *AllocRaidReconDesc(raidPtr, row, col, spareDiskPtr, numDisksDone, srow, scol) + RF_Raid_t *raidPtr; + RF_RowCol_t row; + RF_RowCol_t col; + RF_RaidDisk_t *spareDiskPtr; + int numDisksDone; + RF_RowCol_t srow; + RF_RowCol_t scol; +{ + + RF_RaidReconDesc_t *reconDesc; + + RF_FREELIST_GET(rf_recond_freelist,reconDesc,next,(RF_RaidReconDesc_t *)); + + reconDesc->raidPtr = raidPtr; + reconDesc->row = row; + reconDesc->col = col; + reconDesc->spareDiskPtr=spareDiskPtr; + reconDesc->numDisksDone=numDisksDone; + reconDesc->srow=srow; + reconDesc->scol=scol; + reconDesc->state = 0; + reconDesc->next = NULL; + + return(reconDesc); +} + +static void FreeReconDesc(reconDesc) + RF_RaidReconDesc_t *reconDesc; +{ +#if RF_RECON_STATS > 0 + printf("RAIDframe: %lu recon event waits, %lu recon delays\n", + (long)reconDesc->numReconEventWaits, (long)reconDesc->numReconExecDelays); +#endif /* RF_RECON_STATS > 0 */ +#ifdef KERNEL + printf("RAIDframe: %lu max exec ticks\n", + (long)reconDesc->maxReconExecTicks); +#endif /* KERNEL */ +#if (RF_RECON_STATS > 0) || defined(KERNEL) + printf("\n"); +#endif /* (RF_RECON_STATS > 0) || KERNEL */ + RF_FREELIST_FREE(rf_recond_freelist,reconDesc,next); +} + + +/***************************************************************************************** + * + * primary routine to reconstruct a failed disk. This should be called from + * within its own thread. It won't return until reconstruction completes, + * fails, or is aborted. + ****************************************************************************************/ +int rf_ReconstructFailedDisk(raidPtr, row, col) + RF_Raid_t *raidPtr; + RF_RowCol_t row; + RF_RowCol_t col; +{ +#ifdef SIMULATE + RF_PendingRecon_t *pend; + RF_RowCol_t r, c; +#endif /* SIMULATE */ + RF_LayoutSW_t *lp; + int rc; + + lp = raidPtr->Layout.map; + if (lp->SubmitReconBuffer) { + /* + * The current infrastructure only supports reconstructing one + * disk at a time for each array. + */ +#ifdef SIMULATE + if (raidPtr->reconInProgress) { + RF_Malloc(pend, sizeof(RF_PendingRecon_t), (RF_PendingRecon_t *)); + pend->row = row; + pend->col = col; + pend->next = raidPtr->pendingRecon; + raidPtr->pendingRecon = pend; + /* defer until current recon completes */ + return(0); + } + raidPtr->reconInProgress++; +#else /* SIMULATE */ + RF_LOCK_MUTEX(raidPtr->mutex); + while (raidPtr->reconInProgress) { + RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex); + } + raidPtr->reconInProgress++; + RF_UNLOCK_MUTEX(raidPtr->mutex); +#endif /* SIMULATE */ + rc = rf_ReconstructFailedDiskBasic(raidPtr, row, col); + } + else { + RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n", + lp->parityConfig); + rc = EIO; + } +#ifdef SIMULATE + pend = raidPtr->pendingRecon; + if (pend) { + /* launch next recon */ + raidPtr->pendingRecon = pend->next; + r = pend->row; + c = pend->col; + RF_Free(pend, sizeof(RF_PendingRecon_t)); + return(rf_ReconstructFailedDisk(raidPtr, r, c)); + } +#else /* SIMULATE */ + RF_LOCK_MUTEX(raidPtr->mutex); + raidPtr->reconInProgress--; + RF_UNLOCK_MUTEX(raidPtr->mutex); + RF_SIGNAL_COND(raidPtr->waitForReconCond); +#if 1 +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + wakeup(&raidPtr->waitForReconCond); /* XXX Methinks this will be needed + at some point... GO*/ +#endif +#endif +#endif /* SIMULATE */ + return(rc); +} + +int rf_ReconstructFailedDiskBasic(raidPtr, row, col) + RF_Raid_t *raidPtr; + RF_RowCol_t row; + RF_RowCol_t col; +{ + RF_RaidDisk_t *spareDiskPtr = NULL; + RF_RaidReconDesc_t *reconDesc; + RF_RowCol_t srow, scol; + int numDisksDone=0, rc; + + /* first look for a spare drive onto which to reconstruct the data */ + /* spare disk descriptors are stored in row 0. This may have to change eventually */ + + RF_LOCK_MUTEX(raidPtr->mutex); + RF_ASSERT (raidPtr->Disks[row][col].status == rf_ds_failed); + + if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { + if (raidPtr->status[row] != rf_rs_degraded) { + RF_ERRORMSG2("Unable to reconstruct disk at row %d col %d because status not degraded\n",row,col); + RF_UNLOCK_MUTEX(raidPtr->mutex); + return(EINVAL); + } + srow = row; + scol = (-1); + } + else { + srow = 0; + for (scol=raidPtr->numCol; scol<raidPtr->numCol + raidPtr->numSpare; scol++) { + if (raidPtr->Disks[srow][scol].status == rf_ds_spare) { + spareDiskPtr = &raidPtr->Disks[srow][scol]; + spareDiskPtr->status = rf_ds_used_spare; + break; + } + } + if (!spareDiskPtr) { + RF_ERRORMSG2("Unable to reconstruct disk at row %d col %d because no spares are available\n",row,col); + RF_UNLOCK_MUTEX(raidPtr->mutex); + return(ENOSPC); + } + +#if RF_DEMO > 0 + if (!rf_demoMode) { +#endif /* RF_DEMO > 0 */ + printf("RECON: initiating reconstruction on row %d col %d -> spare at row %d col %d\n",row, col, srow, scol); +#if RF_DEMO > 0 + } +#endif /* RF_DEMO > 0 */ + } + RF_UNLOCK_MUTEX(raidPtr->mutex); + + reconDesc = AllocRaidReconDesc((void *) raidPtr, row, col,spareDiskPtr, numDisksDone, srow , scol); + raidPtr->reconDesc = (void *) reconDesc; +#if RF_RECON_STATS > 0 + reconDesc->hsStallCount = 0; + reconDesc->numReconExecDelays = 0; + reconDesc->numReconEventWaits = 0; +#endif /* RF_RECON_STATS > 0 */ +#ifdef KERNEL + reconDesc->reconExecTimerRunning = 0; + reconDesc->reconExecTicks = 0; + reconDesc->maxReconExecTicks = 0; +#endif /* KERNEL */ +#if RF_DEMO > 0 && !defined(SIMULATE) + if (rf_demoMode) { + char cbuf[10]; + printf("About to start reconstruction, hit return to continue:"); + gets(cbuf); + } +#endif /* RF_DEMO > 0 && !SIMULATE */ + rc = rf_ContinueReconstructFailedDisk(reconDesc); + return(rc); +} + + +int rf_ContinueReconstructFailedDisk(reconDesc) + RF_RaidReconDesc_t *reconDesc; +{ + RF_Raid_t *raidPtr=reconDesc->raidPtr; + RF_RowCol_t row=reconDesc->row; + RF_RowCol_t col=reconDesc->col; + RF_RowCol_t srow=reconDesc->srow; + RF_RowCol_t scol=reconDesc->scol; + RF_ReconMap_t *mapPtr; + + RF_ReconEvent_t *event; + struct timeval etime, elpsd; + unsigned long xor_s, xor_resid_us; + int retcode,i, ds; + + switch (reconDesc->state) + { + + + case 0: + + raidPtr->accumXorTimeUs = 0; + + /* create one trace record per physical disk */ + RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *)); + + /* quiesce the array prior to starting recon. this is needed to assure no nasty interactions + * with pending user writes. We need to do this before we change the disk or row status. + */ + reconDesc->state=1; + + Dprintf("RECON: begin request suspend\n"); + retcode = rf_SuspendNewRequestsAndWait(raidPtr); + Dprintf("RECON: end request suspend\n"); + rf_StartUserStats(raidPtr); /* zero out the stats kept on user accs */ + +#ifdef SIMULATE + if (retcode) return(0); +#endif /* SIMULATE */ + + /* fall through to state 1 */ + + case 1: + + RF_LOCK_MUTEX(raidPtr->mutex); + + /* create the reconstruction control pointer and install it in the right slot */ + raidPtr->reconControl[row] = rf_MakeReconControl(reconDesc, row, col, srow, scol); + mapPtr=raidPtr->reconControl[row]->reconMap; + raidPtr->status[row] = rf_rs_reconstructing; + raidPtr->Disks[row][col].status = rf_ds_reconstructing; + raidPtr->Disks[row][col].spareRow = srow; + raidPtr->Disks[row][col].spareCol = scol; + + RF_UNLOCK_MUTEX(raidPtr->mutex); + + RF_GETTIME(raidPtr->reconControl[row]->starttime); +#if RF_DEMO > 0 + if (rf_demoMode) { + rf_demo_update_mode(RF_DEMO_RECON); + rf_startup_recon_demo(rf_demoMeterVpos, raidPtr->numCol, + raidPtr->Layout.numDataCol+raidPtr->Layout.numParityCol, 0); + } +#endif /* RF_DEMO > 0 */ + + /* now start up the actual reconstruction: issue a read for each surviving disk */ + rf_start_cpu_monitor(); + reconDesc->numDisksDone = 0; + for (i=0; i<raidPtr->numCol; i++) { + if (i != col) { + /* find and issue the next I/O on the indicated disk */ + if (IssueNextReadRequest(raidPtr, row, i)) { + Dprintf2("RECON: done issuing for r%d c%d\n", row, i); + reconDesc->numDisksDone++; + } + } + } + + case 2: + Dprintf("RECON: resume requests\n"); + rf_ResumeNewRequests(raidPtr); + + + reconDesc->state=3; + + case 3: + + /* process reconstruction events until all disks report that they've completed all work */ + mapPtr=raidPtr->reconControl[row]->reconMap; + + + + while (reconDesc->numDisksDone < raidPtr->numCol-1) { + + event = rf_GetNextReconEvent(reconDesc, row, (void (*)(void *))rf_ContinueReconstructFailedDisk,reconDesc); +#ifdef SIMULATE + if (event==NULL) {return(0);} +#else /* SIMULATE */ + RF_ASSERT(event); +#endif /* SIMULATE */ + + if (ProcessReconEvent(raidPtr, row, event)) reconDesc->numDisksDone++; + raidPtr->reconControl[row]->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs); +#if RF_DEMO > 0 + if (rf_prReconSched || rf_demoMode) +#else /* RF_DEMO > 0 */ + if (rf_prReconSched) +#endif /* RF_DEMO > 0 */ + { + rf_PrintReconSchedule(raidPtr->reconControl[row]->reconMap, &(raidPtr->reconControl[row]->starttime)); + } + } + + + + reconDesc->state=4; + + + case 4: + mapPtr=raidPtr->reconControl[row]->reconMap; + if (rf_reconDebug) { + printf("RECON: all reads completed\n"); + } + + + + /* at this point all the reads have completed. We now wait for any pending writes + * to complete, and then we're done + */ + + while (rf_UnitsLeftToReconstruct(raidPtr->reconControl[row]->reconMap) > 0) { + + event = rf_GetNextReconEvent(reconDesc, row, (void (*)(void *))rf_ContinueReconstructFailedDisk,reconDesc); +#ifdef SIMULATE + if (event==NULL) {return(0);} +#else /* SIMULATE */ + RF_ASSERT(event); +#endif /* SIMULATE */ + + (void) ProcessReconEvent(raidPtr, row, event); /* ignore return code */ + raidPtr->reconControl[row]->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs); +#if RF_DEMO > 0 + if (rf_prReconSched || rf_demoMode) +#else /* RF_DEMO > 0 */ + if (rf_prReconSched) +#endif /* RF_DEMO > 0 */ + { + rf_PrintReconSchedule(raidPtr->reconControl[row]->reconMap, &(raidPtr->reconControl[row]->starttime)); + } + } + reconDesc->state=5; + + case 5: + rf_stop_cpu_monitor(); + + /* Success: mark the dead disk as reconstructed. We quiesce the array here to assure no + * nasty interactions with pending user accesses when we free up the psstatus structure + * as part of FreeReconControl() + */ + + + + reconDesc->state=6; + + retcode = rf_SuspendNewRequestsAndWait(raidPtr); + rf_StopUserStats(raidPtr); + rf_PrintUserStats(raidPtr); /* print out the stats on user accs accumulated during recon */ + +#ifdef SIMULATE + if (retcode) return(0); +#endif /* SIMULATE */ + + /* fall through to state 6 */ + case 6: + + + + RF_LOCK_MUTEX(raidPtr->mutex); + raidPtr->numFailures--; + ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE); + raidPtr->Disks[row][col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared; + raidPtr->status[row] = (ds) ? rf_rs_reconfigured : rf_rs_optimal; + RF_UNLOCK_MUTEX(raidPtr->mutex); + RF_GETTIME(etime); + RF_TIMEVAL_DIFF(&(raidPtr->reconControl[row]->starttime), &etime, &elpsd); + + /* XXX -- why is state 7 different from state 6 if there is no return() here? -- XXX + * Note that I set elpsd above & use it below, so if you put a return + * here you'll have to fix this. (also, FreeReconControl is called below) + */ + + case 7: + + rf_ResumeNewRequests(raidPtr); + +#if RF_DEMO > 0 + if (rf_demoMode) { + rf_finish_recon_demo(&elpsd); + } + else { +#endif /* RF_DEMO > 0 */ + printf("Reconstruction of disk at row %d col %d completed and spare disk reassigned\n", row, col); + xor_s = raidPtr->accumXorTimeUs/1000000; + xor_resid_us = raidPtr->accumXorTimeUs%1000000; + printf("Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n", + (int)elpsd.tv_sec,(int)elpsd.tv_usec,raidPtr->accumXorTimeUs,xor_s,xor_resid_us); + printf(" (start time %d sec %d usec, end time %d sec %d usec)\n", + (int)raidPtr->reconControl[row]->starttime.tv_sec, + (int)raidPtr->reconControl[row]->starttime.tv_usec, + (int)etime.tv_sec, (int)etime.tv_usec); + rf_print_cpu_util("reconstruction"); +#if RF_RECON_STATS > 0 + printf("Total head-sep stall count was %d\n", + (int)reconDesc->hsStallCount); +#endif /* RF_RECON_STATS > 0 */ +#if RF_DEMO > 0 + } +#endif /* RF_DEMO > 0 */ + rf_FreeReconControl(raidPtr, row); + RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t)); + FreeReconDesc(reconDesc); + + } + + SignalReconDone(raidPtr); + return (0); +} + +/***************************************************************************************** + * do the right thing upon each reconstruction event. + * returns nonzero if and only if there is nothing left unread on the indicated disk + ****************************************************************************************/ +static int ProcessReconEvent(raidPtr, frow, event) + RF_Raid_t *raidPtr; + RF_RowCol_t frow; + RF_ReconEvent_t *event; +{ + int retcode = 0, submitblocked; + RF_ReconBuffer_t *rbuf; + RF_SectorCount_t sectorsPerRU; + + Dprintf1("RECON: ProcessReconEvent type %d\n", event->type); + switch(event->type) { + + /* a read I/O has completed */ + case RF_REVENT_READDONE: + rbuf = raidPtr->reconControl[frow]->perDiskInfo[event->col].rbuf; + Dprintf3("RECON: READDONE EVENT: row %d col %d psid %ld\n", + frow, event->col, rbuf->parityStripeID); + Dprintf7("RECON: done read psid %ld buf %lx %02x %02x %02x %02x %02x\n", + rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0]&0xff, rbuf->buffer[1]&0xff, + rbuf->buffer[2]&0xff, rbuf->buffer[3]&0xff, rbuf->buffer[4]&0xff); + rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); + submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0); + Dprintf1("RECON: submitblocked=%d\n", submitblocked); + if (!submitblocked) retcode = IssueNextReadRequest(raidPtr, frow, event->col); + break; + + /* a write I/O has completed */ + case RF_REVENT_WRITEDONE: + if (rf_floatingRbufDebug) { + rf_CheckFloatingRbufCount(raidPtr, 1); + } + sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; + rbuf = (RF_ReconBuffer_t *) event->arg; + rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); + Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n", + rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl[frow]->percentComplete); + rf_ReconMapUpdate(raidPtr, raidPtr->reconControl[frow]->reconMap, + rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU -1); + rf_RemoveFromActiveReconTable(raidPtr, frow, rbuf->parityStripeID, rbuf->which_ru); + + if (rbuf->type == RF_RBUF_TYPE_FLOATING) { + RF_LOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex); + raidPtr->numFullReconBuffers--; + rf_ReleaseFloatingReconBuffer(raidPtr, frow, rbuf); + RF_UNLOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex); + } else if (rbuf->type == RF_RBUF_TYPE_FORCED) rf_FreeReconBuffer(rbuf); + else RF_ASSERT(0); + break; + + case RF_REVENT_BUFCLEAR: /* A buffer-stall condition has been cleared */ + Dprintf2("RECON: BUFCLEAR EVENT: row %d col %d\n",frow, event->col); + submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl[frow]->perDiskInfo[event->col].rbuf, 0, (int) (long)event->arg); + RF_ASSERT(!submitblocked); /* we wouldn't have gotten the BUFCLEAR event if we couldn't submit */ + retcode = IssueNextReadRequest(raidPtr, frow, event->col); + break; + + case RF_REVENT_BLOCKCLEAR: /* A user-write reconstruction blockage has been cleared */ + DDprintf2("RECON: BLOCKCLEAR EVENT: row %d col %d\n",frow, event->col); + retcode = TryToRead(raidPtr, frow, event->col); + break; + + case RF_REVENT_HEADSEPCLEAR: /* A max-head-separation reconstruction blockage has been cleared */ + Dprintf2("RECON: HEADSEPCLEAR EVENT: row %d col %d\n",frow, event->col); + retcode = TryToRead(raidPtr, frow, event->col); + break; + + /* a buffer has become ready to write */ + case RF_REVENT_BUFREADY: + Dprintf2("RECON: BUFREADY EVENT: row %d col %d\n",frow, event->col); + retcode = IssueNextWriteRequest(raidPtr, frow); + if (rf_floatingRbufDebug) { + rf_CheckFloatingRbufCount(raidPtr, 1); + } + break; + + /* we need to skip the current RU entirely because it got recon'd while we were waiting for something else to happen */ + case RF_REVENT_SKIP: + DDprintf2("RECON: SKIP EVENT: row %d col %d\n",frow, event->col); + retcode = IssueNextReadRequest(raidPtr, frow, event->col); + break; + + /* a forced-reconstruction read access has completed. Just submit the buffer */ + case RF_REVENT_FORCEDREADDONE: + rbuf = (RF_ReconBuffer_t *) event->arg; + rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); + DDprintf2("RECON: FORCEDREADDONE EVENT: row %d col %d\n",frow, event->col); + submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0); + RF_ASSERT(!submitblocked); + break; + + default: + RF_PANIC(); + } + rf_FreeReconEventDesc(event); + return(retcode); +} + +/***************************************************************************************** + * + * find the next thing that's needed on the indicated disk, and issue a read + * request for it. We assume that the reconstruction buffer associated with this + * process is free to receive the data. If reconstruction is blocked on the + * indicated RU, we issue a blockage-release request instead of a physical disk + * read request. If the current disk gets too far ahead of the others, we issue + * a head-separation wait request and return. + * + * ctrl->{ru_count, curPSID, diskOffset} and rbuf->failedDiskSectorOffset are + * maintained to point the the unit we're currently accessing. Note that this deviates + * from the standard C idiom of having counters point to the next thing to be + * accessed. This allows us to easily retry when we're blocked by head separation + * or reconstruction-blockage events. + * + * returns nonzero if and only if there is nothing left unread on the indicated disk + ****************************************************************************************/ +static int IssueNextReadRequest(raidPtr, row, col) + RF_Raid_t *raidPtr; + RF_RowCol_t row; + RF_RowCol_t col; +{ + RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl[row]->perDiskInfo[col]; + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_ReconBuffer_t *rbuf = ctrl->rbuf; + RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU; + RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU; + int do_new_check = 0, retcode = 0, status; + + /* if we are currently the slowest disk, mark that we have to do a new check */ + if (ctrl->headSepCounter <= raidPtr->reconControl[row]->minHeadSepCounter) do_new_check = 1; + + while (1) { + + ctrl->ru_count++; + if (ctrl->ru_count < RUsPerPU) { + ctrl->diskOffset += sectorsPerRU; + rbuf->failedDiskSectorOffset += sectorsPerRU; + } else { + ctrl->curPSID++; + ctrl->ru_count = 0; + /* code left over from when head-sep was based on parity stripe id */ + if (ctrl->curPSID >= raidPtr->reconControl[row]->lastPSID) { + CheckForNewMinHeadSep(raidPtr, row, ++(ctrl->headSepCounter)); + return(1); /* finito! */ + } + + /* find the disk offsets of the start of the parity stripe on both the current disk and the failed disk. + * skip this entire parity stripe if either disk does not appear in the indicated PS + */ + status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, row, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset, + &rbuf->spRow, &rbuf->spCol, &rbuf->spOffset); + if (status) { + ctrl->ru_count = RUsPerPU-1; continue; + } + } + rbuf->which_ru = ctrl->ru_count; + + /* skip this RU if it's already been reconstructed */ + if (rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, rbuf->failedDiskSectorOffset)) { + Dprintf2("Skipping psid %ld ru %d: already reconstructed\n",ctrl->curPSID,ctrl->ru_count); + continue; + } + break; + } + ctrl->headSepCounter++; + if (do_new_check) CheckForNewMinHeadSep(raidPtr, row, ctrl->headSepCounter); /* update min if needed */ + + + /* at this point, we have definitely decided what to do, and we have only to see if we can actually do it now */ + rbuf->parityStripeID = ctrl->curPSID; + rbuf->which_ru = ctrl->ru_count; + bzero((char *)&raidPtr->recon_tracerecs[col], sizeof(raidPtr->recon_tracerecs[col])); + raidPtr->recon_tracerecs[col].reconacc = 1; + RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer); + retcode = TryToRead(raidPtr, row, col); + return(retcode); +} + +/* tries to issue the next read on the indicated disk. We may be blocked by (a) the heads being too + * far apart, or (b) recon on the indicated RU being blocked due to a write by a user thread. + * In this case, we issue a head-sep or blockage wait request, which will cause this same routine + * to be invoked again later when the blockage has cleared. + */ +static int TryToRead(raidPtr, row, col) + RF_Raid_t *raidPtr; + RF_RowCol_t row; + RF_RowCol_t col; +{ + RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl[row]->perDiskInfo[col]; + RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; + RF_StripeNum_t psid = ctrl->curPSID; + RF_ReconUnitNum_t which_ru = ctrl->ru_count; + RF_DiskQueueData_t *req; + int status, created = 0; + RF_ReconParityStripeStatus_t *pssPtr; + + /* if the current disk is too far ahead of the others, issue a head-separation wait and return */ + if (CheckHeadSeparation(raidPtr, ctrl, row, col, ctrl->headSepCounter, which_ru)) return(0); + RF_LOCK_PSS_MUTEX(raidPtr, row, psid); + pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]->pssTable, psid, which_ru, RF_PSS_CREATE, &created); + + /* if recon is blocked on the indicated parity stripe, issue a block-wait request and return. + * this also must mark the indicated RU in the stripe as under reconstruction if not blocked. + */ + status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, row, col, psid, which_ru); + if (status == RF_PSS_RECON_BLOCKED) { + Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n",psid,which_ru); + goto out; + } else if (status == RF_PSS_FORCED_ON_WRITE) { + rf_CauseReconEvent(raidPtr, row, col, NULL, RF_REVENT_SKIP); + goto out; + } + + /* make one last check to be sure that the indicated RU didn't get reconstructed while + * we were waiting for something else to happen. This is unfortunate in that it causes + * us to make this check twice in the normal case. Might want to make some attempt to + * re-work this so that we only do this check if we've definitely blocked on one of the + * above checks. When this condition is detected, we may have just created a bogus + * status entry, which we need to delete. + */ + if (rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, ctrl->rbuf->failedDiskSectorOffset)) { + Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n",psid,which_ru); + if (created) rf_PSStatusDelete(raidPtr, raidPtr->reconControl[row]->pssTable, pssPtr); + rf_CauseReconEvent(raidPtr, row, col, NULL, RF_REVENT_SKIP); + goto out; + } + + /* found something to read. issue the I/O */ + Dprintf5("RECON: Read for psid %ld on row %d col %d offset %ld buf %lx\n", + psid, row, col, ctrl->diskOffset, ctrl->rbuf->buffer); + RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer); + RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer); + raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us = + RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer); + RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer); + + /* should be ok to use a NULL proc pointer here, all the bufs we use should be in kernel space */ + req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru, + ReconReadDoneProc, (void *) ctrl, NULL, &raidPtr->recon_tracerecs[col], (void *)raidPtr, 0, NULL); + + RF_ASSERT(req); /* XXX -- fix this -- XXX */ + + ctrl->rbuf->arg = (void *) req; + rf_DiskIOEnqueue(&raidPtr->Queues[row][col], req, RF_IO_RECON_PRIORITY); + pssPtr->issued[col] = 1; + +out: + RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid); + return(0); +} + + +/* given a parity stripe ID, we want to find out whether both the current disk and the + * failed disk exist in that parity stripe. If not, we want to skip this whole PS. + * If so, we want to find the disk offset of the start of the PS on both the current + * disk and the failed disk. + * + * this works by getting a list of disks comprising the indicated parity stripe, and + * searching the list for the current and failed disks. Once we've decided they both + * exist in the parity stripe, we need to decide whether each is data or parity, + * so that we'll know which mapping function to call to get the corresponding disk + * offsets. + * + * this is kind of unpleasant, but doing it this way allows the reconstruction code + * to use parity stripe IDs rather than physical disks address to march through the + * failed disk, which greatly simplifies a lot of code, as well as eliminating the + * need for a reverse-mapping function. I also think it will execute faster, since + * the calls to the mapping module are kept to a minimum. + * + * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING THE STRIPE + * IN THE CORRECT ORDER + */ +static int ComputePSDiskOffsets( + RF_Raid_t *raidPtr, /* raid descriptor */ + RF_StripeNum_t psid, /* parity stripe identifier */ + RF_RowCol_t row, /* row and column of disk to find the offsets for */ + RF_RowCol_t col, + RF_SectorNum_t *outDiskOffset, + RF_SectorNum_t *outFailedDiskSectorOffset, + RF_RowCol_t *spRow, /* OUT: row,col of spare unit for failed unit */ + RF_RowCol_t *spCol, + RF_SectorNum_t *spOffset) /* OUT: offset into disk containing spare unit */ +{ + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_RowCol_t fcol = raidPtr->reconControl[row]->fcol; + RF_RaidAddr_t sosRaidAddress; /* start-of-stripe */ + RF_RowCol_t *diskids; + u_int i, j, k, i_offset, j_offset; + RF_RowCol_t prow, pcol; + int testcol, testrow; + RF_RowCol_t stripe; + RF_SectorNum_t poffset; + char i_is_parity=0, j_is_parity=0; + RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol; + + /* get a listing of the disks comprising that stripe */ + sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid); + (layoutPtr->map->IdentifyStripe)(raidPtr, sosRaidAddress, &diskids, &stripe); + RF_ASSERT(diskids); + + /* reject this entire parity stripe if it does not contain the indicated disk or it does not contain the failed disk */ + if (row != stripe) + goto skipit; + for (i=0; i<stripeWidth; i++) { + if (col == diskids[i]) + break; + } + if (i == stripeWidth) + goto skipit; + for (j=0; j<stripeWidth; j++) { + if (fcol == diskids[j]) + break; + } + if (j == stripeWidth) { + goto skipit; + } + + /* find out which disk the parity is on */ + (layoutPtr->map->MapParity)(raidPtr, sosRaidAddress, &prow, &pcol, &poffset, RF_DONT_REMAP); + + /* find out if either the current RU or the failed RU is parity */ + /* also, if the parity occurs in this stripe prior to the data and/or failed col, we need to decrement i and/or j */ + for (k=0; k<stripeWidth; k++) + if (diskids[k] == pcol) + break; + RF_ASSERT(k < stripeWidth); + i_offset = i; j_offset=j; + if (k < i) i_offset--; else if (k==i) {i_is_parity = 1; i_offset = 0;} /* set offsets to zero to disable multiply below */ + if (k < j) j_offset--; else if (k==j) {j_is_parity = 1; j_offset = 0;} + + /* at this point, [ij]_is_parity tells us whether the [current,failed] disk is parity at + * the start of this RU, and, if data, "[ij]_offset" tells us how far into the stripe + * the [current,failed] disk is. + */ + + /* call the mapping routine to get the offset into the current disk, repeat for failed disk. */ + if (i_is_parity) + layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outDiskOffset, RF_DONT_REMAP); + else + layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outDiskOffset, RF_DONT_REMAP); + + RF_ASSERT(row == testrow && col == testcol); + + if (j_is_parity) + layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP); + else + layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP); + RF_ASSERT(row == testrow && fcol == testcol); + + /* now locate the spare unit for the failed unit */ + if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { + if (j_is_parity) + layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spRow, spCol, spOffset, RF_REMAP); + else + layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spRow, spCol, spOffset, RF_REMAP); + } else { + *spRow = raidPtr->reconControl[row]->spareRow; + *spCol = raidPtr->reconControl[row]->spareCol; + *spOffset = *outFailedDiskSectorOffset; + } + + return(0); + +skipit: + Dprintf3("RECON: Skipping psid %ld: nothing needed from r%d c%d\n", + psid, row, col); + return(1); +} + +/* this is called when a buffer has become ready to write to the replacement disk */ +static int IssueNextWriteRequest(raidPtr, row) + RF_Raid_t *raidPtr; + RF_RowCol_t row; +{ + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU; + RF_RowCol_t fcol = raidPtr->reconControl[row]->fcol; + RF_ReconBuffer_t *rbuf; + RF_DiskQueueData_t *req; + + rbuf = rf_GetFullReconBuffer(raidPtr->reconControl[row]); + RF_ASSERT(rbuf); /* there must be one available, or we wouldn't have gotten the event that sent us here */ + RF_ASSERT(rbuf->pssPtr); + + rbuf->pssPtr->writeRbuf = rbuf; + rbuf->pssPtr = NULL; + + Dprintf7("RECON: New write (r %d c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n", + rbuf->spRow, rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID, + rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer); + Dprintf6("RECON: new write psid %ld %02x %02x %02x %02x %02x\n", + rbuf->parityStripeID, rbuf->buffer[0]&0xff, rbuf->buffer[1]&0xff, + rbuf->buffer[2]&0xff, rbuf->buffer[3]&0xff, rbuf->buffer[4]&0xff); + + /* should be ok to use a NULL b_proc here b/c all addrs should be in kernel space */ + req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset, + sectorsPerRU, rbuf->buffer, + rbuf->parityStripeID, rbuf->which_ru, + ReconWriteDoneProc, (void *) rbuf, NULL, + &raidPtr->recon_tracerecs[fcol], + (void *)raidPtr, 0, NULL); + + RF_ASSERT(req); /* XXX -- fix this -- XXX */ + + rbuf->arg = (void *) req; + rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spRow][rbuf->spCol], req, RF_IO_RECON_PRIORITY); + + return(0); +} + +/* this gets called upon the completion of a reconstruction read operation + * the arg is a pointer to the per-disk reconstruction control structure + * for the process that just finished a read. + * + * called at interrupt context in the kernel, so don't do anything illegal here. + */ +static int ReconReadDoneProc(arg, status) + void *arg; + int status; +{ + RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg; + RF_Raid_t *raidPtr = ctrl->reconCtrl->reconDesc->raidPtr; + + if (status) { + /* + * XXX + */ + printf("Recon read failed!\n"); + RF_PANIC(); + } + + RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer); + RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer); + raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us = + RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer); + RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer); + + rf_CauseReconEvent(raidPtr, ctrl->row, ctrl->col, NULL, RF_REVENT_READDONE); + return(0); +} + +/* this gets called upon the completion of a reconstruction write operation. + * the arg is a pointer to the rbuf that was just written + * + * called at interrupt context in the kernel, so don't do anything illegal here. + */ +static int ReconWriteDoneProc(arg, status) + void *arg; + int status; +{ + RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg; + + Dprintf2("Reconstruction completed on psid %ld ru %d\n",rbuf->parityStripeID, rbuf->which_ru); + if (status) {printf("Recon write failed!\n"); /*fprintf(stderr,"Recon write failed!\n");*/ RF_PANIC();} + rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, rbuf->row, rbuf->col, arg, RF_REVENT_WRITEDONE); + return(0); +} + + +/* computes a new minimum head sep, and wakes up anyone who needs to be woken as a result */ +static void CheckForNewMinHeadSep(raidPtr, row, hsCtr) + RF_Raid_t *raidPtr; + RF_RowCol_t row; + RF_HeadSepLimit_t hsCtr; +{ + RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row]; + RF_HeadSepLimit_t new_min; + RF_RowCol_t i; + RF_CallbackDesc_t *p; + RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter); /* from the definition of a minimum */ + + + RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); + + new_min = ~ (1L<< (8*sizeof(long)-1)); /* 0x7FFF....FFF */ + for (i=0; i<raidPtr->numCol; i++) if (i != reconCtrlPtr->fcol) { + if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min) new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter; + } + + /* set the new minimum and wake up anyone who can now run again */ + if (new_min != reconCtrlPtr->minHeadSepCounter) { + reconCtrlPtr->minHeadSepCounter = new_min; + Dprintf1("RECON: new min head pos counter val is %ld\n",new_min); + while (reconCtrlPtr->headSepCBList) { + if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min) break; + p = reconCtrlPtr->headSepCBList; + reconCtrlPtr->headSepCBList = p->next; + p->next = NULL; + rf_CauseReconEvent(raidPtr, p->row, p->col, NULL, RF_REVENT_HEADSEPCLEAR); + rf_FreeCallbackDesc(p); + } + + } + + RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex); +} + +/* checks to see that the maximum head separation will not be violated + * if we initiate a reconstruction I/O on the indicated disk. Limiting the + * maximum head separation between two disks eliminates the nasty buffer-stall + * conditions that occur when one disk races ahead of the others and consumes + * all of the floating recon buffers. This code is complex and unpleasant + * but it's necessary to avoid some very nasty, albeit fairly rare, + * reconstruction behavior. + * + * returns non-zero if and only if we have to stop working on the indicated disk + * due to a head-separation delay. + */ +static int CheckHeadSeparation( + RF_Raid_t *raidPtr, + RF_PerDiskReconCtrl_t *ctrl, + RF_RowCol_t row, + RF_RowCol_t col, + RF_HeadSepLimit_t hsCtr, + RF_ReconUnitNum_t which_ru) +{ + RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row]; + RF_CallbackDesc_t *cb, *p, *pt; + int retval = 0, tid; + + /* if we're too far ahead of the slowest disk, stop working on this disk + * until the slower ones catch up. We do this by scheduling a wakeup callback + * for the time when the slowest disk has caught up. We define "caught up" + * with 20% hysteresis, i.e. the head separation must have fallen to at most + * 80% of the max allowable head separation before we'll wake up. + * + */ + rf_get_threadid(tid); + RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); + if ((raidPtr->headSepLimit >= 0) && + ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit)) + { + Dprintf6("[%d] RECON: head sep stall: row %d col %d hsCtr %ld minHSCtr %ld limit %ld\n", + tid,row,col,ctrl->headSepCounter, reconCtrlPtr->minHeadSepCounter, raidPtr->headSepLimit); + cb = rf_AllocCallbackDesc(); + /* the minHeadSepCounter value we have to get to before we'll wake up. build in 20% hysteresis. */ + cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit/5); + cb->row = row; cb->col = col; + cb->next = NULL; + + /* insert this callback descriptor into the sorted list of pending head-sep callbacks */ + p = reconCtrlPtr->headSepCBList; + if (!p) reconCtrlPtr->headSepCBList = cb; + else if (cb->callbackArg.v < p->callbackArg.v) { + cb->next = reconCtrlPtr->headSepCBList; + reconCtrlPtr->headSepCBList = cb; + } + else { + for (pt=p, p=p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt=p,p=p->next); + cb->next = p; + pt->next = cb; + } + retval = 1; +#if RF_RECON_STATS > 0 + ctrl->reconCtrl->reconDesc->hsStallCount++; +#endif /* RF_RECON_STATS > 0 */ + } + RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex); + + return(retval); +} + +/* checks to see if reconstruction has been either forced or blocked by a user operation. + * if forced, we skip this RU entirely. + * else if blocked, put ourselves on the wait list. + * else return 0. + * + * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY + */ +static int CheckForcedOrBlockedReconstruction( + RF_Raid_t *raidPtr, + RF_ReconParityStripeStatus_t *pssPtr, + RF_PerDiskReconCtrl_t *ctrl, + RF_RowCol_t row, + RF_RowCol_t col, + RF_StripeNum_t psid, + RF_ReconUnitNum_t which_ru) +{ + RF_CallbackDesc_t *cb; + int retcode = 0; + + if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE)) retcode = RF_PSS_FORCED_ON_WRITE; + else if (pssPtr->flags & RF_PSS_RECON_BLOCKED) { + Dprintf4("RECON: row %d col %d blocked at psid %ld ru %d\n",row, col, psid, which_ru); + cb = rf_AllocCallbackDesc(); /* append ourselves to the blockage-wait list */ + cb->row = row; cb->col = col; + cb->next = pssPtr->blockWaitList; + pssPtr->blockWaitList = cb; + retcode = RF_PSS_RECON_BLOCKED; + } + + if (!retcode) pssPtr->flags |= RF_PSS_UNDER_RECON; /* mark this RU as under reconstruction */ + + return(retcode); +} + +/* if reconstruction is currently ongoing for the indicated stripeID, reconstruction + * is forced to completion and we return non-zero to indicate that the caller must + * wait. If not, then reconstruction is blocked on the indicated stripe and the + * routine returns zero. If and only if we return non-zero, we'll cause the cbFunc + * to get invoked with the cbArg when the reconstruction has completed. + */ +int rf_ForceOrBlockRecon(raidPtr, asmap, cbFunc, cbArg) + RF_Raid_t *raidPtr; + RF_AccessStripeMap_t *asmap; + void (*cbFunc)(RF_Raid_t *,void *); + void *cbArg; +{ + RF_RowCol_t row = asmap->physInfo->row; /* which row of the array we're working on */ + RF_StripeNum_t stripeID = asmap->stripeID; /* the stripe ID we're forcing recon on */ + RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; /* num sects in one RU */ + RF_ReconParityStripeStatus_t *pssPtr; /* a pointer to the parity stripe status structure */ + RF_StripeNum_t psid; /* parity stripe id */ + RF_SectorNum_t offset, fd_offset; /* disk offset, failed-disk offset */ + RF_RowCol_t *diskids; + RF_RowCol_t stripe; + int tid; + RF_ReconUnitNum_t which_ru; /* RU within parity stripe */ + RF_RowCol_t fcol, diskno, i; + RF_ReconBuffer_t *new_rbuf; /* ptr to newly allocated rbufs */ + RF_DiskQueueData_t *req; /* disk I/O req to be enqueued */ + RF_CallbackDesc_t *cb; + int created = 0, nPromoted; + + rf_get_threadid(tid); + psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru); + + RF_LOCK_PSS_MUTEX(raidPtr, row, psid); + + pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]->pssTable, psid, which_ru, RF_PSS_CREATE|RF_PSS_RECON_BLOCKED, &created); + + /* if recon is not ongoing on this PS, just return */ + if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) { + RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid); + return(0); + } + + /* otherwise, we have to wait for reconstruction to complete on this RU. */ + /* In order to avoid waiting for a potentially large number of low-priority accesses to + * complete, we force a normal-priority (i.e. not low-priority) reconstruction + * on this RU. + */ + if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) { + DDprintf1("Forcing recon on psid %ld\n",psid); + pssPtr->flags |= RF_PSS_FORCED_ON_WRITE; /* mark this RU as under forced recon */ + pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; /* clear the blockage that we just set */ + fcol = raidPtr->reconControl[row]->fcol; + + /* get a listing of the disks comprising the indicated stripe */ + (raidPtr->Layout.map->IdentifyStripe)(raidPtr, asmap->raidAddress, &diskids, &stripe); + RF_ASSERT(row == stripe); + + /* For previously issued reads, elevate them to normal priority. If the I/O has already completed, + * it won't be found in the queue, and hence this will be a no-op. + * For unissued reads, allocate buffers and issue new reads. The fact that we've set the + * FORCED bit means that the regular recon procs will not re-issue these reqs + */ + for (i=0; i<raidPtr->Layout.numDataCol+raidPtr->Layout.numParityCol; i++) if ( (diskno = diskids[i]) != fcol) { + if (pssPtr->issued[diskno]) { + nPromoted = rf_DiskIOPromote(&raidPtr->Queues[row][diskno], psid, which_ru); + if (rf_reconDebug && nPromoted) printf("[%d] promoted read from row %d col %d\n",tid,row,diskno); + } else { + new_rbuf = rf_MakeReconBuffer(raidPtr, row, diskno, RF_RBUF_TYPE_FORCED); /* create new buf */ + ComputePSDiskOffsets(raidPtr, psid, row, diskno, &offset, &fd_offset, + &new_rbuf->spRow, &new_rbuf->spCol, &new_rbuf->spOffset); /* find offsets & spare location */ + new_rbuf->parityStripeID = psid; /* fill in the buffer */ + new_rbuf->which_ru = which_ru; + new_rbuf->failedDiskSectorOffset = fd_offset; + new_rbuf->priority = RF_IO_NORMAL_PRIORITY; + + /* use NULL b_proc b/c all addrs should be in kernel space */ + req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer, + psid, which_ru, (int (*)(void *, int))ForceReconReadDoneProc, (void *) new_rbuf, NULL, + NULL,(void *)raidPtr, 0, NULL); + + RF_ASSERT(req); /* XXX -- fix this -- XXX */ + + new_rbuf->arg = req; + rf_DiskIOEnqueue(&raidPtr->Queues[row][diskno], req, RF_IO_NORMAL_PRIORITY); /* enqueue the I/O */ + Dprintf3("[%d] Issued new read req on row %d col %d\n",tid,row,diskno); + } + } + + /* if the write is sitting in the disk queue, elevate its priority */ + if (rf_DiskIOPromote(&raidPtr->Queues[row][fcol], psid, which_ru)) printf("[%d] promoted write to row %d col %d\n",tid,row,fcol); + } + + /* install a callback descriptor to be invoked when recon completes on this parity stripe. */ + cb = rf_AllocCallbackDesc(); + /* XXX the following is bogus.. These functions don't really match!! GO */ + cb->callbackFunc = (void (*)(RF_CBParam_t))cbFunc; + cb->callbackArg.p = (void *) cbArg; + cb->next = pssPtr->procWaitList; + pssPtr->procWaitList = cb; + DDprintf2("[%d] Waiting for forced recon on psid %ld\n",tid,psid); + + RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid); + return(1); +} + +/* called upon the completion of a forced reconstruction read. + * all we do is schedule the FORCEDREADONE event. + * called at interrupt context in the kernel, so don't do anything illegal here. + */ +static void ForceReconReadDoneProc(arg, status) + void *arg; + int status; +{ + RF_ReconBuffer_t *rbuf = arg; + + if (status) {printf("Forced recon read failed!\n"); /*fprintf(stderr,"Forced recon read failed!\n");*/ RF_PANIC();} + rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, rbuf->row, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE); +} + +/* releases a block on the reconstruction of the indicated stripe */ +int rf_UnblockRecon(raidPtr, asmap) + RF_Raid_t *raidPtr; + RF_AccessStripeMap_t *asmap; +{ + RF_RowCol_t row = asmap->origRow; + RF_StripeNum_t stripeID = asmap->stripeID; + RF_ReconParityStripeStatus_t *pssPtr; + RF_ReconUnitNum_t which_ru; + RF_StripeNum_t psid; + int tid, created = 0; + RF_CallbackDesc_t *cb; + + rf_get_threadid(tid); + psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru); + RF_LOCK_PSS_MUTEX( raidPtr, row, psid); + pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]->pssTable, psid, which_ru, RF_PSS_NONE, &created); + + /* When recon is forced, the pss desc can get deleted before we get back to unblock recon. + * But, this can _only_ happen when recon is forced. + * It would be good to put some kind of sanity check here, but how to decide if recon + * was just forced or not? + */ + if (!pssPtr) { + /*printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n",psid,which_ru);*/ + if (rf_reconDebug || rf_pssDebug) printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n",(long)psid,which_ru); + goto out; + } + + pssPtr->blockCount--; + Dprintf3("[%d] unblocking recon on psid %ld: blockcount is %d\n",tid,psid,pssPtr->blockCount); + if (pssPtr->blockCount == 0) { /* if recon blockage has been released */ + + /* unblock recon before calling CauseReconEvent in case CauseReconEvent causes us to + * try to issue a new read before returning here. + */ + pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; + + + while (pssPtr->blockWaitList) { /* spin through the block-wait list and release all the waiters */ + cb = pssPtr->blockWaitList; + pssPtr->blockWaitList = cb->next; + cb->next = NULL; + rf_CauseReconEvent(raidPtr, cb->row, cb->col, NULL, RF_REVENT_BLOCKCLEAR); + rf_FreeCallbackDesc(cb); + } + if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) { /* if no recon was requested while recon was blocked */ + rf_PSStatusDelete(raidPtr, raidPtr->reconControl[row]->pssTable, pssPtr); + } + } + +out: + RF_UNLOCK_PSS_MUTEX( raidPtr, row, psid ); + return(0); +} diff --git a/sys/dev/raidframe/rf_reconstruct.h b/sys/dev/raidframe/rf_reconstruct.h new file mode 100644 index 00000000000..5913e626609 --- /dev/null +++ b/sys/dev/raidframe/rf_reconstruct.h @@ -0,0 +1,258 @@ +/* $OpenBSD: rf_reconstruct.h,v 1.1 1999/01/11 14:29:47 niklas Exp $ */ +/* $NetBSD: rf_reconstruct.h,v 1.1 1998/11/13 04:20:34 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/********************************************************* + * rf_reconstruct.h -- header file for reconstruction code + *********************************************************/ + +/* : + * Log: rf_reconstruct.h,v + * Revision 1.25 1996/08/01 15:57:24 jimz + * minor cleanup + * + * Revision 1.24 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.23 1996/07/15 05:40:41 jimz + * some recon datastructure cleanup + * better handling of multiple failures + * added undocumented double-recon test + * + * Revision 1.22 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.21 1996/07/11 19:08:00 jimz + * generalize reconstruction mechanism + * allow raid1 reconstructs via copyback (done with array + * quiesced, not online, therefore not disk-directed) + * + * Revision 1.20 1996/06/11 10:57:30 jimz + * add rf_RegisterReconDoneProc + * + * Revision 1.19 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.18 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.17 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.16 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.15 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.14 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.13 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.12 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.11 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.10 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.9 1995/12/06 15:04:55 root + * added copyright info + * + */ + +#ifndef _RF__RF_RECONSTRUCT_H_ +#define _RF__RF_RECONSTRUCT_H_ + +#include "rf_types.h" +#include <sys/time.h> +#include "rf_reconmap.h" +#include "rf_psstatus.h" + +/* reconstruction configuration information */ +struct RF_ReconConfig_s { + unsigned numFloatingReconBufs; /* number of floating recon bufs to use */ + RF_HeadSepLimit_t headSepLimit; /* how far apart the heads are allow to become, in parity stripes */ +}; + +/* a reconstruction buffer */ +struct RF_ReconBuffer_s { + RF_Raid_t *raidPtr; /* void * to avoid recursive includes */ + caddr_t buffer; /* points to the data */ + RF_StripeNum_t parityStripeID; /* the parity stripe that this data relates to */ + int which_ru; /* which reconstruction unit within the PSS */ + RF_SectorNum_t failedDiskSectorOffset;/* the offset into the failed disk */ + RF_RowCol_t row, col; /* which disk this buffer belongs to or is targeted at */ + RF_StripeCount_t count; /* counts the # of SUs installed so far */ + int priority; /* used to force hi priority recon */ + RF_RbufType_t type; /* FORCED or FLOATING */ + char *arrived; /* [x] = 1/0 if SU from disk x has/hasn't arrived */ + RF_ReconBuffer_t *next; /* used for buffer management */ + void *arg; /* generic field for general use */ + RF_RowCol_t spRow, spCol; /* spare disk to which this buf should be written */ + /* if dist sparing off, always identifies the replacement disk */ + RF_SectorNum_t spOffset; /* offset into the spare disk */ + /* if dist sparing off, identical to failedDiskSectorOffset */ + RF_ReconParityStripeStatus_t *pssPtr; /* debug- pss associated with issue-pending write */ +}; + +/* a reconstruction event descriptor. The event types currently are: + * RF_REVENT_READDONE -- a read operation has completed + * RF_REVENT_WRITEDONE -- a write operation has completed + * RF_REVENT_BUFREADY -- the buffer manager has produced a full buffer + * RF_REVENT_BLOCKCLEAR -- a reconstruction blockage has been cleared + * RF_REVENT_BUFCLEAR -- the buffer manager has released a process blocked on submission + * RF_REVENT_SKIP -- we need to skip the current RU and go on to the next one, typ. b/c we found recon forced + * RF_REVENT_FORCEDREADONE- a forced-reconstructoin read operation has completed + */ +typedef enum RF_Revent_e { + RF_REVENT_READDONE, + RF_REVENT_WRITEDONE, + RF_REVENT_BUFREADY, + RF_REVENT_BLOCKCLEAR, + RF_REVENT_BUFCLEAR, + RF_REVENT_HEADSEPCLEAR, + RF_REVENT_SKIP, + RF_REVENT_FORCEDREADDONE +} RF_Revent_t; + +struct RF_ReconEvent_s { + RF_Revent_t type; /* what kind of event has occurred */ + RF_RowCol_t col; /* row ID is implicit in the queue in which the event is placed */ + void *arg; /* a generic argument */ + RF_ReconEvent_t *next; +}; + +/* + * Reconstruction control information maintained per-disk + * (for surviving disks) + */ +struct RF_PerDiskReconCtrl_s { + RF_ReconCtrl_t *reconCtrl; + RF_RowCol_t row, col; /* to make this structure self-identifying */ + RF_StripeNum_t curPSID; /* the next parity stripe ID to check on this disk */ + RF_HeadSepLimit_t headSepCounter; /* counter used to control maximum head separation */ + RF_SectorNum_t diskOffset; /* the offset into the indicated disk of the current PU */ + RF_ReconUnitNum_t ru_count; /* this counts off the recon units within each parity unit */ + RF_ReconBuffer_t *rbuf; /* the recon buffer assigned to this disk */ +}; + +/* main reconstruction control structure */ +struct RF_ReconCtrl_s { + RF_RaidReconDesc_t *reconDesc; + RF_RowCol_t fcol; /* which column has failed */ + RF_PerDiskReconCtrl_t *perDiskInfo; /* information maintained per-disk */ + RF_ReconMap_t *reconMap; /* map of what has/has not been reconstructed */ + RF_RowCol_t spareRow; /* which of the spare disks we're using */ + RF_RowCol_t spareCol; + RF_StripeNum_t lastPSID; /* the ID of the last parity stripe we want reconstructed */ + int percentComplete; /* percentage completion of reconstruction */ + + /* reconstruction event queue */ + RF_ReconEvent_t *eventQueue; /* queue of pending reconstruction events */ + RF_DECLARE_MUTEX(eq_mutex) /* mutex for locking event queue */ + RF_DECLARE_COND(eq_cond) /* condition variable for signalling recon events */ + int eq_count; /* debug only */ + + /* reconstruction buffer management */ + RF_DECLARE_MUTEX(rb_mutex) /* mutex for messing around with recon buffers */ + RF_ReconBuffer_t *floatingRbufs; /* available floating reconstruction buffers */ + RF_ReconBuffer_t *committedRbufs; /* recon buffers that have been committed to some waiting disk */ + RF_ReconBuffer_t *fullBufferList; /* full buffers waiting to be written out */ + RF_ReconBuffer_t *priorityList; /* full buffers that have been elevated to higher priority */ + RF_CallbackDesc_t *bufferWaitList; /* disks that are currently blocked waiting for buffers */ + + /* parity stripe status table */ + RF_PSStatusHeader_t *pssTable; /* stores the reconstruction status of active parity stripes */ + + /* maximum-head separation control */ + RF_HeadSepLimit_t minHeadSepCounter; /* the minimum hs counter over all disks */ + RF_CallbackDesc_t *headSepCBList; /* list of callbacks to be done as minPSID advances */ + + /* performance monitoring */ + struct timeval starttime; /* recon start time */ + + void (*continueFunc)(void *); /* function to call when io returns*/ + void *continueArg; /* argument for Func */ +}; + +/* the default priority for reconstruction accesses */ +#define RF_IO_RECON_PRIORITY RF_IO_LOW_PRIORITY + +int rf_ConfigureReconstruction(RF_ShutdownList_t **listp); + +int rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t row, + RF_RowCol_t col); + +int rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t row, + RF_RowCol_t col); + +int rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc); + +int rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap, + void (*cbFunc)(RF_Raid_t *,void *), void *cbArg); + +int rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap); + +int rf_RegisterReconDoneProc(RF_Raid_t *raidPtr, void (*proc)(RF_Raid_t *, void *), void *arg, + RF_ReconDoneProc_t **handlep); + +#endif /* !_RF__RF_RECONSTRUCT_H_ */ diff --git a/sys/dev/raidframe/rf_reconstub.c b/sys/dev/raidframe/rf_reconstub.c new file mode 100644 index 00000000000..2502462ea8b --- /dev/null +++ b/sys/dev/raidframe/rf_reconstub.c @@ -0,0 +1,88 @@ +/* $OpenBSD: rf_reconstub.c,v 1.1 1999/01/11 14:29:47 niklas Exp $ */ +/* $NetBSD: rf_reconstub.c,v 1.1 1998/11/13 04:20:34 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/************************************************************************** + * + * rf_reconstub.c -- stub routines used when you don't want reconstruction + * in some particular instantiation of the raidframe + * + * this file also contains stubs for some reconstruction-related + * routines that we don't want compiled into the kernel. + * + * The OSF/1 kernel configuration includes an option "raidframe_recon". If + * enabled, most of this file is ifdef'd out. + * + **************************************************************************/ + +/* : + * Log: rf_reconstub.c,v + * Revision 1.9 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.8 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.7 1996/04/03 23:25:33 jimz + * make inclusion of raidframe_recon.h #ifdef KERNEL + * + * Revision 1.6 1995/12/06 15:06:54 root + * added copyright info + * + */ + +#ifdef KERNEL +#include <raidframe_recon.h> +#endif /* KERNEL */ +#include <sys/errno.h> + +#if RAIDFRAME_RECON == 0 + +int rf_ConfigureReconstruction() { return(0); } +int rf_ConfigureReconEvent() { return(0); } +int rf_ConfigurePSStatus() { return(0); } +int rf_ConfigureNWayXor() { return(0); } +int rf_ConfigureCopyback() { return(0); } +int rf_ShutdownCopyback() { return(0); } +int rf_ShutdownReconstruction() { return(0); } +int rf_ShutdownReconEvent() { return(0); } +int rf_ShutdownPSStatus() { return(0); } +int rf_ShutdownNWayXor() { return(0); } + +int rf_ForceOrBlockRecon() { return(0); } +int rf_UnblockRecon() { return(0); } +int rf_ReconstructFailedDisk() { return(ENOTTY); } +int rf_CheckRUReconstructed() { return(0); } + +void rf_start_cpu_monitor() {} +void rf_stop_cpu_monitor() {} +void rf_print_cpu_util() {} + +#endif /* RAIDFRAME_RECON == 0 */ diff --git a/sys/dev/raidframe/rf_reconutil.c b/sys/dev/raidframe/rf_reconutil.c new file mode 100644 index 00000000000..51267198a7e --- /dev/null +++ b/sys/dev/raidframe/rf_reconutil.c @@ -0,0 +1,408 @@ +/* $OpenBSD: rf_reconutil.c,v 1.1 1999/01/11 14:29:47 niklas Exp $ */ +/* $NetBSD: rf_reconutil.c,v 1.1 1998/11/13 04:20:34 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/******************************************** + * rf_reconutil.c -- reconstruction utilities + ********************************************/ + +/* : + * Log: rf_reconutil.c,v + * Revision 1.32 1996/07/29 14:05:12 jimz + * fix numPUs/numRUs confusion (everything is now numRUs) + * clean up some commenting, return values + * + * Revision 1.31 1996/07/15 05:40:41 jimz + * some recon datastructure cleanup + * better handling of multiple failures + * added undocumented double-recon test + * + * Revision 1.30 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.29 1996/06/19 17:53:48 jimz + * move GetNumSparePUs, InstallSpareTable ops into layout switch + * + * Revision 1.28 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.27 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.26 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.25 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.24 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.23 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.22 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.21 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.20 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.19 1996/05/20 16:14:55 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.18 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.17 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.16 1995/12/06 15:05:31 root + * added copyright info + * + */ + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_desc.h" +#include "rf_reconutil.h" +#include "rf_reconbuffer.h" +#include "rf_general.h" +#include "rf_decluster.h" +#include "rf_raid5_rotatedspare.h" +#include "rf_interdecluster.h" +#include "rf_chaindecluster.h" + +/******************************************************************* + * allocates/frees the reconstruction control information structures + *******************************************************************/ +RF_ReconCtrl_t *rf_MakeReconControl(reconDesc, frow, fcol, srow, scol) + RF_RaidReconDesc_t *reconDesc; + RF_RowCol_t frow; /* failed row and column */ + RF_RowCol_t fcol; + RF_RowCol_t srow; /* identifies which spare we're using */ + RF_RowCol_t scol; +{ + RF_Raid_t *raidPtr = reconDesc->raidPtr; + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU; + RF_ReconUnitCount_t numSpareRUs; + RF_ReconCtrl_t *reconCtrlPtr; + RF_ReconBuffer_t *rbuf; + RF_LayoutSW_t *lp; + int retcode, rc; + RF_RowCol_t i; + + lp = raidPtr->Layout.map; + + /* make and zero the global reconstruction structure and the per-disk structure */ + RF_Calloc(reconCtrlPtr, 1, sizeof(RF_ReconCtrl_t), (RF_ReconCtrl_t *)); + RF_Calloc(reconCtrlPtr->perDiskInfo, raidPtr->numCol, sizeof(RF_PerDiskReconCtrl_t), (RF_PerDiskReconCtrl_t *)); /* this zeros it */ + reconCtrlPtr->reconDesc = reconDesc; + reconCtrlPtr->fcol = fcol; + reconCtrlPtr->spareRow = srow; + reconCtrlPtr->spareCol = scol; + reconCtrlPtr->lastPSID = layoutPtr->numStripe/layoutPtr->SUsPerPU; + reconCtrlPtr->percentComplete = 0; + + /* initialize each per-disk recon information structure */ + for (i=0; i<raidPtr->numCol; i++) { + reconCtrlPtr->perDiskInfo[i].reconCtrl = reconCtrlPtr; + reconCtrlPtr->perDiskInfo[i].row = frow; + reconCtrlPtr->perDiskInfo[i].col = i; + reconCtrlPtr->perDiskInfo[i].curPSID = -1; /* make it appear as if we just finished an RU */ + reconCtrlPtr->perDiskInfo[i].ru_count = RUsPerPU-1; + } + + /* Get the number of spare units per disk and the sparemap in case spare is distributed */ + + if (lp->GetNumSpareRUs) { + numSpareRUs = lp->GetNumSpareRUs(raidPtr); + } + else { + numSpareRUs = 0; + } + + /* + * Not all distributed sparing archs need dynamic mappings + */ + if (lp->InstallSpareTable) { + retcode = rf_InstallSpareTable(raidPtr, frow, fcol); + if (retcode) { + RF_PANIC(); /* XXX fix this*/ + } + } + + /* make the reconstruction map */ + reconCtrlPtr->reconMap = rf_MakeReconMap(raidPtr, (int) (layoutPtr->SUsPerRU * layoutPtr->sectorsPerStripeUnit), + raidPtr->sectorsPerDisk, numSpareRUs); + + /* make the per-disk reconstruction buffers */ + for (i=0; i<raidPtr->numCol; i++) { + reconCtrlPtr->perDiskInfo[i].rbuf = (i==fcol) ? NULL : rf_MakeReconBuffer(raidPtr, frow, i, RF_RBUF_TYPE_EXCLUSIVE); + } + + /* initialize the event queue */ + rc = rf_mutex_init(&reconCtrlPtr->eq_mutex); + if (rc) { + /* XXX deallocate, cleanup */ + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + return(NULL); + } + rc = rf_cond_init(&reconCtrlPtr->eq_cond); + if (rc) { + /* XXX deallocate, cleanup */ + RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + return(NULL); + } + reconCtrlPtr->eventQueue = NULL; + reconCtrlPtr->eq_count = 0; + + /* make the floating recon buffers and append them to the free list */ + rc = rf_mutex_init(&reconCtrlPtr->rb_mutex); + if (rc) { + /* XXX deallocate, cleanup */ + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + return(NULL); + } + reconCtrlPtr->fullBufferList= NULL; + reconCtrlPtr->priorityList = NULL; + reconCtrlPtr->floatingRbufs = NULL; + reconCtrlPtr->committedRbufs= NULL; + for (i=0; i<raidPtr->numFloatingReconBufs; i++) { + rbuf = rf_MakeReconBuffer(raidPtr, frow, fcol, RF_RBUF_TYPE_FLOATING); + rbuf->next = reconCtrlPtr->floatingRbufs; + reconCtrlPtr->floatingRbufs = rbuf; + } + + /* create the parity stripe status table */ + reconCtrlPtr->pssTable = rf_MakeParityStripeStatusTable(raidPtr); + + /* set the initial min head sep counter val */ + reconCtrlPtr->minHeadSepCounter = 0; + + return(reconCtrlPtr); +} + +void rf_FreeReconControl(raidPtr, row) + RF_Raid_t *raidPtr; + RF_RowCol_t row; +{ + RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row]; + RF_ReconBuffer_t *t; + RF_ReconUnitNum_t i; + + RF_ASSERT(reconCtrlPtr); + for (i=0; i<raidPtr->numCol; i++) if (reconCtrlPtr->perDiskInfo[i].rbuf) rf_FreeReconBuffer(reconCtrlPtr->perDiskInfo[i].rbuf); + for (i=0; i<raidPtr->numFloatingReconBufs; i++) { + t = reconCtrlPtr->floatingRbufs; + RF_ASSERT(t); + reconCtrlPtr->floatingRbufs = t->next; + rf_FreeReconBuffer(t); + } + rf_mutex_destroy(&reconCtrlPtr->rb_mutex); + rf_mutex_destroy(&reconCtrlPtr->eq_mutex); + rf_cond_destroy(&reconCtrlPtr->eq_cond); + rf_FreeReconMap(reconCtrlPtr->reconMap); + rf_FreeParityStripeStatusTable(raidPtr, reconCtrlPtr->pssTable); + RF_Free(reconCtrlPtr->perDiskInfo, raidPtr->numCol * sizeof(RF_PerDiskReconCtrl_t)); + RF_Free(reconCtrlPtr, sizeof(*reconCtrlPtr)); +} + + +/****************************************************************************** + * computes the default head separation limit + *****************************************************************************/ +RF_HeadSepLimit_t rf_GetDefaultHeadSepLimit(raidPtr) + RF_Raid_t *raidPtr; +{ + RF_HeadSepLimit_t hsl; + RF_LayoutSW_t *lp; + + lp = raidPtr->Layout.map; + if (lp->GetDefaultHeadSepLimit == NULL) + return(-1); + hsl = lp->GetDefaultHeadSepLimit(raidPtr); + return(hsl); +} + + +/****************************************************************************** + * computes the default number of floating recon buffers + *****************************************************************************/ +int rf_GetDefaultNumFloatingReconBuffers(raidPtr) + RF_Raid_t *raidPtr; +{ + RF_LayoutSW_t *lp; + int nrb; + + lp = raidPtr->Layout.map; + if (lp->GetDefaultNumFloatingReconBuffers == NULL) + return(3 * raidPtr->numCol); + nrb = lp->GetDefaultNumFloatingReconBuffers(raidPtr); + return(nrb); +} + + +/****************************************************************************** + * creates and initializes a reconstruction buffer + *****************************************************************************/ +RF_ReconBuffer_t *rf_MakeReconBuffer( + RF_Raid_t *raidPtr, + RF_RowCol_t row, + RF_RowCol_t col, + RF_RbufType_t type) +{ + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_ReconBuffer_t *t; + u_int recon_buffer_size = rf_RaidAddressToByte(raidPtr, layoutPtr->SUsPerRU * layoutPtr->sectorsPerStripeUnit); + + RF_Malloc(t, sizeof(RF_ReconBuffer_t), (RF_ReconBuffer_t *)); + RF_Malloc(t->buffer, recon_buffer_size, (caddr_t)); + RF_Malloc(t->arrived, raidPtr->numCol * sizeof(char), (char *)); + t->raidPtr = raidPtr; + t->row = row; t->col = col; + t->priority = RF_IO_RECON_PRIORITY; + t->type = type; + t->pssPtr = NULL; + t->next = NULL; + return(t); +} + +/****************************************************************************** + * frees a reconstruction buffer + *****************************************************************************/ +void rf_FreeReconBuffer(rbuf) + RF_ReconBuffer_t *rbuf; +{ + RF_Raid_t *raidPtr = rbuf->raidPtr; + u_int recon_buffer_size = rf_RaidAddressToByte(raidPtr, raidPtr->Layout.SUsPerRU * raidPtr->Layout.sectorsPerStripeUnit); + + RF_Free(rbuf->arrived, raidPtr->numCol * sizeof(char)); + RF_Free(rbuf->buffer, recon_buffer_size); + RF_Free(rbuf, sizeof(*rbuf)); +} + + +/****************************************************************************** + * debug only: sanity check the number of floating recon bufs in use + *****************************************************************************/ +void rf_CheckFloatingRbufCount(raidPtr, dolock) + RF_Raid_t *raidPtr; + int dolock; +{ + RF_ReconParityStripeStatus_t *p; + RF_PSStatusHeader_t *pssTable; + RF_ReconBuffer_t *rbuf; + int i, j, sum = 0; + RF_RowCol_t frow=0; + + for (i=0; i<raidPtr->numRow; i++) + if (raidPtr->reconControl[i]) { + frow = i; + break; + } + RF_ASSERT(frow >= 0); + + if (dolock) + RF_LOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex); + pssTable = raidPtr->reconControl[frow]->pssTable; + + for (i=0; i<raidPtr->pssTableSize; i++) { + RF_LOCK_MUTEX(pssTable[i].mutex); + for (p = pssTable[i].chain; p; p=p->next) { + rbuf = (RF_ReconBuffer_t *) p->rbuf; + if (rbuf && rbuf->type == RF_RBUF_TYPE_FLOATING) + sum++; + + rbuf = (RF_ReconBuffer_t *) p->writeRbuf; + if (rbuf && rbuf->type == RF_RBUF_TYPE_FLOATING) + sum++; + + for (j=0; j<p->xorBufCount; j++) { + rbuf = (RF_ReconBuffer_t *) p->rbufsForXor[j]; + RF_ASSERT(rbuf); + if (rbuf->type == RF_RBUF_TYPE_FLOATING) + sum++; + } + } + RF_UNLOCK_MUTEX(pssTable[i].mutex); + } + + for (rbuf = raidPtr->reconControl[frow]->floatingRbufs; rbuf; rbuf = rbuf->next) { + if (rbuf->type == RF_RBUF_TYPE_FLOATING) + sum++; + } + for (rbuf = raidPtr->reconControl[frow]->committedRbufs; rbuf; rbuf = rbuf->next) { + if (rbuf->type == RF_RBUF_TYPE_FLOATING) + sum++; + } + for (rbuf = raidPtr->reconControl[frow]->fullBufferList; rbuf; rbuf = rbuf->next) { + if (rbuf->type == RF_RBUF_TYPE_FLOATING) + sum++; + } + for (rbuf = raidPtr->reconControl[frow]->priorityList; rbuf; rbuf = rbuf->next) { + if (rbuf->type == RF_RBUF_TYPE_FLOATING) + sum++; + } + + RF_ASSERT(sum == raidPtr->numFloatingReconBufs); + + if (dolock) + RF_UNLOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex); +} diff --git a/sys/dev/raidframe/rf_reconutil.h b/sys/dev/raidframe/rf_reconutil.h new file mode 100644 index 00000000000..f4ea1c6f5f7 --- /dev/null +++ b/sys/dev/raidframe/rf_reconutil.h @@ -0,0 +1,96 @@ +/* $OpenBSD: rf_reconutil.h,v 1.1 1999/01/11 14:29:48 niklas Exp $ */ +/* $NetBSD: rf_reconutil.h,v 1.1 1998/11/13 04:20:34 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/************************************************************ + * rf_reconutil.h -- header file for reconstruction utilities + ************************************************************/ + +/* : + * Log: rf_reconutil.h,v + * Revision 1.10 1996/07/15 05:40:41 jimz + * some recon datastructure cleanup + * better handling of multiple failures + * added undocumented double-recon test + * + * Revision 1.9 1996/07/13 00:00:59 jimz + * sanitized generalized reconstruction architecture + * cleaned up head sep, rbuf problems + * + * Revision 1.8 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.7 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.6 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.5 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.4 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.3 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.2 1995/12/06 15:06:47 root + * added copyright info + * + */ + +#ifndef _RF__RF_RECONUTIL_H_ +#define _RF__RF_RECONUTIL_H_ + +#include "rf_types.h" +#include "rf_reconstruct.h" + +RF_ReconCtrl_t *rf_MakeReconControl(RF_RaidReconDesc_t *reconDesc, + RF_RowCol_t frow, RF_RowCol_t fcol, RF_RowCol_t srow, RF_RowCol_t scol); +void rf_FreeReconControl(RF_Raid_t *raidPtr, RF_RowCol_t row); +RF_HeadSepLimit_t rf_GetDefaultHeadSepLimit(RF_Raid_t *raidPtr); +int rf_GetDefaultNumFloatingReconBuffers(RF_Raid_t *raidPtr); +RF_ReconBuffer_t *rf_MakeReconBuffer(RF_Raid_t *raidPtr, RF_RowCol_t row, + RF_RowCol_t col, RF_RbufType_t type); +void rf_FreeReconBuffer(RF_ReconBuffer_t *rbuf); +void rf_CheckFloatingRbufCount(RF_Raid_t *raidPtr, int dolock); + +#endif /* !_RF__RF_RECONUTIL_H_ */ diff --git a/sys/dev/raidframe/rf_revent.c b/sys/dev/raidframe/rf_revent.c new file mode 100644 index 00000000000..c4236962b64 --- /dev/null +++ b/sys/dev/raidframe/rf_revent.c @@ -0,0 +1,306 @@ +/* $OpenBSD: rf_revent.c,v 1.1 1999/01/11 14:29:48 niklas Exp $ */ +/* $NetBSD: rf_revent.c,v 1.1 1998/11/13 04:20:34 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ +/* + * revent.c -- reconstruction event handling code + */ +/* + * : + * Log: rf_revent.c,v + * Revision 1.22 1996/08/11 00:41:11 jimz + * extern hz only for kernel + * + * Revision 1.21 1996/07/15 05:40:41 jimz + * some recon datastructure cleanup + * better handling of multiple failures + * added undocumented double-recon test + * + * Revision 1.20 1996/06/17 03:18:04 jimz + * include shutdown.h for macroized ShutdownCreate + * + * Revision 1.19 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.18 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.17 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.16 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.15 1996/05/30 12:59:18 jimz + * make etimer happier, more portable + * + * Revision 1.14 1996/05/20 16:13:40 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * use RF_FREELIST for revents + * + * Revision 1.13 1996/05/18 20:09:47 jimz + * bit of cleanup to compile cleanly in kernel, once again + * + * Revision 1.12 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + */ + +#ifdef _KERNEL +#define KERNEL +#endif + +#include <sys/errno.h> + +#include "rf_raid.h" +#include "rf_revent.h" +#include "rf_etimer.h" +#include "rf_general.h" +#include "rf_freelist.h" +#include "rf_desc.h" +#include "rf_shutdown.h" + +static RF_FreeList_t *rf_revent_freelist; +#define RF_MAX_FREE_REVENT 128 +#define RF_REVENT_INC 8 +#define RF_REVENT_INITIAL 8 + + +#ifdef KERNEL + +#include <sys/proc.h> + +extern int hz; + +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#define DO_WAIT(_rc) mpsleep(&(_rc)->eventQueue, PZERO, "raidframe eventq", 0, \ + (void *) simple_lock_addr((_rc)->eq_mutex), MS_LOCK_SIMPLE) +#else +#define DO_WAIT(_rc) tsleep(&(_rc)->eventQueue, PRIBIO | PCATCH, "raidframe eventq", 0) +#endif + +#define DO_SIGNAL(_rc) wakeup(&(_rc)->eventQueue) + +#else /* KERNEL */ + +#define DO_WAIT(_rc) RF_WAIT_COND((_rc)->eq_cond, (_rc)->eq_mutex) +#define DO_SIGNAL(_rc) RF_SIGNAL_COND((_rc)->eq_cond) + +#endif /* KERNEL */ + +static void rf_ShutdownReconEvent(void *); + +static RF_ReconEvent_t *GetReconEventDesc(RF_RowCol_t row, RF_RowCol_t col, + void *arg, RF_Revent_t type); +RF_ReconEvent_t *rf_GetNextReconEvent(RF_RaidReconDesc_t *, + RF_RowCol_t, void (*continueFunc)(void *), + void *); + +static void rf_ShutdownReconEvent(ignored) + void *ignored; +{ + RF_FREELIST_DESTROY(rf_revent_freelist,next,(RF_ReconEvent_t *)); +} + +int rf_ConfigureReconEvent(listp) + RF_ShutdownList_t **listp; +{ + int rc; + + RF_FREELIST_CREATE(rf_revent_freelist, RF_MAX_FREE_REVENT, + RF_REVENT_INC, sizeof(RF_ReconEvent_t)); + if (rf_revent_freelist == NULL) + return(ENOMEM); + rc = rf_ShutdownCreate(listp, rf_ShutdownReconEvent, NULL); + if (rc) { + RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + rf_ShutdownReconEvent(NULL); + return(rc); + } + RF_FREELIST_PRIME(rf_revent_freelist, RF_REVENT_INITIAL,next, + (RF_ReconEvent_t *)); + return(0); +} + +/* returns the next reconstruction event, blocking the calling thread until + * one becomes available + */ + +/* will now return null if it is blocked or will return an event if it is not */ + +RF_ReconEvent_t *rf_GetNextReconEvent(reconDesc, row, continueFunc, continueArg) + RF_RaidReconDesc_t *reconDesc; + RF_RowCol_t row; + void (*continueFunc)(void *); + void *continueArg; +{ + RF_Raid_t *raidPtr = reconDesc->raidPtr; + RF_ReconCtrl_t *rctrl = raidPtr->reconControl[row]; + RF_ReconEvent_t *event; + + RF_ASSERT( row >= 0 && row <= raidPtr->numRow ); + RF_LOCK_MUTEX(rctrl->eq_mutex); + RF_ASSERT( (rctrl->eventQueue==NULL) == (rctrl->eq_count == 0)); /* q null and count==0 must be equivalent conditions */ + + + rctrl->continueFunc=continueFunc; + rctrl->continueArg=continueArg; + +#ifdef SIMULATE + if (!rctrl->eventQueue) { + RF_UNLOCK_MUTEX(rctrl->eq_mutex); + return (NULL); + } +#else /* SIMULATE */ + +#ifdef KERNEL + +/* mpsleep timeout value: secs = timo_val/hz. 'ticks' here is defined as cycle-counter ticks, not softclock ticks */ +#define MAX_RECON_EXEC_TICKS 15000000 /* 150 Mhz => this many ticks in 100 ms */ +#define RECON_DELAY_MS 25 +#define RECON_TIMO ((RECON_DELAY_MS * hz) / 1000) + + /* we are not pre-emptible in the kernel, but we don't want to run forever. If we run w/o blocking + * for more than MAX_RECON_EXEC_TICKS ticks of the cycle counter, delay for RECON_DELAY before continuing. + * this may murder us with context switches, so we may need to increase both the MAX...TICKS and the RECON_DELAY_MS. + */ + if (reconDesc->reconExecTimerRunning) { + int status; + + RF_ETIMER_STOP(reconDesc->recon_exec_timer); + RF_ETIMER_EVAL(reconDesc->recon_exec_timer); + reconDesc->reconExecTicks += RF_ETIMER_VAL_TICKS(reconDesc->recon_exec_timer); + if (reconDesc->reconExecTicks > reconDesc->maxReconExecTicks) + reconDesc->maxReconExecTicks = reconDesc->reconExecTicks; + if (reconDesc->reconExecTicks >= MAX_RECON_EXEC_TICKS) { + /* we've been running too long. delay for RECON_DELAY_MS */ +#if RF_RECON_STATS > 0 + reconDesc->numReconExecDelays++; +#endif /* RF_RECON_STATS > 0 */ +#if !defined(__NetBSD__) && !defined(__OpenBSD__) + status = mpsleep(&reconDesc->reconExecTicks, PZERO, "recon delay", RECON_TIMO, (void *) simple_lock_addr(rctrl->eq_mutex), MS_LOCK_SIMPLE); +#else + status = tsleep(&reconDesc->reconExecTicks, PRIBIO | PCATCH, "recon delay", RECON_TIMO ); +#endif + RF_ASSERT(status == EWOULDBLOCK); + reconDesc->reconExecTicks = 0; + } + } + +#endif /* KERNEL */ + + while (!rctrl->eventQueue) { +#if RF_RECON_STATS > 0 + reconDesc->numReconEventWaits++; +#endif /* RF_RECON_STATS > 0 */ + DO_WAIT(rctrl); +#ifdef KERNEL + reconDesc->reconExecTicks = 0; /* we've just waited */ +#endif /* KERNEL */ + } + +#endif /* SIMULATE */ + +#ifdef KERNEL + reconDesc->reconExecTimerRunning = 1; + RF_ETIMER_START(reconDesc->recon_exec_timer); +#endif /* KERNEL */ + + event = rctrl->eventQueue; + rctrl->eventQueue = event->next; + event->next = NULL; + rctrl->eq_count--; + RF_ASSERT( (rctrl->eventQueue==NULL) == (rctrl->eq_count == 0)); /* q null and count==0 must be equivalent conditions */ + RF_UNLOCK_MUTEX(rctrl->eq_mutex); + return(event); +} + +/* enqueues a reconstruction event on the indicated queue */ +void rf_CauseReconEvent(raidPtr, row, col, arg, type) + RF_Raid_t *raidPtr; + RF_RowCol_t row; + RF_RowCol_t col; + void *arg; + RF_Revent_t type; +{ + RF_ReconCtrl_t *rctrl = raidPtr->reconControl[row]; + RF_ReconEvent_t *event = GetReconEventDesc(row, col, arg, type); + + if (type == RF_REVENT_BUFCLEAR) { + RF_ASSERT(col != rctrl->fcol); + } + + RF_ASSERT( row >= 0 && row <= raidPtr->numRow && col >=0 && col <= raidPtr->numCol ); + RF_LOCK_MUTEX(rctrl->eq_mutex); + RF_ASSERT( (rctrl->eventQueue==NULL) == (rctrl->eq_count == 0)); /* q null and count==0 must be equivalent conditions */ + event->next = rctrl->eventQueue; + rctrl->eventQueue = event; + rctrl->eq_count++; + RF_UNLOCK_MUTEX(rctrl->eq_mutex); + +#ifndef SIMULATE + DO_SIGNAL(rctrl); +#else /* !SIMULATE */ + (rctrl->continueFunc)(rctrl->continueArg); +#endif /* !SIMULATE */ +} + +/* allocates and initializes a recon event descriptor */ +static RF_ReconEvent_t *GetReconEventDesc(row, col, arg, type) + RF_RowCol_t row; + RF_RowCol_t col; + void *arg; + RF_Revent_t type; +{ + RF_ReconEvent_t *t; + + RF_FREELIST_GET(rf_revent_freelist,t,next,(RF_ReconEvent_t *)); + if (t == NULL) + return(NULL); + t->col = col; + t->arg = arg; + t->type = type; + return(t); +} + +void rf_FreeReconEventDesc(event) + RF_ReconEvent_t *event; +{ + RF_FREELIST_FREE(rf_revent_freelist,event,next); +} diff --git a/sys/dev/raidframe/rf_revent.h b/sys/dev/raidframe/rf_revent.h new file mode 100644 index 00000000000..7029a8ef74d --- /dev/null +++ b/sys/dev/raidframe/rf_revent.h @@ -0,0 +1,82 @@ +/* $OpenBSD: rf_revent.h,v 1.1 1999/01/11 14:29:48 niklas Exp $ */ +/* $NetBSD: rf_revent.h,v 1.1 1998/11/13 04:20:34 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/******************************************************************* + * + * rf_revent.h -- header file for reconstruction event handling code + * + *******************************************************************/ + +/* : + * Log: rf_revent.h,v + * Revision 1.7 1996/07/15 05:40:41 jimz + * some recon datastructure cleanup + * better handling of multiple failures + * added undocumented double-recon test + * + * Revision 1.6 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.5 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.4 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.3 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.2 1995/12/06 15:04:20 root + * added copyright info + * + */ + +#ifndef _RF__RF_REVENT_H_ +#define _RF__RF_REVENT_H_ + +#include "rf_types.h" + +int rf_ConfigureReconEvent(RF_ShutdownList_t **listp); + +RF_ReconEvent_t *rf_GetNextReconEvent(RF_RaidReconDesc_t *reconDesc, + RF_RowCol_t row, void (*continueFunc)(void *), void *continueArg); + +void rf_CauseReconEvent(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col, + void *arg, RF_Revent_t type); + +void rf_FreeReconEventDesc(RF_ReconEvent_t *event); + +#endif /* !_RF__RF_REVENT_H_ */ diff --git a/sys/dev/raidframe/rf_rst.h b/sys/dev/raidframe/rf_rst.h new file mode 100644 index 00000000000..06e66275cd2 --- /dev/null +++ b/sys/dev/raidframe/rf_rst.h @@ -0,0 +1,78 @@ +/* $OpenBSD: rf_rst.h,v 1.1 1999/01/11 14:29:49 niklas Exp $ */ +/* $NetBSD: rf_rst.h,v 1.1 1998/11/13 04:20:34 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* rf_rst.h - defines raidSim trace entry */ + +/* : + * Log: rf_rst.h,v + * Revision 1.7 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.6 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.5 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.4 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.3 1995/12/06 15:03:15 root + * added copyright info + * + */ + +#ifndef _RF__RF_RST_H_ +#define _RF__RF_RST_H_ + +#include "rf_types.h" + +typedef struct RF_ScriptTraceEntry_s { + RF_int32 blkno; + RF_int32 size; + double delay; + RF_int16 pid; + RF_int8 op; + RF_int8 async_flag; +} RF_ScriptTraceEntry_t; + +typedef struct RF_ScriptTraceEntryList_s RF_ScriptTraceEntryList_t; +struct RF_ScriptTraceEntryList_s { + RF_ScriptTraceEntry_t entry; + RF_ScriptTraceEntryList_t *next; +}; + +#endif /* !_RF__RF_RST_H_ */ diff --git a/sys/dev/raidframe/rf_shutdown.c b/sys/dev/raidframe/rf_shutdown.c new file mode 100644 index 00000000000..3e0dfc96a37 --- /dev/null +++ b/sys/dev/raidframe/rf_shutdown.c @@ -0,0 +1,114 @@ +/* $OpenBSD: rf_shutdown.c,v 1.1 1999/01/11 14:29:49 niklas Exp $ */ +/* $NetBSD: rf_shutdown.c,v 1.1 1998/11/13 04:20:34 oster Exp $ */ +/* + * rf_shutdown.c + */ +/* + * Copyright (c) 1996 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ +/* + * Maintain lists of cleanup functions. Also, mechanisms for coordinating + * thread startup and shutdown. + */ + +#include "rf_types.h" +#include "rf_threadstuff.h" +#include "rf_shutdown.h" +#include "rf_debugMem.h" +#include "rf_freelist.h" +#include "rf_threadid.h" + +static void rf_FreeShutdownEnt(RF_ShutdownList_t *ent) +{ +#ifdef KERNEL + FREE(ent, M_DEVBUF); +#else /* KERNEL */ + free(ent); +#endif /* KERNEL */ +} + +int _rf_ShutdownCreate( + RF_ShutdownList_t **listp, + void (*cleanup)(void *arg), + void *arg, + char *file, + int line) +{ + RF_ShutdownList_t *ent; + + /* + * Have to directly allocate memory here, since we start up before + * and shutdown after RAIDframe internal allocation system. + */ +#ifdef KERNEL + ent = (RF_ShutdownList_t *)malloc( sizeof(RF_ShutdownList_t), M_DEVBUF, M_WAITOK); +#if 0 + MALLOC(ent, RF_ShutdownList_t *, sizeof(RF_ShutdownList_t), M_DEVBUF, M_WAITOK); +#endif +#else /* KERNEL */ + ent = (RF_ShutdownList_t *)malloc(sizeof(RF_ShutdownList_t)); +#endif /* KERNEL */ + if (ent == NULL) + return(ENOMEM); + ent->cleanup = cleanup; + ent->arg = arg; + ent->file = file; + ent->line = line; + ent->next = *listp; + *listp = ent; + return(0); +} + +int rf_ShutdownList(RF_ShutdownList_t **list) +{ + RF_ShutdownList_t *r, *next; + char *file; + int line; + + for(r=*list;r;r=next) { + next = r->next; + file = r->file; + line = r->line; + + if (rf_shutdownDebug) { + int tid; + rf_get_threadid(tid); + printf("[%d] call shutdown, created %s:%d\n", tid, file, line); + } + + r->cleanup(r->arg); + + if (rf_shutdownDebug) { + int tid; + rf_get_threadid(tid); + printf("[%d] completed shutdown, created %s:%d\n", tid, file, line); + } + + rf_FreeShutdownEnt(r); + } + *list = NULL; + return(0); +} diff --git a/sys/dev/raidframe/rf_shutdown.h b/sys/dev/raidframe/rf_shutdown.h new file mode 100644 index 00000000000..bddfe7f9c0d --- /dev/null +++ b/sys/dev/raidframe/rf_shutdown.h @@ -0,0 +1,68 @@ +/* $OpenBSD: rf_shutdown.h,v 1.1 1999/01/11 14:29:49 niklas Exp $ */ +/* $NetBSD: rf_shutdown.h,v 1.1 1998/11/13 04:20:34 oster Exp $ */ +/* + * rf_shutdown.h + */ +/* + * Copyright (c) 1996 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ +/* + * Maintain lists of cleanup functions. Also, mechanisms for coordinating + * thread startup and shutdown. + */ + +#ifndef _RF__RF_SHUTDOWN_H_ +#define _RF__RF_SHUTDOWN_H_ + +#include "rf_types.h" +#include "rf_threadstuff.h" + +/* + * Important note: the shutdown list is run like a stack, new + * entries pushed on top. Therefore, the most recently added + * entry (last started) is the first removed (stopped). This + * should handle system-dependencies pretty nicely- if a system + * is there when you start another, it'll be there when you + * shut down another. Hopefully, this subsystem will remove + * more complexity than it introduces. + */ + +struct RF_ShutdownList_s { + void (*cleanup)(void *arg); + void *arg; + char *file; + int line; + RF_ShutdownList_t *next; +}; + +#define rf_ShutdownCreate(_listp_,_func_,_arg_) \ + _rf_ShutdownCreate(_listp_,_func_,_arg_,__FILE__,__LINE__) + +int _rf_ShutdownCreate(RF_ShutdownList_t **listp, void (*cleanup)(void *arg), + void *arg, char *file, int line); +int rf_ShutdownList(RF_ShutdownList_t **listp); + +#endif /* !_RF__RF_SHUTDOWN_H_ */ diff --git a/sys/dev/raidframe/rf_sstf.c b/sys/dev/raidframe/rf_sstf.c new file mode 100644 index 00000000000..21d97eef046 --- /dev/null +++ b/sys/dev/raidframe/rf_sstf.c @@ -0,0 +1,717 @@ +/* $OpenBSD: rf_sstf.c,v 1.1 1999/01/11 14:29:50 niklas Exp $ */ +/* $NetBSD: rf_sstf.c,v 1.1 1998/11/13 04:20:34 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/******************************************************************************* + * + * sstf.c -- prioritized shortest seek time first disk queueing code + * + ******************************************************************************/ + +/* + * : + * Log: rf_sstf.c,v + * Revision 1.7 1996/06/19 14:09:56 jimz + * SstfPeek wasn't calling closest_to_arm() properly- would bogart + * low priority I/Os + * + * Revision 1.6 1996/06/18 20:53:11 jimz + * fix up disk queueing (remove configure routine, + * add shutdown list arg to create routines) + * + * Revision 1.5 1996/06/13 20:42:13 jimz + * add scan, cscan + * + * Revision 1.4 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.3 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.2 1996/06/06 01:11:35 jimz + * fixed many priority-related bugs + * + * Revision 1.1 1996/06/05 19:17:40 jimz + * Initial revision + * + */ + +#include "rf_alloclist.h" +#include "rf_stripelocks.h" +#include "rf_layout.h" +#include "rf_diskqueue.h" +#include "rf_sstf.h" +#include "rf_debugMem.h" +#include "rf_general.h" +#include "rf_threadid.h" +#include "rf_options.h" + +#define DIR_LEFT 1 +#define DIR_RIGHT 2 +#define DIR_EITHER 3 + +#define SNUM_DIFF(_a_,_b_) (((_a_)>(_b_))?((_a_)-(_b_)):((_b_)-(_a_))) + +#define QSUM(_sstfq_) (((_sstfq_)->lopri.qlen)+((_sstfq_)->left.qlen)+((_sstfq_)->right.qlen)) + + +static void do_sstf_ord_q(RF_DiskQueueData_t **, + RF_DiskQueueData_t **, + RF_DiskQueueData_t *); + +static RF_DiskQueueData_t *closest_to_arm(RF_SstfQ_t *, + RF_SectorNum_t, + int *, + int); +static void do_dequeue(RF_SstfQ_t *, RF_DiskQueueData_t *); + + +static void do_sstf_ord_q(queuep, tailp, req) + RF_DiskQueueData_t **queuep; + RF_DiskQueueData_t **tailp; + RF_DiskQueueData_t *req; +{ + RF_DiskQueueData_t *r, *s; + + if (*queuep == NULL) { + *queuep = req; + *tailp = req; + req->next = NULL; + req->prev = NULL; + return; + } + if (req->sectorOffset <= (*queuep)->sectorOffset) { + req->next = *queuep; + req->prev = NULL; + (*queuep)->prev = req; + *queuep = req; + return; + } + if (req->sectorOffset > (*tailp)->sectorOffset) { + /* optimization */ + r = NULL; + s = *tailp; + goto q_at_end; + } + for(s=NULL,r=*queuep;r;s=r,r=r->next) { + if (r->sectorOffset >= req->sectorOffset) { + /* insert after s, before r */ + RF_ASSERT(s); + req->next = r; + r->prev = req; + s->next = req; + req->prev = s; + return; + } + } +q_at_end: + /* insert after s, at end of queue */ + RF_ASSERT(r == NULL); + RF_ASSERT(s); + RF_ASSERT(s == (*tailp)); + req->next = NULL; + req->prev = s; + s->next = req; + *tailp = req; +} + +/* for removing from head-of-queue */ +#define DO_HEAD_DEQ(_r_,_q_) { \ + _r_ = (_q_)->queue; \ + RF_ASSERT((_r_) != NULL); \ + (_q_)->queue = (_r_)->next; \ + (_q_)->qlen--; \ + if ((_q_)->qlen == 0) { \ + RF_ASSERT((_r_) == (_q_)->qtail); \ + RF_ASSERT((_q_)->queue == NULL); \ + (_q_)->qtail = NULL; \ + } \ + else { \ + RF_ASSERT((_q_)->queue->prev == (_r_)); \ + (_q_)->queue->prev = NULL; \ + } \ +} + +/* for removing from end-of-queue */ +#define DO_TAIL_DEQ(_r_,_q_) { \ + _r_ = (_q_)->qtail; \ + RF_ASSERT((_r_) != NULL); \ + (_q_)->qtail = (_r_)->prev; \ + (_q_)->qlen--; \ + if ((_q_)->qlen == 0) { \ + RF_ASSERT((_r_) == (_q_)->queue); \ + RF_ASSERT((_q_)->qtail == NULL); \ + (_q_)->queue = NULL; \ + } \ + else { \ + RF_ASSERT((_q_)->qtail->next == (_r_)); \ + (_q_)->qtail->next = NULL; \ + } \ +} + +#define DO_BEST_DEQ(_l_,_r_,_q_) { \ + if (SNUM_DIFF((_q_)->queue->sectorOffset,_l_) \ + < SNUM_DIFF((_q_)->qtail->sectorOffset,_l_)) \ + { \ + DO_HEAD_DEQ(_r_,_q_); \ + } \ + else { \ + DO_TAIL_DEQ(_r_,_q_); \ + } \ +} + +static RF_DiskQueueData_t *closest_to_arm(queue, arm_pos, dir, allow_reverse) + RF_SstfQ_t *queue; + RF_SectorNum_t arm_pos; + int *dir; + int allow_reverse; +{ + RF_SectorNum_t best_pos_l=0, this_pos_l=0, last_pos=0; + RF_SectorNum_t best_pos_r=0, this_pos_r=0; + RF_DiskQueueData_t *r, *best_l, *best_r; + + best_r = best_l = NULL; + for(r=queue->queue;r;r=r->next) { + if (r->sectorOffset < arm_pos) { + if (best_l == NULL) { + best_l = r; + last_pos = best_pos_l = this_pos_l; + } + else { + this_pos_l = arm_pos - r->sectorOffset; + if (this_pos_l < best_pos_l) { + best_l = r; + last_pos = best_pos_l = this_pos_l; + } + else { + last_pos = this_pos_l; + } + } + } + else { + if (best_r == NULL) { + best_r = r; + last_pos = best_pos_r = this_pos_r; + } + else { + this_pos_r = r->sectorOffset - arm_pos; + if (this_pos_r < best_pos_r) { + best_r = r; + last_pos = best_pos_r = this_pos_r; + } + else { + last_pos = this_pos_r; + } + if (this_pos_r > last_pos) { + /* getting farther away */ + break; + } + } + } + } + if ((best_r == NULL) && (best_l == NULL)) + return(NULL); + if ((*dir == DIR_RIGHT) && best_r) + return(best_r); + if ((*dir == DIR_LEFT) && best_l) + return(best_l); + if (*dir == DIR_EITHER) { + if (best_l == NULL) + return(best_r); + if (best_r == NULL) + return(best_l); + if (best_pos_r < best_pos_l) + return(best_r); + else + return(best_l); + } + /* + * Nothing in the direction we want to go. Reverse or + * reset the arm. We know we have an I/O in the other + * direction. + */ + if (allow_reverse) { + if (*dir == DIR_RIGHT) { + *dir = DIR_LEFT; + return(best_l); + } + else { + *dir = DIR_RIGHT; + return(best_r); + } + } + /* + * Reset (beginning of queue). + */ + RF_ASSERT(*dir == DIR_RIGHT); + return(queue->queue); +} + +void *rf_SstfCreate(sect_per_disk, cl_list, listp) + RF_SectorCount_t sect_per_disk; + RF_AllocListElem_t *cl_list; + RF_ShutdownList_t **listp; +{ + RF_Sstf_t *sstfq; + + RF_CallocAndAdd(sstfq, 1, sizeof(RF_Sstf_t), (RF_Sstf_t *), cl_list); + sstfq->dir = DIR_EITHER; + sstfq->allow_reverse = 1; + return((void *)sstfq); +} + +void *rf_ScanCreate(sect_per_disk, cl_list, listp) + RF_SectorCount_t sect_per_disk; + RF_AllocListElem_t *cl_list; + RF_ShutdownList_t **listp; +{ + RF_Sstf_t *scanq; + + RF_CallocAndAdd(scanq, 1, sizeof(RF_Sstf_t), (RF_Sstf_t *), cl_list); + scanq->dir = DIR_RIGHT; + scanq->allow_reverse = 1; + return((void *)scanq); +} + +void *rf_CscanCreate(sect_per_disk, cl_list, listp) + RF_SectorCount_t sect_per_disk; + RF_AllocListElem_t *cl_list; + RF_ShutdownList_t **listp; +{ + RF_Sstf_t *cscanq; + + RF_CallocAndAdd(cscanq, 1, sizeof(RF_Sstf_t), (RF_Sstf_t *), cl_list); + cscanq->dir = DIR_RIGHT; + return((void *)cscanq); +} + +void rf_SstfEnqueue(qptr, req, priority) + void *qptr; + RF_DiskQueueData_t *req; + int priority; +{ + RF_Sstf_t *sstfq; + + sstfq = (RF_Sstf_t *)qptr; + + if (priority == RF_IO_LOW_PRIORITY) { + if (rf_sstfDebug || rf_scanDebug || rf_cscanDebug) { + RF_DiskQueue_t *dq; + int tid; + rf_get_threadid(tid); + dq = (RF_DiskQueue_t *)req->queue; + printf("[%d] ENQ lopri %d,%d queues are %d,%d,%d\n", + tid, dq->row, dq->col, sstfq->left.qlen, sstfq->right.qlen, + sstfq->lopri.qlen); + } + do_sstf_ord_q(&sstfq->lopri.queue, &sstfq->lopri.qtail, req); + sstfq->lopri.qlen++; + } + else { + if (req->sectorOffset < sstfq->last_sector) { + do_sstf_ord_q(&sstfq->left.queue, &sstfq->left.qtail, req); + sstfq->left.qlen++; + } + else { + do_sstf_ord_q(&sstfq->right.queue, &sstfq->right.qtail, req); + sstfq->right.qlen++; + } + } +} + +static void do_dequeue(queue, req) + RF_SstfQ_t *queue; + RF_DiskQueueData_t *req; +{ + RF_DiskQueueData_t *req2; + + if (rf_sstfDebug || rf_scanDebug || rf_cscanDebug) { + int tid; + rf_get_threadid(tid); + printf("[%d] do_dequeue\n", tid); + } + if (req == queue->queue) { + DO_HEAD_DEQ(req2,queue); + RF_ASSERT(req2 == req); + } + else if (req == queue->qtail) { + DO_TAIL_DEQ(req2,queue); + RF_ASSERT(req2 == req); + } + else { + /* dequeue from middle of list */ + RF_ASSERT(req->next); + RF_ASSERT(req->prev); + queue->qlen--; + req->next->prev = req->prev; + req->prev->next = req->next; + req->next = req->prev = NULL; + } +} + +RF_DiskQueueData_t *rf_SstfDequeue(qptr) + void *qptr; +{ + RF_DiskQueueData_t *req=NULL; + RF_Sstf_t *sstfq; + + sstfq = (RF_Sstf_t *)qptr; + + if (rf_sstfDebug) { + RF_DiskQueue_t *dq; + int tid; + rf_get_threadid(tid); + dq = (RF_DiskQueue_t *)req->queue; + RF_ASSERT(QSUM(sstfq)==dq->queueLength); + printf("[%d] sstf: Dequeue %d,%d queues are %d,%d,%d\n", tid, + dq->row, dq->col, sstfq->left.qlen, sstfq->right.qlen, + sstfq->lopri.qlen); + } + if (sstfq->left.queue == NULL) { + RF_ASSERT(sstfq->left.qlen == 0); + if (sstfq->right.queue == NULL) { + RF_ASSERT(sstfq->right.qlen == 0); + if (sstfq->lopri.queue == NULL) { + RF_ASSERT(sstfq->lopri.qlen == 0); + return(NULL); + } + if (rf_sstfDebug) { + int tid; + rf_get_threadid(tid); + printf("[%d] sstf: check for close lopri", tid); + } + req = closest_to_arm(&sstfq->lopri, sstfq->last_sector, + &sstfq->dir, sstfq->allow_reverse); + if (rf_sstfDebug) { + int tid; + rf_get_threadid(tid); + printf("[%d] sstf: closest_to_arm said %lx", tid, (long)req); + } + if (req == NULL) + return(NULL); + do_dequeue(&sstfq->lopri, req); + } + else { + DO_BEST_DEQ(sstfq->last_sector,req,&sstfq->right); + } + } + else { + if (sstfq->right.queue == NULL) { + RF_ASSERT(sstfq->right.qlen == 0); + DO_BEST_DEQ(sstfq->last_sector,req,&sstfq->left); + } + else { + if (SNUM_DIFF(sstfq->last_sector,sstfq->right.queue->sectorOffset) + < SNUM_DIFF(sstfq->last_sector,sstfq->left.qtail->sectorOffset)) + { + DO_HEAD_DEQ(req,&sstfq->right); + } + else { + DO_TAIL_DEQ(req,&sstfq->left); + } + } + } + RF_ASSERT(req); + sstfq->last_sector = req->sectorOffset; + return(req); +} + +RF_DiskQueueData_t *rf_ScanDequeue(qptr) + void *qptr; +{ + RF_DiskQueueData_t *req=NULL; + RF_Sstf_t *scanq; + + scanq = (RF_Sstf_t *)qptr; + + if (rf_scanDebug) { + RF_DiskQueue_t *dq; + int tid; + rf_get_threadid(tid); + dq = (RF_DiskQueue_t *)req->queue; + RF_ASSERT(QSUM(scanq)==dq->queueLength); + printf("[%d] scan: Dequeue %d,%d queues are %d,%d,%d\n", tid, + dq->row, dq->col, scanq->left.qlen, scanq->right.qlen, + scanq->lopri.qlen); + } + if (scanq->left.queue == NULL) { + RF_ASSERT(scanq->left.qlen == 0); + if (scanq->right.queue == NULL) { + RF_ASSERT(scanq->right.qlen == 0); + if (scanq->lopri.queue == NULL) { + RF_ASSERT(scanq->lopri.qlen == 0); + return(NULL); + } + req = closest_to_arm(&scanq->lopri, scanq->last_sector, + &scanq->dir, scanq->allow_reverse); + if (req == NULL) + return(NULL); + do_dequeue(&scanq->lopri, req); + } + else { + scanq->dir = DIR_RIGHT; + DO_HEAD_DEQ(req,&scanq->right); + } + } + else if (scanq->right.queue == NULL) { + RF_ASSERT(scanq->right.qlen == 0); + RF_ASSERT(scanq->left.queue); + scanq->dir = DIR_LEFT; + DO_TAIL_DEQ(req,&scanq->left); + } + else { + RF_ASSERT(scanq->right.queue); + RF_ASSERT(scanq->left.queue); + if (scanq->dir == DIR_RIGHT) { + DO_HEAD_DEQ(req,&scanq->right); + } + else { + DO_TAIL_DEQ(req,&scanq->left); + } + } + RF_ASSERT(req); + scanq->last_sector = req->sectorOffset; + return(req); +} + +RF_DiskQueueData_t *rf_CscanDequeue(qptr) + void *qptr; +{ + RF_DiskQueueData_t *req=NULL; + RF_Sstf_t *cscanq; + + cscanq = (RF_Sstf_t *)qptr; + + RF_ASSERT(cscanq->dir == DIR_RIGHT); + if (rf_cscanDebug) { + RF_DiskQueue_t *dq; + int tid; + rf_get_threadid(tid); + dq = (RF_DiskQueue_t *)req->queue; + RF_ASSERT(QSUM(cscanq)==dq->queueLength); + printf("[%d] scan: Dequeue %d,%d queues are %d,%d,%d\n", tid, + dq->row, dq->col, cscanq->left.qlen, cscanq->right.qlen, + cscanq->lopri.qlen); + } + if (cscanq->right.queue) { + DO_HEAD_DEQ(req,&cscanq->right); + } + else { + RF_ASSERT(cscanq->right.qlen == 0); + if (cscanq->left.queue == NULL) { + RF_ASSERT(cscanq->left.qlen == 0); + if (cscanq->lopri.queue == NULL) { + RF_ASSERT(cscanq->lopri.qlen == 0); + return(NULL); + } + req = closest_to_arm(&cscanq->lopri, cscanq->last_sector, + &cscanq->dir, cscanq->allow_reverse); + if (req == NULL) + return(NULL); + do_dequeue(&cscanq->lopri, req); + } + else { + /* + * There's I/Os to the left of the arm. Swing + * on back (swap queues). + */ + cscanq->right = cscanq->left; + cscanq->left.qlen = 0; + cscanq->left.queue = cscanq->left.qtail = NULL; + DO_HEAD_DEQ(req,&cscanq->right); + } + } + RF_ASSERT(req); + cscanq->last_sector = req->sectorOffset; + return(req); +} + +RF_DiskQueueData_t *rf_SstfPeek(qptr) + void *qptr; +{ + RF_DiskQueueData_t *req; + RF_Sstf_t *sstfq; + + sstfq = (RF_Sstf_t *)qptr; + + if ((sstfq->left.queue == NULL) && (sstfq->right.queue == NULL)) { + req = closest_to_arm(&sstfq->lopri, sstfq->last_sector, &sstfq->dir, + sstfq->allow_reverse); + } + else { + if (sstfq->left.queue == NULL) + req = sstfq->right.queue; + else { + if (sstfq->right.queue == NULL) + req = sstfq->left.queue; + else { + if (SNUM_DIFF(sstfq->last_sector,sstfq->right.queue->sectorOffset) + <SNUM_DIFF(sstfq->last_sector,sstfq->left.qtail->sectorOffset)) + { + req = sstfq->right.queue; + } + else { + req = sstfq->left.qtail; + } + } + } + } + if (req == NULL) { + RF_ASSERT(QSUM(sstfq) == 0); + } + return(req); +} + +RF_DiskQueueData_t *rf_ScanPeek(qptr) + void *qptr; +{ + RF_DiskQueueData_t *req; + RF_Sstf_t *scanq; + int dir; + + scanq = (RF_Sstf_t *)qptr; + dir = scanq->dir; + + if (scanq->left.queue == NULL) { + RF_ASSERT(scanq->left.qlen == 0); + if (scanq->right.queue == NULL) { + RF_ASSERT(scanq->right.qlen == 0); + if (scanq->lopri.queue == NULL) { + RF_ASSERT(scanq->lopri.qlen == 0); + return(NULL); + } + req = closest_to_arm(&scanq->lopri, scanq->last_sector, + &dir, scanq->allow_reverse); + } + else { + req = scanq->right.queue; + } + } + else if (scanq->right.queue == NULL) { + RF_ASSERT(scanq->right.qlen == 0); + RF_ASSERT(scanq->left.queue); + req = scanq->left.qtail; + } + else { + RF_ASSERT(scanq->right.queue); + RF_ASSERT(scanq->left.queue); + if (scanq->dir == DIR_RIGHT) { + req = scanq->right.queue; + } + else { + req = scanq->left.qtail; + } + } + if (req == NULL) { + RF_ASSERT(QSUM(scanq) == 0); + } + return(req); +} + +RF_DiskQueueData_t *rf_CscanPeek(qptr) + void *qptr; +{ + RF_DiskQueueData_t *req; + RF_Sstf_t *cscanq; + + cscanq = (RF_Sstf_t *)qptr; + + RF_ASSERT(cscanq->dir == DIR_RIGHT); + if (cscanq->right.queue) { + req = cscanq->right.queue; + } + else { + RF_ASSERT(cscanq->right.qlen == 0); + if (cscanq->left.queue == NULL) { + RF_ASSERT(cscanq->left.qlen == 0); + if (cscanq->lopri.queue == NULL) { + RF_ASSERT(cscanq->lopri.qlen == 0); + return(NULL); + } + req = closest_to_arm(&cscanq->lopri, cscanq->last_sector, + &cscanq->dir, cscanq->allow_reverse); + } + else { + /* + * There's I/Os to the left of the arm. We'll end + * up swinging on back. + */ + req = cscanq->left.queue; + } + } + if (req == NULL) { + RF_ASSERT(QSUM(cscanq) == 0); + } + return(req); +} + +int rf_SstfPromote(qptr, parityStripeID, which_ru) + void *qptr; + RF_StripeNum_t parityStripeID; + RF_ReconUnitNum_t which_ru; +{ + RF_DiskQueueData_t *r, *next; + RF_Sstf_t *sstfq; + int n; + + sstfq = (RF_Sstf_t *)qptr; + + n = 0; + if (rf_sstfDebug || rf_scanDebug || rf_cscanDebug) { + int tid; + rf_get_threadid(tid); + printf("[%d] promote %ld %d queues are %d,%d,%d\n", + tid, (long)parityStripeID, (int)which_ru, + sstfq->left.qlen, + sstfq->right.qlen, + sstfq->lopri.qlen); + } + for(r=sstfq->lopri.queue;r;r=next) { + next = r->next; + if (rf_sstfDebug || rf_scanDebug || rf_cscanDebug) { + int tid; + rf_get_threadid(tid); + printf("[%d] check promote %lx\n", tid, (long)r); + } + if ((r->parityStripeID == parityStripeID) + && (r->which_ru == which_ru)) + { + do_dequeue(&sstfq->lopri, r); + rf_SstfEnqueue(qptr, r, RF_IO_NORMAL_PRIORITY); + n++; + } + } + if (rf_sstfDebug || rf_scanDebug || rf_cscanDebug) { + int tid; + rf_get_threadid(tid); + printf("[%d] promoted %d matching I/Os queues are %d,%d,%d\n", + tid, n, sstfq->left.qlen, sstfq->right.qlen, sstfq->lopri.qlen); + } + return(n); +} diff --git a/sys/dev/raidframe/rf_sstf.h b/sys/dev/raidframe/rf_sstf.h new file mode 100644 index 00000000000..9d81a090826 --- /dev/null +++ b/sys/dev/raidframe/rf_sstf.h @@ -0,0 +1,90 @@ +/* $OpenBSD: rf_sstf.h,v 1.1 1999/01/11 14:29:50 niklas Exp $ */ +/* $NetBSD: rf_sstf.h,v 1.1 1998/11/13 04:20:34 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* : + * Log: rf_sstf.h,v + * Revision 1.6 1996/06/18 20:53:11 jimz + * fix up disk queueing (remove configure routine, + * add shutdown list arg to create routines) + * + * Revision 1.5 1996/06/13 20:42:08 jimz + * add scan, cscan + * + * Revision 1.4 1996/06/07 22:26:27 jimz + * type-ify which_ru (RF_ReconUnitNum_t) + * + * Revision 1.3 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.2 1996/06/06 01:22:24 jimz + * minor cleanup + * + * Revision 1.1 1996/06/05 19:17:40 jimz + * Initial revision + * + */ + +#ifndef _RF__RF_SSTF_H_ +#define _RF__RF_SSTF_H_ + +#include "rf_diskqueue.h" + +typedef struct RF_SstfQ_s { + RF_DiskQueueData_t *queue; + RF_DiskQueueData_t *qtail; + int qlen; +} RF_SstfQ_t; + +typedef struct RF_Sstf_s { + RF_SstfQ_t left; + RF_SstfQ_t right; + RF_SstfQ_t lopri; + RF_SectorNum_t last_sector; + int dir; + int allow_reverse; +} RF_Sstf_t; + +void *rf_SstfCreate(RF_SectorCount_t sect_per_disk, + RF_AllocListElem_t *cl_list, RF_ShutdownList_t **listp); +void *rf_ScanCreate(RF_SectorCount_t sect_per_disk, + RF_AllocListElem_t *cl_list, RF_ShutdownList_t **listp); +void *rf_CscanCreate(RF_SectorCount_t sect_per_disk, + RF_AllocListElem_t *cl_list, RF_ShutdownList_t **listp); +void rf_SstfEnqueue(void *qptr, RF_DiskQueueData_t *req, int priority); +RF_DiskQueueData_t *rf_SstfDequeue(void *qptr); +RF_DiskQueueData_t *rf_SstfPeek(void *qptr); +int rf_SstfPromote(void *qptr, RF_StripeNum_t parityStripeID, + RF_ReconUnitNum_t which_ru); +RF_DiskQueueData_t *rf_ScanDequeue(void *qptr); +RF_DiskQueueData_t *rf_ScanPeek(void *qptr); +RF_DiskQueueData_t *rf_CscanDequeue(void *qptr); +RF_DiskQueueData_t *rf_CscanPeek(void *qptr); + +#endif /* !_RF__RF_SSTF_H_ */ diff --git a/sys/dev/raidframe/rf_states.c b/sys/dev/raidframe/rf_states.c new file mode 100644 index 00000000000..1bad7bd4ab7 --- /dev/null +++ b/sys/dev/raidframe/rf_states.c @@ -0,0 +1,873 @@ +/* $OpenBSD: rf_states.c,v 1.1 1999/01/11 14:29:50 niklas Exp $ */ +/* $NetBSD: rf_states.c,v 1.2 1998/11/13 13:47:56 drochner Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, William V. Courtright II, Robby Findler + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * : + * Log: rf_states.c,v + * Revision 1.45 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.44 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.43 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.42 1996/07/17 21:00:58 jimz + * clean up timer interface, tracing + * + * Revision 1.41 1996/07/11 19:08:00 jimz + * generalize reconstruction mechanism + * allow raid1 reconstructs via copyback (done with array + * quiesced, not online, therefore not disk-directed) + * + * Revision 1.40 1996/06/17 14:38:33 jimz + * properly #if out RF_DEMO code + * fix bug in MakeConfig that was causing weird behavior + * in configuration routines (config was not zeroed at start) + * clean up genplot handling of stacks + * + * Revision 1.39 1996/06/11 18:12:17 jimz + * got rid of evil race condition in LastState + * + * Revision 1.38 1996/06/10 14:18:58 jimz + * move user, throughput stats into per-array structure + * + * Revision 1.37 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.36 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.35 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.34 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.33 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.32 1996/05/30 12:59:18 jimz + * make etimer happier, more portable + * + * Revision 1.31 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.30 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.29 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.28 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.27 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.26 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.25 1996/05/20 19:31:46 jimz + * straighten out syntax problems + * + * Revision 1.24 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.23 1996/05/16 23:37:33 jimz + * fix misspelled "else" + * + * Revision 1.22 1996/05/15 22:33:32 jimz + * appropriately #ifdef cache stuff + * + * Revision 1.21 1996/05/06 22:09:20 wvcii + * rf_State_ExecuteDAG now only executes the first dag + * of each parity stripe in a multi-stripe access + * + * rf_State_ProcessDAG now executes all dags in a + * multi-stripe access except the first dag of each stripe. + * + * Revision 1.20 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.19 1995/11/19 16:29:50 wvcii + * replaced LaunchDAGState with CreateDAGState, ExecuteDAGState + * created rf_ContinueDagAccess + * + * Revision 1.18 1995/11/07 15:37:23 wvcii + * deleted states SendDAGState, RetryDAGState + * added staes: LaunchDAGState, ProcessDAGState + * code no longer has a hard-coded retry count of 1 but will support + * retries until a dag can not be found (selected) to perform the user request + * + * Revision 1.17 1995/10/09 23:36:08 amiri + * *** empty log message *** + * + * Revision 1.16 1995/10/09 18:36:58 jimz + * moved call to StopThroughput for user-level driver to rf_driver.c + * + * Revision 1.15 1995/10/09 18:07:23 wvcii + * lastState now call rf_StopThroughputStats + * + * Revision 1.14 1995/10/05 18:56:31 jimz + * no-op file if !INCLUDE_VS + * + * Revision 1.13 1995/09/30 20:38:24 jimz + * LogTraceRec now takes a Raid * as its first argument + * + * Revision 1.12 1995/09/19 22:58:54 jimz + * integrate DKUSAGE into raidframe + * + * Revision 1.11 1995/09/07 01:26:55 jimz + * Achive basic compilation in kernel. Kernel functionality + * is not guaranteed at all, but it'll compile. Mostly. I hope. + * + * Revision 1.10 1995/07/26 03:28:31 robby + * intermediary checkin + * + * Revision 1.9 1995/07/23 02:50:33 robby + * oops. fixed boo boo + * + * Revision 1.8 1995/07/22 22:54:54 robby + * removed incorrect comment + * + * Revision 1.7 1995/07/21 19:30:26 robby + * added idle state for rf_when-idle.c + * + * Revision 1.6 1995/07/10 19:06:28 rachad + * *** empty log message *** + * + * Revision 1.5 1995/07/10 17:30:38 robby + * added virtual striping lock states + * + * Revision 1.4 1995/07/08 18:05:39 rachad + * Linked up Claudsons code with the real cache + * + * Revision 1.3 1995/07/06 14:38:50 robby + * changed get_thread_id to get_threadid + * + * Revision 1.2 1995/07/06 14:24:15 robby + * added log + * + */ + +#ifdef _KERNEL +#define KERNEL +#endif + +#ifdef KERNEL +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#include <dkusage.h> +#endif /* !__NetBSD__ && !__OpenBSD__ */ +#endif /* KERNEL */ + +#include <sys/errno.h> + +#include "rf_archs.h" +#include "rf_threadstuff.h" +#include "rf_raid.h" +#include "rf_dag.h" +#include "rf_desc.h" +#include "rf_aselect.h" +#include "rf_threadid.h" +#include "rf_general.h" +#include "rf_states.h" +#include "rf_dagutils.h" +#include "rf_driver.h" +#include "rf_engine.h" +#include "rf_map.h" +#include "rf_etimer.h" + +#if defined(KERNEL) && (DKUSAGE > 0) +#include <sys/dkusage.h> +#include <io/common/iotypes.h> +#include <io/cam/dec_cam.h> +#include <io/cam/cam.h> +#include <io/cam/pdrv.h> +#endif /* KERNEL && DKUSAGE > 0 */ + +/* prototypes for some of the available states. + + States must: + + - not block. + + - either schedule rf_ContinueRaidAccess as a callback and return + RF_TRUE, or complete all of their work and return RF_FALSE. + + - increment desc->state when they have finished their work. +*/ + + +#ifdef SIMULATE +extern int global_async_flag; +#endif /* SIMULATE */ + +static char *StateName(RF_AccessState_t state) +{ + switch (state) { + case rf_QuiesceState: return "QuiesceState"; + case rf_MapState: return "MapState"; + case rf_LockState: return "LockState"; + case rf_CreateDAGState: return "CreateDAGState"; + case rf_ExecuteDAGState: return "ExecuteDAGState"; + case rf_ProcessDAGState: return "ProcessDAGState"; + case rf_CleanupState: return "CleanupState"; + case rf_LastState: return "LastState"; + case rf_IncrAccessesCountState: return "IncrAccessesCountState"; + case rf_DecrAccessesCountState: return "DecrAccessesCountState"; + default: return "!!! UnnamedState !!!"; + } +} + +void rf_ContinueRaidAccess(RF_RaidAccessDesc_t *desc) +{ + int suspended = RF_FALSE; + int current_state_index = desc->state; + RF_AccessState_t current_state = desc->states[current_state_index]; + +#ifdef SIMULATE + rf_SetCurrentOwner(desc->owner); +#endif /* SIMULATE */ + + do { + + current_state_index = desc->state; + current_state = desc->states [current_state_index]; + + switch (current_state) { + + case rf_QuiesceState: suspended = rf_State_Quiesce(desc); + break; + case rf_IncrAccessesCountState: suspended = rf_State_IncrAccessCount(desc); + break; + case rf_MapState: suspended = rf_State_Map(desc); + break; + case rf_LockState: suspended = rf_State_Lock(desc); + break; + case rf_CreateDAGState: suspended = rf_State_CreateDAG(desc); + break; + case rf_ExecuteDAGState: suspended = rf_State_ExecuteDAG(desc); + break; + case rf_ProcessDAGState: suspended = rf_State_ProcessDAG(desc); + break; + case rf_CleanupState: suspended = rf_State_Cleanup(desc); + break; + case rf_DecrAccessesCountState: suspended = rf_State_DecrAccessCount(desc); + break; + case rf_LastState: suspended = rf_State_LastState(desc); + break; + } + + /* after this point, we cannot dereference desc since desc may + have been freed. desc is only freed in LastState, so if we + renter this function or loop back up, desc should be valid. */ + + if (rf_printStatesDebug) { + int tid; + rf_get_threadid (tid); + + printf ("[%d] State: %-24s StateIndex: %3i desc: 0x%ld %s\n", + tid, StateName(current_state), current_state_index, (long)desc, + suspended ? "callback scheduled" : "looping"); + } + } while (!suspended && current_state != rf_LastState); + + return; +} + + +void rf_ContinueDagAccess (RF_DagList_t *dagList) +{ + RF_AccTraceEntry_t *tracerec = &(dagList->desc->tracerec); + RF_RaidAccessDesc_t *desc; + RF_DagHeader_t *dag_h; + RF_Etimer_t timer; + int i; + + desc = dagList->desc; + + timer = tracerec->timer; + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + tracerec->specific.user.exec_us = RF_ETIMER_VAL_US(timer); + RF_ETIMER_START(tracerec->timer); + + /* skip to dag which just finished */ + dag_h = dagList->dags; + for (i = 0; i < dagList->numDagsDone; i++) { + dag_h = dag_h->next; + } + + /* check to see if retry is required */ + if (dag_h->status == rf_rollBackward) { + /* when a dag fails, mark desc status as bad and allow all other dags + * in the desc to execute to completion. then, free all dags and start over */ + desc->status = 1; /* bad status */ +#if RF_DEMO > 0 + if (!rf_demoMode) +#endif /* RF_DEMO > 0 */ + { + printf("[%d] DAG failure: %c addr 0x%lx (%ld) nblk 0x%x (%d) buf 0x%lx\n", + desc->tid, desc->type, (long)desc->raidAddress, + (long)desc->raidAddress,(int)desc->numBlocks, + (int)desc->numBlocks, (unsigned long) (desc->bufPtr)); + } + } + + dagList->numDagsDone++; + rf_ContinueRaidAccess(desc); +} + + +int rf_State_LastState(RF_RaidAccessDesc_t *desc) +{ + void (*callbackFunc)(RF_CBParam_t) = desc->callbackFunc; + RF_CBParam_t callbackArg; + + callbackArg.p = desc->callbackArg; + +#ifdef SIMULATE + int tid; + rf_get_threadid(tid); + + if (rf_accessDebug) + printf("async_flag set to %d\n",global_async_flag); + global_async_flag=desc->async_flag; + if (rf_accessDebug) + printf("Will now do clean up for %d\n",rf_GetCurrentOwner()); + rf_FreeRaidAccDesc(desc); + + if (callbackFunc) + callbackFunc(callbackArg); +#else /* SIMULATE */ + +#ifndef KERNEL + + if (!(desc->flags & RF_DAG_NONBLOCKING_IO)) { + /* bummer that we have to take another lock here */ + RF_LOCK_MUTEX(desc->mutex); + RF_ASSERT(desc->flags&RF_DAG_ACCESS_COMPLETE); + RF_SIGNAL_COND(desc->cond); /* DoAccess frees the desc in the blocking-I/O case */ + RF_UNLOCK_MUTEX(desc->mutex); + } + else + rf_FreeRaidAccDesc(desc); + + if (callbackFunc) + callbackFunc(callbackArg); + +#else /* KERNEL */ + if (!(desc->flags & RF_DAG_TEST_ACCESS)) {/* don't biodone if this */ +#if DKUSAGE > 0 + RF_DKU_END_IO(((RF_Raid_t *)desc->raidPtr)->raidid,(struct buf *)desc->bp); +#else + RF_DKU_END_IO(((RF_Raid_t *)desc->raidPtr)->raidid); +#endif /* DKUSAGE > 0 */ + /* printf("Calling biodone on 0x%x\n",desc->bp); */ + biodone(desc->bp); /* access came through ioctl */ + } + + if (callbackFunc) callbackFunc(callbackArg); + rf_FreeRaidAccDesc(desc); + +#endif /* ! KERNEL */ +#endif /* SIMULATE */ + + return RF_FALSE; +} + +int rf_State_IncrAccessCount(RF_RaidAccessDesc_t *desc) +{ + RF_Raid_t *raidPtr; + + raidPtr = desc->raidPtr; + /* Bummer. We have to do this to be 100% safe w.r.t. the increment below */ + RF_LOCK_MUTEX(raidPtr->access_suspend_mutex); + raidPtr->accs_in_flight++; /* used to detect quiescence */ + RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex); + + desc->state++; + return RF_FALSE; +} + +int rf_State_DecrAccessCount(RF_RaidAccessDesc_t *desc) +{ + RF_Raid_t *raidPtr; + + raidPtr = desc->raidPtr; + + RF_LOCK_MUTEX(raidPtr->access_suspend_mutex); + raidPtr->accs_in_flight--; + if (raidPtr->accesses_suspended && raidPtr->accs_in_flight == 0) { + rf_SignalQuiescenceLock(raidPtr, raidPtr->reconDesc); + } + rf_UpdateUserStats(raidPtr, RF_ETIMER_VAL_US(desc->timer), desc->numBlocks); + RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex); + + desc->state++; + return RF_FALSE; +} + +int rf_State_Quiesce(RF_RaidAccessDesc_t *desc) +{ + RF_AccTraceEntry_t *tracerec = &desc->tracerec; + RF_Etimer_t timer; + int suspended = RF_FALSE; + RF_Raid_t *raidPtr; + + raidPtr = desc->raidPtr; + + RF_ETIMER_START(timer); + RF_ETIMER_START(desc->timer); + + RF_LOCK_MUTEX(raidPtr->access_suspend_mutex); + if (raidPtr->accesses_suspended) { + RF_CallbackDesc_t *cb; + cb = rf_AllocCallbackDesc(); + /* XXX the following cast is quite bogus... rf_ContinueRaidAccess + takes a (RF_RaidAccessDesc_t *) as an argument.. GO */ + cb->callbackFunc = (void (*)(RF_CBParam_t))rf_ContinueRaidAccess; + cb->callbackArg.p = (void *) desc; + cb->next = raidPtr->quiesce_wait_list; + raidPtr->quiesce_wait_list = cb; + suspended = RF_TRUE; + } + + RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex); + + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + tracerec->specific.user.suspend_ovhd_us += RF_ETIMER_VAL_US(timer); + + if (suspended && rf_quiesceDebug) + printf("Stalling access due to quiescence lock\n"); + + desc->state++; + return suspended; +} + +int rf_State_Map(RF_RaidAccessDesc_t *desc) +{ + RF_Raid_t *raidPtr = desc->raidPtr; + RF_AccTraceEntry_t *tracerec = &desc->tracerec; + RF_Etimer_t timer; + + RF_ETIMER_START(timer); + + if (!(desc->asmap = rf_MapAccess(raidPtr, desc->raidAddress, desc->numBlocks, + desc->bufPtr, RF_DONT_REMAP))) + RF_PANIC(); + + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + tracerec->specific.user.map_us = RF_ETIMER_VAL_US(timer); + + desc->state ++; + return RF_FALSE; +} + +int rf_State_Lock(RF_RaidAccessDesc_t *desc) +{ + RF_AccTraceEntry_t *tracerec = &desc->tracerec; + RF_Raid_t *raidPtr = desc->raidPtr; + RF_AccessStripeMapHeader_t *asmh = desc->asmap; + RF_AccessStripeMap_t *asm_p; + RF_Etimer_t timer; + int suspended = RF_FALSE; + + RF_ETIMER_START(timer); + if (!(raidPtr->Layout.map->flags & RF_NO_STRIPE_LOCKS)) { + RF_StripeNum_t lastStripeID = -1; + + /* acquire each lock that we don't already hold */ + for (asm_p = asmh->stripeMap; asm_p; asm_p = asm_p->next) { + RF_ASSERT(RF_IO_IS_R_OR_W(desc->type)); + if (!rf_suppressLocksAndLargeWrites && + asm_p->parityInfo && + !(desc->flags& RF_DAG_SUPPRESS_LOCKS) && + !(asm_p->flags & RF_ASM_FLAGS_LOCK_TRIED)) + { + asm_p->flags |= RF_ASM_FLAGS_LOCK_TRIED; + RF_ASSERT(asm_p->stripeID > lastStripeID); /* locks must be acquired + hierarchically */ + lastStripeID = asm_p->stripeID; + /* XXX the cast to (void (*)(RF_CBParam_t)) below is bogus! GO */ + RF_INIT_LOCK_REQ_DESC(asm_p->lockReqDesc, desc->type, + (void (*)(struct buf *))rf_ContinueRaidAccess, desc, asm_p, + raidPtr->Layout.dataSectorsPerStripe); + if (rf_AcquireStripeLock(raidPtr->lockTable, asm_p->stripeID, + &asm_p->lockReqDesc)) + { + suspended = RF_TRUE; + break; + } + } + + if (desc->type == RF_IO_TYPE_WRITE && + raidPtr->status[asm_p->physInfo->row] == rf_rs_reconstructing) + { + if (! (asm_p->flags & RF_ASM_FLAGS_FORCE_TRIED) ) { + int val; + + asm_p->flags |= RF_ASM_FLAGS_FORCE_TRIED; + /* XXX the cast below is quite bogus!!! XXX GO */ + val = rf_ForceOrBlockRecon(raidPtr, asm_p, + (void (*)(RF_Raid_t *,void *))rf_ContinueRaidAccess, desc); + if (val == 0) { + asm_p->flags |= RF_ASM_FLAGS_RECON_BLOCKED; + } + else { + suspended = RF_TRUE; + break; + } + } + else { + if (rf_pssDebug) { + printf("[%d] skipping force/block because already done, psid %ld\n", + desc->tid,(long)asm_p->stripeID); + } + } + } + else { + if (rf_pssDebug) { + printf("[%d] skipping force/block because not write or not under recon, psid %ld\n", + desc->tid,(long)asm_p->stripeID); + } + } + } + + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + tracerec->specific.user.lock_us += RF_ETIMER_VAL_US(timer); + + if (suspended) + return(RF_TRUE); + } + + desc->state++; + return(RF_FALSE); +} + +/* + * the following three states create, execute, and post-process dags + * the error recovery unit is a single dag. + * by default, SelectAlgorithm creates an array of dags, one per parity stripe + * in some tricky cases, multiple dags per stripe are created + * - dags within a parity stripe are executed sequentially (arbitrary order) + * - dags for distinct parity stripes are executed concurrently + * + * repeat until all dags complete successfully -or- dag selection fails + * + * while !done + * create dag(s) (SelectAlgorithm) + * if dag + * execute dag (DispatchDAG) + * if dag successful + * done (SUCCESS) + * else + * !done (RETRY - start over with new dags) + * else + * done (FAIL) + */ +int rf_State_CreateDAG (RF_RaidAccessDesc_t *desc) +{ + RF_AccTraceEntry_t *tracerec = &desc->tracerec; + RF_Etimer_t timer; + RF_DagHeader_t *dag_h; + int i, selectStatus; + + /* generate a dag for the access, and fire it off. When the dag + completes, we'll get re-invoked in the next state. */ + RF_ETIMER_START(timer); + /* SelectAlgorithm returns one or more dags */ + selectStatus = rf_SelectAlgorithm(desc, desc->flags|RF_DAG_SUPPRESS_LOCKS); + if (rf_printDAGsDebug) + for (i = 0; i < desc->numStripes; i++) + rf_PrintDAGList(desc->dagArray[i].dags); + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + /* update time to create all dags */ + tracerec->specific.user.dag_create_us = RF_ETIMER_VAL_US(timer); + + desc->status = 0; /* good status */ + + if (selectStatus) { + /* failed to create a dag */ + /* this happens when there are too many faults or incomplete dag libraries */ + printf("[Failed to create a DAG\n]"); + RF_PANIC(); + } + else { + /* bind dags to desc */ + for (i = 0; i < desc->numStripes; i++) { + dag_h = desc->dagArray[i].dags; + while (dag_h) { +#ifdef KERNEL + dag_h->bp = (struct buf *) desc->bp; +#endif /* KERNEL */ + dag_h->tracerec = tracerec; + dag_h = dag_h->next; + } + } + desc->flags |= RF_DAG_DISPATCH_RETURNED; + desc->state++; /* next state should be rf_State_ExecuteDAG */ + } + return RF_FALSE; +} + + + +/* the access has an array of dagLists, one dagList per parity stripe. + * fire the first dag in each parity stripe (dagList). + * dags within a stripe (dagList) must be executed sequentially + * - this preserves atomic parity update + * dags for independents parity groups (stripes) are fired concurrently */ + +int rf_State_ExecuteDAG(RF_RaidAccessDesc_t *desc) +{ + int i; + RF_DagHeader_t *dag_h; + RF_DagList_t *dagArray = desc->dagArray; + + /* next state is always rf_State_ProcessDAG + * important to do this before firing the first dag + * (it may finish before we leave this routine) */ + desc->state++; + + /* sweep dag array, a stripe at a time, firing the first dag in each stripe */ + for (i = 0; i < desc->numStripes; i++) { + RF_ASSERT(dagArray[i].numDags > 0); + RF_ASSERT(dagArray[i].numDagsDone == 0); + RF_ASSERT(dagArray[i].numDagsFired == 0); + RF_ETIMER_START(dagArray[i].tracerec.timer); + /* fire first dag in this stripe */ + dag_h = dagArray[i].dags; + RF_ASSERT(dag_h); + dagArray[i].numDagsFired++; + /* XXX Yet another case where we pass in a conflicting function pointer + :-( XXX GO */ + rf_DispatchDAG(dag_h, (void (*)(void *))rf_ContinueDagAccess, &dagArray[i]); + } + + /* the DAG will always call the callback, even if there was no + * blocking, so we are always suspended in this state */ + return RF_TRUE; +} + + + +/* rf_State_ProcessDAG is entered when a dag completes. + * first, check to all dags in the access have completed + * if not, fire as many dags as possible */ + +int rf_State_ProcessDAG(RF_RaidAccessDesc_t *desc) +{ + RF_AccessStripeMapHeader_t *asmh = desc->asmap; + RF_Raid_t *raidPtr = desc->raidPtr; + RF_DagHeader_t *dag_h; + int i, j, done = RF_TRUE; + RF_DagList_t *dagArray = desc->dagArray; + RF_Etimer_t timer; + + /* check to see if this is the last dag */ + for (i = 0; i < desc->numStripes; i++) + if (dagArray[i].numDags != dagArray[i].numDagsDone) + done = RF_FALSE; + + if (done) { + if (desc->status) { + /* a dag failed, retry */ + RF_ETIMER_START(timer); + /* free all dags */ + for (i = 0; i < desc->numStripes; i++) { + rf_FreeDAG(desc->dagArray[i].dags); + } + rf_MarkFailuresInASMList(raidPtr, asmh); + /* back up to rf_State_CreateDAG */ + desc->state = desc->state - 2; + return RF_FALSE; + } + else { + /* move on to rf_State_Cleanup */ + desc->state++; + } + return RF_FALSE; + } + else { + /* more dags to execute */ + /* see if any are ready to be fired. if so, fire them */ + /* don't fire the initial dag in a list, it's fired in rf_State_ExecuteDAG */ + for (i = 0; i < desc->numStripes; i++) { + if ((dagArray[i].numDagsDone < dagArray[i].numDags) + && (dagArray[i].numDagsDone == dagArray[i].numDagsFired) + && (dagArray[i].numDagsFired > 0)) { + RF_ETIMER_START(dagArray[i].tracerec.timer); + /* fire next dag in this stripe */ + /* first, skip to next dag awaiting execution */ + dag_h = dagArray[i].dags; + for (j = 0; j < dagArray[i].numDagsDone; j++) + dag_h = dag_h->next; + dagArray[i].numDagsFired++; + /* XXX and again we pass a different function pointer.. GO */ + rf_DispatchDAG(dag_h, (void (*)(void *))rf_ContinueDagAccess, + &dagArray[i]); + } + } + return RF_TRUE; + } +} + +/* only make it this far if all dags complete successfully */ +int rf_State_Cleanup(RF_RaidAccessDesc_t *desc) +{ + RF_AccTraceEntry_t *tracerec = &desc->tracerec; + RF_AccessStripeMapHeader_t *asmh = desc->asmap; + RF_Raid_t *raidPtr = desc->raidPtr; + RF_AccessStripeMap_t *asm_p; + RF_DagHeader_t *dag_h; + RF_Etimer_t timer; + int tid, i; + + desc->state ++; + + rf_get_threadid(tid); + + timer = tracerec->timer; + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + tracerec->specific.user.dag_retry_us = RF_ETIMER_VAL_US(timer); + + /* the RAID I/O is complete. Clean up. */ + tracerec->specific.user.dag_retry_us = 0; + + RF_ETIMER_START(timer); + if (desc->flags & RF_DAG_RETURN_DAG) { + /* copy dags into paramDAG */ + *(desc->paramDAG) = desc->dagArray[0].dags; + dag_h = *(desc->paramDAG); + for (i = 1; i < desc->numStripes; i++) { + /* concatenate dags from remaining stripes */ + RF_ASSERT(dag_h); + while (dag_h->next) + dag_h = dag_h->next; + dag_h->next = desc->dagArray[i].dags; + } + } + else { + /* free all dags */ + for (i = 0; i < desc->numStripes; i++) { + rf_FreeDAG(desc->dagArray[i].dags); + } + } + + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + tracerec->specific.user.cleanup_us = RF_ETIMER_VAL_US(timer); + + RF_ETIMER_START(timer); + if (!(raidPtr->Layout.map->flags & RF_NO_STRIPE_LOCKS)) { + for (asm_p = asmh->stripeMap; asm_p; asm_p = asm_p->next) { + if (!rf_suppressLocksAndLargeWrites && + asm_p->parityInfo && + !(desc->flags&RF_DAG_SUPPRESS_LOCKS)) + { + RF_ASSERT_VALID_LOCKREQ(&asm_p->lockReqDesc); + rf_ReleaseStripeLock(raidPtr->lockTable, asm_p->stripeID, + &asm_p->lockReqDesc); + } + if (asm_p->flags & RF_ASM_FLAGS_RECON_BLOCKED) { + rf_UnblockRecon(raidPtr, asm_p); + } + } + } + +#ifdef SIMULATE + /* refresh current owner in case blocked ios where allowed to run */ + rf_SetCurrentOwner(desc->owner); +#endif /* SIMULATE */ + + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + tracerec->specific.user.lock_us += RF_ETIMER_VAL_US(timer); + + RF_ETIMER_START(timer); + if (desc->flags & RF_DAG_RETURN_ASM) + *(desc->paramASM) = asmh; + else + rf_FreeAccessStripeMap(asmh); + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + tracerec->specific.user.cleanup_us += RF_ETIMER_VAL_US(timer); + + RF_ETIMER_STOP(desc->timer); + RF_ETIMER_EVAL(desc->timer); + + timer = desc->tracerec.tot_timer; + RF_ETIMER_STOP(timer); + RF_ETIMER_EVAL(timer); + desc->tracerec.total_us = RF_ETIMER_VAL_US(timer); + + rf_LogTraceRec(raidPtr, tracerec); + + desc->flags |= RF_DAG_ACCESS_COMPLETE; + + return RF_FALSE; +} diff --git a/sys/dev/raidframe/rf_states.h b/sys/dev/raidframe/rf_states.h new file mode 100644 index 00000000000..2e2895caa5e --- /dev/null +++ b/sys/dev/raidframe/rf_states.h @@ -0,0 +1,70 @@ +/* $OpenBSD: rf_states.h,v 1.1 1999/01/11 14:29:51 niklas Exp $ */ +/* $NetBSD: rf_states.h,v 1.1 1998/11/13 04:20:34 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, William V. Courtright II, Robby Findler + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* : + * Log: rf_states.h,v + * Revision 1.5 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.4 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.3 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.2 1996/05/06 22:08:28 wvcii + * added copyright info and change log + * + * Revision 1.1 1995/07/06 14:23:39 robby + * Initial revision + * + */ + +#ifndef _RF__RF_STATES_H_ +#define _RF__RF_STATES_H_ + +#include "rf_types.h" + +void rf_ContinueRaidAccess(RF_RaidAccessDesc_t *desc); +void rf_ContinueDagAccess(RF_DagList_t *dagList); +int rf_State_LastState(RF_RaidAccessDesc_t *desc); +int rf_State_IncrAccessCount(RF_RaidAccessDesc_t *desc); +int rf_State_DecrAccessCount(RF_RaidAccessDesc_t *desc); +int rf_State_Quiesce(RF_RaidAccessDesc_t *desc); +int rf_State_Map(RF_RaidAccessDesc_t *desc); +int rf_State_Lock(RF_RaidAccessDesc_t *desc); +int rf_State_CreateDAG(RF_RaidAccessDesc_t *desc); +int rf_State_ExecuteDAG(RF_RaidAccessDesc_t *desc); +int rf_State_ProcessDAG(RF_RaidAccessDesc_t *desc); +int rf_State_Cleanup(RF_RaidAccessDesc_t *desc); + +#endif /* !_RF__RF_STATES_H_ */ diff --git a/sys/dev/raidframe/rf_stripelocks.c b/sys/dev/raidframe/rf_stripelocks.c new file mode 100644 index 00000000000..c9b9502ad70 --- /dev/null +++ b/sys/dev/raidframe/rf_stripelocks.c @@ -0,0 +1,642 @@ +/* $OpenBSD: rf_stripelocks.c,v 1.1 1999/01/11 14:29:51 niklas Exp $ */ +/* $NetBSD: rf_stripelocks.c,v 1.1 1998/11/13 04:20:34 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Authors: Mark Holland, Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* : + * Log: rf_stripelocks.c,v + * Revision 1.35 1996/06/10 12:50:57 jimz + * Add counters to freelists to track number of allocations, frees, + * grows, max size, etc. Adjust a couple sets of PRIME params based + * on the results. + * + * Revision 1.34 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.33 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.32 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.31 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.30 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.29 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.28 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.27 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.26 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.25 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.24 1996/05/20 16:15:00 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.23 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.22 1996/05/16 22:28:11 jimz + * misc cleanup + * + * Revision 1.21 1996/05/15 23:39:52 jimz + * remove #if 0 code + * + * Revision 1.20 1996/05/15 23:37:38 jimz + * convert to using RF_FREELIST stuff for StripeLockDesc allocation + * + * Revision 1.19 1996/05/08 18:00:53 jimz + * fix number of args to debug printf + * + * Revision 1.18 1996/05/06 22:33:07 jimz + * added better debug info + * + * Revision 1.17 1996/05/06 22:09:01 wvcii + * added copyright info and change log + * + */ + +/* + * stripelocks.c -- code to lock stripes for read and write access + * + * The code distinguishes between read locks and write locks. There can be + * as many readers to given stripe as desired. When a write request comes + * in, no further readers are allowed to enter, and all subsequent requests + * are queued in FIFO order. When a the number of readers goes to zero, the + * writer is given the lock. When a writer releases the lock, the list of + * queued requests is scanned, and all readersq up to the next writer are + * given the lock. + * + * The lock table size must be one less than a power of two, but HASH_STRIPEID + * is the only function that requires this. + * + * The code now supports "range locks". When you ask to lock a stripe, you + * specify a range of addresses in that stripe that you want to lock. When + * you acquire the lock, you've locked only this range of addresses, and + * other threads can concurrently read/write any non-overlapping portions + * of the stripe. The "addresses" that you lock are abstract in that you + * can pass in anything you like. The expectation is that you'll pass in + * the range of physical disk offsets of the parity bits you're planning + * to update. The idea behind this, of course, is to allow sub-stripe + * locking. The implementation is perhaps not the best imaginable; in the + * worst case a lock release is O(n^2) in the total number of outstanding + * requests to a given stripe. Note that if you're striping with a + * stripe unit size equal to an entire disk (i.e. not striping), there will + * be only one stripe and you may spend some significant number of cycles + * searching through stripe lock descriptors. + */ + +#ifdef _KERNEL +#define KERNEL +#endif + +#include "rf_types.h" +#include "rf_raid.h" +#include "rf_stripelocks.h" +#include "rf_alloclist.h" +#include "rf_threadid.h" +#include "rf_general.h" +#include "rf_freelist.h" +#include "rf_debugprint.h" +#include "rf_driver.h" +#include "rf_shutdown.h" + +#define Dprintf1(s,a) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf2(s,a,b) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf3(s,a,b,c) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL) +#define Dprintf4(s,a,b,c,d) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL) +#define Dprintf5(s,a,b,c,d,e) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL) +#define Dprintf6(s,a,b,c,d,e,f) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL) +#define Dprintf7(s,a,b,c,d,e,f,g) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL) +#define Dprintf8(s,a,b,c,d,e,f,g,h) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),(void *)((unsigned long)h)) + +#ifndef KERNEL +#define FLUSH fflush(stdout) +#else /* !KERNEL */ +#define FLUSH +#endif /* !KERNEL */ + +#define HASH_STRIPEID(_sid_) ( (_sid_) & (rf_lockTableSize-1) ) +#define MAX_FREELIST 100 + +static void AddToWaitersQueue(RF_LockTableEntry_t *lockTable, RF_StripeLockDesc_t *lockDesc, RF_LockReqDesc_t *lockReqDesc); +static RF_StripeLockDesc_t *AllocStripeLockDesc(RF_StripeNum_t stripeID); +static void FreeStripeLockDesc(RF_StripeLockDesc_t *p); +static void PrintLockedStripes(RF_LockTableEntry_t *lockTable); + +/* determines if two ranges overlap. always yields false if either start value is negative */ +#define SINGLE_RANGE_OVERLAP(_strt1, _stop1, _strt2, _stop2) \ + ( (_strt1 >= 0) && (_strt2 >= 0) && (RF_MAX(_strt1, _strt2) <= RF_MIN(_stop1, _stop2)) ) + +/* determines if any of the ranges specified in the two lock descriptors overlap each other */ +#define RANGE_OVERLAP(_cand, _pred) \ + ( SINGLE_RANGE_OVERLAP((_cand)->start, (_cand)->stop, (_pred)->start, (_pred)->stop ) || \ + SINGLE_RANGE_OVERLAP((_cand)->start2, (_cand)->stop2, (_pred)->start, (_pred)->stop ) || \ + SINGLE_RANGE_OVERLAP((_cand)->start, (_cand)->stop, (_pred)->start2, (_pred)->stop2) || \ + SINGLE_RANGE_OVERLAP((_cand)->start2, (_cand)->stop2, (_pred)->start2, (_pred)->stop2) ) + +/* Determines if a candidate lock request conflicts with a predecessor lock req. + * Note that the arguments are not interchangeable. + * The rules are: + * a candidate read conflicts with a predecessor write if any ranges overlap + * a candidate write conflicts with a predecessor read if any ranges overlap + * a candidate write conflicts with a predecessor write if any ranges overlap + */ +#define STRIPELOCK_CONFLICT(_cand, _pred) \ + RANGE_OVERLAP((_cand), (_pred)) && \ + ( ( (((_cand)->type == RF_IO_TYPE_READ) && ((_pred)->type == RF_IO_TYPE_WRITE)) || \ + (((_cand)->type == RF_IO_TYPE_WRITE) && ((_pred)->type == RF_IO_TYPE_READ)) || \ + (((_cand)->type == RF_IO_TYPE_WRITE) && ((_pred)->type == RF_IO_TYPE_WRITE)) \ + ) \ + ) + +static RF_FreeList_t *rf_stripelock_freelist; +#define RF_MAX_FREE_STRIPELOCK 128 +#define RF_STRIPELOCK_INC 8 +#define RF_STRIPELOCK_INITIAL 32 + +static void rf_ShutdownStripeLockFreeList(void *); +static void rf_RaidShutdownStripeLocks(void *); + +static void rf_ShutdownStripeLockFreeList(ignored) + void *ignored; +{ + RF_FREELIST_DESTROY(rf_stripelock_freelist,next,(RF_StripeLockDesc_t *)); +} + +int rf_ConfigureStripeLockFreeList(listp) + RF_ShutdownList_t **listp; +{ + unsigned mask; + int rc; + + RF_FREELIST_CREATE(rf_stripelock_freelist, RF_MAX_FREE_STRIPELOCK, + RF_STRIPELOCK_INITIAL,sizeof(RF_StripeLockDesc_t)); + rc = rf_ShutdownCreate(listp, rf_ShutdownStripeLockFreeList, NULL); + if (rc) { + RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", + __FILE__, __LINE__, rc); + rf_ShutdownStripeLockFreeList(NULL); + return(rc); + } + RF_FREELIST_PRIME(rf_stripelock_freelist,RF_STRIPELOCK_INITIAL,next, + (RF_StripeLockDesc_t *)); + for (mask=0x1; mask; mask<<=1) + if (rf_lockTableSize==mask) + break; + if (!mask) { + printf("[WARNING: lock table size must be a power of two. Setting to %d.]\n",RF_DEFAULT_LOCK_TABLE_SIZE); + rf_lockTableSize = RF_DEFAULT_LOCK_TABLE_SIZE; + } + return(0); +} + +RF_LockTableEntry_t *rf_MakeLockTable() +{ + RF_LockTableEntry_t *lockTable; + int i, rc; + + RF_Calloc(lockTable, ((int) rf_lockTableSize), sizeof(RF_LockTableEntry_t), (RF_LockTableEntry_t *)); + if (lockTable == NULL) + return(NULL); + for (i=0; i<rf_lockTableSize; i++) { + rc = rf_mutex_init(&lockTable[i].mutex); + if (rc) { + RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__, + __LINE__, rc); + /* XXX clean up other mutexes */ + return(NULL); + } + } + return(lockTable); +} + +void rf_ShutdownStripeLocks(RF_LockTableEntry_t *lockTable) +{ + int i; + + if (rf_stripeLockDebug) { + PrintLockedStripes(lockTable); + } + for (i=0; i<rf_lockTableSize; i++) { + rf_mutex_destroy(&lockTable[i].mutex); + } + RF_Free(lockTable, rf_lockTableSize*sizeof(RF_LockTableEntry_t)); +} + +static void rf_RaidShutdownStripeLocks(arg) + void *arg; +{ + RF_Raid_t *raidPtr = (RF_Raid_t *)arg; + rf_ShutdownStripeLocks(raidPtr->lockTable); +} + +int rf_ConfigureStripeLocks( + RF_ShutdownList_t **listp, + RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr) +{ + int rc; + + raidPtr->lockTable = rf_MakeLockTable(); + if (raidPtr->lockTable == NULL) + return(ENOMEM); + rc = rf_ShutdownCreate(listp, rf_RaidShutdownStripeLocks, raidPtr); + if (rc) { + RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", + __FILE__, __LINE__, rc); + rf_ShutdownStripeLocks(raidPtr->lockTable); + return(rc); + } + return(0); +} + +/* returns 0 if you've got the lock, and non-zero if you have to wait. + * if and only if you have to wait, we'll cause cbFunc to get invoked + * with cbArg when you are granted the lock. We store a tag in *releaseTag + * that you need to give back to us when you release the lock. + */ +int rf_AcquireStripeLock( + RF_LockTableEntry_t *lockTable, + RF_StripeNum_t stripeID, + RF_LockReqDesc_t *lockReqDesc) +{ + RF_StripeLockDesc_t *lockDesc; + RF_LockReqDesc_t *p; + int tid=0, hashval = HASH_STRIPEID(stripeID); + int retcode = 0; + + RF_ASSERT(RF_IO_IS_R_OR_W(lockReqDesc->type)); + + if (rf_stripeLockDebug) { + rf_get_threadid(tid); + if (stripeID == -1) Dprintf1("[%d] Lock acquisition supressed (stripeID == -1)\n",tid); + else { + Dprintf8("[%d] Trying to acquire stripe lock table 0x%lx SID %ld type %c range %ld-%ld, range2 %ld-%ld hashval %d\n", + tid, (unsigned long) lockTable, stripeID, lockReqDesc->type, lockReqDesc->start, + lockReqDesc->stop, lockReqDesc->start2, lockReqDesc->stop2); + Dprintf3("[%d] lock %ld hashval %d\n", tid, stripeID, hashval); + FLUSH; + } + } + if (stripeID == -1) return(0); + lockReqDesc->next = NULL; /* just to be sure */ + + RF_LOCK_MUTEX(lockTable[hashval].mutex); + for (lockDesc = lockTable[hashval].descList; lockDesc; lockDesc=lockDesc->next) { + if (lockDesc->stripeID == stripeID) break; + } + + if (!lockDesc) { /* no entry in table => no one reading or writing */ + lockDesc = AllocStripeLockDesc(stripeID); + lockDesc->next = lockTable[hashval].descList; + lockTable[hashval].descList = lockDesc; + if (lockReqDesc->type == RF_IO_TYPE_WRITE) lockDesc->nWriters++; + lockDesc->granted = lockReqDesc; + if (rf_stripeLockDebug) {Dprintf7("[%d] no one waiting: lock %ld %c %ld-%ld %ld-%ld granted\n", + tid,stripeID,lockReqDesc->type,lockReqDesc->start,lockReqDesc->stop,lockReqDesc->start2,lockReqDesc->stop2); FLUSH;} + } else { + + if (lockReqDesc->type == RF_IO_TYPE_WRITE) lockDesc->nWriters++; + + if (lockDesc->nWriters == 0) { /* no need to search any lists if there are no writers anywhere */ + lockReqDesc->next = lockDesc->granted; + lockDesc->granted = lockReqDesc; + if (rf_stripeLockDebug) {Dprintf7("[%d] no writers: lock %ld %c %ld-%ld %ld-%ld granted\n", + tid,stripeID,lockReqDesc->type,lockReqDesc->start,lockReqDesc->stop,lockReqDesc->start2,lockReqDesc->stop2); FLUSH;} + } else { + + /* search the granted & waiting lists for a conflict. stop searching as soon as we find one */ + retcode = 0; + for (p = lockDesc->granted; p; p=p->next) if (STRIPELOCK_CONFLICT(lockReqDesc, p)) {retcode = 1; break;} + if (!retcode) for (p = lockDesc->waitersH; p; p=p->next) if (STRIPELOCK_CONFLICT(lockReqDesc, p)) {retcode = 2; break;} + + if (!retcode) { + lockReqDesc->next = lockDesc->granted; /* no conflicts found => grant lock */ + lockDesc->granted = lockReqDesc; + if (rf_stripeLockDebug) { + Dprintf7("[%d] no conflicts: lock %ld %c %ld-%ld %ld-%ld granted\n", + tid,stripeID,lockReqDesc->type,lockReqDesc->start,lockReqDesc->stop, + lockReqDesc->start2,lockReqDesc->stop2); + FLUSH; + } + } else { + if (rf_stripeLockDebug) { + Dprintf6("[%d] conflict: lock %ld %c %ld-%ld hashval=%d not granted\n", + tid,stripeID,lockReqDesc->type,lockReqDesc->start,lockReqDesc->stop, + hashval); + Dprintf3("[%d] lock %ld retcode=%d\n", tid, stripeID, retcode); + FLUSH; + } + AddToWaitersQueue(lockTable, lockDesc, lockReqDesc); /* conflict => the current access must wait */ + } + } + } + + RF_UNLOCK_MUTEX(lockTable[hashval].mutex); + return(retcode); +} + +void rf_ReleaseStripeLock( + RF_LockTableEntry_t *lockTable, + RF_StripeNum_t stripeID, + RF_LockReqDesc_t *lockReqDesc) +{ + RF_StripeLockDesc_t *lockDesc, *ld_t; + RF_LockReqDesc_t *lr, *lr_t, *callbacklist, *t; + RF_IoType_t type = lockReqDesc->type; + int tid=0, hashval = HASH_STRIPEID(stripeID); + int release_it, consider_it; + RF_LockReqDesc_t *candidate, *candidate_t, *predecessor; + + RF_ASSERT(RF_IO_IS_R_OR_W(type)); + + if (rf_stripeLockDebug) { + rf_get_threadid(tid); + if (stripeID == -1) Dprintf1("[%d] Lock release supressed (stripeID == -1)\n",tid); + else {Dprintf8("[%d] Releasing stripe lock on stripe ID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n", + tid,stripeID,lockReqDesc->type,lockReqDesc->start,lockReqDesc->stop,lockReqDesc->start2,lockReqDesc->stop2, lockTable); FLUSH;} + } + + if (stripeID == -1) return; + + RF_LOCK_MUTEX(lockTable[hashval].mutex); + + /* find the stripe lock descriptor */ + for (ld_t = NULL, lockDesc = lockTable[hashval].descList; lockDesc; ld_t = lockDesc, lockDesc=lockDesc->next) { + if (lockDesc->stripeID == stripeID) break; + } + RF_ASSERT(lockDesc); /* major error to release a lock that doesn't exist */ + + /* find the stripe lock request descriptor & delete it from the list */ + for (lr_t = NULL, lr = lockDesc->granted; lr; lr_t = lr, lr=lr->next) if (lr == lockReqDesc) break; + + RF_ASSERT(lr && (lr == lockReqDesc)); /* major error to release a lock that hasn't been granted */ + if (lr_t) lr_t->next = lr->next; else { + RF_ASSERT(lr == lockDesc->granted); + lockDesc->granted = lr->next; + } + lr->next = NULL; + + if (lockReqDesc->type == RF_IO_TYPE_WRITE) lockDesc->nWriters--; + + /* search through the waiters list to see if anyone needs to be woken up. + * for each such descriptor in the wait list, we check it against everything granted and against + * everything _in front_ of it in the waiters queue. If it conflicts with none of these, we release it. + * + * DON'T TOUCH THE TEMPLINK POINTER OF ANYTHING IN THE GRANTED LIST HERE. This will roach the case where + * the callback tries to acquire a new lock in the same stripe. There are some asserts to try and detect this. + * + * We apply 2 performance optimizations: + * (1) if releasing this lock results in no more writers to this stripe, we just release everybody waiting, + * since we place no restrictions on the number of concurrent reads. + * (2) we consider as candidates for wakeup only those waiters that have a range overlap with either + * the descriptor being woken up or with something in the callbacklist (i.e. something we've just now woken up). + * This allows us to avoid the long evaluation for some descriptors. + */ + + callbacklist = NULL; + if (lockDesc->nWriters == 0) { /* performance tweak (1) */ + while (lockDesc->waitersH) { + + lr = lockDesc->waitersH; /* delete from waiters list */ + lockDesc->waitersH = lr->next; + + RF_ASSERT(lr->type == RF_IO_TYPE_READ); + + lr->next = lockDesc->granted; /* add to granted list */ + lockDesc->granted = lr; + + RF_ASSERT(!lr->templink); + lr->templink = callbacklist; /* put on callback list so that we'll invoke callback below */ + callbacklist = lr; + if (rf_stripeLockDebug) {Dprintf8("[%d] No writers: granting lock stripe ID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n", + tid,stripeID,lr->type,lr->start,lr->stop,lr->start2,lr->stop2,(unsigned long) lockTable); FLUSH;} + } + lockDesc->waitersT = NULL; /* we've purged the whole waiters list */ + + } else for (candidate_t = NULL, candidate = lockDesc->waitersH; candidate; ) { + + /* performance tweak (2) */ + consider_it = 0; + if (RANGE_OVERLAP(lockReqDesc, candidate)) consider_it = 1; + else for (t = callbacklist; t; t=t->templink) if (RANGE_OVERLAP(t, candidate)) { + consider_it = 1; + break; + } + if (!consider_it) { + if (rf_stripeLockDebug) {Dprintf8("[%d] No overlap: rejecting candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n", + tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2, + (unsigned long) lockTable); FLUSH;} + candidate_t = candidate; candidate = candidate->next; + continue; + } + + + /* we have a candidate for release. check to make sure it is not blocked by any granted locks */ + release_it = 1; + for (predecessor = lockDesc->granted; predecessor; predecessor = predecessor->next) { + if (STRIPELOCK_CONFLICT(candidate, predecessor)) { + if (rf_stripeLockDebug) { + Dprintf8("[%d] Conflicts with granted lock: rejecting candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n", + tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2, + (unsigned long) lockTable); FLUSH; + } + release_it = 0; break; + } + } + + /* now check to see if the candidate is blocked by any waiters that occur before it it the wait queue */ + if (release_it) for (predecessor = lockDesc->waitersH; predecessor != candidate; predecessor = predecessor->next) { + if (STRIPELOCK_CONFLICT(candidate, predecessor)) { + if (rf_stripeLockDebug) { + Dprintf8("[%d] Conflicts with waiting lock: rejecting candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n", + tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2, + (unsigned long) lockTable); FLUSH; + } + release_it = 0; break; + } + } + + /* release it if indicated */ + if (release_it) { + if (rf_stripeLockDebug) {Dprintf8("[%d] Granting lock to candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n", + tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2, + (unsigned long) lockTable); FLUSH;} + if (candidate_t) { + candidate_t->next = candidate->next; + if (lockDesc->waitersT == candidate) lockDesc->waitersT = candidate_t; /* cannot be waitersH since candidate_t is not NULL */ + } else { + RF_ASSERT(candidate == lockDesc->waitersH); + lockDesc->waitersH = lockDesc->waitersH->next; + if (!lockDesc->waitersH) lockDesc->waitersT = NULL; + } + candidate->next = lockDesc->granted; /* move it to the granted list */ + lockDesc->granted = candidate; + + RF_ASSERT(!candidate->templink); + candidate->templink = callbacklist; /* put it on the list of things to be called after we release the mutex */ + callbacklist = candidate; + + if (!candidate_t) candidate = lockDesc->waitersH; else candidate = candidate_t->next; /* continue with the rest of the list */ + } else { + candidate_t = candidate; candidate = candidate->next; /* continue with the rest of the list */ + } + } + + /* delete the descriptor if no one is waiting or active */ + if (!lockDesc->granted && !lockDesc->waitersH) { + RF_ASSERT(lockDesc->nWriters == 0); + if (rf_stripeLockDebug) { + Dprintf3("[%d] Last lock released (table 0x%lx): deleting desc for stripeID %ld\n",tid,(unsigned long) lockTable, stripeID); FLUSH; + } + if (ld_t) ld_t->next = lockDesc->next; else { + RF_ASSERT(lockDesc == lockTable[hashval].descList); + lockTable[hashval].descList = lockDesc->next; + } + FreeStripeLockDesc(lockDesc); + lockDesc = NULL; /* only for the ASSERT below */ + } + + RF_UNLOCK_MUTEX(lockTable[hashval].mutex); + + /* now that we've unlocked the mutex, invoke the callback on all the descriptors in the list */ + RF_ASSERT(!( (callbacklist) && (!lockDesc) )); /* if we deleted the descriptor, we should have no callbacks to do */ + for (candidate = callbacklist; candidate; ) { + t = candidate; + candidate = candidate->templink; + t->templink = NULL; + (t->cbFunc)(t->cbArg); + } +} + +/* must have the indicated lock table mutex upon entry */ +static void AddToWaitersQueue( + RF_LockTableEntry_t *lockTable, + RF_StripeLockDesc_t *lockDesc, + RF_LockReqDesc_t *lockReqDesc) +{ + int tid; + + if (rf_stripeLockDebug) { + rf_get_threadid(tid); + Dprintf3("[%d] Waiting on lock for stripe %ld table 0x%lx\n", tid, lockDesc->stripeID, (unsigned long) lockTable); FLUSH; + } + if (!lockDesc->waitersH) { + lockDesc->waitersH = lockDesc->waitersT = lockReqDesc; + } else { + lockDesc->waitersT->next = lockReqDesc; + lockDesc->waitersT = lockReqDesc; + } +} + +static RF_StripeLockDesc_t *AllocStripeLockDesc(RF_StripeNum_t stripeID) +{ + RF_StripeLockDesc_t *p; + + RF_FREELIST_GET(rf_stripelock_freelist,p,next,(RF_StripeLockDesc_t *)); + if (p) { + p->stripeID = stripeID; + } + return(p); +} + +static void FreeStripeLockDesc(RF_StripeLockDesc_t *p) +{ + RF_FREELIST_FREE(rf_stripelock_freelist,p,next); +} + +static void PrintLockedStripes(lockTable) + RF_LockTableEntry_t *lockTable; +{ + int i, j, foundone = 0, did; + RF_StripeLockDesc_t *p; + RF_LockReqDesc_t *q; + + RF_LOCK_MUTEX(rf_printf_mutex); + printf("Locked stripes:\n"); + for (i=0; i<rf_lockTableSize; i++) if (lockTable[i].descList) { + foundone = 1; + for (p = lockTable[i].descList; p; p=p->next) { + printf("Stripe ID 0x%lx (%d) nWriters %d\n", + (long)p->stripeID, (int)p->stripeID, p->nWriters); + + if (! (p->granted) ) printf("Granted: (none)\n"); else printf("Granted:\n"); + for (did=1,j=0,q = p->granted; q; j++,q=q->next) { + printf(" %c(%ld-%ld",q->type,(long)q->start,(long)q->stop); + if (q->start2 != -1) printf(",%ld-%ld) ",(long)q->start2, + (long)q->stop2); else printf(") "); + if (j && !(j%4)) {printf("\n"); did=1;} else did=0; + } + if (!did) printf("\n"); + + if (! (p->waitersH) ) printf("Waiting: (none)\n"); else printf("Waiting:\n"); + for (did=1,j=0,q = p->waitersH; q; j++,q=q->next) { + printf("%c(%ld-%ld",q->type,(long)q->start,(long)q->stop); + if (q->start2 != -1) printf(",%ld-%ld) ",(long)q->start2,(long)q->stop2); else printf(") "); + if (j && !(j%4)) {printf("\n "); did=1;} else did=0; + } + if (!did) printf("\n"); + } + } + if (!foundone) printf("(none)\n"); else printf("\n"); + RF_UNLOCK_MUTEX(rf_printf_mutex); +} diff --git a/sys/dev/raidframe/rf_stripelocks.h b/sys/dev/raidframe/rf_stripelocks.h new file mode 100644 index 00000000000..46412504247 --- /dev/null +++ b/sys/dev/raidframe/rf_stripelocks.h @@ -0,0 +1,170 @@ +/* $OpenBSD: rf_stripelocks.h,v 1.1 1999/01/11 14:29:51 niklas Exp $ */ +/* $NetBSD: rf_stripelocks.h,v 1.1 1998/11/13 04:20:34 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* : + * Log: rf_stripelocks.h,v + * Revision 1.22 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.21 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.20 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.19 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.18 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.17 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.16 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.15 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.14 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.13 1996/05/06 22:08:46 wvcii + * added copyright info and change log + * + */ + +/***************************************************************************** + * + * stripelocks.h -- header file for locking stripes + * + * Note that these functions are called from the execution routines of certain + * DAG Nodes, and so they must be NON-BLOCKING to assure maximum parallelism + * in the DAG. Accordingly, when a node wants to acquire a lock, it calls + * AcquireStripeLock, supplying a pointer to a callback function. If the lock + * is free at the time of the call, 0 is returned, indicating that the lock + * has been acquired. If the lock is not free, 1 is returned, and a copy of + * the function pointer and argument are held in the lock table. When the + * lock becomes free, the callback function is invoked. + * + *****************************************************************************/ + +#ifndef _RF__RF_STRIPELOCKS_H_ +#define _RF__RF_STRIPELOCKS_H_ + +#include <sys/buf.h> + +#include "rf_types.h" +#include "rf_threadstuff.h" +#include "rf_general.h" + +struct RF_LockReqDesc_s { + RF_IoType_t type; /* read or write */ + RF_int64 start, stop; /* start and end of range to be locked */ + RF_int64 start2, stop2; /* start and end of 2nd range to be locked */ + void (*cbFunc)(struct buf *);/* callback function */ + void *cbArg; /* argument to callback function */ + RF_LockReqDesc_t *next; /* next element in chain */ + RF_LockReqDesc_t *templink; /* for making short-lived lists of request descriptors */ +}; + +#define RF_ASSERT_VALID_LOCKREQ(_lr_) { \ + RF_ASSERT(RF_IO_IS_R_OR_W((_lr_)->type)); \ +} + +struct RF_StripeLockDesc_s { + RF_StripeNum_t stripeID; /* the stripe ID */ + RF_LockReqDesc_t *granted; /* unordered list of granted requests */ + RF_LockReqDesc_t *waitersH; /* FIFO queue of all waiting reqs, both read and write (Head and Tail) */ + RF_LockReqDesc_t *waitersT; + int nWriters; /* number of writers either granted or waiting */ + RF_StripeLockDesc_t *next; /* for hash table collision resolution */ +}; + +struct RF_LockTableEntry_s { + RF_DECLARE_MUTEX(mutex) /* mutex on this hash chain */ + RF_StripeLockDesc_t *descList; /* hash chain of lock descriptors */ +}; + +/* + * Initializes a stripe lock descriptor. _defSize is the number of sectors + * that we lock when there is no parity information in the ASM (e.g. RAID0). + */ + +#define RF_INIT_LOCK_REQ_DESC(_lrd, _typ, _cbf, _cba, _asm, _defSize) \ + { \ + (_lrd).type = _typ; \ + (_lrd).start2 = -1; \ + (_lrd).stop2 = -1; \ + if ((_asm)->parityInfo) { \ + (_lrd).start = (_asm)->parityInfo->startSector; \ + (_lrd).stop = (_asm)->parityInfo->startSector + (_asm)->parityInfo->numSector-1; \ + if ((_asm)->parityInfo->next) { \ + (_lrd).start2 = (_asm)->parityInfo->next->startSector; \ + (_lrd).stop2 = (_asm)->parityInfo->next->startSector + (_asm)->parityInfo->next->numSector-1; \ + } \ + } else { \ + (_lrd).start = 0; \ + (_lrd).stop = (_defSize); \ + } \ + (_lrd).templink= NULL; \ + (_lrd).cbFunc = (_cbf); \ + (_lrd).cbArg = (void *) (_cba); \ + } + +int rf_ConfigureStripeLockFreeList(RF_ShutdownList_t **listp); +RF_LockTableEntry_t *rf_MakeLockTable(void); +void rf_ShutdownStripeLocks(RF_LockTableEntry_t *lockTable); +int rf_ConfigureStripeLocks(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, + RF_Config_t *cfgPtr); +int rf_AcquireStripeLock(RF_LockTableEntry_t *lockTable, + RF_StripeNum_t stripeID, RF_LockReqDesc_t *lockReqDesc); +void rf_ReleaseStripeLock(RF_LockTableEntry_t *lockTable, + RF_StripeNum_t stripeID, RF_LockReqDesc_t *lockReqDesc); + +#endif /* !_RF__RF_STRIPELOCKS_H_ */ diff --git a/sys/dev/raidframe/rf_strutils.c b/sys/dev/raidframe/rf_strutils.c new file mode 100644 index 00000000000..1c42b6b6b56 --- /dev/null +++ b/sys/dev/raidframe/rf_strutils.c @@ -0,0 +1,62 @@ +/* $OpenBSD: rf_strutils.c,v 1.1 1999/01/11 14:29:51 niklas Exp $ */ +/* $NetBSD: rf_strutils.c,v 1.1 1998/11/13 04:20:35 oster Exp $ */ +/* + * rf_strutils.c + * + * String-parsing funcs + */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ +/* + * rf_strutils.c -- some simple utilities for munging on strings. + * I put them in a file by themselves because they're needed in + * setconfig, in the user-level driver, and in the kernel. + * + * : + * Log: rf_strutils.c,v + * Revision 1.2 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + */ + +#include "rf_utils.h" + +/* finds a non-white character in the line */ +char *rf_find_non_white(char *p) +{ + for (; *p != '\0' && (*p == ' ' || *p == '\t'); p++); + return(p); +} + +/* finds a white character in the line */ +char *rf_find_white(char *p) +{ + for (; *p != '\0' && (*p != ' ' && *p != '\t'); p++); + return(p); +} diff --git a/sys/dev/raidframe/rf_sys.c b/sys/dev/raidframe/rf_sys.c new file mode 100644 index 00000000000..e6eb17bb7ef --- /dev/null +++ b/sys/dev/raidframe/rf_sys.c @@ -0,0 +1,260 @@ +/* $OpenBSD: rf_sys.c,v 1.1 1999/01/11 14:29:53 niklas Exp $ */ +/* $NetBSD: rf_sys.c,v 1.1 1998/11/13 04:20:35 oster Exp $ */ +/* + * rf_sys.c + * + * Jim Zelenka, CMU/SCS, 14 June 1996 + */ +/* + * Copyright (c) 1996 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#ifdef _KERNEL +#define KERNEL +#endif + +#include "rf_types.h" +#include "rf_sys.h" +#ifndef KERNEL +#include <errno.h> +#include <fcntl.h> +#include <nlist.h> +#include <stdio.h> +#include <unistd.h> +#endif /* !KERNEL */ +#include <sys/param.h> +#if !defined(sun) && !defined(__NetBSD__) && !defined(__OpenBSD__) && !defined(LINUX) && (!defined(MACH) || defined(__osf__)) +#include <sys/sysinfo.h> +#endif /* !sun && !__NetBSD__ && !__OpenBSD__ && !LINUX && (!MACH || __osf__) */ +#include <sys/time.h> +#ifdef __osf__ +#include <machine/rpb.h> +#include <machine/hal/hal_sysinfo.h> +#endif /* __osf__ */ +#include "rf_etimer.h" +#include "rf_general.h" +#include "rf_threadstuff.h" + +#ifdef KERNEL +extern struct rpb *rpb; +#endif /* KERNEL */ + +/* timer stuff */ +#ifdef __alpha +long rf_timer_max_val; +long rf_timer_ticks_per_second; +unsigned long rf_timer_ticks_per_usec; +#endif /* __alpha */ + + +#if defined(__NetBSD__) || defined(__OpenBSD__) +long rf_timer_max_val; +long rf_timer_ticks_per_second; +unsigned long rf_timer_ticks_per_usec; +#endif /* __NetBSD__ || __OpenBSD__ */ + +#if !defined(KERNEL) && !defined(SIMULATE) && (RF_UTILITY == 0) +pthread_attr_t raidframe_attr_default; + +int rf_thread_create( + RF_Thread_t *thread, + pthread_attr_t attr, + void (*func)(), + RF_ThreadArg_t arg) +{ + int rc; + +#ifdef __osf__ + rc = pthread_create(thread, attr, (pthread_startroutine_t)func, arg); +#endif /* __osf__ */ +#ifdef AIX + rc = pthread_create(thread, &attr, (void *(*)(void *))func, arg); +#endif /* AIX */ + if (rc) + return(errno); + rc = pthread_detach(thread); + if (rc) { + /* don't return error, because the thread exists, and must be cleaned up */ + RF_ERRORMSG1("RAIDFRAME WARNING: failed detaching thread %lx\n", thread); + } + return(0); +} +#endif /* !KERNEL && !SIMULATE && (RF_UTILITY == 0) */ + +#if defined(__osf__) && !defined(KERNEL) +int rf_get_cpu_ticks_per_sec(long *ticksp) +{ + char *kmemdevname, buf[sizeof(struct rpb)+8]; + char *memdevname, kernel_name[MAXPATHLEN+1]; + struct nlist nl[2], *np; + unsigned long rpb_addr; + int kfd, rc, fd, bad; + struct rpb rpb; + off_t off; + + kmemdevname = "/dev/kmem"; + memdevname = "/dev/mem"; + + np = &nl[0]; + bzero((char *)np, sizeof(nl)); + nl[0].n_name = "pmap_physhwrpb"; + nl[1].n_name = NULL; + + bad = 0; + + /* get running kernel name */ + bzero(kernel_name, MAXPATHLEN+1); + kernel_name[0] = '/'; + rc = getsysinfo(GSI_BOOTEDFILE, &kernel_name[1], MAXPATHLEN, 0, 0); + if (rc != 1) { + RF_ERRORMSG("RAIDFRAME: cannot get booted kernel name\n"); + if (errno) + return(errno); + else + return(EIO); + } + + rc = nlist(kernel_name, np); + if (rc) { + RF_ERRORMSG1("RAIDFRAME: cannot nlist %s\n", kernel_name); + return(EIO); + } + + if (np->n_type == 0) { + RF_ERRORMSG1("RAIDFRAME: cannot usefully nlist %s\n", kernel_name); + return(EIO); + } + + kfd = open(kmemdevname, O_RDONLY); + if (kfd < 0) { + perror(kmemdevname); + return(errno); + } + fd = open(memdevname, O_RDONLY); + if (fd < 0) { + perror(kmemdevname); + return(errno); + } + + /* + * pmap_physhwrpb is a variable in the kernel containing the physical + * address of the hardware RPB. We'll just find that variable and + * read it, then use that as a physical memory address to read the + * rpb itself. + */ + + off = lseek(kfd, np->n_value, SEEK_SET); + if (off != np->n_value) { + RF_ERRORMSG("RAIDFRAME: cannot seek to address of hwrpb addr\n"); + return(EIO); + } + + rc = read(kfd, &rpb_addr, sizeof(rpb_addr)); + if (rc != sizeof(rpb_addr)) { + RF_ERRORMSG("RAIDFRAME: cannot read address of hwrpb addr\n"); + if (rc < 0) + bad = errno; + bad = EIO; + goto isbad; + } + + off = lseek(fd, rpb_addr, SEEK_SET); + if (off != rpb_addr) { + RF_ERRORMSG("RAIDFRAME: cannot seek to rpb addr\n"); + bad = EIO; + goto isbad; + } + + rc = read(fd, &rpb, sizeof(rpb)); + if (rc != sizeof(rpb)) { + RF_ERRORMSG1("RAIDFRAME: cannot read rpb (rc=%d)\n", rc); + if (rc < 0) + bad = errno; + bad = EIO; + goto isbad; + } + + /* + * One extra sanity check: the RPB is self-identifying. + * This field is guaranteed to have the value + * 0x0000004250525748, always. + */ + if (rpb.rpb_string != 0x0000004250525748) { + bad = EIO; + goto isbad; + } + +isbad: + if (bad) { + RF_ERRORMSG("ERROR: rpb failed validation\n"); + RF_ERRORMSG1("RAIDFRAME: perhaps %s has changed since booting?\n", + kernel_name); + return(bad); + } + + *ticksp = rpb.rpb_counter; + + close(kfd); + close(fd); + + return(0); +} +#endif /* __osf__ && !KERNEL */ + +int rf_ConfigureEtimer(listp) + RF_ShutdownList_t **listp; +{ +#ifdef __osf__ + int rc; + +#ifdef KERNEL + rf_timer_ticks_per_second = rpb->rpb_counter; +#else /* KERNEL */ + rc = rf_get_cpu_ticks_per_sec(&rf_timer_ticks_per_second); + if (rc) + return(rc); +#endif /* KERNEL */ + rf_timer_max_val = RF_DEF_TIMER_MAX_VAL; + rf_timer_ticks_per_usec = rf_timer_ticks_per_second/1000000; +#endif /* __osf__ */ +#if defined(NETBSD_ALPHA) || defined(OPENBSD_ALPHA) + /* + * XXX cgd fix this + */ + rf_timer_ticks_per_second = 233100233; + rf_timer_max_val = RF_DEF_TIMER_MAX_VAL; + rf_timer_ticks_per_usec = rf_timer_ticks_per_second/1000000; +#endif /* NETBSD_ALPHA || OPENBSD_ALPHA */ +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + /* XXX just picking some random values to keep things happy... without these + set, stuff will panic on division by zero errors!! */ + rf_timer_ticks_per_second = 233100233; + rf_timer_max_val = RF_DEF_TIMER_MAX_VAL; + rf_timer_ticks_per_usec = rf_timer_ticks_per_second/1000000; + +#endif + return(0); +} diff --git a/sys/dev/raidframe/rf_sys.h b/sys/dev/raidframe/rf_sys.h new file mode 100644 index 00000000000..f9606708c2e --- /dev/null +++ b/sys/dev/raidframe/rf_sys.h @@ -0,0 +1,69 @@ +/* $OpenBSD: rf_sys.h,v 1.1 1999/01/11 14:29:53 niklas Exp $ */ +/* $NetBSD: rf_sys.h,v 1.1 1998/11/13 04:20:35 oster Exp $ */ +/* + * rf_sys.h + * + * Jim Zelenka, CMU/SCS, 14 June 1996 + */ +/* + * Copyright (c) 1996 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#ifndef _RF__RF_SYS_H_ +#define _RF__RF_SYS_H_ + +#include "rf_types.h" + +int rf_ConfigureEtimer(RF_ShutdownList_t **listp); + +#if defined(__osf__) && !defined(KERNEL) +int rf_get_cpu_ticks_per_sec(long *ticksp); +#endif /* __osf__ && !KERNEL */ + +#ifdef AIX +#include <nlist.h> +#include <sys/time.h> +#if RF_AIXVers == 3 +int gettimeofday(struct timeval *tp, struct timezone *tzp); +#endif /* RF_AIXVers == 3 */ +int knlist(struct nlist *namelist, int nel, int size); +int ffs(int index); +#endif /* AIX */ + +#ifdef sun +#define bcopy(a,b,n) memcpy(b,a,n) +#define bzero(b,n) memset(b,0,n) +#define bcmp(a,b,n) memcmp(a,b,n) +#endif /* sun */ + +#ifdef __GNUC__ +/* we use gcc -Wall to check our anal-retentiveness level, occasionally */ +#if defined(DEC_OSF) && !defined(KERNEL) +extern int ioctl(int fd, int req, ...); +#endif /* DEC_OSF && !KERNEL */ +#endif /* __GNUC__ */ + +#endif /* !_RF__RF_SYS_H_ */ diff --git a/sys/dev/raidframe/rf_threadid.h b/sys/dev/raidframe/rf_threadid.h new file mode 100644 index 00000000000..ef77020b554 --- /dev/null +++ b/sys/dev/raidframe/rf_threadid.h @@ -0,0 +1,230 @@ +/* $OpenBSD: rf_threadid.h,v 1.1 1999/01/11 14:29:53 niklas Exp $ */ +/* $NetBSD: rf_threadid.h,v 1.1 1998/11/13 04:20:35 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Daniel Stodolsky, Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* rf_threadid.h + * + * simple macros to register and lookup integer identifiers for threads. + * must include pthread.h before including this + * + * This is one of two places where the pthreads package is used explicitly. + * The other is in threadstuff.h + * + * none of this is used in the kernel, so it all gets compiled out if KERNEL is defined + */ + +/* : + * Log: rf_threadid.h,v + * Revision 1.17 1996/08/12 20:11:17 jimz + * fix up for AIX4 + * + * Revision 1.16 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.15 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.14 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.13 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.12 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.11 1996/05/20 16:13:46 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.10 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.9 1996/05/17 13:29:06 jimz + * did a dance on get_threadid such that it will do the pthread_attr_t -> int + * assignment without warnings, even on really anal compilers + * + * Revision 1.8 1995/12/06 15:15:00 root + * added copyright info + * + */ + +#ifndef _RF__RF_THREADID_H_ +#define _RF__RF_THREADID_H_ + +#ifdef _KERNEL +#define KERNEL +#endif + +#ifndef SIMULATE +#ifndef KERNEL + +/* + * User + */ + +#include "rf_threadstuff.h" + +extern int rf_numThrsRegistered; +extern pthread_key_t rf_thread_id_key; +RF_DECLARE_EXTERN_MUTEX(rf_threadid_mutex) + +#define RF_THREAD_MAX 200 + +/* these should be global since a function is declared. Should be invoked at only one place in code */ +#define RF_DECLARE_GLOBAL_THREADID \ + int rf_numThrsRegistered = 0; \ + pthread_key_t rf_thread_id_key; \ + RF_DECLARE_MUTEX(rf_threadid_mutex) \ + RF_Thread_t rf_regdThrs[RF_THREAD_MAX]; \ + void rf_ThreadIdEmptyFunc() {} + +/* setup must be called exactly once, i.e. it can't be called by each thread */ + +#ifdef AIX +typedef void (*pthread_destructor_t)(void *); +#endif /* AIX */ + +#ifdef __osf__ +#define rf_setup_threadid() { \ + extern void rf_ThreadIdEmptyFunc(); \ + pthread_keycreate(&rf_thread_id_key, (pthread_destructor_t) rf_ThreadIdEmptyFunc); \ + rf_mutex_init(&rf_threadid_mutex); /* XXX check return val */ \ + rf_numThrsRegistered = 0; \ +} +#endif /* __osf__ */ + +#ifdef AIX +#define rf_setup_threadid() { \ + extern void rf_ThreadIdEmptyFunc(); \ + pthread_key_create(&rf_thread_id_key, (pthread_destructor_t) rf_ThreadIdEmptyFunc); \ + rf_mutex_init(&rf_threadid_mutex); /* XXX check return val */ \ + rf_numThrsRegistered = 0; \ +} +#endif /* AIX */ + +#define rf_shutdown_threadid() { \ + rf_mutex_destroy(&rf_threadid_mutex); \ +} + +#ifdef __osf__ +typedef pthread_addr_t RF_THID_cast_t; +#endif /* __osf__ */ + +#ifdef AIX +typedef void *RF_THID_cast_t; +#endif /* AIX */ + +#define rf_assign_threadid() {RF_LOCK_MUTEX(rf_threadid_mutex); \ + if (pthread_setspecific(rf_thread_id_key, (RF_THID_cast_t) ((unsigned long)(rf_numThrsRegistered++)))) { RF_PANIC(); } \ + RF_UNLOCK_MUTEX(rf_threadid_mutex);} + +#ifdef __osf__ +#define rf_get_threadid(_id_) { \ + RF_THID_cast_t _val; \ + unsigned long _val2; \ + if (pthread_getspecific(rf_thread_id_key, &_val)) \ + RF_PANIC(); \ + (_val2) = (unsigned long)_val; \ + (_id_) = (int)_val2; \ +} +#endif /* __osf__ */ + +#ifdef AIX +#define rf_get_threadid(_id_) { \ + RF_THID_cast_t _val; \ + unsigned long _val2; \ + _val = pthread_getspecific(rf_thread_id_key); \ + (_val2) = (unsigned long)_val; \ + (_id_) = (int)_val2; \ +} +#endif /* AIX */ + +#else /* KERNEL */ + +/* + * Kernel + */ + +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#include <kern/task.h> +#include <kern/thread.h> +#include <mach/machine/vm_param.h> +#endif + +#define RF_DECLARE_GLOBAL_THREADID +#define rf_setup_threadid() +#define rf_shutdown_threadid() +#define rf_assign_threadid() + + + +#if defined(__NetBSD__) || defined(__OpenBSD__) + +#define rf_get_threadid(_id_) _id_ = 0; + +#else +#define rf_get_threadid(_id_) { \ + thread_t thread = current_thread(); \ + _id_ = (int)(((thread->thread_self)>>(8*sizeof(int *)))&0x0fffffff); \ +} +#endif /* __NetBSD__ || __OpenBSD__ */ +#endif /* KERNEL */ + +#else /* SIMULATE */ + +/* + * Simulator + */ + +#include "rf_diskevent.h" + +#define RF_DECLARE_GLOBAL_THREADID +#define rf_setup_threadid() +#define rf_shutdown_threadid() +#define rf_assign_threadid() + +#define rf_get_threadid(_id_) _id_ = rf_GetCurrentOwner() + +#endif /* SIMULATE */ +#endif /* !_RF__RF_THREADID_H_ */ diff --git a/sys/dev/raidframe/rf_threadstuff.c b/sys/dev/raidframe/rf_threadstuff.c new file mode 100644 index 00000000000..0de5f36c679 --- /dev/null +++ b/sys/dev/raidframe/rf_threadstuff.c @@ -0,0 +1,477 @@ +/* $OpenBSD: rf_threadstuff.c,v 1.1 1999/01/11 14:29:53 niklas Exp $ */ +/* $NetBSD: rf_threadstuff.c,v 1.1 1998/11/13 04:20:35 oster Exp $ */ +/* + * rf_threadstuff.c + */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#ifdef _KERNEL +#define KERNEL +#endif + + +#include "rf_types.h" +#include "rf_threadstuff.h" +#include "rf_general.h" +#include "rf_shutdown.h" + +static void mutex_destroyer(void *); +static void cond_destroyer(void *); +void thread_wakeup(void *); + +/* + * Shared stuff + */ + +static void mutex_destroyer(arg) + void *arg; +{ + int rc; + + rc = rf_mutex_destroy(arg); + if (rc) { + RF_ERRORMSG1("RAIDFRAME: Error %d auto-destroying mutex\n", rc); + } +} + +static void cond_destroyer(arg) + void *arg; +{ + int rc; + + rc = rf_cond_destroy(arg); + if (rc) { + RF_ERRORMSG1("RAIDFRAME: Error %d auto-destroying condition\n", rc); + } +} + +int _rf_create_managed_mutex(listp, m, file, line) + RF_ShutdownList_t **listp; + RF_DECLARE_MUTEX(*m) + char *file; + int line; +{ + int rc, rc1; + + rc = rf_mutex_init(m); + if (rc) + return(rc); + rc = _rf_ShutdownCreate(listp, mutex_destroyer, (void *)m, file, line); + if (rc) { + RF_ERRORMSG1("RAIDFRAME: Error %d adding shutdown entry\n", rc); + rc1 = rf_mutex_destroy(m); + if (rc1) { + RF_ERRORMSG1("RAIDFRAME: Error %d destroying mutex\n", rc1); + } + } + return(rc); +} + +int _rf_create_managed_cond(listp, c, file, line) + RF_ShutdownList_t **listp; + RF_DECLARE_COND(*c) + char *file; + int line; +{ + int rc, rc1; + + rc = rf_cond_init(c); + if (rc) + return(rc); + rc = _rf_ShutdownCreate(listp, cond_destroyer, (void *)c, file, line); + if (rc) { + RF_ERRORMSG1("RAIDFRAME: Error %d adding shutdown entry\n", rc); + rc1 = rf_cond_destroy(c); + if (rc1) { + RF_ERRORMSG1("RAIDFRAME: Error %d destroying cond\n", rc1); + } + } + return(rc); +} + +int _rf_init_managed_threadgroup(listp, g, file, line) + RF_ShutdownList_t **listp; + RF_ThreadGroup_t *g; + char *file; + int line; +{ + int rc; + + rc = _rf_create_managed_mutex(listp, &g->mutex, file, line); + if (rc) + return(rc); + rc = _rf_create_managed_cond(listp, &g->cond, file, line); + if (rc) + return(rc); + g->created = g->running = g->shutdown = 0; + return(0); +} + +int _rf_destroy_threadgroup(g, file, line) + RF_ThreadGroup_t *g; + char *file; + int line; +{ + int rc1, rc2; + +#if RF_DEBUG_ATOMIC > 0 + rc1 = _rf_mutex_destroy(&g->mutex, file, line); + rc2 = _rf_cond_destroy(&g->cond, file, line); +#else /* RF_DEBUG_ATOMIC > 0 */ + rc1 = rf_mutex_destroy(&g->mutex); + rc2 = rf_cond_destroy(&g->cond); +#endif /* RF_DEBUG_ATOMIC > 0 */ + if (rc1) + return(rc1); + return(rc2); +} + +int _rf_init_threadgroup(g, file, line) + RF_ThreadGroup_t *g; + char *file; + int line; +{ + int rc; + +#if RF_DEBUG_ATOMIC > 0 + rc = _rf_mutex_init(&g->mutex, file, line); + if (rc) + return(rc); + rc = _rf_cond_init(&g->cond, file, line); + if (rc) { + _rf_mutex_destroy(&g->mutex, file, line); + return(rc); + } +#else /* RF_DEBUG_ATOMIC > 0 */ + rc = rf_mutex_init(&g->mutex); + if (rc) + return(rc); + rc = rf_cond_init(&g->cond); + if (rc) { + rf_mutex_destroy(&g->mutex); + return(rc); + } +#endif /* RF_DEBUG_ATOMIC > 0 */ + g->created = g->running = g->shutdown = 0; + return(0); +} + +/* + * User + */ + +#if !defined(KERNEL) && !defined(SIMULATE) + +#if RF_DEBUG_ATOMIC > 0 + +static RF_ATEnt_t rf_atent_list; +static RF_ATEnt_t *rf_atent_done_list=NULL; + +static pthread_mutex_t rf_atent_mutex; + +void rf_atent_init() +{ + int rc; + + rc = pthread_mutex_init(&rf_atent_mutex, pthread_mutexattr_default); + if (rc) { + fprintf(stderr, "ERROR: rc=%d creating rf_atent_mutex\n", rc); + fflush(stderr); + RF_PANIC(); + } + rf_atent_list.next = rf_atent_list.prev = &rf_atent_list; +} + +#define ATENT_TYPE(_e_) ((((_e_)->type == 0)||((_e_)->type > 2)) ? 0 : (_e_)->type) +#define ATENT_OTYPE(_e_) ((((_e_)->otype == 0)||((_e_)->otype > 2)) ? 0 : (_e_)->otype) + +void rf_atent_shutdown() +{ + int rc, num_freed[3], num_not_freed[3]; + RF_ATEnt_t *r, *n; + + num_freed[0] = num_freed[1] = num_freed[2] = 0; + num_not_freed[0] = num_not_freed[1] = num_not_freed[2] = 0; + printf("rf_atent_shutdown:\n"); + for(r=rf_atent_list.next;r!=&rf_atent_list;r=r->next) { + printf("r=%lx type=%d file=%s line=%d\n", r, r->type, r->file, r->line); + num_not_freed[ATENT_TYPE(r)]++; + } + rc = pthread_mutex_destroy(&rf_atent_mutex); + if (rc) { + fprintf(stderr, "ERROR: rc=%d destroying rf_atent_mutex\n", rc); + fflush(stderr); + RF_PANIC(); + } + for(r=rf_atent_done_list;r;r=n) { + n = r->next; + num_freed[ATENT_OTYPE(r)]++; + free(r); + } + printf("%d mutexes not freed %d conditions not freed %d bogus not freed\n", + num_not_freed[1], num_not_freed[2], num_not_freed[0]); + printf("%d mutexes freed %d conditions freed %d bogus freed\n", + num_freed[1], num_freed[2], num_freed[0]); + fflush(stdout); + fflush(stderr); +} + +static RF_ATEnt_t *AllocATEnt(file,line) + char *file; + int line; +{ + RF_ATEnt_t *t; + + t = (RF_ATEnt_t *)malloc(sizeof(RF_ATEnt_t)); + if (t == NULL) { + RF_PANIC(); + } + t->file = file; + t->line = line; + t->type = 0; + return(t); +} + +static void FreeATEnt(t) + RF_ATEnt_t *t; +{ + t->otype = t->type; + t->type = 0; + t->next = rf_atent_done_list; + rf_atent_done_list = t; +} + +int _rf_mutex_init(m, file, line) + RF_ATEnt_t **m; + char *file; + int line; +{ + RF_ATEnt_t *a; + int rc; + + a = AllocATEnt(file,line); + rc = pthread_mutex_init(&a->m, pthread_mutexattr_default); + if (rc == 0) { + pthread_mutex_lock(&rf_atent_mutex); + a->next = rf_atent_list.next; + a->prev = &rf_atent_list; + a->type = RF_ATENT_M; + a->next->prev = a; + a->prev->next = a; + pthread_mutex_unlock(&rf_atent_mutex); + } + else { + fprintf(stderr, "ERROR: rc=%d allocating mutex %s:%d\n", + rc, file, line); + fflush(stderr); + RF_PANIC(); + } + *m = a; + return(0); +} + +int _rf_mutex_destroy(m, file, line) + RF_ATEnt_t **m; + char *file; + int line; +{ + RF_ATEnt_t *r; + int rc; + + r = *m; + rc = pthread_mutex_destroy(&r->m); + if (rc) { + fprintf(stderr, "ERROR: rc=%d destroying mutex %s:%d\n", + rc, file, line); + fflush(stderr); + RF_PANIC(); + } + pthread_mutex_lock(&rf_atent_mutex); + r->next->prev = r->prev; + r->prev->next = r->next; + FreeATEnt(r); + pthread_mutex_unlock(&rf_atent_mutex); + *m = NULL; + return(0); +} + +int _rf_cond_init(c, file, line) + RF_ATEnt_t **c; + char *file; + int line; +{ + RF_ATEnt_t *a; + int rc; + + a = AllocATEnt(file,line); + rc = pthread_cond_init(&a->c, pthread_condattr_default); + if (rc == 0) { + pthread_mutex_lock(&rf_atent_mutex); + a->next = rf_atent_list.next; + a->prev = &rf_atent_list; + a->next->prev = a; + a->prev->next = a; + a->type = RF_ATENT_C; + pthread_mutex_unlock(&rf_atent_mutex); + } + else { + fprintf(stderr, "ERROR: rc=%d allocating cond %s:%d\n", + rc, file, line); + fflush(stderr); + RF_PANIC(); + } + *c = a; + return(0); +} + +int _rf_cond_destroy(c, file, line) + RF_ATEnt_t **c; + char *file; + int line; +{ + RF_ATEnt_t *r; + int rc; + + r = *c; + rc = pthread_cond_destroy(&r->c); + if (rc) { + fprintf(stderr, "ERROR: rc=%d destroying cond %s:%d\n", + rc, file, line); + fflush(stderr); + RF_PANIC(); + } + pthread_mutex_lock(&rf_atent_mutex); + r->next->prev = r->prev; + r->prev->next = r->next; + FreeATEnt(r); + pthread_mutex_unlock(&rf_atent_mutex); + *c = NULL; + return(0); +} + +#else /* RF_DEBUG_ATOMIC > 0 */ + +int rf_mutex_init(m) + pthread_mutex_t *m; +{ +#ifdef __osf__ + return(pthread_mutex_init(m, pthread_mutexattr_default)); +#endif /* __osf__ */ +#ifdef AIX + return(pthread_mutex_init(m, &pthread_mutexattr_default)); +#endif /* AIX */ +} + +int rf_mutex_destroy(m) + pthread_mutex_t *m; +{ + return(pthread_mutex_destroy(m)); +} + +int rf_cond_init(c) + pthread_cond_t *c; +{ +#ifdef __osf__ + return(pthread_cond_init(c, pthread_condattr_default)); +#endif /* __osf__ */ +#ifdef AIX + return(pthread_cond_init(c, &pthread_condattr_default)); +#endif /* AIX */ +} + +int rf_cond_destroy(c) + pthread_cond_t *c; +{ + return(pthread_cond_destroy(c)); +} + +#endif /* RF_DEBUG_ATOMIC > 0 */ + +#endif /* !KERNEL && !SIMULATE */ + +/* + * Kernel + */ +#ifdef KERNEL +int rf_mutex_init(m) + decl_simple_lock_data(,*m) +{ + simple_lock_init(m); + return(0); +} + +int rf_mutex_destroy(m) + decl_simple_lock_data(,*m) +{ + return(0); +} + +int rf_cond_init(c) + RF_DECLARE_COND(*c) +{ + *c = 0; /* no reason */ + return(0); +} + +int rf_cond_destroy(c) + RF_DECLARE_COND(*c) +{ + return(0); +} + + +#endif /* KERNEL */ + +/* + * Simulator + */ +#ifdef SIMULATE +int rf_mutex_init(m) + RF_DECLARE_MUTEX(*m) +{ + return(0); +} + +int rf_mutex_destroy(m) + RF_DECLARE_MUTEX(*m) +{ + return(0); +} + +int rf_cond_init(c) + RF_DECLARE_COND(*c) +{ + return(0); +} + +int rf_cond_destroy(c) + RF_DECLARE_COND(*c) +{ + return(0); +} +#endif /* SIMULATE */ diff --git a/sys/dev/raidframe/rf_threadstuff.h b/sys/dev/raidframe/rf_threadstuff.h new file mode 100644 index 00000000000..1437b2b0edf --- /dev/null +++ b/sys/dev/raidframe/rf_threadstuff.h @@ -0,0 +1,465 @@ +/* $OpenBSD: rf_threadstuff.h,v 1.1 1999/01/11 14:29:54 niklas Exp $ */ +/* $NetBSD: rf_threadstuff.h,v 1.1 1998/11/13 04:20:35 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland, Daniel Stodolsky, Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/* + * threadstuff.h -- definitions for threads, locks, and synchronization + * + * The purpose of this file is provide some illusion of portability. + * If the functions below can be implemented with the same semantics on + * some new system, then at least the synchronization and thread control + * part of the code should not require modification to port to a new machine. + * the only other place where the pthread package is explicitly used is + * threadid.h + * + * this file should be included above stdio.h to get some necessary defines. + * + */ + +/* : + * Log: rf_threadstuff.h,v + * Revision 1.38 1996/08/12 22:37:47 jimz + * add AIX stuff for user driver + * + * Revision 1.37 1996/08/11 00:47:09 jimz + * make AIX friendly + * + * Revision 1.36 1996/07/23 22:06:59 jimz + * add rf_destroy_threadgroup + * + * Revision 1.35 1996/07/23 21:31:16 jimz + * add init_threadgroup + * + * Revision 1.34 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.33 1996/07/15 17:22:18 jimz + * nit-pick code cleanup + * resolve stdlib problems on DEC OSF + * + * Revision 1.32 1996/06/17 03:01:11 jimz + * get rid of JOIN stuff + * + * Revision 1.31 1996/06/14 23:15:38 jimz + * attempt to deal with thread GC problem + * + * Revision 1.30 1996/06/11 18:12:36 jimz + * get rid of JOIN operations + * use ThreadGroup stuff instead + * fix some allocation/deallocation and sync bugs + * + * Revision 1.29 1996/06/11 13:48:10 jimz + * make kernel RF_THREAD_CREATE give back happier return vals + * + * Revision 1.28 1996/06/10 16:40:01 jimz + * break user-level stuff out into lib+apps + * + * Revision 1.27 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.26 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.25 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.24 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.23 1996/05/20 19:31:54 jimz + * add atomic debug (mutex and cond leak finder) stuff + * + * Revision 1.22 1996/05/20 16:24:49 jimz + * get happy in simulator + * + * Revision 1.21 1996/05/20 16:15:07 jimz + * switch to rf_{mutex,cond}_{init,destroy} + * + * Revision 1.20 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.19 1996/05/09 17:16:53 jimz + * correct arg to JOIN_THREAD + * + * Revision 1.18 1995/12/12 18:10:06 jimz + * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT + * fix 80-column brain damage in comments + * + * Revision 1.17 1995/12/06 15:15:21 root + * added copyright info + * + */ + +#ifndef _RF__RF_THREADSTUFF_H_ +#define _RF__RF_THREADSTUFF_H_ + +#include "rf_types.h" + +#define rf_create_managed_mutex(a,b) _rf_create_managed_mutex(a,b,__FILE__,__LINE__) +#define rf_create_managed_cond(a,b) _rf_create_managed_cond(a,b,__FILE__,__LINE__) +#define rf_init_managed_threadgroup(a,b) _rf_init_managed_threadgroup(a,b,__FILE__,__LINE__) +#define rf_init_threadgroup(a) _rf_init_threadgroup(a,__FILE__,__LINE__) +#define rf_destroy_threadgroup(a) _rf_destroy_threadgroup(a,__FILE__,__LINE__) + +int _rf_init_threadgroup(RF_ThreadGroup_t *g, char *file, int line); +int _rf_destroy_threadgroup(RF_ThreadGroup_t *g, char *file, int line); +int _rf_init_managed_threadgroup(RF_ShutdownList_t **listp, + RF_ThreadGroup_t *g, char *file, int line); + +#ifndef SIMULATE /* will null all this calls */ +#ifndef KERNEL + +#if defined(__osf__) || defined(AIX) +#include <pthread.h> +#endif /* __osf__ || AIX */ + +#define RF_DEBUG_ATOMIC 0 + +#if RF_DEBUG_ATOMIC > 0 +#define RF_ATENT_M 1 +#define RF_ATENT_C 2 +typedef struct RF_ATEnt_s RF_ATEnt_t; +struct RF_ATEnt_s { + char *file; + int line; + pthread_mutex_t m; + pthread_cond_t c; + int type; + int otype; + RF_ATEnt_t *next; + RF_ATEnt_t *prev; +}; + +#define RF_DECLARE_MUTEX(_m_) RF_ATEnt_t *_m_; +#define RF_DECLARE_STATIC_MUTEX(_m_) static RF_ATEnt_t *_m_; +#define RF_DECLARE_EXTERN_MUTEX(_m_) extern RF_ATEnt_t *_m_; +#define RF_DECLARE_COND(_c_) RF_ATEnt_t *_c_; +#define RF_DECLARE_STATIC_COND(_c_) static RF_ATEnt_t *_c_; +#define RF_DECLARE_EXTERN_COND(_c_) extern RF_ATEnt_t *_c_; + +int _rf_mutex_init(RF_ATEnt_t **m, char *file, int line); +int _rf_mutex_destroy(RF_ATEnt_t **m, char *file, int line); +int _rf_cond_init(RF_ATEnt_t **c, char *file, int line); +int _rf_cond_destroy(RF_ATEnt_t **c, char *file, int line); +void rf_atent_init(void); +void rf_atent_shutdown(void); + +#define rf_mutex_init(_m_) _rf_mutex_init(_m_,__FILE__,__LINE__) +#define rf_mutex_destroy(_m_) _rf_mutex_destroy(_m_,__FILE__,__LINE__) +#define rf_cond_init(_m_) _rf_cond_init(_m_,__FILE__,__LINE__) +#define rf_cond_destroy(_m_) _rf_cond_destroy(_m_,__FILE__,__LINE__) + +#define RF_LOCK_MUTEX(_a_) {RF_ASSERT((_a_)->type == RF_ATENT_M); pthread_mutex_lock(&((_a_)->m));} +#define RF_UNLOCK_MUTEX(_a_) {RF_ASSERT((_a_)->type == RF_ATENT_M); pthread_mutex_unlock(&((_a_)->m));} + +#define RF_WAIT_COND(_c_,_m_) { \ + RF_ASSERT((_c_)->type == RF_ATENT_C); \ + RF_ASSERT((_m_)->type == RF_ATENT_M); \ + pthread_cond_wait( &((_c_)->c), &((_m_)->m) ); \ +} +#define RF_SIGNAL_COND(_c_) {RF_ASSERT((_c_)->type == RF_ATENT_C); pthread_cond_signal( &((_c_)->c));} +#define RF_BROADCAST_COND(_c_) {RF_ASSERT((_c_)->type == RF_ATENT_C); pthread_cond_broadcast(&((_c_)->c));} + +#else /* RF_DEBUG_ATOMIC > 0 */ + +/* defining these as macros allows us to NULL them out in the kernel */ +#define RF_DECLARE_MUTEX(_m_) pthread_mutex_t _m_; +#define RF_DECLARE_STATIC_MUTEX(_m_) static pthread_mutex_t _m_; +#define RF_DECLARE_EXTERN_MUTEX(_m_) extern pthread_mutex_t _m_; +#define RF_DECLARE_COND(_c_) pthread_cond_t _c_; +#define RF_DECLARE_STATIC_COND(_c_) static pthread_cond_t _c_; +#define RF_DECLARE_EXTERN_COND(_c_) extern pthread_cond_t _c_; + +int rf_mutex_init(pthread_mutex_t *m); +int rf_mutex_destroy(pthread_mutex_t *m); +int rf_cond_init(pthread_cond_t *c); +int rf_cond_destroy(pthread_cond_t *c); + +#define RF_LOCK_MUTEX(_m_) {pthread_mutex_lock(&(_m_));} +#define RF_UNLOCK_MUTEX(_m_) pthread_mutex_unlock(&(_m_)) + +#define RF_WAIT_COND(_c_,_m_) pthread_cond_wait( &(_c_), &(_m_) ) +#define RF_SIGNAL_COND(_c_) pthread_cond_signal( &(_c_) ) +#define RF_BROADCAST_COND(_c_) pthread_cond_broadcast(&(_c_)) + +#endif /* RF_DEBUG_ATOMIC > 0 */ + +int _rf_create_managed_mutex(RF_ShutdownList_t **listp, pthread_mutex_t *m, char *file, int line); +int _rf_create_managed_cond(RF_ShutdownList_t **listp, pthread_cond_t *c, char *file, int line); + +typedef pthread_t RF_Thread_t; +#ifdef __osf__ +typedef pthread_addr_t RF_ThreadArg_t; /* the argument to a thread function */ +#else /* __osf__ */ +typedef void *RF_ThreadArg_t; /* the argument to a thread function */ +#endif /* __osf__ */ +typedef pthread_attr_t RF_ThreadAttr_t; /* a thread creation attribute structure */ + +#ifdef __osf__ +#define RF_EXIT_THREAD(_status_) pthread_exit( (pthread_addr_t) (_status_) ) +#else /* __osf__ */ +#define RF_EXIT_THREAD(_status_) pthread_exit( (void *) (_status_) ) +#endif /* __osf__ */ +#define RF_DELAY_THREAD(_secs_, _msecs_) {struct timespec interval; \ + interval.tv_sec = (_secs_); \ + interval.tv_nsec = (_msecs_)*1000000; \ + pthread_delay_np(&interval); \ + } +#define RF_DELAY_THREAD_TS(_ts_) pthread_delay_np(&(_ts_)) + +#ifdef __osf__ +#define RF_THREAD_ATTR_CREATE(_attr_) pthread_attr_create( &(_attr_) ) +#define RF_THREAD_ATTR_DELETE(_attr_) pthread_attr_delete( &(_attr_) ) +#endif /* __osf__ */ +#ifdef AIX +#define RF_THREAD_ATTR_CREATE(_attr_) pthread_attr_init( &(_attr_) ) +#define RF_THREAD_ATTR_DELETE(_attr_) pthread_attr_destroy( &(_attr_) ) +#endif /* AIX */ +#define RF_THREAD_ATTR_SETSTACKSIZE(_attr_,_sz_) pthread_attr_setstacksize(&(_attr_), (long) (_sz_)) +#define RF_THREAD_ATTR_GETSTACKSIZE(_attr_) pthread_attr_getstacksize(_attr_) +#define RF_THREAD_ATTR_SETSCHED(_attr_,_sched_) pthread_attr_setsched(&(_attr_), (_sched_)) +#define RF_CREATE_ATTR_THREAD(_handle_, _attr_, _func_, _arg_) \ + pthread_create(&(_handle_), (_attr_), (pthread_startroutine_t) (_func_), (_arg_)) + + +extern pthread_attr_t raidframe_attr_default; +int rf_thread_create(RF_Thread_t *thread, pthread_attr_t attr, + void (*func)(), RF_ThreadArg_t arg); + +#define RF_CREATE_THREAD(_handle_, _func_, _arg_) \ + rf_thread_create(&(_handle_), raidframe_attr_default, (_func_), (_arg_)) + +#else /* KERNEL */ +#if defined(__NetBSD__) || defined(__OpenBSD__) +#include <sys/lock.h> +#define decl_simple_lock_data(a,b) a struct simplelock b; +#define simple_lock_addr(a) ((struct simplelock *)&(a)) +#else +#include <kern/task.h> +#include <kern/thread.h> +#include <kern/lock.h> +#include <kern/sched_prim.h> +#define decl_simple_lock_data(a,b) a int (b); +#endif /* __NetBSD__ || __OpenBSD__ */ + +#if defined(__NetBSD__) || defined(__OpenBSD__) +typedef struct proc *RF_Thread_t; +#else +typedef thread_t RF_Thread_t; +#endif +typedef void *RF_ThreadArg_t; + +#define RF_DECLARE_MUTEX(_m_) decl_simple_lock_data(,(_m_)) +#define RF_DECLARE_STATIC_MUTEX(_m_) decl_simple_lock_data(static,(_m_)) +#define RF_DECLARE_EXTERN_MUTEX(_m_) decl_simple_lock_data(extern,(_m_)) + +#define RF_DECLARE_COND(_c_) int _c_; +#define RF_DECLARE_STATIC_COND(_c_) static int _c_; +#define RF_DECLARE_EXTERN_COND(_c_) extern int _c_; + +#define RF_LOCK_MUTEX(_m_) simple_lock(&(_m_)) +#define RF_UNLOCK_MUTEX(_m_) simple_unlock(&(_m_)) + + +#if defined(__NetBSD__) || defined(__OpenBSD__) +#include <sys/types.h> +#include <sys/kthread.h> +/* + * In Net- and OpenBSD, kernel threads are simply processes which share several + * substructures and never run in userspace. + * + * XXX Note, Net- and OpenBSD does not yet have a wakeup_one(), so we always + * XXX get Thundering Herd when a condition occurs. + */ +#define RF_WAIT_COND(_c_,_m_) { \ + RF_UNLOCK_MUTEX(_m_); \ + tsleep(&_c_, PRIBIO | PCATCH, "rfwcond", 0); \ + RF_LOCK_MUTEX(_m_); \ +} +#define RF_SIGNAL_COND(_c_) wakeup(&(_c_)) +#define RF_BROADCAST_COND(_c_) wakeup(&(_c_)) +#define RF_CREATE_THREAD(_handle_, _func_, _arg_) \ + kthread_create((void (*) __P((void *)))(_func_), (void *)(_arg_), \ + (struct proc **)&(_handle_), "raid") +#else /* ! __NetBSD__ && ! __OpenBSD__ */ +/* + * Digital UNIX/Mach threads. + */ +#define RF_WAIT_COND(_c_,_m_) { \ + assert_wait((vm_offset_t)&(_c_), TRUE); \ + RF_UNLOCK_MUTEX(_m_); \ + thread_block(); \ + RF_LOCK_MUTEX(_m_); \ +} +#define RF_SIGNAL_COND(_c_) thread_wakeup_one(((vm_offset_t)&(_c_))) +#define RF_BROADCAST_COND(_c_) thread_wakeup(((vm_offset_t)&(_c_))) +extern task_t first_task; +#define RF_CREATE_THREAD(_handle_, _func_, _arg_) \ + (((_handle_ = kernel_thread_w_arg(first_task, (void (*)())_func_, (void *)(_arg_))) != THREAD_NULL) ? 0 : ENOMEM) +#endif /* __NetBSD__ || __OpenBSD__ */ +#endif /* KERNEL */ +#else /* SIMULATE */ + +#define RF_DECLARE_MUTEX(_m_) int _m_; +#define RF_DECLARE_STATIC_MUTEX(_m_) static int _m_; +#define RF_DECLARE_EXTERN_MUTEX(_m_) extern int _m_; +#define RF_DECLARE_COND(_c_) int _c_; +#define RF_DECLARE_STATIC_COND(_c_) static int _c_; +#define RF_DECLARE_EXTERN_COND(_c_) extern int _c_; + +extern int rf_mutex_init(int *m); +extern int rf_mutex_destroy(int *m); +extern int rf_cond_init(int *c); +extern int rf_cond_destroy(int *c); + +int rf_mutex_init(int *m); +int rf_mutex_destroy(int *m); +int _rf_create_managed_mutex(RF_ShutdownList_t **listp, int *m, char *file, int line); +int _rf_create_managed_cond(RF_ShutdownList_t **listp, int *m, char *file, int line); + +typedef void *RF_ThreadArg_t; /* the argument to a thread function */ + +#define RF_LOCK_MUTEX(_m_) +#define RF_UNLOCK_MUTEX(_m_) + +#define RF_WAIT_COND(_c_,_m_) +#define RF_SIGNAL_COND(_c_) +#define RF_BROADCAST_COND(_c_) + +#define RF_EXIT_THREAD(_status_) +#define RF_DELAY_THREAD(_secs_, _msecs_) + +#define RF_THREAD_ATTR_CREATE(_attr_) ; +#define RF_THREAD_ATTR_DELETE(_attr_) ; +#define RF_THREAD_ATTR_SETSTACKSIZE(_attr_,_sz_) ; +#define RF_THREAD_ATTR_SETSCHED(_attr_,_sched_) ; +#define RF_CREATE_ATTR_THREAD(_handle_, _attr_, _func_, _arg_) ; + +#define RF_CREATE_THREAD(_handle_, _func_, _arg_) 1 + +#endif /* SIMULATE */ + +struct RF_ThreadGroup_s { + int created; + int running; + int shutdown; + RF_DECLARE_MUTEX(mutex) + RF_DECLARE_COND(cond) +}; + +/* + * Someone has started a thread in the group + */ +#define RF_THREADGROUP_STARTED(_g_) { \ + RF_LOCK_MUTEX((_g_)->mutex); \ + (_g_)->created++; \ + RF_UNLOCK_MUTEX((_g_)->mutex); \ +} + +/* + * Thread announcing that it is now running + */ +#define RF_THREADGROUP_RUNNING(_g_) { \ + RF_LOCK_MUTEX((_g_)->mutex); \ + (_g_)->running++; \ + RF_UNLOCK_MUTEX((_g_)->mutex); \ + RF_SIGNAL_COND((_g_)->cond); \ +} + +/* + * Thread announcing that it is now done + */ +#define RF_THREADGROUP_DONE(_g_) { \ + RF_LOCK_MUTEX((_g_)->mutex); \ + (_g_)->shutdown++; \ + RF_UNLOCK_MUTEX((_g_)->mutex); \ + RF_SIGNAL_COND((_g_)->cond); \ +} + +/* + * Wait for all threads to start running + */ +#define RF_THREADGROUP_WAIT_START(_g_) { \ + RF_LOCK_MUTEX((_g_)->mutex); \ + while((_g_)->running < (_g_)->created) { \ + RF_WAIT_COND((_g_)->cond, (_g_)->mutex); \ + } \ + RF_UNLOCK_MUTEX((_g_)->mutex); \ +} + +/* + * Wait for all threads to stop running + */ +#if !defined(__NetBSD__) && !defined(__OpenBSD__) +#define RF_THREADGROUP_WAIT_STOP(_g_) { \ + RF_LOCK_MUTEX((_g_)->mutex); \ + RF_ASSERT((_g_)->running == (_g_)->created); \ + while((_g_)->shutdown < (_g_)->running) { \ + RF_WAIT_COND((_g_)->cond, (_g_)->mutex); \ + } \ + RF_UNLOCK_MUTEX((_g_)->mutex); \ +} +#else + /* XXX Note that we've removed the assert. That should get put back + in once we actually get something like a kernel thread running */ +#define RF_THREADGROUP_WAIT_STOP(_g_) { \ + RF_LOCK_MUTEX((_g_)->mutex); \ + while((_g_)->shutdown < (_g_)->running) { \ + RF_WAIT_COND((_g_)->cond, (_g_)->mutex); \ + } \ + RF_UNLOCK_MUTEX((_g_)->mutex); \ +} +#endif + +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) + +int rf_mutex_init(struct simplelock *); +int rf_mutex_destroy(struct simplelock *); +int _rf_create_managed_mutex(RF_ShutdownList_t **, struct simplelock *, + char *, int); +int _rf_create_managed_cond(RF_ShutdownList_t **listp, int *, + char *file, int line); + +int rf_cond_init(int *c); /* XXX need to write?? */ +int rf_cond_destroy(int *c); /* XXX need to write?? */ +#endif +#endif /* !_RF__RF_THREADSTUFF_H_ */ diff --git a/sys/dev/raidframe/rf_types.h b/sys/dev/raidframe/rf_types.h new file mode 100644 index 00000000000..6df3e9e5d78 --- /dev/null +++ b/sys/dev/raidframe/rf_types.h @@ -0,0 +1,583 @@ +/* $OpenBSD: rf_types.h,v 1.1 1999/01/11 14:29:54 niklas Exp $ */ +/* $NetBSD: rf_types.h,v 1.2 1998/11/16 04:14:10 mycroft Exp $ */ +/* + * rf_types.h + */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Jim Zelenka + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ +/*********************************************************** + * + * rf_types.h -- standard types for RAIDframe + * + ***********************************************************/ +/* + * : + * Log: rf_types.h,v + * Revision 1.35 1996/08/09 18:48:29 jimz + * correct mips definition + * + * Revision 1.34 1996/08/07 22:50:14 jimz + * monkey with linux includes to get a good compile + * + * Revision 1.33 1996/08/07 21:09:28 jimz + * add SGI mips stuff (note: 64-bit stuff may be wrong, I didn't have + * a machine to test on) + * + * Revision 1.32 1996/08/06 22:24:27 jimz + * add LINUX_I386 + * + * Revision 1.31 1996/07/31 16:30:12 jimz + * move in RF_LONGSHIFT + * + * Revision 1.30 1996/07/30 04:51:58 jimz + * ultrix port + * + * Revision 1.29 1996/07/29 16:37:34 jimz + * define DEC_OSF for osf/1 kernel + * + * Revision 1.28 1996/07/28 20:31:39 jimz + * i386netbsd port + * true/false fixup + * + * Revision 1.27 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.26 1996/07/27 18:40:24 jimz + * cleanup sweep + * + * Revision 1.25 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.24 1996/07/18 22:57:14 jimz + * port simulator to AIX + * + * Revision 1.23 1996/07/15 17:22:18 jimz + * nit-pick code cleanup + * resolve stdlib problems on DEC OSF + * + * Revision 1.22 1996/06/11 18:11:57 jimz + * add ThreadGroup + * + * Revision 1.21 1996/06/11 10:58:47 jimz + * add RF_ReconDoneProc_t + * + * Revision 1.20 1996/06/10 14:18:58 jimz + * move user, throughput stats into per-array structure + * + * Revision 1.19 1996/06/10 11:55:47 jimz + * Straightened out some per-array/not-per-array distinctions, fixed + * a couple bugs related to confusion. Added shutdown lists. Removed + * layout shutdown function (now subsumed by shutdown lists). + * + * Revision 1.18 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.17 1996/06/05 19:38:32 jimz + * fixed up disk queueing types config + * added sstf disk queueing + * fixed exit bug on diskthreads (ref-ing bad mem) + * + * Revision 1.16 1996/06/05 18:06:02 jimz + * Major code cleanup. The Great Renaming is now done. + * Better modularity. Better typing. Fixed a bunch of + * synchronization bugs. Made a lot of global stuff + * per-desc or per-array. Removed dead code. + * + * Revision 1.15 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.14 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.13 1996/05/31 22:26:54 jimz + * fix a lot of mapping problems, memory allocation problems + * found some weird lock issues, fixed 'em + * more code cleanup + * + * Revision 1.12 1996/05/30 23:22:16 jimz + * bugfixes of serialization, timing problems + * more cleanup + * + * Revision 1.11 1996/05/30 11:29:41 jimz + * Numerous bug fixes. Stripe lock release code disagreed with the taking code + * about when stripes should be locked (I made it consistent: no parity, no lock) + * There was a lot of extra serialization of I/Os which I've removed- a lot of + * it was to calculate values for the cache code, which is no longer with us. + * More types, function, macro cleanup. Added code to properly quiesce the array + * on shutdown. Made a lot of stuff array-specific which was (bogusly) general + * before. Fixed memory allocation, freeing bugs. + * + * Revision 1.10 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.9 1996/05/24 22:17:04 jimz + * continue code + namespace cleanup + * typed a bunch of flags + * + * Revision 1.8 1996/05/24 04:28:55 jimz + * release cleanup ckpt + * + * Revision 1.7 1996/05/24 01:59:45 jimz + * another checkpoint in code cleanup for release + * time to sync kernel tree + * + * Revision 1.6 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.5 1996/05/23 00:33:23 jimz + * code cleanup: move all debug decls to rf_options.c, all extern + * debug decls to rf_options.h, all debug vars preceded by rf_ + * + * Revision 1.4 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.3 1996/05/10 16:22:46 jimz + * RF_offset -> RF_Offset + * add RF_SectorCount + * + * Revision 1.2 1996/05/02 14:58:50 jimz + * switch to _t for non-base-integral types + * + * Revision 1.1 1995/12/14 18:36:51 jimz + * Initial revision + * + */ + +#ifndef _RF__RF_TYPES_H_ +#define _RF__RF_TYPES_H_ + + +#ifdef _KERNEL +#define KERNEL +#endif + +#include "rf_archs.h" + +#ifndef KERNEL +#ifdef LINUX +#include <stdlib.h> +#include <sys/types.h> +#endif /* LINUX */ +#include <fcntl.h> +#include <stdio.h> + +#ifdef __osf__ +/* + * The following monkeying is to get around some problems with + * conflicting definitions in /usr/include/random.h and /usr/include/stdlib.h + * on Digital Unix. They + * (1) define the same symbols + * (2) differently than one another + * (3) also differently from the DU libc sources + * This loses, bad. + */ +#include <standards.h> +#include <cma.h> +#ifdef _OSF_SOURCE +#undef _OSF_SOURCE +#define _RF_SPANKME +#endif /* _OSF_SOURCE */ +#endif /* __osf__ */ +#include <stdlib.h> +#ifdef __osf__ +#ifdef _RF_SPANKME +#undef _RF_SPANKME +#define _OSF_SOURCE +#endif /* _RF_SPANKME */ +#endif /* __osf__ */ + +#include <string.h> +#include <unistd.h> +#endif /* !KERNEL */ +#include <sys/errno.h> +#include <sys/types.h> + +#ifdef AIX +#include <sys/stream.h> +#endif /* AIX */ + +#if defined(hpux) || defined(__hpux) +/* + * Yeah, we get one of hpux or __hpux, but not both. This is because + * HP didn't really want to provide an ANSI C compiler. Apparantly, they + * don't like standards. This explains a lot about their API. You might + * try using gcc, but you'll discover that it's sufficiently buggy that + * it can't even compile the core library. + * + * Hatred update: c89, the one thing which could both handle prototypes, + * and compile /usr/include/sys/timeout.h, can't do 64-bit ints. + * + * Note: the hpux port is incomplete. Why? Well, because I can't find + * a working C compiler. I've tried cc (both with and without -Ae), + * c89, and gcc, all with and without -D_HPUX_SOURCE. Sod it. + * + * -Jim Zelenka, 22 July 1996 + */ +#ifndef hpux +#define hpux +#endif /* !hpux */ +#include <sys/hpibio.h> +#endif /* hpux || __hpux*/ + +#ifdef sun +#ifndef KERNEL +#include <errno.h> +#endif /* !KERNEL */ +#endif /* sun */ + +#if defined(OSF) && defined(__alpha) && defined(KERNEL) +#ifndef DEC_OSF +#define DEC_OSF +#endif /* !DEC_OSF */ +#endif /* OSF && __alpha && KERNEL */ + +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(KERNEL) +#include <sys/uio.h> +#include <sys/param.h> +#include <sys/lock.h> + +/* XXX not sure about these... */ +/* #define PZERO 0 */ /* actually defined in <sys/param.h> */ +#define MS_LOCK_SIMPLE 1 + +#define TRUE 1 /* XXX why isn't this done somewhere already!! */ + +#endif /* (__NetBSD__ || __OpenBSD__) && KERNEL */ + +/* + * First, define system-dependent types and constants. + * + * If the machine is big-endian, RF_BIG_ENDIAN should be 1. + * Otherwise, it should be 0. + * + * The various integer types should be self-explanatory; we + * use these elsewhere to avoid size confusion. + * + * LONGSHIFT is lg(sizeof(long)) (that is, log base two of sizeof(long) + * + */ + +#if defined(__NetBSD__) || defined(__OpenBSD__) + +#include <sys/types.h> +#include <machine/endian.h> +#include <machine/limits.h> + +#if BYTE_ORDER == BIG_ENDIAN +#define RF_IS_BIG_ENDIAN 1 +#elif BYTE_ORDER == LITTLE_ENDIAN +#define RF_IS_BIG_ENDIAN 0 +#else +#error byte order not defined +#endif +typedef int8_t RF_int8; +typedef u_int8_t RF_uint8; +typedef int16_t RF_int16; +typedef u_int16_t RF_uint16; +typedef int32_t RF_int32; +typedef u_int32_t RF_uint32; +typedef int64_t RF_int64; +typedef u_int64_t RF_uint64; +#if LONG_BIT == 32 +#define RF_LONGSHIFT 2 +#elif LONG_BIT == 64 +#define RF_LONGSHIFT 3 +#else +#error word size not defined +#endif + +#else /* __NetBSD__ || __OpenBSD__ */ + +#ifdef __alpha +#define RF_IS_BIG_ENDIAN 0 +typedef signed char RF_int8; +typedef unsigned char RF_uint8; +typedef short RF_int16; +typedef unsigned short RF_uint16; +typedef int RF_int32; +typedef unsigned int RF_uint32; +typedef long RF_int64; +typedef unsigned long RF_uint64; +#define RF_LONGSHIFT 3 +#endif /* __alpha */ + +#ifdef _IBMR2 +#define RF_IS_BIG_ENDIAN 1 +typedef signed char RF_int8; +typedef unsigned char RF_uint8; +typedef short RF_int16; +typedef unsigned short RF_uint16; +typedef int RF_int32; +typedef unsigned int RF_uint32; +typedef long long RF_int64; +typedef unsigned long long RF_uint64; +#define RF_LONGSHIFT 2 +#endif /* _IBMR2 */ + +#ifdef hpux +#define RF_IS_BIG_ENDIAN 1 +typedef signed char RF_int8; +typedef unsigned char RF_uint8; +typedef short RF_int16; +typedef unsigned short RF_uint16; +typedef int RF_int32; +typedef unsigned int RF_uint32; +typedef long long RF_int64; +typedef unsigned long long RF_uint64; +#define RF_LONGSHIFT 2 +#endif /* hpux */ + +#ifdef sun +#define RF_IS_BIG_ENDIAN 1 +typedef char RF_int8; +typedef unsigned char RF_uint8; +typedef short RF_int16; +typedef unsigned short RF_uint16; +typedef int RF_int32; +typedef unsigned int RF_uint32; +typedef long long RF_int64; +typedef unsigned long long RF_uint64; +#define RF_LONGSHIFT 2 +#endif /* sun */ + +#if defined(NETBSD_I386) || defined(NETBSD_I386) || defined(LINUX_I386) +#define RF_IS_BIG_ENDIAN 0 +typedef char RF_int8; +typedef unsigned char RF_uint8; +typedef short RF_int16; +typedef unsigned short RF_uint16; +typedef int RF_int32; +typedef unsigned int RF_uint32; +typedef long long RF_int64; +typedef unsigned long long RF_uint64; +#define RF_LONGSHIFT 2 +#endif /* NETBSD_I386 || OPENBSD_I386 || LINUX_I386 */ + +#if defined(mips) && !defined(SGI) && !defined(__NetBSD__) && !defined(__OpenBSD__) +#define RF_IS_BIG_ENDIAN 0 +typedef char RF_int8; +typedef unsigned char RF_uint8; +typedef short RF_int16; +typedef unsigned short RF_uint16; +typedef int RF_int32; +typedef unsigned int RF_uint32; +typedef long long RF_int64; +typedef unsigned long long RF_uint64; +#define RF_LONGSHIFT 2 +#endif /* mips && !SGI */ + +#ifdef SGI +#if _MIPS_SZLONG == 64 +#define RF_IS_BIG_ENDIAN 1 +typedef signed char RF_int8; +typedef unsigned char RF_uint8; +typedef short RF_int16; +typedef unsigned short RF_uint16; +typedef int RF_int32; +typedef unsigned int RF_uint32; +typedef long RF_int64; +typedef unsigned long RF_uint64; +#define RF_LONGSHIFT 3 +#endif /* _MIPS_SZLONG == 64 */ +#if _MIPS_SZLONG == 32 +#define RF_IS_BIG_ENDIAN 1 +typedef char RF_int8; +typedef unsigned char RF_uint8; +typedef short RF_int16; +typedef unsigned short RF_uint16; +typedef int RF_int32; +typedef unsigned int RF_uint32; +typedef long long RF_int64; +typedef unsigned long long RF_uint64; +#define RF_LONGSHIFT 2 +#endif /* _MIPS_SZLONG == 32 */ +#endif /* SGI */ + +#endif /* __NetBSD__ || __OpenBSD__ */ + +/* + * These are just zero and non-zero. We don't use "TRUE" + * and "FALSE" because there's too much nonsense trying + * to get them defined exactly once on every platform, given + * the different places they may be defined in system header + * files. + */ +#define RF_TRUE 1 +#define RF_FALSE 0 + +/* + * Now, some generic types + */ +typedef RF_uint64 RF_IoCount_t; +typedef RF_uint64 RF_Offset_t; +typedef RF_uint32 RF_PSSFlags_t; +typedef RF_uint64 RF_SectorCount_t; +typedef RF_uint64 RF_StripeCount_t; +typedef RF_int64 RF_SectorNum_t; /* these are unsigned so we can set them to (-1) for "uninitialized" */ +typedef RF_int64 RF_StripeNum_t; +typedef RF_int64 RF_RaidAddr_t; +typedef int RF_RowCol_t; /* unsigned so it can be (-1) */ +typedef RF_int64 RF_HeadSepLimit_t; +typedef RF_int64 RF_ReconUnitCount_t; +typedef int RF_ReconUnitNum_t; + +typedef char RF_ParityConfig_t; + +typedef char RF_DiskQueueType_t[1024]; +#define RF_DISK_QUEUE_TYPE_NONE "" + +/* values for the 'type' field in a reconstruction buffer */ +typedef int RF_RbufType_t; +#define RF_RBUF_TYPE_EXCLUSIVE 0 /* this buf assigned exclusively to one disk */ +#define RF_RBUF_TYPE_FLOATING 1 /* this is a floating recon buf */ +#define RF_RBUF_TYPE_FORCED 2 /* this rbuf was allocated to complete a forced recon */ + +typedef char RF_IoType_t; +#define RF_IO_TYPE_READ 'r' +#define RF_IO_TYPE_WRITE 'w' +#define RF_IO_TYPE_NOP 'n' +#define RF_IO_IS_R_OR_W(_type_) (((_type_) == RF_IO_TYPE_READ) \ + || ((_type_) == RF_IO_TYPE_WRITE)) + +#ifdef SIMULATE +typedef double RF_TICS_t; +typedef int RF_Owner_t; +#endif /* SIMULATE */ + +typedef void (*RF_VoidFuncPtr)(void *,...); + +typedef RF_uint32 RF_AccessStripeMapFlags_t; +typedef RF_uint32 RF_DiskQueueDataFlags_t; +typedef RF_uint32 RF_DiskQueueFlags_t; +typedef RF_uint32 RF_RaidAccessFlags_t; + +#define RF_DISKQUEUE_DATA_FLAGS_NONE ((RF_DiskQueueDataFlags_t)0) + +typedef struct RF_AccessStripeMap_s RF_AccessStripeMap_t; +typedef struct RF_AccessStripeMapHeader_s RF_AccessStripeMapHeader_t; +typedef struct RF_AllocListElem_s RF_AllocListElem_t; +typedef struct RF_CallbackDesc_s RF_CallbackDesc_t; +typedef struct RF_ChunkDesc_s RF_ChunkDesc_t; +typedef struct RF_CommonLogData_s RF_CommonLogData_t; +typedef struct RF_Config_s RF_Config_t; +typedef struct RF_CumulativeStats_s RF_CumulativeStats_t; +typedef struct RF_DagHeader_s RF_DagHeader_t; +typedef struct RF_DagList_s RF_DagList_t; +typedef struct RF_DagNode_s RF_DagNode_t; +typedef struct RF_DeclusteredConfigInfo_s RF_DeclusteredConfigInfo_t; +typedef struct RF_DiskId_s RF_DiskId_t; +typedef struct RF_DiskMap_s RF_DiskMap_t; +typedef struct RF_DiskQueue_s RF_DiskQueue_t; +typedef struct RF_DiskQueueData_s RF_DiskQueueData_t; +typedef struct RF_DiskQueueSW_s RF_DiskQueueSW_t; +typedef struct RF_Etimer_s RF_Etimer_t; +typedef struct RF_EventCreate_s RF_EventCreate_t; +typedef struct RF_FreeList_s RF_FreeList_t; +typedef struct RF_LockReqDesc_s RF_LockReqDesc_t; +typedef struct RF_LockTableEntry_s RF_LockTableEntry_t; +typedef struct RF_MCPair_s RF_MCPair_t; +typedef struct RF_OwnerInfo_s RF_OwnerInfo_t; +typedef struct RF_ParityLog_s RF_ParityLog_t; +typedef struct RF_ParityLogAppendQueue_s RF_ParityLogAppendQueue_t; +typedef struct RF_ParityLogData_s RF_ParityLogData_t; +typedef struct RF_ParityLogDiskQueue_s RF_ParityLogDiskQueue_t; +typedef struct RF_ParityLogQueue_s RF_ParityLogQueue_t; +typedef struct RF_ParityLogRecord_s RF_ParityLogRecord_t; +typedef struct RF_PerDiskReconCtrl_s RF_PerDiskReconCtrl_t; +typedef struct RF_PSStatusHeader_s RF_PSStatusHeader_t; +typedef struct RF_PhysDiskAddr_s RF_PhysDiskAddr_t; +typedef struct RF_PropHeader_s RF_PropHeader_t; +typedef struct RF_Raid_s RF_Raid_t; +typedef struct RF_RaidAccessDesc_s RF_RaidAccessDesc_t; +typedef struct RF_RaidDisk_s RF_RaidDisk_t; +typedef struct RF_RaidLayout_s RF_RaidLayout_t; +typedef struct RF_RaidReconDesc_s RF_RaidReconDesc_t; +typedef struct RF_ReconBuffer_s RF_ReconBuffer_t; +typedef struct RF_ReconConfig_s RF_ReconConfig_t; +typedef struct RF_ReconCtrl_s RF_ReconCtrl_t; +typedef struct RF_ReconDoneProc_s RF_ReconDoneProc_t; +typedef struct RF_ReconEvent_s RF_ReconEvent_t; +typedef struct RF_ReconMap_s RF_ReconMap_t; +typedef struct RF_ReconMapListElem_s RF_ReconMapListElem_t; +typedef struct RF_ReconParityStripeStatus_s RF_ReconParityStripeStatus_t; +typedef struct RF_RedFuncs_s RF_RedFuncs_t; +typedef struct RF_RegionBufferQueue_s RF_RegionBufferQueue_t; +typedef struct RF_RegionInfo_s RF_RegionInfo_t; +typedef struct RF_ShutdownList_s RF_ShutdownList_t; +typedef struct RF_SpareTableEntry_s RF_SpareTableEntry_t; +typedef struct RF_SparetWait_s RF_SparetWait_t; +typedef struct RF_StripeLockDesc_s RF_StripeLockDesc_t; +typedef struct RF_ThreadGroup_s RF_ThreadGroup_t; +typedef struct RF_ThroughputStats_s RF_ThroughputStats_t; + +/* + * Important assumptions regarding ordering of the states in this list + * have been made!!! + * Before disturbing this ordering, look at code in rf_states.c + */ +typedef enum RF_AccessState_e { + /* original states */ + rf_QuiesceState, /* handles queisence for reconstruction */ + rf_IncrAccessesCountState, /* count accesses in flight */ + rf_DecrAccessesCountState, + rf_MapState, /* map access to disk addresses */ + rf_LockState, /* take stripe locks */ + rf_CreateDAGState, /* create DAGs */ + rf_ExecuteDAGState, /* execute DAGs */ + rf_ProcessDAGState, /* DAGs are completing- check if correct, or if we need to retry */ + rf_CleanupState, /* release stripe locks, clean up */ + rf_LastState /* must be the last state */ +} RF_AccessState_t; + +#define RF_MAXROW 10 /* these are arbitrary and can be modified at will */ +#define RF_MAXCOL 40 +#define RF_MAXSPARE 10 +#define RF_MAXDBGV 75 /* max number of debug variables */ + +union RF_GenericParam_u { + void *p; + RF_uint64 v; +}; +typedef union RF_GenericParam_u RF_DagParam_t; +typedef union RF_GenericParam_u RF_CBParam_t; + +#endif /* _RF__RF_TYPES_H_ */ diff --git a/sys/dev/raidframe/rf_utils.c b/sys/dev/raidframe/rf_utils.c new file mode 100644 index 00000000000..be379ed8e58 --- /dev/null +++ b/sys/dev/raidframe/rf_utils.c @@ -0,0 +1,231 @@ +/* $OpenBSD: rf_utils.c,v 1.1 1999/01/11 14:29:54 niklas Exp $ */ +/* $NetBSD: rf_utils.c,v 1.1 1998/11/13 04:20:35 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/**************************************** + * + * rf_utils.c -- various support routines + * + ****************************************/ + +/* : + * Log: rf_utils.c,v + * Revision 1.20 1996/07/27 23:36:08 jimz + * Solaris port of simulator + * + * Revision 1.19 1996/07/22 19:52:16 jimz + * switched node params to RF_DagParam_t, a union of + * a 64-bit int and a void *, for better portability + * attempted hpux port, but failed partway through for + * lack of a single C compiler capable of compiling all + * source files + * + * Revision 1.18 1996/07/15 17:22:18 jimz + * nit-pick code cleanup + * resolve stdlib problems on DEC OSF + * + * Revision 1.17 1996/06/09 02:36:46 jimz + * lots of little crufty cleanup- fixup whitespace + * issues, comment #ifdefs, improve typing in some + * places (esp size-related) + * + * Revision 1.16 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.15 1996/06/03 23:28:26 jimz + * more bugfixes + * check in tree to sync for IPDS runs with current bugfixes + * there still may be a problem with threads in the script test + * getting I/Os stuck- not trivially reproducible (runs ~50 times + * in a row without getting stuck) + * + * Revision 1.14 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.13 1996/05/27 18:56:37 jimz + * more code cleanup + * better typing + * compiles in all 3 environments + * + * Revision 1.12 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.11 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.10 1995/12/06 15:17:44 root + * added copyright info + * + */ + +#include "rf_threadstuff.h" + +#ifdef _KERNEL +#define KERNEL +#endif + +#ifndef KERNEL +#include <stdio.h> +#endif /* !KERNEL */ +#include <sys/time.h> + +#include "rf_threadid.h" +#include "rf_utils.h" +#include "rf_debugMem.h" +#include "rf_alloclist.h" +#include "rf_general.h" +#include "rf_sys.h" + +#ifndef KERNEL +#include "rf_randmacros.h" +#endif /* !KERNEL */ + +/* creates & zeros 2-d array with b rows and k columns (MCH) */ +RF_RowCol_t **rf_make_2d_array(b, k, allocList) + int b; + int k; + RF_AllocListElem_t *allocList; +{ + RF_RowCol_t **retval, i; + + RF_MallocAndAdd(retval, b * sizeof(RF_RowCol_t *), (RF_RowCol_t **), allocList); + for (i=0; i<b; i++) { + RF_MallocAndAdd(retval[i], k * sizeof(RF_RowCol_t), (RF_RowCol_t *), allocList); + (void) bzero((char *) retval[i], k*sizeof(RF_RowCol_t)); + } + return(retval); +} + +void rf_free_2d_array(a, b, k) + RF_RowCol_t **a; + int b; + int k; +{ + RF_RowCol_t i; + + for (i=0; i<b; i++) + RF_Free(a[i], k*sizeof(RF_RowCol_t)); + RF_Free(a, b*sizeof(RF_RowCol_t)); +} + + +/* creates & zeros a 1-d array with c columns */ +RF_RowCol_t *rf_make_1d_array(c, allocList) + int c; + RF_AllocListElem_t *allocList; +{ + RF_RowCol_t *retval; + + RF_MallocAndAdd(retval, c * sizeof(RF_RowCol_t), (RF_RowCol_t *), allocList); + (void) bzero((char *) retval, c*sizeof(RF_RowCol_t)); + return(retval); +} + +void rf_free_1d_array(a, n) + RF_RowCol_t *a; + int n; +{ + RF_Free(a, n * sizeof(RF_RowCol_t)); +} + +/* Euclid's algorithm: finds and returns the greatest common divisor + * between a and b. (MCH) + */ +int rf_gcd(m, n) + int m; + int n; +{ + int t; + + while (m>0) { + t = n % m; + n = m; + m = t; + } + return(n); +} + +#if !defined(KERNEL) && !defined(SIMULATE) && defined(__osf__) +/* this is used to generate a random number when _FASTRANDOM is off + * in randmacros.h + */ +long rf_do_random(rval, rdata) + long *rval; + struct random_data *rdata; +{ + int a, b; + long c; + /* + * random_r() generates random 32-bit values. OR them together. + */ + if (random_r(&a, rdata)!=0) { + fprintf(stderr,"Yikes! call to random_r failed\n"); + exit(1); + } + if (random_r(&b, rdata)!=0) { + fprintf(stderr,"Yikes! call to random_r failed\n"); + exit(1); + } + c = ((long)a)<<32; + *rval = c|b; + return(*rval); +} +#endif /* !KERNEL && !SIMULATE && __osf__ */ + +/* these convert between text and integer. Apparently the regular C macros + * for doing this are not available in the kernel + */ + +#define ISDIGIT(x) ( (x) >= '0' && (x) <= '9' ) +#define ISHEXCHAR(x) ( ((x) >= 'a' && (x) <= 'f') || ((x) >= 'A' && (x) <= 'F') ) +#define ISHEX(x) ( ISDIGIT(x) || ISHEXCHAR(x) ) +#define HC2INT(x) ( ((x) >= 'a' && (x) <= 'f') ? (x) - 'a' + 10 : \ + ( ((x) >= 'A' && (x) <= 'F') ? (x) - 'A' + 10 : (x - '0') ) ) + +int rf_atoi(p) + char *p; +{ + int val = 0, negate = 0; + + if (*p == '-') {negate=1; p++;} + for ( ; ISDIGIT(*p); p++) val = 10 * val + (*p - '0'); + return((negate) ? -val : val); +} + +int rf_htoi(p) + char *p; +{ + int val = 0; + for ( ; ISHEXCHAR(*p); p++) val = 16 * val + HC2INT(*p); + return(val); +} diff --git a/sys/dev/raidframe/rf_utils.h b/sys/dev/raidframe/rf_utils.h new file mode 100644 index 00000000000..73eede8f131 --- /dev/null +++ b/sys/dev/raidframe/rf_utils.h @@ -0,0 +1,90 @@ +/* $OpenBSD: rf_utils.h,v 1.1 1999/01/11 14:29:55 niklas Exp $ */ +/* $NetBSD: rf_utils.h,v 1.1 1998/11/13 04:20:35 oster Exp $ */ +/* + * Copyright (c) 1995 Carnegie-Mellon University. + * All rights reserved. + * + * Author: Mark Holland + * + * Permission to use, copy, modify and distribute this software and + * its documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND + * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +/*************************************** + * + * rf_utils.c -- header file for utils.c + * + ***************************************/ + +/* : + * Log: rf_utils.h,v + * Revision 1.7 1996/06/07 21:33:04 jimz + * begin using consistent types for sector numbers, + * stripe numbers, row+col numbers, recon unit numbers + * + * Revision 1.6 1996/06/02 17:31:48 jimz + * Moved a lot of global stuff into array structure, where it belongs. + * Fixed up paritylogging, pss modules in this manner. Some general + * code cleanup. Removed lots of dead code, some dead files. + * + * Revision 1.5 1996/05/23 21:46:35 jimz + * checkpoint in code cleanup (release prep) + * lots of types, function names have been fixed + * + * Revision 1.4 1996/05/18 19:51:34 jimz + * major code cleanup- fix syntax, make some types consistent, + * add prototypes, clean out dead code, et cetera + * + * Revision 1.3 1995/12/06 15:17:53 root + * added copyright info + * + */ + +#ifndef _RF__RF_UTILS_H_ +#define _RF__RF_UTILS_H_ + +#include "rf_types.h" +#include "rf_alloclist.h" +#include "rf_threadstuff.h" + +char *rf_find_non_white(char *p); +char *rf_find_white(char *p); +RF_RowCol_t **rf_make_2d_array(int b, int k, RF_AllocListElem_t *allocList); +RF_RowCol_t *rf_make_1d_array(int c, RF_AllocListElem_t *allocList); +void rf_free_2d_array(RF_RowCol_t **a, int b, int k); +void rf_free_1d_array(RF_RowCol_t *a, int n); +int rf_gcd(int m, int n); +int rf_atoi(char *p); +int rf_htoi(char *p); + +#define RF_USEC_PER_SEC 1000000 +#define RF_TIMEVAL_DIFF(_start_,_end_,_diff_) { \ + if ((_end_)->tv_usec < (_start_)->tv_usec) { \ + (_diff_)->tv_usec = ((_end_)->tv_usec + RF_USEC_PER_SEC) \ + - (_start_)->tv_usec; \ + (_diff_)->tv_sec = ((_end_)->tv_sec-1) - (_start_)->tv_sec; \ + } \ + else { \ + (_diff_)->tv_usec = (_end_)->tv_usec - (_start_)->tv_usec; \ + (_diff_)->tv_sec = (_end_)->tv_sec - (_start_)->tv_sec; \ + } \ +} + +#endif /* !_RF__RF_UTILS_H_ */ |