summaryrefslogtreecommitdiff
path: root/sys/dev/raidframe
diff options
context:
space:
mode:
authorNiklas Hallqvist <niklas@cvs.openbsd.org>1999-01-11 14:29:56 +0000
committerNiklas Hallqvist <niklas@cvs.openbsd.org>1999-01-11 14:29:56 +0000
commit5a29b52d01b420bb61a3112d2d44740a0fa99601 (patch)
tree7d6238740f53a56f5c76ba8256c785b13caaa24a /sys/dev/raidframe
parent799a3ea9a9c07e091f5f4e62273c6f105cf86191 (diff)
Import of CMU's RAIDframe via NetBSD.
Diffstat (limited to 'sys/dev/raidframe')
-rw-r--r--sys/dev/raidframe/rf_acctrace.c295
-rw-r--r--sys/dev/raidframe/rf_acctrace.h196
-rw-r--r--sys/dev/raidframe/rf_alloclist.c294
-rw-r--r--sys/dev/raidframe/rf_alloclist.h84
-rw-r--r--sys/dev/raidframe/rf_archs.h211
-rw-r--r--sys/dev/raidframe/rf_aselect.c618
-rw-r--r--sys/dev/raidframe/rf_aselect.h60
-rw-r--r--sys/dev/raidframe/rf_callback.c121
-rw-r--r--sys/dev/raidframe/rf_callback.h92
-rw-r--r--sys/dev/raidframe/rf_ccmn.h115
-rw-r--r--sys/dev/raidframe/rf_chaindecluster.c382
-rw-r--r--sys/dev/raidframe/rf_chaindecluster.h123
-rw-r--r--sys/dev/raidframe/rf_configure.h127
-rw-r--r--sys/dev/raidframe/rf_copyback.c577
-rw-r--r--sys/dev/raidframe/rf_copyback.h88
-rw-r--r--sys/dev/raidframe/rf_cpuutil.c195
-rw-r--r--sys/dev/raidframe/rf_cpuutil.h57
-rw-r--r--sys/dev/raidframe/rf_cvscan.c450
-rw-r--r--sys/dev/raidframe/rf_cvscan.h97
-rw-r--r--sys/dev/raidframe/rf_dag.h320
-rw-r--r--sys/dev/raidframe/rf_dagdegrd.c1212
-rw-r--r--sys/dev/raidframe/rf_dagdegrd.h88
-rw-r--r--sys/dev/raidframe/rf_dagdegwr.c969
-rw-r--r--sys/dev/raidframe/rf_dagdegwr.h81
-rw-r--r--sys/dev/raidframe/rf_dagffrd.c500
-rw-r--r--sys/dev/raidframe/rf_dagffrd.h75
-rw-r--r--sys/dev/raidframe/rf_dagffwr.c2202
-rw-r--r--sys/dev/raidframe/rf_dagffwr.h103
-rw-r--r--sys/dev/raidframe/rf_dagflags.h86
-rw-r--r--sys/dev/raidframe/rf_dagfuncs.c1050
-rw-r--r--sys/dev/raidframe/rf_dagfuncs.h138
-rw-r--r--sys/dev/raidframe/rf_dagutils.c1406
-rw-r--r--sys/dev/raidframe/rf_dagutils.h192
-rw-r--r--sys/dev/raidframe/rf_debugMem.c578
-rw-r--r--sys/dev/raidframe/rf_debugMem.h263
-rw-r--r--sys/dev/raidframe/rf_debugprint.c186
-rw-r--r--sys/dev/raidframe/rf_debugprint.h64
-rw-r--r--sys/dev/raidframe/rf_decluster.c847
-rw-r--r--sys/dev/raidframe/rf_decluster.h182
-rw-r--r--sys/dev/raidframe/rf_declusterPQ.c589
-rw-r--r--sys/dev/raidframe/rf_declusterPQ.h100
-rw-r--r--sys/dev/raidframe/rf_demo.c506
-rw-r--r--sys/dev/raidframe/rf_demo.h83
-rw-r--r--sys/dev/raidframe/rf_desc.h181
-rw-r--r--sys/dev/raidframe/rf_diskevent.c291
-rw-r--r--sys/dev/raidframe/rf_diskevent.h97
-rw-r--r--sys/dev/raidframe/rf_diskqueue.c929
-rw-r--r--sys/dev/raidframe/rf_diskqueue.h315
-rw-r--r--sys/dev/raidframe/rf_disks.c651
-rw-r--r--sys/dev/raidframe/rf_disks.h161
-rw-r--r--sys/dev/raidframe/rf_diskthreads.h103
-rw-r--r--sys/dev/raidframe/rf_driver.c1765
-rw-r--r--sys/dev/raidframe/rf_driver.h126
-rw-r--r--sys/dev/raidframe/rf_engine.c1096
-rw-r--r--sys/dev/raidframe/rf_engine.h75
-rw-r--r--sys/dev/raidframe/rf_etimer.h353
-rw-r--r--sys/dev/raidframe/rf_evenodd.c556
-rw-r--r--sys/dev/raidframe/rf_evenodd.h49
-rw-r--r--sys/dev/raidframe/rf_evenodd_dagfuncs.c887
-rw-r--r--sys/dev/raidframe/rf_evenodd_dagfuncs.h77
-rw-r--r--sys/dev/raidframe/rf_evenodd_dags.c199
-rw-r--r--sys/dev/raidframe/rf_evenodd_dags.h64
-rw-r--r--sys/dev/raidframe/rf_fifo.c371
-rw-r--r--sys/dev/raidframe/rf_fifo.h115
-rw-r--r--sys/dev/raidframe/rf_freelist.h734
-rw-r--r--sys/dev/raidframe/rf_general.h269
-rw-r--r--sys/dev/raidframe/rf_geniq.c199
-rw-r--r--sys/dev/raidframe/rf_geometry.c891
-rw-r--r--sys/dev/raidframe/rf_geometry.h155
-rw-r--r--sys/dev/raidframe/rf_heap.c274
-rw-r--r--sys/dev/raidframe/rf_heap.h128
-rw-r--r--sys/dev/raidframe/rf_hist.h73
-rw-r--r--sys/dev/raidframe/rf_interdecluster.c361
-rw-r--r--sys/dev/raidframe/rf_interdecluster.h112
-rw-r--r--sys/dev/raidframe/rf_invertq.c55
-rw-r--r--sys/dev/raidframe/rf_invertq.h73
-rw-r--r--sys/dev/raidframe/rf_kintf.h71
-rw-r--r--sys/dev/raidframe/rf_layout.c720
-rw-r--r--sys/dev/raidframe/rf_layout.h493
-rw-r--r--sys/dev/raidframe/rf_map.c976
-rw-r--r--sys/dev/raidframe/rf_map.h134
-rw-r--r--sys/dev/raidframe/rf_mcpair.c200
-rw-r--r--sys/dev/raidframe/rf_mcpair.h62
-rw-r--r--sys/dev/raidframe/rf_memchunk.c256
-rw-r--r--sys/dev/raidframe/rf_memchunk.h80
-rw-r--r--sys/dev/raidframe/rf_netbsd.h98
-rw-r--r--sys/dev/raidframe/rf_netbsdkintf.c2048
-rw-r--r--sys/dev/raidframe/rf_nwayxor.c454
-rw-r--r--sys/dev/raidframe/rf_nwayxor.h75
-rw-r--r--sys/dev/raidframe/rf_openbsd.h94
-rw-r--r--sys/dev/raidframe/rf_openbsdkintf.c2033
-rw-r--r--sys/dev/raidframe/rf_options.c85
-rw-r--r--sys/dev/raidframe/rf_options.h68
-rw-r--r--sys/dev/raidframe/rf_optnames.h144
-rw-r--r--sys/dev/raidframe/rf_owner.h75
-rw-r--r--sys/dev/raidframe/rf_paritylog.c1022
-rw-r--r--sys/dev/raidframe/rf_paritylog.h225
-rw-r--r--sys/dev/raidframe/rf_paritylogDiskMgr.c790
-rw-r--r--sys/dev/raidframe/rf_paritylogDiskMgr.h63
-rw-r--r--sys/dev/raidframe/rf_paritylogging.c1088
-rw-r--r--sys/dev/raidframe/rf_paritylogging.h137
-rw-r--r--sys/dev/raidframe/rf_parityloggingdags.c752
-rw-r--r--sys/dev/raidframe/rf_parityloggingdags.h94
-rw-r--r--sys/dev/raidframe/rf_parityscan.c553
-rw-r--r--sys/dev/raidframe/rf_parityscan.h118
-rw-r--r--sys/dev/raidframe/rf_pq.c1026
-rw-r--r--sys/dev/raidframe/rf_pq.h115
-rw-r--r--sys/dev/raidframe/rf_pqdeg.c286
-rw-r--r--sys/dev/raidframe/rf_pqdeg.h93
-rw-r--r--sys/dev/raidframe/rf_pqdegdags.c554
-rw-r--r--sys/dev/raidframe/rf_pqdegdags.h77
-rw-r--r--sys/dev/raidframe/rf_psstatus.c417
-rw-r--r--sys/dev/raidframe/rf_psstatus.h154
-rw-r--r--sys/dev/raidframe/rf_raid.h437
-rw-r--r--sys/dev/raidframe/rf_raid0.c242
-rw-r--r--sys/dev/raidframe/rf_raid0.h111
-rw-r--r--sys/dev/raidframe/rf_raid1.c881
-rw-r--r--sys/dev/raidframe/rf_raid1.h130
-rw-r--r--sys/dev/raidframe/rf_raid4.c225
-rw-r--r--sys/dev/raidframe/rf_raid4.h109
-rw-r--r--sys/dev/raidframe/rf_raid5.c403
-rw-r--r--sys/dev/raidframe/rf_raid5.h113
-rw-r--r--sys/dev/raidframe/rf_raid5_rotatedspare.c250
-rw-r--r--sys/dev/raidframe/rf_raid5_rotatedspare.h105
-rw-r--r--sys/dev/raidframe/rf_raidframe.h165
-rw-r--r--sys/dev/raidframe/rf_randmacros.h228
-rw-r--r--sys/dev/raidframe/rf_reconbuffer.c538
-rw-r--r--sys/dev/raidframe/rf_reconbuffer.h98
-rw-r--r--sys/dev/raidframe/rf_reconmap.c459
-rw-r--r--sys/dev/raidframe/rf_reconmap.h114
-rw-r--r--sys/dev/raidframe/rf_reconstruct.c1595
-rw-r--r--sys/dev/raidframe/rf_reconstruct.h258
-rw-r--r--sys/dev/raidframe/rf_reconstub.c88
-rw-r--r--sys/dev/raidframe/rf_reconutil.c408
-rw-r--r--sys/dev/raidframe/rf_reconutil.h96
-rw-r--r--sys/dev/raidframe/rf_revent.c306
-rw-r--r--sys/dev/raidframe/rf_revent.h82
-rw-r--r--sys/dev/raidframe/rf_rst.h78
-rw-r--r--sys/dev/raidframe/rf_shutdown.c114
-rw-r--r--sys/dev/raidframe/rf_shutdown.h68
-rw-r--r--sys/dev/raidframe/rf_sstf.c717
-rw-r--r--sys/dev/raidframe/rf_sstf.h90
-rw-r--r--sys/dev/raidframe/rf_states.c873
-rw-r--r--sys/dev/raidframe/rf_states.h70
-rw-r--r--sys/dev/raidframe/rf_stripelocks.c642
-rw-r--r--sys/dev/raidframe/rf_stripelocks.h170
-rw-r--r--sys/dev/raidframe/rf_strutils.c62
-rw-r--r--sys/dev/raidframe/rf_sys.c260
-rw-r--r--sys/dev/raidframe/rf_sys.h69
-rw-r--r--sys/dev/raidframe/rf_threadid.h230
-rw-r--r--sys/dev/raidframe/rf_threadstuff.c477
-rw-r--r--sys/dev/raidframe/rf_threadstuff.h465
-rw-r--r--sys/dev/raidframe/rf_types.h583
-rw-r--r--sys/dev/raidframe/rf_utils.c231
-rw-r--r--sys/dev/raidframe/rf_utils.h90
155 files changed, 56247 insertions, 0 deletions
diff --git a/sys/dev/raidframe/rf_acctrace.c b/sys/dev/raidframe/rf_acctrace.c
new file mode 100644
index 00000000000..8e3c7a9b26a
--- /dev/null
+++ b/sys/dev/raidframe/rf_acctrace.c
@@ -0,0 +1,295 @@
+/* $OpenBSD: rf_acctrace.c,v 1.1 1999/01/11 14:28:58 niklas Exp $ */
+/* $NetBSD: rf_acctrace.c,v 1.1 1998/11/13 04:20:26 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************
+ *
+ * acctrace.c -- code to support collecting information about each access
+ *
+ *****************************************************************************/
+
+/* :
+ * Log: rf_acctrace.c,v
+ * Revision 1.29 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.28 1996/07/17 21:00:58 jimz
+ * clean up timer interface, tracing
+ *
+ * Revision 1.27 1996/06/14 14:35:24 jimz
+ * clean up dfstrace protection
+ *
+ * Revision 1.26 1996/06/13 19:09:04 jimz
+ * remove trace.dat file before beginning
+ *
+ * Revision 1.25 1996/06/12 04:41:26 jimz
+ * tweaks to make genplot work with user-level driver
+ * (mainly change stat collection)
+ *
+ * Revision 1.24 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.23 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.22 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.21 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.20 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.19 1996/05/30 12:59:18 jimz
+ * make etimer happier, more portable
+ *
+ * Revision 1.18 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.17 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.16 1996/05/20 16:15:49 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.15 1996/05/18 20:10:00 jimz
+ * bit of cleanup to compile cleanly in kernel, once again
+ *
+ * Revision 1.14 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.13 1995/11/30 16:26:43 wvcii
+ * added copyright info
+ *
+ */
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#include "rf_threadstuff.h"
+#include "rf_types.h"
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#ifdef KERNEL
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include <dfstrace.h>
+#endif /* !__NetBSD__ && !__OpenBSD__ */
+#if DFSTRACE > 0
+#include <sys/dfs_log.h>
+#include <sys/dfstracebuf.h>
+#endif /* DFSTRACE > 0 */
+#endif /* KERNEL */
+
+#include "rf_debugMem.h"
+#include "rf_acctrace.h"
+#include "rf_general.h"
+#include "rf_raid.h"
+#include "rf_etimer.h"
+#include "rf_hist.h"
+#include "rf_shutdown.h"
+#include "rf_sys.h"
+
+static long numTracesSoFar;
+static int accessTraceBufCount = 0;
+static RF_AccTraceEntry_t *access_tracebuf;
+static long traceCount;
+
+int rf_stopCollectingTraces;
+RF_DECLARE_MUTEX(rf_tracing_mutex)
+int rf_trace_fd;
+
+static void rf_ShutdownAccessTrace(void *);
+
+static void rf_ShutdownAccessTrace(ignored)
+ void *ignored;
+{
+ if (rf_accessTraceBufSize) {
+ if (accessTraceBufCount) rf_FlushAccessTraceBuf();
+#ifndef KERNEL
+ close(rf_trace_fd);
+#endif /* !KERNEL */
+ RF_Free(access_tracebuf, rf_accessTraceBufSize * sizeof(RF_AccTraceEntry_t));
+ }
+ rf_mutex_destroy(&rf_tracing_mutex);
+#if defined(KERNEL) && DFSTRACE > 0
+ printf("RAIDFRAME: %d trace entries were sent to dfstrace\n",traceCount);
+#endif /* KERNEL && DFSTRACE > 0 */
+}
+
+int rf_ConfigureAccessTrace(listp)
+ RF_ShutdownList_t **listp;
+{
+ int rc;
+
+ numTracesSoFar = accessTraceBufCount = rf_stopCollectingTraces = 0;
+ if (rf_accessTraceBufSize) {
+ RF_Malloc(access_tracebuf, rf_accessTraceBufSize * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
+ accessTraceBufCount = 0;
+#ifndef KERNEL
+ rc = unlink("trace.dat");
+ if (rc && (errno != ENOENT)) {
+ perror("unlink");
+ RF_ERRORMSG("Unable to remove existing trace.dat\n");
+ return(errno);
+ }
+ if ((rf_trace_fd = open("trace.dat",O_WRONLY|O_CREAT|O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH)) < 0 ) {
+ perror("Unable to open trace.dat for output");
+ return(errno);
+ }
+#endif /* !KERNEL */
+ }
+ traceCount = 0;
+ numTracesSoFar = 0;
+ rc = rf_mutex_init(&rf_tracing_mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ }
+ rc = rf_ShutdownCreate(listp, rf_ShutdownAccessTrace, NULL);
+ if (rc) {
+ RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ if (rf_accessTraceBufSize) {
+ RF_Free(access_tracebuf, rf_accessTraceBufSize * sizeof(RF_AccTraceEntry_t));
+#ifndef KERNEL
+ close(rf_trace_fd);
+#endif /* !KERNEL */
+ rf_mutex_destroy(&rf_tracing_mutex);
+ }
+ }
+ return(rc);
+}
+
+/* install a trace record. cause a flush to disk or to the trace collector daemon
+ * if the trace buffer is at least 1/2 full.
+ */
+void rf_LogTraceRec(raid, rec)
+ RF_Raid_t *raid;
+ RF_AccTraceEntry_t *rec;
+{
+ RF_AccTotals_t *acc = &raid->acc_totals;
+#if 0
+ RF_Etimer_t timer;
+ int i, n;
+#endif
+
+ if (rf_stopCollectingTraces || ((rf_maxNumTraces >= 0) && (numTracesSoFar >= rf_maxNumTraces)))
+ return;
+
+#ifndef KERNEL
+ if (rf_accessTraceBufSize) {
+ RF_LOCK_MUTEX(rf_tracing_mutex);
+ numTracesSoFar++;
+ bcopy((char *)rec, (char *)&access_tracebuf[ accessTraceBufCount++ ], sizeof(RF_AccTraceEntry_t));
+ if (accessTraceBufCount == rf_accessTraceBufSize)
+ rf_FlushAccessTraceBuf();
+ RF_UNLOCK_MUTEX(rf_tracing_mutex);
+ }
+#endif /* !KERNEL */
+#if defined(KERNEL) && DFSTRACE > 0
+ rec->index = traceCount++;
+ if (traceon & DFS_TRACE_RAIDFRAME) {
+ dfs_log(DFS_NOTE, (char *) rec, (int) sizeof(*rec), 0);
+ }
+#endif /* KERNEL && DFSTRACE > 0 */
+ /* update AccTotals for this device */
+ if (!raid->keep_acc_totals)
+ return;
+ acc->num_log_ents++;
+ if (rec->reconacc) {
+ acc->recon_start_to_fetch_us += rec->specific.recon.recon_start_to_fetch_us;
+ acc->recon_fetch_to_return_us += rec->specific.recon.recon_fetch_to_return_us;
+ acc->recon_return_to_submit_us += rec->specific.recon.recon_return_to_submit_us;
+ acc->recon_num_phys_ios += rec->num_phys_ios;
+ acc->recon_phys_io_us += rec->phys_io_us;
+ acc->recon_diskwait_us += rec->diskwait_us;
+ acc->recon_reccount++;
+ }
+ else {
+ RF_HIST_ADD(acc->tot_hist, rec->total_us);
+ RF_HIST_ADD(acc->dw_hist, rec->diskwait_us);
+ /* count of physical ios which are too big. often due to thermal recalibration */
+ /* if bigvals > 0, you should probably ignore this data set */
+ if (rec->diskwait_us > 100000)
+ acc->bigvals++;
+ acc->total_us += rec->total_us;
+ acc->suspend_ovhd_us += rec->specific.user.suspend_ovhd_us;
+ acc->map_us += rec->specific.user.map_us;
+ acc->lock_us += rec->specific.user.lock_us;
+ acc->dag_create_us += rec->specific.user.dag_create_us;
+ acc->dag_retry_us += rec->specific.user.dag_retry_us;
+ acc->exec_us += rec->specific.user.exec_us;
+ acc->cleanup_us += rec->specific.user.cleanup_us;
+ acc->exec_engine_us += rec->specific.user.exec_engine_us;
+ acc->xor_us += rec->xor_us;
+ acc->q_us += rec->q_us;
+ acc->plog_us += rec->plog_us;
+ acc->diskqueue_us += rec->diskqueue_us;
+ acc->diskwait_us += rec->diskwait_us;
+ acc->num_phys_ios += rec->num_phys_ios;
+ acc->phys_io_us = rec->phys_io_us;
+ acc->user_reccount++;
+ }
+}
+
+
+/* assumes the tracing mutex is locked at entry. In order to allow this to be called
+ * from interrupt context, we don't do any copyouts here, but rather just wake trace
+ * buffer collector thread.
+ */
+void rf_FlushAccessTraceBuf()
+{
+#ifndef KERNEL
+ int size = accessTraceBufCount * sizeof(RF_AccTraceEntry_t);
+
+ if (write(rf_trace_fd, (char *) access_tracebuf, size) < size ) {
+ fprintf(stderr, "Unable to write traces to file. tracing disabled\n");
+ RF_Free(access_tracebuf, rf_accessTraceBufSize * sizeof(RF_AccTraceEntry_t));
+ rf_accessTraceBufSize = 0;
+ close(rf_trace_fd);
+ }
+#endif /* !KERNEL */
+ accessTraceBufCount = 0;
+}
diff --git a/sys/dev/raidframe/rf_acctrace.h b/sys/dev/raidframe/rf_acctrace.h
new file mode 100644
index 00000000000..0b3441e3e49
--- /dev/null
+++ b/sys/dev/raidframe/rf_acctrace.h
@@ -0,0 +1,196 @@
+/* $OpenBSD: rf_acctrace.h,v 1.1 1999/01/11 14:28:58 niklas Exp $ */
+/* $NetBSD: rf_acctrace.h,v 1.1 1998/11/13 04:20:26 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************
+ *
+ * acctrace.h -- header file for acctrace.c
+ *
+ *****************************************************************************/
+
+/* :
+ *
+ * Log: rf_acctrace.h,v
+ * Revision 1.32 1996/08/02 15:12:38 jimz
+ * remove dead code
+ *
+ * Revision 1.31 1996/07/27 14:34:39 jimz
+ * remove bogus semicolon
+ *
+ * Revision 1.30 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.29 1996/07/17 21:00:58 jimz
+ * clean up timer interface, tracing
+ *
+ * Revision 1.28 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.27 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ * /
+ *
+ * Revision 1.26 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.25 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.24 1996/05/30 12:59:18 jimz
+ * make etimer happier, more portable
+ *
+ * Revision 1.23 1996/05/28 12:34:30 jimz
+ * nail down size of reconacc
+ *
+ * Revision 1.22 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.21 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.20 1996/05/02 14:57:24 jimz
+ * change to boolean_t
+ *
+ * Revision 1.19 1995/12/14 18:37:06 jimz
+ * convert to rf_types.h types
+ *
+ * Revision 1.18 1995/11/30 16:26:49 wvcii
+ * added copyright info
+ *
+ * Revision 1.17 1995/09/30 19:49:23 jimz
+ * add AccTotals structure, for capturing totals in kernel
+ *
+ * Revision 1.16 1995/09/12 00:20:55 wvcii
+ * added support for tracing disk queue time
+ *
+ * Revision 1.15 95/09/06 19:23:12 wvcii
+ * increased MAX_IOS_PER_TRACE_ENTRY from 1 to 4
+ *
+ */
+
+#ifndef _RF__RF_ACCTRACE_H_
+#define _RF__RF_ACCTRACE_H_
+
+#include "rf_types.h"
+#include "rf_hist.h"
+#include "rf_etimer.h"
+
+typedef struct RF_user_acc_stats_s {
+ RF_uint64 suspend_ovhd_us; /* us spent mucking in the access-suspension code */
+ RF_uint64 map_us; /* us spent mapping the access */
+ RF_uint64 lock_us; /* us spent locking & unlocking stripes, including time spent blocked */
+ RF_uint64 dag_create_us; /* us spent creating the DAGs */
+ RF_uint64 dag_retry_us; /* _total_ us spent retrying the op -- not broken down into components */
+ RF_uint64 exec_us; /* us spent in DispatchDAG */
+ RF_uint64 exec_engine_us; /* us spent in engine, not including blocking time */
+ RF_uint64 cleanup_us; /* us spent tearing down the dag & maps, and generally cleaning up */
+} RF_user_acc_stats_t;
+
+typedef struct RF_recon_acc_stats_s {
+ RF_uint32 recon_start_to_fetch_us;
+ RF_uint32 recon_fetch_to_return_us;
+ RF_uint32 recon_return_to_submit_us;
+} RF_recon_acc_stats_t;
+
+typedef struct RF_acctrace_entry_s {
+ union {
+ RF_user_acc_stats_t user;
+ RF_recon_acc_stats_t recon;
+ } specific;
+ RF_uint8 reconacc; /* whether this is a tracerec for a user acc or a recon acc */
+ RF_uint64 xor_us; /* us spent doing XORs */
+ RF_uint64 q_us; /* us spent doing XORs */
+ RF_uint64 plog_us; /* us spent waiting to stuff parity into log */
+ RF_uint64 diskqueue_us; /* _total_ us spent in disk queue(s), incl concurrent ops */
+ RF_uint64 diskwait_us; /* _total_ us spent waiting actually waiting on the disk, incl concurrent ops */
+ RF_uint64 total_us; /* total us spent on this access */
+ RF_uint64 num_phys_ios; /* number of physical I/Os invoked */
+ RF_uint64 phys_io_us; /* time of physical I/O */
+ RF_Etimer_t tot_timer; /* a timer used to compute total access time */
+ RF_Etimer_t timer; /* a generic timer val for timing events that live across procedure boundaries */
+ RF_Etimer_t recon_timer; /* generic timer for recon stuff */
+ RF_uint64 index;
+} RF_AccTraceEntry_t;
+
+typedef struct RF_AccTotals_s {
+ /* user acc stats */
+ RF_uint64 suspend_ovhd_us;
+ RF_uint64 map_us;
+ RF_uint64 lock_us;
+ RF_uint64 dag_create_us;
+ RF_uint64 dag_retry_us;
+ RF_uint64 exec_us;
+ RF_uint64 exec_engine_us;
+ RF_uint64 cleanup_us;
+ RF_uint64 user_reccount;
+ /* recon acc stats */
+ RF_uint64 recon_start_to_fetch_us;
+ RF_uint64 recon_fetch_to_return_us;
+ RF_uint64 recon_return_to_submit_us;
+ RF_uint64 recon_io_overflow_count;
+ RF_uint64 recon_phys_io_us;
+ RF_uint64 recon_num_phys_ios;
+ RF_uint64 recon_diskwait_us;
+ RF_uint64 recon_reccount;
+ /* trace entry stats */
+ RF_uint64 xor_us;
+ RF_uint64 q_us;
+ RF_uint64 plog_us;
+ RF_uint64 diskqueue_us;
+ RF_uint64 diskwait_us;
+ RF_uint64 total_us;
+ RF_uint64 num_log_ents;
+ RF_uint64 phys_io_overflow_count;
+ RF_uint64 num_phys_ios;
+ RF_uint64 phys_io_us;
+ RF_uint64 bigvals;
+ /* histograms */
+ RF_Hist_t dw_hist[RF_HIST_NUM_BUCKETS];
+ RF_Hist_t tot_hist[RF_HIST_NUM_BUCKETS];
+} RF_AccTotals_t;
+
+#if RF_UTILITY == 0
+RF_DECLARE_EXTERN_MUTEX(rf_tracing_mutex)
+#endif /* RF_UTILITY == 0 */
+
+int rf_ConfigureAccessTrace(RF_ShutdownList_t **listp);
+void rf_LogTraceRec(RF_Raid_t *raid, RF_AccTraceEntry_t *rec);
+void rf_FlushAccessTraceBuf(void);
+
+#endif /* !_RF__RF_ACCTRACE_H_ */
diff --git a/sys/dev/raidframe/rf_alloclist.c b/sys/dev/raidframe/rf_alloclist.c
new file mode 100644
index 00000000000..5f0de4a4070
--- /dev/null
+++ b/sys/dev/raidframe/rf_alloclist.c
@@ -0,0 +1,294 @@
+/* $OpenBSD: rf_alloclist.c,v 1.1 1999/01/11 14:28:58 niklas Exp $ */
+/* $NetBSD: rf_alloclist.c,v 1.1 1998/11/13 04:20:26 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * Log: rf_alloclist.c,v
+ * Revision 1.28 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.27 1996/06/12 03:29:54 jimz
+ * don't barf just because we can't create an alloclist
+ *
+ * Revision 1.26 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.25 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.24 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.23 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.22 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.21 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.20 1996/05/20 16:15:59 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.19 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.18 1996/05/16 22:27:45 jimz
+ * get rid of surreal_MakeAllocList (what was that, anyway?)
+ *
+ * Revision 1.17 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.16 1995/11/30 16:27:07 wvcii
+ * added copyright info
+ *
+ * Revision 1.15 1995/10/05 20:37:56 jimz
+ * assert non-NULLness of pointer to FREE in FreeAllocList()
+ *
+ * Revision 1.14 1995/06/11 20:11:24 holland
+ * changed fl_hist,miss_count from long to int to get around weird kernel bug
+ *
+ * Revision 1.13 1995/05/01 13:28:00 holland
+ * parity range locks, locking disk requests, recon+parityscan in kernel, etc.
+ *
+ * Revision 1.12 1995/04/21 19:13:04 holland
+ * minor change to avoid a syntax error on DO_FREE
+ *
+ * Revision 1.11 1995/02/17 19:39:56 holland
+ * added size param to all calls to Free().
+ * this is ignored at user level, but necessary in the kernel.
+ *
+ * Revision 1.10 1995/02/10 18:08:07 holland
+ * added DO_FREE macro to fix what I broke during kernelization
+ *
+ * Revision 1.9 1995/02/10 17:34:10 holland
+ * kernelization changes
+ *
+ * Revision 1.8 1995/02/03 22:31:36 holland
+ * many changes related to kernelization
+ *
+ * Revision 1.7 1995/02/01 15:13:05 holland
+ * moved #include of general.h out of raid.h and into each file
+ *
+ * Revision 1.6 1995/01/11 19:27:02 holland
+ * many changes related to performance tuning
+ *
+ * Revision 1.5 1994/11/29 20:53:10 danner
+ * Marks mods
+ *
+ * Revision 1.3 1994/11/19 21:01:07 danner
+ * First merge with mark
+ *
+ * Revision 1.1.1.1 1994/11/19 20:23:38 danner
+ * First PQ checkin
+ *
+ * Revision 1.2 1994/11/16 15:45:35 danner
+ * fixed free bug in FreeAllocList
+ *
+ *
+ */
+
+/****************************************************************************
+ *
+ * Alloclist.c -- code to manipulate allocation lists
+ *
+ * an allocation list is just a list of AllocListElem structures. Each
+ * such structure contains a fixed-size array of pointers. Calling
+ * FreeAList() causes each pointer to be freed.
+ *
+ ***************************************************************************/
+
+#include "rf_types.h"
+#include "rf_threadstuff.h"
+#include "rf_alloclist.h"
+#include "rf_debugMem.h"
+#include "rf_etimer.h"
+#include "rf_general.h"
+#include "rf_shutdown.h"
+#include "rf_sys.h"
+
+RF_DECLARE_STATIC_MUTEX(alist_mutex)
+static unsigned int fl_hit_count, fl_miss_count;
+
+static RF_AllocListElem_t *al_free_list=NULL;
+static int al_free_list_count;
+
+#define RF_AL_FREELIST_MAX 256
+
+#ifndef KERNEL
+#define DO_FREE(_p,_sz) free((_p))
+#else /* !KERNEL */
+#define DO_FREE(_p,_sz) RF_Free((_p),(_sz))
+#endif /* !KERNEL */
+
+static void rf_ShutdownAllocList(void *);
+
+static void rf_ShutdownAllocList(ignored)
+ void *ignored;
+{
+ RF_AllocListElem_t *p, *pt;
+
+ for (p = al_free_list; p; ) {
+ pt = p;
+ p = p->next;
+ DO_FREE(pt, sizeof(*pt));
+ }
+ rf_mutex_destroy(&alist_mutex);
+ /*
+ printf("Alloclist: Free list hit count %lu (%lu %%) miss count %lu (%lu %%)\n",
+ fl_hit_count, (100*fl_hit_count)/(fl_hit_count+fl_miss_count),
+ fl_miss_count, (100*fl_miss_count)/(fl_hit_count+fl_miss_count));
+ */
+}
+
+int rf_ConfigureAllocList(listp)
+ RF_ShutdownList_t **listp;
+{
+ int rc;
+
+ rc = rf_mutex_init(&alist_mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ return(rc);
+ }
+ al_free_list = NULL;
+ fl_hit_count = fl_miss_count = al_free_list_count = 0;
+ rc = rf_ShutdownCreate(listp, rf_ShutdownAllocList, NULL);
+ if (rc) {
+ RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
+ __FILE__, __LINE__, rc);
+ rf_mutex_destroy(&alist_mutex);
+ return(rc);
+ }
+ return(0);
+}
+
+
+/* we expect the lists to have at most one or two elements, so we're willing
+ * to search for the end. If you ever observe the lists growing longer,
+ * increase POINTERS_PER_ALLOC_LIST_ELEMENT.
+ */
+void rf_real_AddToAllocList(l, p, size, lockflag)
+ RF_AllocListElem_t *l;
+ void *p;
+ int size;
+ int lockflag;
+{
+ RF_AllocListElem_t *newelem;
+
+ for ( ; l->next; l=l->next)
+ RF_ASSERT(l->numPointers == RF_POINTERS_PER_ALLOC_LIST_ELEMENT); /* find end of list */
+
+ RF_ASSERT(l->numPointers >= 0 && l->numPointers <= RF_POINTERS_PER_ALLOC_LIST_ELEMENT);
+ if (l->numPointers == RF_POINTERS_PER_ALLOC_LIST_ELEMENT) {
+ newelem = rf_real_MakeAllocList(lockflag);
+ l->next = newelem;
+ l = newelem;
+ }
+ l->pointers[ l->numPointers ] = p;
+ l->sizes [ l->numPointers ] = size;
+ l->numPointers++;
+
+}
+
+
+/* we use the debug_mem_mutex here because we need to lock it anyway to call free.
+ * this is probably a bug somewhere else in the code, but when I call malloc/free
+ * outside of any lock I have endless trouble with malloc appearing to return the
+ * same pointer twice. Since we have to lock it anyway, we might as well use it
+ * as the lock around the al_free_list. Note that we can't call Free with the
+ * debug_mem_mutex locked.
+ */
+void rf_FreeAllocList(l)
+ RF_AllocListElem_t *l;
+{
+ int i;
+ RF_AllocListElem_t *temp, *p;
+
+ for (p=l; p; p=p->next) {
+ RF_ASSERT(p->numPointers >= 0 && p->numPointers <= RF_POINTERS_PER_ALLOC_LIST_ELEMENT);
+ for (i=0; i<p->numPointers; i++) {
+ RF_ASSERT(p->pointers[i]);
+ RF_Free(p->pointers[i], p->sizes[i]);
+ }
+ }
+#ifndef KERNEL
+ RF_LOCK_MUTEX(rf_debug_mem_mutex);
+#endif /* !KERNEL */
+ while (l) {
+ temp = l;
+ l = l->next;
+ if (al_free_list_count > RF_AL_FREELIST_MAX) {DO_FREE(temp, sizeof(*temp));}
+ else {temp->next = al_free_list; al_free_list = temp; al_free_list_count++;}
+ }
+#ifndef KERNEL
+ RF_UNLOCK_MUTEX(rf_debug_mem_mutex);
+#endif /* !KERNEL */
+}
+
+RF_AllocListElem_t *rf_real_MakeAllocList(lockflag)
+ int lockflag;
+{
+ RF_AllocListElem_t *p;
+
+#ifndef KERNEL
+ if (lockflag) { RF_LOCK_MUTEX(rf_debug_mem_mutex); }
+#endif /* !KERNEL */
+ if (al_free_list) {fl_hit_count++; p = al_free_list; al_free_list = p->next; al_free_list_count--;}
+ else {
+ fl_miss_count++;
+#ifndef KERNEL
+ p = (RF_AllocListElem_t *) malloc(sizeof(RF_AllocListElem_t)); /* can't use Malloc at user level b/c we already locked the mutex */
+#else /* !KERNEL */
+ RF_Malloc(p, sizeof(RF_AllocListElem_t), (RF_AllocListElem_t *)); /* no allocation locking in kernel, so this is fine */
+#endif /* !KERNEL */
+ }
+#ifndef KERNEL
+ if (lockflag) { RF_UNLOCK_MUTEX(rf_debug_mem_mutex); }
+#endif /* !KERNEL */
+ if (p == NULL) {
+ return(NULL);
+ }
+ bzero((char *)p, sizeof(RF_AllocListElem_t));
+ return(p);
+}
diff --git a/sys/dev/raidframe/rf_alloclist.h b/sys/dev/raidframe/rf_alloclist.h
new file mode 100644
index 00000000000..b33f7a46e8b
--- /dev/null
+++ b/sys/dev/raidframe/rf_alloclist.h
@@ -0,0 +1,84 @@
+/* $OpenBSD: rf_alloclist.h,v 1.1 1999/01/11 14:28:59 niklas Exp $ */
+/* $NetBSD: rf_alloclist.h,v 1.1 1998/11/13 04:20:26 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/****************************************************************************
+ *
+ * alloclist.h -- header file for alloclist.c
+ *
+ ***************************************************************************/
+
+/* :
+ * Log: rf_alloclist.h,v
+ * Revision 1.11 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.10 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.9 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.8 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.7 1995/11/30 16:27:13 wvcii
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_ALLOCLIST_H_
+#define _RF__RF_ALLOCLIST_H_
+
+#include "rf_types.h"
+
+#define RF_POINTERS_PER_ALLOC_LIST_ELEMENT 20
+
+struct RF_AllocListElem_s {
+ void *pointers[RF_POINTERS_PER_ALLOC_LIST_ELEMENT];
+ int sizes[RF_POINTERS_PER_ALLOC_LIST_ELEMENT];
+ int numPointers;
+ RF_AllocListElem_t *next;
+};
+
+#define rf_MakeAllocList(_ptr_) _ptr_ = rf_real_MakeAllocList(1);
+#define rf_AddToAllocList(_l_,_ptr_,_sz_) rf_real_AddToAllocList((_l_), (_ptr_), (_sz_), 1)
+
+int rf_ConfigureAllocList(RF_ShutdownList_t **listp);
+
+#if RF_UTILITY == 0
+void rf_real_AddToAllocList(RF_AllocListElem_t *l, void *p, int size, int lockflag);
+void rf_FreeAllocList(RF_AllocListElem_t *l);
+RF_AllocListElem_t *rf_real_MakeAllocList(int lockflag);
+#endif /* RF_UTILITY == 0 */
+
+#endif /* !_RF__RF_ALLOCLIST_H_ */
diff --git a/sys/dev/raidframe/rf_archs.h b/sys/dev/raidframe/rf_archs.h
new file mode 100644
index 00000000000..6a4850829ce
--- /dev/null
+++ b/sys/dev/raidframe/rf_archs.h
@@ -0,0 +1,211 @@
+/* $OpenBSD: rf_archs.h,v 1.1 1999/01/11 14:28:59 niklas Exp $ */
+/* $NetBSD: rf_archs.h,v 1.1 1998/11/13 04:20:26 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_archs.h -- defines for which architectures you want to
+ * include is some particular build of raidframe. Unfortunately,
+ * it's difficult to exclude declustering, P+Q, and distributed
+ * sparing because the code is intermixed with RAID5 code. This
+ * should be fixed.
+ *
+ * this is really intended only for use in the kernel, where I
+ * am worried about the size of the object module. At user level and
+ * in the simulator, I don't really care that much, so all the
+ * architectures can be compiled together. Note that by itself, turning
+ * off these defines does not affect the size of the executable; you
+ * have to edit the makefile for that.
+ *
+ * comment out any line below to eliminate that architecture.
+ * the list below includes all the modules that can be compiled
+ * out.
+ *
+ * :
+ * Log: rf_archs.h,v
+ * Revision 1.32 1996/08/20 23:05:40 jimz
+ * define RF_KEEP_DISKSTATS to 1
+ *
+ * Revision 1.31 1996/07/31 15:34:04 jimz
+ * include evenodd
+ *
+ * Revision 1.30 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.29 1996/07/26 20:11:46 jimz
+ * only define RF_DEMO for CMU_PDL
+ *
+ * Revision 1.28 1996/07/26 20:10:57 jimz
+ * define RF_CMU_PDL only if it isn't already defined
+ *
+ * Revision 1.27 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.26 1996/06/17 14:38:33 jimz
+ * properly #if out RF_DEMO code
+ * fix bug in MakeConfig that was causing weird behavior
+ * in configuration routines (config was not zeroed at start)
+ * clean up genplot handling of stacks
+ *
+ * Revision 1.25 1996/06/14 21:24:59 jimz
+ * turn on RF_CMU_PDL by default
+ *
+ * Revision 1.24 1996/06/13 20:41:57 jimz
+ * add RF_INCLUDE_QUEUE_RANDOM (0)
+ *
+ * Revision 1.23 1996/06/11 18:12:36 jimz
+ * get rid of JOIN operations
+ * use ThreadGroup stuff instead
+ * fix some allocation/deallocation and sync bugs
+ *
+ * Revision 1.22 1996/06/10 22:24:55 wvcii
+ * added symbols for enabling forward or backward error
+ * recovery experiments
+ *
+ * Revision 1.21 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.20 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.19 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.18 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.17 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.16 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.15 1996/05/15 22:32:59 jimz
+ * remove cache and vs stuff
+ *
+ * Revision 1.14 1995/11/30 16:27:34 wvcii
+ * added copyright info
+ *
+ * Revision 1.13 1995/11/28 21:23:44 amiri
+ * added the interleaved declustering architecture
+ * ('I'), with distributed sparing.
+ *
+ * Revision 1.12 1995/11/17 16:59:45 amiri
+ * don't INCLUDE_CHAINDECLUSTER in the kernel
+ * source.
+ *
+ * Revision 1.11 1995/11/16 16:15:21 amiri
+ * don't include RAID5 with rotated sparing (INCLUDE_RAID5_RS) in kernel
+ *
+ * Revision 1.10 1995/10/12 17:40:47 jimz
+ * define INCLUDE_LS
+ *
+ * Revision 1.9 1995/10/11 06:56:47 jimz
+ * define INCLUDE_VS (sanity check for compilation)
+ *
+ * Revision 1.8 1995/10/05 18:56:24 jimz
+ * don't INCLUDE_VS
+ *
+ * Revision 1.7 1995/10/04 03:51:20 wvcii
+ * added raid 1
+ *
+ * Revision 1.6 1995/09/07 09:59:29 wvcii
+ * unstable archs conditionally defined for !KERNEL makes
+ *
+ *
+ */
+
+#ifndef _RF__RF_ARCHS_H_
+#define _RF__RF_ARCHS_H_
+
+/*
+ * Turn off if you do not have CMU PDL support compiled
+ * into your kernel.
+ */
+#ifndef RF_CMU_PDL
+#define RF_CMU_PDL 0
+#endif /* !RF_CMU_PDL */
+
+/*
+ * Khalil's performance-displaying demo stuff.
+ * Relies on CMU meter tools.
+ */
+#ifndef KERNEL
+#if RF_CMU_PDL > 0
+#define RF_DEMO 1
+#endif /* RF_CMU_PDL > 0 */
+#endif /* !KERNEL */
+
+#define RF_INCLUDE_EVENODD 1
+
+#define RF_INCLUDE_RAID5_RS 1
+#define RF_INCLUDE_PARITYLOGGING 1
+
+#define RF_INCLUDE_CHAINDECLUSTER 1
+#define RF_INCLUDE_INTERDECLUSTER 1
+
+#define RF_INCLUDE_RAID0 1
+#define RF_INCLUDE_RAID1 1
+#define RF_INCLUDE_RAID4 1
+#define RF_INCLUDE_RAID5 1
+#define RF_INCLUDE_RAID6 0
+#define RF_INCLUDE_DECL_PQ 0
+
+#define RF_MEMORY_REDZONES 0
+#define RF_RECON_STATS 1
+
+#define RF_INCLUDE_QUEUE_RANDOM 0
+
+#define RF_KEEP_DISKSTATS 1
+
+/* These two symbols enable nonstandard forms of error recovery.
+ * These modes are only valid for performance measurements and
+ * data corruption will occur if an error occurs when either
+ * forward or backward error recovery are enabled. In general
+ * both of the following two definitions should be commented
+ * out--this forces RAIDframe to use roll-away error recovery
+ * which does guarantee proper error recovery without data corruption
+ */
+/* #define RF_FORWARD 1 */
+/* #define RF_BACKWARD 1 */
+
+#include "rf_options.h"
+
+#endif /* !_RF__RF_ARCHS_H_ */
diff --git a/sys/dev/raidframe/rf_aselect.c b/sys/dev/raidframe/rf_aselect.c
new file mode 100644
index 00000000000..f6a1918b7a5
--- /dev/null
+++ b/sys/dev/raidframe/rf_aselect.c
@@ -0,0 +1,618 @@
+/* $OpenBSD: rf_aselect.c,v 1.1 1999/01/11 14:28:59 niklas Exp $ */
+/* $NetBSD: rf_aselect.c,v 1.1 1998/11/13 04:20:26 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************
+ *
+ * aselect.c -- algorithm selection code
+ *
+ *****************************************************************************/
+/*
+ * :
+ * Log: rf_aselect.c,v
+ * Revision 1.35 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.34 1996/07/27 18:39:39 jimz
+ * cleanup sweep
+ *
+ * Revision 1.33 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.32 1996/06/12 03:29:40 jimz
+ * Note: things that call InitHdrNode should check
+ * for successful return.
+ *
+ * Revision 1.31 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.30 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.29 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.28 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.27 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.26 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.25 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.24 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.23 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.22 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.21 1996/05/08 21:01:24 jimz
+ * fixed up enum type names that were conflicting with other
+ * enums and function names (ie, "panic")
+ * future naming trends will be towards RF_ and rf_ for
+ * everything raidframe-related
+ *
+ * Revision 1.20 1996/05/03 19:45:35 wvcii
+ * removed includes of old deg creation files
+ * updated SelectAlgorithm comments
+ *
+ * Revision 1.19 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.18 1995/11/30 16:27:48 wvcii
+ * added copyright info
+ *
+ * Revision 1.17 1995/11/19 16:25:55 wvcii
+ * SelectAlgorithm now creates an array, returned in desc->dagArray
+ * return value is now int (1 = FAIL)
+ *
+ * Revision 1.16 1995/11/17 15:09:58 wvcii
+ * fixed bug in SelectAlgorithm in which multiple graphs per stripe are required
+ *
+ * Revision 1.15 1995/11/07 17:12:42 wvcii
+ * changed SelectAlgorithm as follows:
+ *
+ * dag creation funcs now create term nodes
+ * dag selection funcs no longer return numHdrSucc, numTermAnt
+ * there is now one dag hdr for each dag in a request, implying
+ * that SelectAlgorithm now returns a linked list of dag hdrs
+ *
+ */
+
+#include "rf_archs.h"
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_dagutils.h"
+#include "rf_dagfuncs.h"
+#include "rf_general.h"
+#include "rf_desc.h"
+#include "rf_map.h"
+
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+/* the function below is not used... so don't define it! */
+#else
+static void TransferDagMemory(RF_DagHeader_t *, RF_DagHeader_t *);
+#endif
+
+static int InitHdrNode(RF_DagHeader_t **, RF_Raid_t *, int);
+static void UpdateNodeHdrPtr(RF_DagHeader_t *, RF_DagNode_t *);
+int rf_SelectAlgorithm(RF_RaidAccessDesc_t *, RF_RaidAccessFlags_t );
+
+
+/******************************************************************************
+ *
+ * Create and Initialiaze a dag header and termination node
+ *
+ *****************************************************************************/
+static int InitHdrNode(hdr, raidPtr, memChunkEnable)
+ RF_DagHeader_t **hdr;
+ RF_Raid_t *raidPtr;
+ int memChunkEnable;
+{
+ /* create and initialize dag hdr */
+ *hdr = rf_AllocDAGHeader();
+ rf_MakeAllocList((*hdr)->allocList);
+ if ((*hdr)->allocList == NULL) {
+ rf_FreeDAGHeader(*hdr);
+ return(ENOMEM);
+ }
+ (*hdr)->status = rf_enable;
+ (*hdr)->numSuccedents = 0;
+ (*hdr)->raidPtr = raidPtr;
+ (*hdr)->next = NULL;
+ return(0);
+}
+
+/******************************************************************************
+ *
+ * Transfer allocation list and mem chunks from one dag to another
+ *
+ *****************************************************************************/
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+/* the function below is not used... so don't define it! */
+#else
+static void TransferDagMemory(daga, dagb)
+ RF_DagHeader_t *daga;
+ RF_DagHeader_t *dagb;
+{
+ RF_AccessStripeMapHeader_t *end;
+ RF_AllocListElem_t *p;
+ int i, memChunksXfrd = 0, xtraChunksXfrd = 0;
+
+ /* transfer allocList from dagb to daga */
+ for (p = dagb->allocList; p ; p = p->next)
+ {
+ for (i = 0; i < p->numPointers; i++)
+ {
+ rf_AddToAllocList(daga->allocList, p->pointers[i], p->sizes[i]);
+ p->pointers[i] = NULL;
+ p->sizes[i] = 0;
+ }
+ p->numPointers = 0;
+ }
+
+ /* transfer chunks from dagb to daga */
+ while ((memChunksXfrd + xtraChunksXfrd < dagb->chunkIndex + dagb->xtraChunkIndex) && (daga->chunkIndex < RF_MAXCHUNKS))
+ {
+ /* stuff chunks into daga's memChunk array */
+ if (memChunksXfrd < dagb->chunkIndex)
+ {
+ daga->memChunk[daga->chunkIndex++] = dagb->memChunk[memChunksXfrd];
+ dagb->memChunk[memChunksXfrd++] = NULL;
+ }
+ else
+ {
+ daga->memChunk[daga->xtraChunkIndex++] = dagb->xtraMemChunk[xtraChunksXfrd];
+ dagb->xtraMemChunk[xtraChunksXfrd++] = NULL;
+ }
+ }
+ /* use escape hatch to hold excess chunks */
+ while (memChunksXfrd + xtraChunksXfrd < dagb->chunkIndex + dagb->xtraChunkIndex) {
+ if (memChunksXfrd < dagb->chunkIndex)
+ {
+ daga->xtraMemChunk[daga->xtraChunkIndex++] = dagb->memChunk[memChunksXfrd];
+ dagb->memChunk[memChunksXfrd++] = NULL;
+ }
+ else
+ {
+ daga->xtraMemChunk[daga->xtraChunkIndex++] = dagb->xtraMemChunk[xtraChunksXfrd];
+ dagb->xtraMemChunk[xtraChunksXfrd++] = NULL;
+ }
+ }
+ RF_ASSERT((memChunksXfrd == dagb->chunkIndex) && (xtraChunksXfrd == dagb->xtraChunkIndex));
+ RF_ASSERT(daga->chunkIndex <= RF_MAXCHUNKS);
+ RF_ASSERT(daga->xtraChunkIndex <= daga->xtraChunkCnt);
+ dagb->chunkIndex = 0;
+ dagb->xtraChunkIndex = 0;
+
+ /* transfer asmList from dagb to daga */
+ if (dagb->asmList)
+ {
+ if (daga->asmList)
+ {
+ end = daga->asmList;
+ while (end->next)
+ end = end->next;
+ end->next = dagb->asmList;
+ }
+ else
+ daga->asmList = dagb->asmList;
+ dagb->asmList = NULL;
+ }
+}
+#endif /* __NetBSD__ || __OpenBSD__ */
+
+/*****************************************************************************************
+ *
+ * Ensure that all node->dagHdr fields in a dag are consistent
+ *
+ * IMPORTANT: This routine recursively searches all succedents of the node. If a
+ * succedent is encountered whose dagHdr ptr does not require adjusting, that node's
+ * succedents WILL NOT BE EXAMINED.
+ *
+ ****************************************************************************************/
+static void UpdateNodeHdrPtr(hdr, node)
+ RF_DagHeader_t *hdr;
+ RF_DagNode_t *node;
+{
+ int i;
+ RF_ASSERT(hdr != NULL && node != NULL);
+ for (i = 0; i < node->numSuccedents; i++)
+ if (node->succedents[i]->dagHdr != hdr)
+ UpdateNodeHdrPtr(hdr, node->succedents[i]);
+ node->dagHdr = hdr;
+}
+
+/******************************************************************************
+ *
+ * Create a DAG to do a read or write operation.
+ *
+ * create an array of dagLists, one list per parity stripe.
+ * return the lists in the array desc->dagArray.
+ *
+ * Normally, each list contains one dag for the entire stripe. In some
+ * tricky cases, we break this into multiple dags, either one per stripe
+ * unit or one per block (sector). When this occurs, these dags are returned
+ * as a linked list (dagList) which is executed sequentially (to preserve
+ * atomic parity updates in the stripe).
+ *
+ * dags which operate on independent parity goups (stripes) are returned in
+ * independent dagLists (distinct elements in desc->dagArray) and may be
+ * executed concurrently.
+ *
+ * Finally, if the SelectionFunc fails to create a dag for a block, we punt
+ * and return 1.
+ *
+ * The above process is performed in two phases:
+ * 1) create an array(s) of creation functions (eg stripeFuncs)
+ * 2) create dags and concatenate/merge to form the final dag.
+ *
+ * Because dag's are basic blocks (single entry, single exit, unconditional
+ * control flow, we can add the following optimizations (future work):
+ * first-pass optimizer to allow max concurrency (need all data dependencies)
+ * second-pass optimizer to eliminate common subexpressions (need true
+ * data dependencies)
+ * third-pass optimizer to eliminate dead code (need true data dependencies)
+ *****************************************************************************/
+
+#define MAXNSTRIPES 50
+
+int rf_SelectAlgorithm(desc, flags)
+ RF_RaidAccessDesc_t *desc;
+ RF_RaidAccessFlags_t flags;
+{
+ RF_AccessStripeMapHeader_t *asm_h = desc->asmap;
+ RF_IoType_t type = desc->type;
+ RF_Raid_t *raidPtr = desc->raidPtr;
+ void *bp = desc->bp;
+
+ RF_AccessStripeMap_t *asmap = asm_h->stripeMap;
+ RF_AccessStripeMap_t *asm_p;
+ RF_DagHeader_t *dag_h = NULL, *tempdag_h, *lastdag_h;
+ int i, j, k;
+ RF_VoidFuncPtr *stripeFuncs, normalStripeFuncs[MAXNSTRIPES];
+ RF_AccessStripeMap_t *asm_up, *asm_bp;
+ RF_AccessStripeMapHeader_t ***asmh_u, *endASMList;
+ RF_AccessStripeMapHeader_t ***asmh_b;
+ RF_VoidFuncPtr **stripeUnitFuncs, uFunc;
+ RF_VoidFuncPtr **blockFuncs, bFunc;
+ int numStripesBailed = 0, cantCreateDAGs = RF_FALSE;
+ int numStripeUnitsBailed = 0;
+ int stripeNum, numUnitDags = 0, stripeUnitNum, numBlockDags = 0;
+ RF_StripeNum_t numStripeUnits;
+ RF_SectorNum_t numBlocks;
+ RF_RaidAddr_t address;
+ int length;
+ RF_PhysDiskAddr_t *physPtr;
+ caddr_t buffer;
+
+ lastdag_h = NULL;
+ asmh_u = asmh_b = NULL;
+ stripeUnitFuncs = NULL;
+ blockFuncs = NULL;
+
+ /* get an array of dag-function creation pointers, try to avoid calling malloc */
+ if (asm_h->numStripes <= MAXNSTRIPES) stripeFuncs = normalStripeFuncs;
+ else RF_Calloc(stripeFuncs, asm_h->numStripes, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr *));
+
+ /* walk through the asm list once collecting information */
+ /* attempt to find a single creation function for each stripe */
+ desc->numStripes = 0;
+ for (i=0,asm_p = asmap; asm_p; asm_p=asm_p->next,i++) {
+ desc->numStripes++;
+ (raidPtr->Layout.map->SelectionFunc)(raidPtr, type, asm_p, &stripeFuncs[i]);
+ /* check to see if we found a creation func for this stripe */
+ if (stripeFuncs[i] == (RF_VoidFuncPtr) NULL)
+ {
+ /* could not find creation function for entire stripe
+ so, let's see if we can find one for each stripe unit in the stripe */
+
+ if (numStripesBailed == 0)
+ {
+ /* one stripe map header for each stripe we bail on */
+ RF_Malloc(asmh_u, sizeof(RF_AccessStripeMapHeader_t **) * asm_h->numStripes, (RF_AccessStripeMapHeader_t ***));
+ /* create an array of ptrs to arrays of stripeFuncs */
+ RF_Calloc(stripeUnitFuncs, asm_h->numStripes, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr **));
+ }
+
+ /* create an array of creation funcs (called stripeFuncs) for this stripe */
+ numStripeUnits = asm_p->numStripeUnitsAccessed;
+ RF_Calloc(stripeUnitFuncs[numStripesBailed], numStripeUnits, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr *));
+ RF_Malloc(asmh_u[numStripesBailed], numStripeUnits * sizeof(RF_AccessStripeMapHeader_t *), (RF_AccessStripeMapHeader_t **));
+
+ /* lookup array of stripeUnitFuncs for this stripe */
+ for (j=0, physPtr = asm_p->physInfo; physPtr; physPtr = physPtr->next, j++)
+ {
+ /* remap for series of single stripe-unit accesses */
+ address = physPtr->raidAddress;
+ length = physPtr->numSector;
+ buffer = physPtr->bufPtr;
+
+ asmh_u[numStripesBailed][j] = rf_MapAccess(raidPtr, address, length, buffer, RF_DONT_REMAP);
+ asm_up = asmh_u[numStripesBailed][j]->stripeMap;
+
+ /* get the creation func for this stripe unit */
+ (raidPtr->Layout.map-> SelectionFunc)(raidPtr, type, asm_up, &(stripeUnitFuncs[numStripesBailed][j]));
+
+ /* check to see if we found a creation func for this stripe unit */
+ if (stripeUnitFuncs[numStripesBailed][j] == (RF_VoidFuncPtr) NULL)
+ {
+ /* could not find creation function for stripe unit so,
+ let's see if we can find one for each block in the stripe unit */
+ if (numStripeUnitsBailed == 0)
+ {
+ /* one stripe map header for each stripe unit we bail on */
+ RF_Malloc(asmh_b, sizeof(RF_AccessStripeMapHeader_t **) * asm_h->numStripes * raidPtr->Layout.numDataCol, (RF_AccessStripeMapHeader_t ***));
+ /* create an array of ptrs to arrays of blockFuncs */
+ RF_Calloc(blockFuncs, asm_h->numStripes * raidPtr->Layout.numDataCol, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr **));
+ }
+
+ /* create an array of creation funcs (called blockFuncs) for this stripe unit */
+ numBlocks = physPtr->numSector;
+ numBlockDags += numBlocks;
+ RF_Calloc(blockFuncs[numStripeUnitsBailed], numBlocks, sizeof(RF_VoidFuncPtr), (RF_VoidFuncPtr *));
+ RF_Malloc(asmh_b[numStripeUnitsBailed], numBlocks * sizeof(RF_AccessStripeMapHeader_t *), (RF_AccessStripeMapHeader_t **));
+
+ /* lookup array of blockFuncs for this stripe unit */
+ for (k=0; k < numBlocks; k++)
+ {
+ /* remap for series of single stripe-unit accesses */
+ address = physPtr->raidAddress + k;
+ length = 1;
+ buffer = physPtr->bufPtr + (k * (1<<raidPtr->logBytesPerSector));
+
+ asmh_b[numStripeUnitsBailed][k] = rf_MapAccess(raidPtr, address, length, buffer, RF_DONT_REMAP);
+ asm_bp = asmh_b[numStripeUnitsBailed][k]->stripeMap;
+
+ /* get the creation func for this stripe unit */
+ (raidPtr->Layout.map-> SelectionFunc)(raidPtr, type, asm_bp, &(blockFuncs[numStripeUnitsBailed][k]));
+
+ /* check to see if we found a creation func for this stripe unit */
+ if (blockFuncs[numStripeUnitsBailed][k] == NULL)
+ cantCreateDAGs = RF_TRUE;
+ }
+ numStripeUnitsBailed++;
+ }
+ else
+ {
+ numUnitDags++;
+ }
+ }
+ RF_ASSERT(j == numStripeUnits);
+ numStripesBailed++;
+ }
+ }
+
+ if (cantCreateDAGs)
+ {
+ /* free memory and punt */
+ if (asm_h->numStripes > MAXNSTRIPES)
+ RF_Free(stripeFuncs, asm_h->numStripes * sizeof(RF_VoidFuncPtr));
+ if (numStripesBailed > 0)
+ {
+ stripeNum = 0;
+ for (i = 0, asm_p = asmap; asm_p; asm_p = asm_p->next, i++)
+ if (stripeFuncs[i] == NULL)
+ {
+ numStripeUnits = asm_p->numStripeUnitsAccessed;
+ for (j = 0; j < numStripeUnits; j++)
+ rf_FreeAccessStripeMap(asmh_u[stripeNum][j]);
+ RF_Free(asmh_u[stripeNum], numStripeUnits * sizeof(RF_AccessStripeMapHeader_t *));
+ RF_Free(stripeUnitFuncs[stripeNum], numStripeUnits * sizeof(RF_VoidFuncPtr));
+ stripeNum++;
+ }
+ RF_ASSERT(stripeNum == numStripesBailed);
+ RF_Free(stripeUnitFuncs, asm_h->numStripes * sizeof(RF_VoidFuncPtr));
+ RF_Free(asmh_u, asm_h->numStripes * sizeof(RF_AccessStripeMapHeader_t **));
+ }
+ return(1);
+ }
+ else
+ {
+ /* begin dag creation */
+ stripeNum = 0;
+ stripeUnitNum = 0;
+
+ /* create an array of dagLists and fill them in */
+ RF_CallocAndAdd(desc->dagArray, desc->numStripes, sizeof(RF_DagList_t), (RF_DagList_t *), desc->cleanupList);
+
+ for (i=0, asm_p = asmap; asm_p; asm_p=asm_p->next,i++) {
+ /* grab dag header for this stripe */
+ dag_h = NULL;
+ desc->dagArray[i].desc = desc;
+
+ if (stripeFuncs[i] == (RF_VoidFuncPtr) NULL)
+ {
+ /* use bailout functions for this stripe */
+ for (j = 0, physPtr = asm_p->physInfo; physPtr; physPtr=physPtr->next, j++)
+ {
+ uFunc = stripeUnitFuncs[stripeNum][j];
+ if (uFunc == (RF_VoidFuncPtr) NULL)
+ {
+ /* use bailout functions for this stripe unit */
+ for (k = 0; k < physPtr->numSector; k++)
+ {
+ /* create a dag for this block */
+ InitHdrNode(&tempdag_h, raidPtr, rf_useMemChunks);
+ desc->dagArray[i].numDags++;
+ if (dag_h == NULL) {
+ dag_h = tempdag_h;
+ }
+ else {
+ lastdag_h->next = tempdag_h;
+ }
+ lastdag_h = tempdag_h;
+
+ bFunc = blockFuncs[stripeUnitNum][k];
+ RF_ASSERT(bFunc);
+ asm_bp = asmh_b[stripeUnitNum][k]->stripeMap;
+ (*bFunc)(raidPtr, asm_bp, tempdag_h, bp, flags, tempdag_h->allocList);
+ }
+ stripeUnitNum++;
+ }
+ else
+ {
+ /* create a dag for this unit */
+ InitHdrNode(&tempdag_h, raidPtr, rf_useMemChunks);
+ desc->dagArray[i].numDags++;
+ if (dag_h == NULL) {
+ dag_h = tempdag_h;
+ }
+ else {
+ lastdag_h->next = tempdag_h;
+ }
+ lastdag_h = tempdag_h;
+
+ asm_up = asmh_u[stripeNum][j]->stripeMap;
+ (*uFunc)(raidPtr, asm_up, tempdag_h, bp, flags, tempdag_h->allocList);
+ }
+ }
+ RF_ASSERT(j == asm_p->numStripeUnitsAccessed);
+ /* merge linked bailout dag to existing dag collection */
+ stripeNum++;
+ }
+ else {
+ /* Create a dag for this parity stripe */
+ InitHdrNode(&tempdag_h, raidPtr, rf_useMemChunks);
+ desc->dagArray[i].numDags++;
+ if (dag_h == NULL) {
+ dag_h = tempdag_h;
+ }
+ else {
+ lastdag_h->next = tempdag_h;
+ }
+ lastdag_h = tempdag_h;
+
+ (stripeFuncs[i])(raidPtr, asm_p, tempdag_h, bp, flags, tempdag_h->allocList);
+ }
+ desc->dagArray[i].dags = dag_h;
+ }
+ RF_ASSERT(i == desc->numStripes);
+
+ /* free memory */
+ if (asm_h->numStripes > MAXNSTRIPES)
+ RF_Free(stripeFuncs, asm_h->numStripes * sizeof(RF_VoidFuncPtr));
+ if ((numStripesBailed > 0) || (numStripeUnitsBailed > 0))
+ {
+ stripeNum = 0;
+ stripeUnitNum = 0;
+ if (dag_h->asmList)
+ {
+ endASMList = dag_h->asmList;
+ while (endASMList->next)
+ endASMList = endASMList->next;
+ }
+ else
+ endASMList = NULL;
+ /* walk through io, stripe by stripe */
+ for (i = 0, asm_p = asmap; asm_p; asm_p = asm_p->next, i++)
+ if (stripeFuncs[i] == NULL)
+ {
+ numStripeUnits = asm_p->numStripeUnitsAccessed;
+ /* walk through stripe, stripe unit by stripe unit */
+ for (j = 0, physPtr = asm_p->physInfo; physPtr; physPtr = physPtr->next, j++)
+ {
+ if (stripeUnitFuncs[stripeNum][j] == NULL)
+ {
+ numBlocks = physPtr->numSector;
+ /* walk through stripe unit, block by block */
+ for (k = 0; k < numBlocks; k++)
+ if (dag_h->asmList == NULL)
+ {
+ dag_h->asmList = asmh_b[stripeUnitNum][k];
+ endASMList = dag_h->asmList;
+ }
+ else
+ {
+ endASMList->next = asmh_b[stripeUnitNum][k];
+ endASMList = endASMList->next;
+ }
+ RF_Free(asmh_b[stripeUnitNum], numBlocks * sizeof(RF_AccessStripeMapHeader_t *));
+ RF_Free(blockFuncs[stripeUnitNum], numBlocks * sizeof(RF_VoidFuncPtr));
+ stripeUnitNum++;
+ }
+ if (dag_h->asmList == NULL)
+ {
+ dag_h->asmList = asmh_u[stripeNum][j];
+ endASMList = dag_h->asmList;
+ }
+ else
+ {
+ endASMList->next = asmh_u[stripeNum][j];
+ endASMList = endASMList->next;
+ }
+ }
+ RF_Free(asmh_u[stripeNum], numStripeUnits * sizeof(RF_AccessStripeMapHeader_t *));
+ RF_Free(stripeUnitFuncs[stripeNum], numStripeUnits * sizeof(RF_VoidFuncPtr));
+ stripeNum++;
+ }
+ RF_ASSERT(stripeNum == numStripesBailed);
+ RF_Free(stripeUnitFuncs, asm_h->numStripes * sizeof(RF_VoidFuncPtr));
+ RF_Free(asmh_u, asm_h->numStripes * sizeof(RF_AccessStripeMapHeader_t **));
+ if (numStripeUnitsBailed > 0)
+ {
+ RF_ASSERT(stripeUnitNum == numStripeUnitsBailed);
+ RF_Free(blockFuncs, raidPtr->Layout.numDataCol * asm_h->numStripes * sizeof(RF_VoidFuncPtr));
+ RF_Free(asmh_b, raidPtr->Layout.numDataCol * asm_h->numStripes * sizeof(RF_AccessStripeMapHeader_t **));
+ }
+ }
+ return(0);
+ }
+}
diff --git a/sys/dev/raidframe/rf_aselect.h b/sys/dev/raidframe/rf_aselect.h
new file mode 100644
index 00000000000..1b1d3e51795
--- /dev/null
+++ b/sys/dev/raidframe/rf_aselect.h
@@ -0,0 +1,60 @@
+/* $OpenBSD: rf_aselect.h,v 1.1 1999/01/11 14:29:00 niklas Exp $ */
+/* $NetBSD: rf_aselect.h,v 1.1 1998/11/13 04:20:26 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************
+ *
+ * aselect.h -- header file for algorithm selection code
+ *
+ *****************************************************************************/
+/* :
+ * Log: rf_aselect.h,v
+ * Revision 1.5 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.4 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.3 1995/11/30 16:28:00 wvcii
+ * added copyright info
+ *
+ * Revision 1.2 1995/11/19 16:20:46 wvcii
+ * changed SelectAlgorithm prototype
+ *
+ */
+
+#ifndef _RF__RF_ASELECT_H_
+#define _RF__RF_ASELECT_H_
+
+#include "rf_desc.h"
+
+int rf_SelectAlgorithm(RF_RaidAccessDesc_t *desc, RF_RaidAccessFlags_t flags);
+
+#endif /* !_RF__RF_ASELECT_H_ */
diff --git a/sys/dev/raidframe/rf_callback.c b/sys/dev/raidframe/rf_callback.c
new file mode 100644
index 00000000000..dffd52fc7a6
--- /dev/null
+++ b/sys/dev/raidframe/rf_callback.c
@@ -0,0 +1,121 @@
+/* $OpenBSD: rf_callback.c,v 1.1 1999/01/11 14:29:00 niklas Exp $ */
+/* $NetBSD: rf_callback.c,v 1.1 1998/11/13 04:20:26 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************************
+ *
+ * callback.c -- code to manipulate callback descriptor
+ *
+ ****************************************************************************************/
+
+/* :
+ * Log: rf_callback.c,v
+ * Revision 1.11 1996/06/17 03:18:04 jimz
+ * include shutdown.h for macroized ShutdownCreate
+ *
+ * Revision 1.10 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.9 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.8 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.7 1996/05/17 16:30:41 jimz
+ * convert to RF_FREELIST stuff
+ *
+ * Revision 1.6 1995/12/01 15:16:04 root
+ * added copyright info
+ *
+ */
+
+#ifndef _KERNEL
+#ifdef __NetBSD__
+#include <unistd.h>
+#endif /* __NetBSD__ */
+#endif
+
+#include "rf_types.h"
+#include "rf_threadstuff.h"
+#include "rf_callback.h"
+#include "rf_debugMem.h"
+#include "rf_freelist.h"
+#include "rf_shutdown.h"
+
+static RF_FreeList_t *rf_callback_freelist;
+
+#define RF_MAX_FREE_CALLBACK 64
+#define RF_CALLBACK_INC 4
+#define RF_CALLBACK_INITIAL 4
+
+static void rf_ShutdownCallback(void *);
+static void rf_ShutdownCallback(ignored)
+ void *ignored;
+{
+ RF_FREELIST_DESTROY(rf_callback_freelist,next,(RF_CallbackDesc_t *));
+}
+
+int rf_ConfigureCallback(listp)
+ RF_ShutdownList_t **listp;
+{
+ int rc;
+
+ RF_FREELIST_CREATE(rf_callback_freelist, RF_MAX_FREE_CALLBACK,
+ RF_CALLBACK_INC, sizeof(RF_CallbackDesc_t));
+ if (rf_callback_freelist == NULL)
+ return(ENOMEM);
+ rc = rf_ShutdownCreate(listp, rf_ShutdownCallback, NULL);
+ if (rc) {
+ RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ rf_ShutdownCallback(NULL);
+ return(rc);
+ }
+ RF_FREELIST_PRIME(rf_callback_freelist, RF_CALLBACK_INITIAL,next,
+ (RF_CallbackDesc_t *));
+ return(0);
+}
+
+RF_CallbackDesc_t *rf_AllocCallbackDesc()
+{
+ RF_CallbackDesc_t *p;
+
+ RF_FREELIST_GET(rf_callback_freelist,p,next,(RF_CallbackDesc_t *));
+ return(p);
+}
+
+void rf_FreeCallbackDesc(p)
+ RF_CallbackDesc_t *p;
+{
+ RF_FREELIST_FREE(rf_callback_freelist,p,next);
+}
diff --git a/sys/dev/raidframe/rf_callback.h b/sys/dev/raidframe/rf_callback.h
new file mode 100644
index 00000000000..cb3db8ebbbd
--- /dev/null
+++ b/sys/dev/raidframe/rf_callback.h
@@ -0,0 +1,92 @@
+/* $OpenBSD: rf_callback.h,v 1.1 1999/01/11 14:29:00 niklas Exp $ */
+/* $NetBSD: rf_callback.h,v 1.1 1998/11/13 04:20:26 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************************
+ *
+ * callback.h -- header file for callback.c
+ *
+ * the reconstruction code must manage concurrent I/Os on multiple drives.
+ * it sometimes needs to suspend operation on a particular drive until some
+ * condition occurs. we can't block the thread, of course, or we wouldn't
+ * be able to manage our other outstanding I/Os. Instead we just suspend
+ * new activity on the indicated disk, and create a callback descriptor and
+ * put it someplace where it will get invoked when the condition that's
+ * stalling us has cleared. When the descriptor is invoked, it will call
+ * a function that will restart operation on the indicated disk.
+ *
+ ****************************************************************************************/
+
+/* :
+ * Log: rf_callback.h,v
+ * Revision 1.8 1996/08/01 15:57:28 jimz
+ * minor cleanup
+ *
+ * Revision 1.7 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.6 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.5 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.4 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.3 1996/05/17 16:30:46 jimz
+ * add prototypes
+ *
+ * Revision 1.2 1995/12/01 15:15:55 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_CALLBACK_H_
+#define _RF__RF_CALLBACK_H_
+
+#include "rf_types.h"
+
+struct RF_CallbackDesc_s {
+ void (*callbackFunc)(RF_CBParam_t); /* function to call */
+ RF_CBParam_t callbackArg; /* args to give to function, or just info about this callback */
+ RF_CBParam_t callbackArg2;
+ RF_RowCol_t row; /* disk row and column IDs to give to the callback func */
+ RF_RowCol_t col;
+ RF_CallbackDesc_t *next; /* next entry in list */
+};
+
+int rf_ConfigureCallback(RF_ShutdownList_t **listp);
+RF_CallbackDesc_t *rf_AllocCallbackDesc(void);
+void rf_FreeCallbackDesc(RF_CallbackDesc_t *p);
+
+#endif /* !_RF__RF_CALLBACK_H_ */
diff --git a/sys/dev/raidframe/rf_ccmn.h b/sys/dev/raidframe/rf_ccmn.h
new file mode 100644
index 00000000000..f13778c0cd4
--- /dev/null
+++ b/sys/dev/raidframe/rf_ccmn.h
@@ -0,0 +1,115 @@
+/* $OpenBSD: rf_ccmn.h,v 1.1 1999/01/11 14:29:01 niklas Exp $ */
+/* $NetBSD: rf_ccmn.h,v 1.1 1998/11/13 04:20:26 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_ccmn.h
+ * header file that declares the ccmn routines, and includes
+ * the files needed to use them.
+ */
+
+/* :
+ * Log: rf_ccmn.h,v
+ * Revision 1.4 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.3 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.2 1995/12/01 15:16:45 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_CCMN_H_
+#define _RF__RF_CCMN_H_
+
+#ifdef __osf__
+#include <sys/errno.h>
+#include <sys/types.h>
+#include <sys/file.h>
+#include <sys/param.h>
+#include <sys/uio.h>
+#include <sys/time.h>
+#include <sys/buf.h>
+#include <sys/ioctl.h>
+#include <io/common/iotypes.h>
+#include <io/cam/cam_debug.h>
+#include <io/cam/cam.h>
+#include <io/cam/dec_cam.h>
+#include <io/cam/uagt.h>
+#include <io/cam/scsi_all.h>
+#include <io/cam/scsi_direct.h>
+
+#ifdef KERNEL
+#include <sys/conf.h>
+#include <sys/mtio.h>
+#include <io/common/devio.h>
+#include <io/common/devdriver.h>
+#include <io/cam/scsi_status.h>
+#include <io/cam/pdrv.h>
+#include <io/common/pt.h>
+#include <sys/disklabel.h>
+#include <io/cam/cam_disk.h>
+#include <io/cam/ccfg.h>
+
+extern void ccmn_init();
+extern long ccmn_open_unit();
+extern void ccmn_close_unit();
+extern u_long ccmn_send_ccb();
+extern void ccmn_rem_ccb();
+extern void ccmn_abort_que();
+extern void ccmn_term_que();
+extern CCB_HEADER *ccmn_get_ccb();
+extern void ccmn_rel_ccb();
+extern CCB_SCSIIO *ccmn_io_ccb_bld();
+extern CCB_GETDEV *ccmn_gdev_ccb_bld();
+extern CCB_SETDEV *ccmn_sdev_ccb_bld();
+extern CCB_SETASYNC *ccmn_sasy_ccb_bld();
+extern CCB_RELSIM *ccmn_rsq_ccb_bld();
+extern CCB_PATHINQ *ccmn_pinq_ccb_bld();
+extern CCB_ABORT *ccmn_abort_ccb_bld();
+extern CCB_TERMIO *ccmn_term_ccb_bld();
+extern CCB_RESETDEV *ccmn_bdr_ccb_bld();
+extern CCB_RESETBUS *ccmn_br_ccb_bld();
+extern CCB_SCSIIO *ccmn_tur();
+extern CCB_SCSIIO *ccmn_mode_select();
+extern u_long ccmn_ccb_status();
+extern struct buf *ccmn_get_bp();
+extern void ccmn_rel_bp();
+extern u_char *ccmn_get_dbuf();
+extern void ccmn_rel_dbuf();
+
+extern struct device *camdinfo[];
+extern struct controller *camminfo[];
+extern PDRV_UNIT_ELEM pdrv_unit_table[];
+
+#endif /* KERNEL */
+#endif /* __osf__ */
+
+#endif /* !_RF__RF_CCMN_H_ */
diff --git a/sys/dev/raidframe/rf_chaindecluster.c b/sys/dev/raidframe/rf_chaindecluster.c
new file mode 100644
index 00000000000..bbb7caa92ec
--- /dev/null
+++ b/sys/dev/raidframe/rf_chaindecluster.c
@@ -0,0 +1,382 @@
+/* $OpenBSD: rf_chaindecluster.c,v 1.1 1999/01/11 14:29:01 niklas Exp $ */
+/* $NetBSD: rf_chaindecluster.c,v 1.1 1998/11/13 04:20:26 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Khalil Amiri
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/******************************************************************************
+ *
+ * rf_chaindecluster.c -- implements chained declustering
+ *
+ *****************************************************************************/
+
+/* :
+ * Log: rf_chaindecluster.c,v
+ * Revision 1.33 1996/08/02 13:20:34 jimz
+ * get rid of bogus (long) casts
+ *
+ * Revision 1.32 1996/07/31 16:56:18 jimz
+ * dataBytesPerStripe, sectorsPerDisk init arch-indep.
+ *
+ * Revision 1.31 1996/07/29 14:05:12 jimz
+ * fix numPUs/numRUs confusion (everything is now numRUs)
+ * clean up some commenting, return values
+ *
+ * Revision 1.30 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.29 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.28 1996/06/19 17:53:48 jimz
+ * move GetNumSparePUs, InstallSpareTable ops into layout switch
+ *
+ * Revision 1.27 1996/06/11 15:19:57 wvcii
+ * added include of rf_chaindecluster.h
+ * fixed parameter list of rf_ConfigureChainDecluster
+ *
+ * Revision 1.26 1996/06/11 08:55:15 jimz
+ * improved error-checking at configuration time
+ *
+ * Revision 1.25 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.24 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.23 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.22 1996/06/06 17:31:30 jimz
+ * use CreateMirrorPartitionReadDAG for mirrored reads
+ *
+ * Revision 1.21 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.20 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.19 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.18 1996/05/31 16:13:28 amiri
+ * removed/added some commnets.
+ *
+ * Revision 1.17 1996/05/31 05:01:52 amiri
+ * fixed a bug related to sparing layout.
+ *
+ * Revision 1.16 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.15 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.14 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.13 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.12 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.11 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.10 1996/05/03 19:53:56 wvcii
+ * removed include of rf_redstripe.h
+ * moved dag creation routines to new dag library
+ *
+ */
+
+#include "rf_archs.h"
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_chaindecluster.h"
+#include "rf_dag.h"
+#include "rf_dagutils.h"
+#include "rf_dagffrd.h"
+#include "rf_dagffwr.h"
+#include "rf_dagdegrd.h"
+#include "rf_dagfuncs.h"
+#include "rf_threadid.h"
+#include "rf_general.h"
+#include "rf_utils.h"
+
+typedef struct RF_ChaindeclusterConfigInfo_s {
+ RF_RowCol_t **stripeIdentifier; /* filled in at config time
+ * and used by IdentifyStripe */
+ RF_StripeCount_t numSparingRegions;
+ RF_StripeCount_t stripeUnitsPerSparingRegion;
+ RF_SectorNum_t mirrorStripeOffset;
+} RF_ChaindeclusterConfigInfo_t;
+
+int rf_ConfigureChainDecluster(
+ RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr)
+{
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_StripeCount_t num_used_stripeUnitsPerDisk;
+ RF_ChaindeclusterConfigInfo_t *info;
+ RF_RowCol_t i;
+
+ /* create a Chained Declustering configuration structure */
+ RF_MallocAndAdd(info, sizeof(RF_ChaindeclusterConfigInfo_t), (RF_ChaindeclusterConfigInfo_t *), raidPtr->cleanupList);
+ if (info == NULL)
+ return(ENOMEM);
+ layoutPtr->layoutSpecificInfo = (void *) info;
+
+ /* fill in the config structure. */
+ info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol, 2 , raidPtr->cleanupList);
+ if (info->stripeIdentifier == NULL)
+ return(ENOMEM);
+ for (i=0; i< raidPtr->numCol; i++) {
+ info->stripeIdentifier[i][0] = i % raidPtr->numCol;
+ info->stripeIdentifier[i][1] = (i+1) % raidPtr->numCol;
+ }
+
+ RF_ASSERT(raidPtr->numRow == 1);
+
+ /* fill in the remaining layout parameters */
+ num_used_stripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk - (layoutPtr->stripeUnitsPerDisk %
+ (2*raidPtr->numCol-2) );
+ info->numSparingRegions = num_used_stripeUnitsPerDisk / (2*raidPtr->numCol-2);
+ info->stripeUnitsPerSparingRegion = raidPtr->numCol * (raidPtr->numCol - 1);
+ info->mirrorStripeOffset = info->numSparingRegions * (raidPtr->numCol-1);
+ layoutPtr->numStripe = info->numSparingRegions * info->stripeUnitsPerSparingRegion;
+ layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+ layoutPtr->numDataCol = 1;
+ layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+ layoutPtr->numParityCol = 1;
+
+ layoutPtr->dataStripeUnitsPerDisk = num_used_stripeUnitsPerDisk;
+
+ raidPtr->sectorsPerDisk =
+ num_used_stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
+
+ raidPtr->totalSectors =
+ (layoutPtr->numStripe) * layoutPtr->sectorsPerStripeUnit;
+
+ layoutPtr->stripeUnitsPerDisk = raidPtr->sectorsPerDisk / layoutPtr->sectorsPerStripeUnit;
+
+ return(0);
+}
+
+RF_ReconUnitCount_t rf_GetNumSpareRUsChainDecluster(raidPtr)
+ RF_Raid_t *raidPtr;
+{
+ RF_ChaindeclusterConfigInfo_t *info = (RF_ChaindeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+
+ /*
+ * The layout uses two stripe units per disk as spare within each
+ * sparing region.
+ */
+ return (2*info->numSparingRegions);
+}
+
+
+/* Maps to the primary copy of the data, i.e. the first mirror pair */
+void rf_MapSectorChainDecluster(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_ChaindeclusterConfigInfo_t *info = (RF_ChaindeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+ RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+ RF_SectorNum_t index_within_region, index_within_disk;
+ RF_StripeNum_t sparing_region_id;
+ int col_before_remap;
+
+ *row = 0;
+ sparing_region_id = SUID / info->stripeUnitsPerSparingRegion;
+ index_within_region = SUID % info->stripeUnitsPerSparingRegion;
+ index_within_disk = index_within_region / raidPtr->numCol;
+ col_before_remap = SUID % raidPtr->numCol;
+
+ if (!remap) {
+ *col = col_before_remap;
+ *diskSector = ( index_within_disk + ( (raidPtr->numCol-1) * sparing_region_id) ) *
+ raidPtr->Layout.sectorsPerStripeUnit;
+ *diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+ }
+ else {
+ /* remap sector to spare space...*/
+ *diskSector = sparing_region_id * (raidPtr->numCol+1) * raidPtr->Layout.sectorsPerStripeUnit;
+ *diskSector += (raidPtr->numCol-1) * raidPtr->Layout.sectorsPerStripeUnit;
+ *diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+ index_within_disk = index_within_region / raidPtr->numCol;
+ if (index_within_disk < col_before_remap )
+ *col = index_within_disk;
+ else if (index_within_disk == raidPtr->numCol-2 ) {
+ *col = (col_before_remap+raidPtr->numCol-1) % raidPtr->numCol;
+ *diskSector += raidPtr->Layout.sectorsPerStripeUnit;
+ }
+ else
+ *col = (index_within_disk + 2) % raidPtr->numCol;
+ }
+
+}
+
+
+
+/* Maps to the second copy of the mirror pair, which is chain declustered. The second copy is contained
+ in the next disk (mod numCol) after the disk containing the primary copy.
+ The offset into the disk is one-half disk down */
+void rf_MapParityChainDecluster(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_ChaindeclusterConfigInfo_t *info = (RF_ChaindeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+ RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+ RF_SectorNum_t index_within_region, index_within_disk;
+ RF_StripeNum_t sparing_region_id;
+ int col_before_remap;
+
+ *row = 0;
+ if (!remap) {
+ *col = SUID % raidPtr->numCol;
+ *col = (*col + 1) % raidPtr->numCol;
+ *diskSector = info->mirrorStripeOffset * raidPtr->Layout.sectorsPerStripeUnit;
+ *diskSector += ( SUID / raidPtr->numCol ) * raidPtr->Layout.sectorsPerStripeUnit;
+ *diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+ }
+ else {
+ /* remap parity to spare space ... */
+ sparing_region_id = SUID / info->stripeUnitsPerSparingRegion;
+ index_within_region = SUID % info->stripeUnitsPerSparingRegion;
+ index_within_disk = index_within_region / raidPtr->numCol;
+ *diskSector = sparing_region_id * (raidPtr->numCol+1) * raidPtr->Layout.sectorsPerStripeUnit;
+ *diskSector += (raidPtr->numCol) * raidPtr->Layout.sectorsPerStripeUnit;
+ *diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+ col_before_remap = SUID % raidPtr->numCol;
+ if (index_within_disk < col_before_remap)
+ *col = index_within_disk;
+ else if (index_within_disk == raidPtr->numCol-2 ) {
+ *col = (col_before_remap+2) % raidPtr->numCol;
+ *diskSector -= raidPtr->Layout.sectorsPerStripeUnit;
+ }
+ else
+ *col = (index_within_disk + 2) % raidPtr->numCol;
+ }
+
+}
+
+void rf_IdentifyStripeChainDecluster(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids,
+ RF_RowCol_t *outRow)
+{
+ RF_ChaindeclusterConfigInfo_t *info = (RF_ChaindeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+ RF_StripeNum_t SUID;
+ RF_RowCol_t col;
+
+ SUID = addr / raidPtr->Layout.sectorsPerStripeUnit;
+ col = SUID % raidPtr->numCol;
+ *outRow = 0;
+ *diskids = info->stripeIdentifier[ col ];
+}
+
+void rf_MapSIDToPSIDChainDecluster(
+ RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t stripeID,
+ RF_StripeNum_t *psID,
+ RF_ReconUnitNum_t *which_ru)
+{
+ *which_ru = 0;
+ *psID = stripeID;
+}
+
+/******************************************************************************
+ * select a graph to perform a single-stripe access
+ *
+ * Parameters: raidPtr - description of the physical array
+ * type - type of operation (read or write) requested
+ * asmap - logical & physical addresses for this access
+ * createFunc - function to use to create the graph (return value)
+ *****************************************************************************/
+
+void rf_RAIDCDagSelect(
+ RF_Raid_t *raidPtr,
+ RF_IoType_t type,
+ RF_AccessStripeMap_t *asmap,
+ RF_VoidFuncPtr *createFunc)
+#if 0
+ void (**createFunc)(RF_Raid_t *, RF_AccessStripeMap_t *,
+ RF_DagHeader_t *, void *, RF_RaidAccessFlags_t,
+ RF_AllocListElem_t *))
+#endif
+{
+ RF_ASSERT(RF_IO_IS_R_OR_W(type));
+ RF_ASSERT(raidPtr->numRow == 1);
+
+ if (asmap->numDataFailed + asmap->numParityFailed > 1) {
+ RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n");
+ *createFunc = NULL;
+ return;
+ }
+
+ *createFunc = (type == RF_IO_TYPE_READ) ? (RF_VoidFuncPtr)rf_CreateFaultFreeReadDAG :(RF_VoidFuncPtr) rf_CreateRaidOneWriteDAG;
+
+ if (type == RF_IO_TYPE_READ) {
+ if ( ( raidPtr->status[0] == rf_rs_degraded ) || ( raidPtr->status[0] == rf_rs_reconstructing) )
+ *createFunc = (RF_VoidFuncPtr)rf_CreateRaidCDegradedReadDAG; /* array status is degraded, implement workload shifting */
+ else
+ *createFunc = (RF_VoidFuncPtr)rf_CreateMirrorPartitionReadDAG; /* array status not degraded, so use mirror partition dag */
+ }
+ else
+ *createFunc = (RF_VoidFuncPtr)rf_CreateRaidOneWriteDAG;
+}
diff --git a/sys/dev/raidframe/rf_chaindecluster.h b/sys/dev/raidframe/rf_chaindecluster.h
new file mode 100644
index 00000000000..52a94deac2f
--- /dev/null
+++ b/sys/dev/raidframe/rf_chaindecluster.h
@@ -0,0 +1,123 @@
+/* $OpenBSD: rf_chaindecluster.h,v 1.1 1999/01/11 14:29:01 niklas Exp $ */
+/* $NetBSD: rf_chaindecluster.h,v 1.1 1998/11/13 04:20:26 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Khalil Amiri
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_chaindecluster.h
+ * header file for Chained Declustering
+ */
+
+/*
+ * :
+ * Log: rf_chaindecluster.h,v
+ * Revision 1.14 1996/07/29 14:05:12 jimz
+ * fix numPUs/numRUs confusion (everything is now numRUs)
+ * clean up some commenting, return values
+ *
+ * Revision 1.13 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.12 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.11 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.10 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.9 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.8 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.7 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.6 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.5 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.4 1996/02/22 16:45:59 amiri
+ * added declaration of dag selection function
+ *
+ * Revision 1.3 1995/12/01 15:16:56 root
+ * added copyright info
+ *
+ * Revision 1.2 1995/11/17 19:55:21 amiri
+ * prototyped MapParityChainDecluster
+ */
+
+#ifndef _RF__RF_CHAINDECLUSTER_H_
+#define _RF__RF_CHAINDECLUSTER_H_
+
+int rf_ConfigureChainDecluster(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr);
+RF_ReconUnitCount_t rf_GetNumSpareRUsChainDecluster(RF_Raid_t *raidPtr);
+void rf_MapSectorChainDecluster(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_MapParityChainDecluster(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_IdentifyStripeChainDecluster(RF_Raid_t *raidPtr, RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids, RF_RowCol_t *outRow);
+void rf_MapSIDToPSIDChainDecluster(RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t stripeID, RF_StripeNum_t *psID,
+ RF_ReconUnitNum_t *which_ru);
+void rf_RAIDCDagSelect(RF_Raid_t *raidPtr, RF_IoType_t type,
+ RF_AccessStripeMap_t *asmap,
+ RF_VoidFuncPtr *);
+#if 0
+ void (**createFunc)(RF_Raid_t *,
+ RF_AccessStripeMap_t *,
+ RF_DagHeader_t *,
+ void *,
+ RF_RaidAccessFlags_t,
+ RF_AllocListElem_t *)
+);
+#endif
+
+#endif /* !_RF__RF_CHAINDECLUSTER_H_ */
diff --git a/sys/dev/raidframe/rf_configure.h b/sys/dev/raidframe/rf_configure.h
new file mode 100644
index 00000000000..aee456c52a2
--- /dev/null
+++ b/sys/dev/raidframe/rf_configure.h
@@ -0,0 +1,127 @@
+/* $OpenBSD: rf_configure.h,v 1.1 1999/01/11 14:29:02 niklas Exp $ */
+/* $NetBSD: rf_configure.h,v 1.1 1998/11/13 04:20:26 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/********************************
+ *
+ * rf_configure.h
+ *
+ * header file for raidframe configuration in the kernel version only.
+ * configuration is invoked via ioctl rather than at boot time
+ *
+ *******************************/
+
+/* :
+ * Log: rf_configure.h,v
+ * Revision 1.16 1996/06/19 14:57:53 jimz
+ * move layout-specific config parsing hooks into RF_LayoutSW_t
+ * table in rf_layout.c
+ *
+ * Revision 1.15 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.14 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.13 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.12 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.11 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.10 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.9 1996/05/18 20:09:51 jimz
+ * bit of cleanup to compile cleanly in kernel, once again
+ *
+ * Revision 1.8 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.7 1995/12/01 15:16:26 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_CONFIGURE_H_
+#define _RF__RF_CONFIGURE_H_
+
+#include "rf_archs.h"
+#include "rf_types.h"
+
+#include <sys/param.h>
+#include <sys/proc.h>
+
+#include <sys/ioctl.h>
+
+/* the raidframe configuration, passed down through an ioctl.
+ * the driver can be reconfigured (with total loss of data) at any time,
+ * but it must be shut down first.
+ */
+struct RF_Config_s {
+ RF_RowCol_t numRow, numCol, numSpare; /* number of rows, columns, and spare disks */
+ dev_t devs[RF_MAXROW][RF_MAXCOL]; /* device numbers for disks comprising array */
+ char devnames[RF_MAXROW][RF_MAXCOL][50]; /* device names */
+ dev_t spare_devs[RF_MAXSPARE]; /* device numbers for spare disks */
+ char spare_names[RF_MAXSPARE][50]; /* device names */
+ RF_SectorNum_t sectPerSU; /* sectors per stripe unit */
+ RF_StripeNum_t SUsPerPU; /* stripe units per parity unit */
+ RF_StripeNum_t SUsPerRU; /* stripe units per reconstruction unit */
+ RF_ParityConfig_t parityConfig; /* identifies the RAID architecture to be used */
+ RF_DiskQueueType_t diskQueueType; /* 'f' = fifo, 'c' = cvscan, not used in kernel */
+ char maxOutstandingDiskReqs; /* # concurrent reqs to be sent to a disk. not used in kernel. */
+ char debugVars[RF_MAXDBGV][50]; /* space for specifying debug variables & their values */
+ unsigned int layoutSpecificSize; /* size in bytes of layout-specific info */
+ void *layoutSpecific; /* a pointer to a layout-specific structure to be copied in */
+};
+
+#ifndef KERNEL
+int rf_MakeConfig(char *configname, RF_Config_t *cfgPtr);
+int rf_MakeLayoutSpecificNULL(FILE *fp, RF_Config_t *cfgPtr, void *arg);
+int rf_MakeLayoutSpecificDeclustered(FILE *configfp, RF_Config_t *cfgPtr, void *arg);
+void *rf_ReadSpareTable(RF_SparetWait_t *req, char *fname);
+#endif /* !KERNEL */
+
+#endif /* !_RF__RF_CONFIGURE_H_ */
diff --git a/sys/dev/raidframe/rf_copyback.c b/sys/dev/raidframe/rf_copyback.c
new file mode 100644
index 00000000000..b2fe641fded
--- /dev/null
+++ b/sys/dev/raidframe/rf_copyback.c
@@ -0,0 +1,577 @@
+/* $OpenBSD: rf_copyback.c,v 1.1 1999/01/11 14:29:02 niklas Exp $ */
+/* $NetBSD: rf_copyback.c,v 1.1 1998/11/13 04:20:27 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************************
+ *
+ * copyback.c -- code to copy reconstructed data back from spare space to
+ * the replaced disk.
+ *
+ * the code operates using callbacks on the I/Os to continue with the next
+ * unit to be copied back. We do this because a simple loop containing blocking I/Os
+ * will not work in the simulator.
+ *
+ ****************************************************************************************/
+
+/*
+ * :
+ * Log: rf_copyback.c,v
+ * Revision 1.26 1996/08/06 22:26:00 jimz
+ * don't include sys/buf.h on linux
+ *
+ * Revision 1.25 1996/07/30 03:30:40 jimz
+ * include rf_types.h first
+ *
+ * Revision 1.24 1996/07/27 18:39:52 jimz
+ * cleanup sweep
+ *
+ * Revision 1.23 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.22 1996/07/11 19:08:00 jimz
+ * generalize reconstruction mechanism
+ * allow raid1 reconstructs via copyback (done with array
+ * quiesced, not online, therefore not disk-directed)
+ *
+ * Revision 1.21 1996/07/11 16:03:47 jimz
+ * fixed hanging bug in rf_CopybackWriteDoneProc()
+ *
+ * Revision 1.20 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.19 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.18 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.17 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.16 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.15 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.14 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.13 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.12 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.11 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.10 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.9 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.8 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.7 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.6 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.5 1995/12/01 15:15:31 root
+ * added copyright info
+ *
+ * Revision 1.4 1995/06/23 13:41:36 robby
+ * updeated to prototypes in rf_layout.h
+ *
+ */
+
+#include "rf_types.h"
+#include <sys/time.h>
+#ifndef LINUX
+#include <sys/buf.h>
+#endif /* !LINUX */
+#include "rf_raid.h"
+#include "rf_threadid.h"
+#include "rf_mcpair.h"
+#include "rf_acctrace.h"
+#include "rf_etimer.h"
+#include "rf_general.h"
+#include "rf_utils.h"
+#include "rf_copyback.h"
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include "rf_camlayer.h"
+#endif
+#include "rf_decluster.h"
+#include "rf_driver.h"
+#include "rf_shutdown.h"
+#include "rf_sys.h"
+
+#define RF_COPYBACK_DATA 0
+#define RF_COPYBACK_PARITY 1
+
+int rf_copyback_in_progress;
+
+static int rf_CopybackReadDoneProc(RF_CopybackDesc_t *desc, int status);
+static int rf_CopybackWriteDoneProc(RF_CopybackDesc_t *desc, int status);
+static void rf_CopybackOne(RF_CopybackDesc_t *desc, int typ,
+ RF_RaidAddr_t addr, RF_RowCol_t testRow, RF_RowCol_t testCol,
+ RF_SectorNum_t testOffs);
+static void rf_CopybackComplete(RF_CopybackDesc_t *desc, int status);
+
+int rf_ConfigureCopyback(listp)
+ RF_ShutdownList_t **listp;
+{
+ rf_copyback_in_progress = 0;
+ return(0);
+}
+
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/ioctl.h>
+#include <sys/fcntl.h>
+#ifdef __NETBSD__
+#include <sys/vnode.h>
+#endif
+
+int raidlookup __P((char *, struct proc *, struct vnode **));
+#endif
+
+/* do a complete copyback */
+void rf_CopybackReconstructedData(raidPtr)
+ RF_Raid_t *raidPtr;
+{
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ int done,retcode;
+ RF_CopybackDesc_t *desc;
+ RF_RowCol_t frow, fcol;
+ RF_RaidDisk_t *badDisk;
+ char *databuf;
+
+ struct partinfo dpart;
+ struct vnode *vp;
+ struct vattr va;
+ struct proc *proc;
+
+#else
+ int bus, targ, lun, done, retcode;
+ RF_CopybackDesc_t *desc;
+ RF_RowCol_t frow, fcol;
+ RF_RaidDisk_t *badDisk;
+ RF_DiskOp_t *tur_op;
+ char *databuf;
+#endif
+
+ done = 0;
+ fcol = 0;
+ for (frow=0; frow<raidPtr->numRow; frow++) {
+ for (fcol=0; fcol<raidPtr->numCol; fcol++) {
+ if (raidPtr->Disks[frow][fcol].status == rf_ds_dist_spared
+ || raidPtr->Disks[frow][fcol].status == rf_ds_spared)
+ {
+ done = 1;
+ break;
+ }
+ }
+ if (done)
+ break;
+ }
+
+ if (frow == raidPtr->numRow) {
+ printf("COPYBACK: no disks need copyback\n");
+ return;
+ }
+
+ badDisk = &raidPtr->Disks[frow][fcol];
+#ifndef SIMULATE
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+
+ proc = raidPtr->proc; /* XXX Yes, this is not nice.. */
+
+#if 0
+ printf("Pretending the disk is happy...\n");
+ retcode = 0; /* XXX this should be set to something more realistic. */
+#endif
+
+ /* This device may have been opened successfully the first time.
+ Close it before trying to open it again.. */
+
+ if (raidPtr->raid_cinfo[frow][fcol].ci_vp != NULL) {
+ printf("Closed the open device: %s\n",
+ raidPtr->Disks[frow][fcol].devname);
+ (void)vn_close(raidPtr->raid_cinfo[frow][fcol].ci_vp,
+ FREAD|FWRITE, proc->p_ucred, proc);
+ }
+
+ printf("About to (re-)open the device: %s\n",
+ raidPtr->Disks[frow][fcol].devname);
+
+ retcode = raidlookup(raidPtr->Disks[frow][fcol].devname, proc, &vp);
+
+ if (retcode) {
+ printf("COPYBACK: raidlookup on device: %s failed: %d!\n",
+ raidPtr->Disks[frow][fcol].devname, retcode);
+
+ /* XXX the component isn't responding properly...
+ must be still dead :-( */
+ return;
+
+ } else {
+
+ /* Ok, so we can at least do a lookup... How about actually
+ getting a vp for it? */
+
+ if ((retcode = VOP_GETATTR(vp, &va, proc->p_ucred, proc)) != 0) {
+ return;
+ }
+
+ retcode = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
+ FREAD, proc->p_ucred, proc);
+ if (retcode) {
+ return;
+ }
+ raidPtr->Disks[frow][fcol].blockSize = dpart.disklab->d_secsize;
+
+ raidPtr->Disks[frow][fcol].numBlocks = dpart.part->p_size -
+ rf_protectedSectors;
+
+ raidPtr->raid_cinfo[frow][fcol].ci_vp = vp;
+ raidPtr->raid_cinfo[frow][fcol].ci_dev = va.va_rdev;
+
+ raidPtr->Disks[frow][fcol].dev = va.va_rdev; /* XXX or the above? */
+
+ /* we allow the user to specify that only a fraction of the
+ * disks should be used this is just for debug: it speeds up
+ * the parity scan
+ */
+ raidPtr->Disks[frow][fcol].numBlocks =
+ raidPtr->Disks[frow][fcol].numBlocks *
+ rf_sizePercentage / 100;
+ }
+#else
+ if (rf_extract_ids(badDisk->devname, &bus, &targ, &lun)) {
+ printf("COPYBACK: unable to extract bus, target, lun from devname %s\n",
+ badDisk->devname);
+ return;
+ }
+
+ /* TUR the disk that's marked as bad to be sure that it's actually alive */
+ rf_SCSI_AllocTUR(&tur_op);
+ retcode = rf_SCSI_DoTUR(tur_op, bus, targ, lun, badDisk->dev);
+ rf_SCSI_FreeDiskOp(tur_op, 0);
+#endif
+
+ if (retcode) {
+ printf("COPYBACK: target disk failed TUR\n");
+ return;
+ }
+#endif /* !SIMULATE */
+
+ /* get a buffer to hold one SU */
+ RF_Malloc(databuf, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (char *));
+
+ /* create a descriptor */
+ RF_Malloc(desc, sizeof(*desc), (RF_CopybackDesc_t *));
+ desc->raidPtr = raidPtr;
+ desc->status = 0;
+ desc->frow = frow;
+ desc->fcol = fcol;
+ desc->spRow = badDisk->spareRow;
+ desc->spCol = badDisk->spareCol;
+ desc->stripeAddr = 0;
+ desc->sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
+ desc->sectPerStripe = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.numDataCol;
+ desc->databuf = databuf;
+#ifndef SIMULATE
+ desc->mcpair = rf_AllocMCPair();
+#endif /* !SIMULATE */
+
+ printf("COPYBACK: Quiescing the array\n");
+ /* quiesce the array, since we don't want to code support for user accs here */
+ rf_SuspendNewRequestsAndWait(raidPtr);
+
+ /* adjust state of the array and of the disks */
+ RF_LOCK_MUTEX(raidPtr->mutex);
+ raidPtr->Disks[desc->frow][desc->fcol].status = rf_ds_optimal;
+ raidPtr->status[desc->frow] = rf_rs_optimal;
+ rf_copyback_in_progress = 1; /* debug only */
+ RF_UNLOCK_MUTEX(raidPtr->mutex);
+
+ printf("COPYBACK: Beginning\n");
+ RF_GETTIME(desc->starttime);
+ rf_ContinueCopyback(desc);
+}
+
+
+/*
+ * invoked via callback after a copyback I/O has completed to
+ * continue on with the next one
+ */
+void rf_ContinueCopyback(desc)
+ RF_CopybackDesc_t *desc;
+{
+ RF_SectorNum_t testOffs, stripeAddr;
+ RF_Raid_t *raidPtr = desc->raidPtr;
+ RF_RaidAddr_t addr;
+ RF_RowCol_t testRow, testCol;
+ int old_pctg, new_pctg, done;
+ struct timeval t, diff;
+
+ old_pctg = (-1);
+ while (1) {
+ stripeAddr = desc->stripeAddr;
+ if (rf_prReconSched) {
+ old_pctg = 100 * desc->stripeAddr / raidPtr->totalSectors;
+ }
+ desc->stripeAddr += desc->sectPerStripe;
+ if (rf_prReconSched) {
+ new_pctg = 100 * desc->stripeAddr / raidPtr->totalSectors;
+ if (new_pctg != old_pctg) {
+ RF_GETTIME(t);
+ RF_TIMEVAL_DIFF(&desc->starttime, &t, &diff);
+ printf("%d %d.%06d\n",new_pctg, (int)diff.tv_sec, (int)diff.tv_usec);
+ }
+ }
+
+ if (stripeAddr >= raidPtr->totalSectors) {
+ rf_CopybackComplete(desc, 0);
+ return;
+ }
+
+ /* walk through the current stripe, su-by-su */
+ for (done=0, addr = stripeAddr; addr < stripeAddr+desc->sectPerStripe; addr += desc->sectPerSU) {
+
+ /* map the SU, disallowing remap to spare space */
+ (raidPtr->Layout.map->MapSector)(raidPtr, addr, &testRow, &testCol, &testOffs, RF_DONT_REMAP);
+
+ if (testRow == desc->frow && testCol == desc->fcol) {
+ rf_CopybackOne(desc, RF_COPYBACK_DATA, addr, testRow, testCol, testOffs);
+#ifdef SIMULATE
+ return;
+#else /* SIMULATE */
+ done = 1;
+ break;
+#endif /* SIMULATE */
+ }
+ }
+
+ if (!done) {
+ /* we didn't find the failed disk in the data part. check parity. */
+
+ /* map the parity for this stripe, disallowing remap to spare space */
+ (raidPtr->Layout.map->MapParity)(raidPtr, stripeAddr, &testRow, &testCol, &testOffs, RF_DONT_REMAP);
+
+ if (testRow == desc->frow && testCol == desc->fcol) {
+ rf_CopybackOne(desc, RF_COPYBACK_PARITY, stripeAddr, testRow, testCol, testOffs);
+#ifdef SIMULATE
+ return;
+#endif /* SIMULATE */
+ }
+ }
+
+ /* check to see if the last read/write pair failed */
+ if (desc->status) {
+ rf_CopybackComplete(desc, 1);
+ return;
+ }
+
+ /* we didn't find any units to copy back in this stripe. Continue with the next one */
+ }
+}
+
+
+/* copyback one unit */
+static void rf_CopybackOne(desc, typ, addr, testRow, testCol, testOffs)
+ RF_CopybackDesc_t *desc;
+ int typ;
+ RF_RaidAddr_t addr;
+ RF_RowCol_t testRow;
+ RF_RowCol_t testCol;
+ RF_SectorNum_t testOffs;
+{
+ RF_SectorCount_t sectPerSU = desc->sectPerSU;
+ RF_Raid_t *raidPtr = desc->raidPtr;
+ RF_RowCol_t spRow = desc->spRow;
+ RF_RowCol_t spCol = desc->spCol;
+ RF_SectorNum_t spOffs;
+
+ /* find the spare spare location for this SU */
+ if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
+ if (typ == RF_COPYBACK_DATA)
+ raidPtr->Layout.map->MapSector(raidPtr, addr, &spRow, &spCol, &spOffs, RF_REMAP);
+ else
+ raidPtr->Layout.map->MapParity(raidPtr, addr, &spRow, &spCol, &spOffs, RF_REMAP);
+ } else {
+ spOffs = testOffs;
+ }
+
+ /* create reqs to read the old location & write the new */
+ desc->readreq = rf_CreateDiskQueueData(RF_IO_TYPE_READ, spOffs,
+ sectPerSU, desc->databuf, 0L, 0,
+ (int (*)(void *,int)) rf_CopybackReadDoneProc, desc,
+ NULL, NULL, (void *) raidPtr, RF_DISKQUEUE_DATA_FLAGS_NONE, NULL);
+ desc->writereq = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, testOffs,
+ sectPerSU, desc->databuf, 0L, 0,
+ (int (*)(void *,int)) rf_CopybackWriteDoneProc, desc,
+ NULL, NULL, (void *) raidPtr, RF_DISKQUEUE_DATA_FLAGS_NONE, NULL);
+ desc->frow = testRow;
+ desc->fcol = testCol;
+
+ /* enqueue the read. the write will go out as part of the callback on the read.
+ * at user-level & in the kernel, wait for the read-write pair to complete.
+ * in the simulator, just return, since everything will happen as callbacks
+ */
+#ifndef SIMULATE
+ RF_LOCK_MUTEX(desc->mcpair->mutex);
+ desc->mcpair->flag = 0;
+#endif /* !SIMULATE */
+
+ rf_DiskIOEnqueue(&raidPtr->Queues[spRow][spCol], desc->readreq, RF_IO_NORMAL_PRIORITY);
+
+#ifndef SIMULATE
+ while (!desc->mcpair->flag) {
+ RF_WAIT_MCPAIR(desc->mcpair);
+ }
+ RF_UNLOCK_MUTEX(desc->mcpair->mutex);
+ rf_FreeDiskQueueData(desc->readreq);
+ rf_FreeDiskQueueData(desc->writereq);
+#endif /* !SIMULATE */
+}
+
+
+/* called at interrupt context when the read has completed. just send out the write */
+static int rf_CopybackReadDoneProc(desc, status)
+ RF_CopybackDesc_t *desc;
+ int status;
+{
+ if (status) { /* invoke the callback with bad status */
+ printf("COPYBACK: copyback read failed. Aborting.\n");
+ (desc->writereq->CompleteFunc)(desc, -100);
+ }
+ else {
+ rf_DiskIOEnqueue(&(desc->raidPtr->Queues[desc->frow][desc->fcol]), desc->writereq, RF_IO_NORMAL_PRIORITY);
+ }
+ return(0);
+}
+
+/* called at interrupt context when the write has completed.
+ * at user level & in the kernel, wake up the copyback thread.
+ * in the simulator, invoke the next copyback directly.
+ * can't free diskqueuedata structs in the kernel b/c we're at interrupt context.
+ */
+static int rf_CopybackWriteDoneProc(desc, status)
+ RF_CopybackDesc_t *desc;
+ int status;
+{
+ if (status && status != -100) {
+ printf("COPYBACK: copyback write failed. Aborting.\n");
+ }
+
+#ifdef SIMULATE
+ rf_FreeDiskQueueData(desc->readreq);
+ rf_FreeDiskQueueData(desc->writereq);
+ if (!status)
+ rf_ContinueCopyback(desc);
+ else
+ rf_CopybackComplete(desc, 1);
+#else /* SIMULATE */
+ desc->status = status;
+ rf_MCPairWakeupFunc(desc->mcpair);
+#endif /* SIMULATE */
+ return(0);
+}
+
+/* invoked when the copyback has completed */
+static void rf_CopybackComplete(desc, status)
+ RF_CopybackDesc_t *desc;
+ int status;
+{
+ RF_Raid_t *raidPtr = desc->raidPtr;
+ struct timeval t, diff;
+
+ if (!status) {
+ RF_LOCK_MUTEX(raidPtr->mutex);
+ if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
+ RF_ASSERT(raidPtr->Layout.map->parityConfig == 'D');
+ rf_FreeSpareTable(raidPtr);
+ } else {
+ raidPtr->Disks[desc->spRow][desc->spCol].status = rf_ds_spare;
+ }
+ RF_UNLOCK_MUTEX(raidPtr->mutex);
+
+ RF_GETTIME(t);
+ RF_TIMEVAL_DIFF(&desc->starttime, &t, &diff);
+ printf("Copyback time was %d.%06d seconds\n",
+ (int)diff.tv_sec, (int)diff.tv_usec);
+ } else printf("COPYBACK: Failure.\n");
+
+ RF_Free(desc->databuf, rf_RaidAddressToByte(raidPtr, desc->sectPerSU));
+#ifndef SIMULATE
+ rf_FreeMCPair(desc->mcpair);
+#endif /* !SIMULATE */
+ RF_Free(desc, sizeof(*desc));
+
+ rf_copyback_in_progress = 0;
+ rf_ResumeNewRequests(raidPtr);
+}
diff --git a/sys/dev/raidframe/rf_copyback.h b/sys/dev/raidframe/rf_copyback.h
new file mode 100644
index 00000000000..59ef0630447
--- /dev/null
+++ b/sys/dev/raidframe/rf_copyback.h
@@ -0,0 +1,88 @@
+/* $OpenBSD: rf_copyback.h,v 1.1 1999/01/11 14:29:03 niklas Exp $ */
+/* $NetBSD: rf_copyback.h,v 1.1 1998/11/13 04:20:27 oster Exp $ */
+/*
+ * rf_copyback.h
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * :
+ * Log: rf_copyback.h,v
+ * Revision 1.5 1996/07/11 19:08:00 jimz
+ * generalize reconstruction mechanism
+ * allow raid1 reconstructs via copyback (done with array
+ * quiesced, not online, therefore not disk-directed)
+ *
+ * Revision 1.4 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.3 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.2 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.1 1996/05/18 19:55:02 jimz
+ * Initial revision
+ *
+ */
+
+#ifndef _RF__RF_COPYBACK_H_
+#define _RF__RF_COPYBACK_H_
+
+#include "rf_types.h"
+
+typedef struct RF_CopybackDesc_s {
+ RF_Raid_t *raidPtr;
+ RF_RowCol_t frow;
+ RF_RowCol_t fcol;
+ RF_RowCol_t spRow;
+ RF_RowCol_t spCol;
+ int status;
+ RF_StripeNum_t stripeAddr;
+ RF_SectorCount_t sectPerSU;
+ RF_SectorCount_t sectPerStripe;
+ char *databuf;
+ RF_DiskQueueData_t *readreq;
+ RF_DiskQueueData_t *writereq;
+ struct timeval starttime;
+#ifndef SIMULATE
+ RF_MCPair_t *mcpair;
+#endif /* !SIMULATE */
+} RF_CopybackDesc_t;
+
+extern int rf_copyback_in_progress;
+
+int rf_ConfigureCopyback(RF_ShutdownList_t **listp);
+void rf_CopybackReconstructedData(RF_Raid_t *raidPtr);
+void rf_ContinueCopyback(RF_CopybackDesc_t *desc);
+
+#endif /* !_RF__RF_COPYBACK_H_ */
diff --git a/sys/dev/raidframe/rf_cpuutil.c b/sys/dev/raidframe/rf_cpuutil.c
new file mode 100644
index 00000000000..1816740bfc3
--- /dev/null
+++ b/sys/dev/raidframe/rf_cpuutil.c
@@ -0,0 +1,195 @@
+/* $OpenBSD: rf_cpuutil.c,v 1.1 1999/01/11 14:29:03 niklas Exp $ */
+/* $NetBSD: rf_cpuutil.c,v 1.1 1998/11/13 04:20:27 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Authors: Mark Holland, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * rf_cpuutil.c
+ *
+ * track cpu utilization
+ */
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#include "rf_cpuutil.h"
+
+#ifndef KERNEL
+#include <errno.h>
+#endif /* !KERNEL */
+#include "rf_types.h"
+#include "rf_general.h"
+#include "rf_shutdown.h"
+#include "rf_sys.h"
+#ifdef __osf__
+#include <sys/table.h>
+#endif /* __osf__ */
+#ifdef AIX
+#include <nlist.h>
+#include <sys/sysinfo.h>
+#endif /* AIX */
+#ifdef KERNEL
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include <sys/dk.h>
+#endif /* __NetBSD__ && !__OpenBSD__ */
+#else /* KERNEL */
+extern int table(int id, int index, void *addr, int nel, u_int lel);
+#endif /* KERNEL */
+
+#ifdef __osf__
+static struct tbl_sysinfo start, stop;
+#endif /* __osf__ */
+
+#ifdef AIX
+static int kmem_fd;
+static off_t sysinfo_offset;
+static struct sysinfo sysinfo_start, sysinfo_stop;
+static struct nlist namelist[] = {
+ {{"sysinfo"}},
+ {{""}},
+};
+#endif /* AIX */
+
+#ifdef AIX
+static void rf_ShutdownCpuMonitor(ignored)
+ void *ignored;
+{
+ close(kmem_fd);
+}
+#endif /* AIX */
+
+int rf_ConfigureCpuMonitor(listp)
+ RF_ShutdownList_t **listp;
+{
+#ifdef AIX
+ int rc;
+
+ rc = knlist(namelist, 1, sizeof(struct nlist));
+ if (rc) {
+ RF_ERRORMSG("Could not knlist() to config CPU monitor\n");
+ return(errno);
+ }
+ if (namelist[0].n_value == 0) {
+ RF_ERRORMSG("Got bogus results from knlist() for CPU monitor\n");
+ return(EIO);
+ }
+ sysinfo_offset = namelist[0].n_value;
+ kmem_fd = open("/dev/kmem", O_RDONLY);
+ if (kmem_fd < 0) {
+ perror("/dev/kmem");
+ return(errno);
+ }
+ rc = rf_ShutdownCreate(listp, rf_ShutdownCpuMonitor, NULL);
+ if (rc) {
+ RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ rf_ShutdownCpuMonitor(NULL);
+ return(rc);
+ }
+#endif /* AIX */
+ return(0);
+}
+
+void rf_start_cpu_monitor()
+{
+#ifdef __osf__
+#ifndef KERNEL
+ if (table(TBL_SYSINFO, 0, &start, 1, sizeof(start)) != 1) {
+ printf("Unable to get sysinfo for cpu utilization monitor\n");
+ perror("start_cpu_monitor");
+ }
+#else /* !KERNEL */
+ /* start.si_user = cp_time[CP_USER];
+ start.si_nice = cp_time[CP_NICE];
+ start.si_sys = cp_time[CP_SYS];
+ start.si_idle = cp_time[CP_IDLE];
+ start.wait = cp_time[CP_WAIT]; */
+#endif /* !KERNEL */
+#endif /* __osf__ */
+#ifdef AIX
+ off_t off;
+ int rc;
+
+ off = lseek(kmem_fd, sysinfo_offset, SEEK_SET);
+ RF_ASSERT(off == sysinfo_offset);
+ rc = read(kmem_fd, &sysinfo_start, sizeof(struct sysinfo));
+ if (rc != sizeof(struct sysinfo)) {
+ RF_ERRORMSG2("Starting CPU monitor: rc=%d != %d\n", rc,
+ sizeof(struct sysinfo));
+ }
+#endif /* AIX */
+}
+
+void rf_stop_cpu_monitor()
+{
+#ifdef __osf__
+#ifndef KERNEL
+ if (table(TBL_SYSINFO, 0, &stop, 1, sizeof(stop)) != 1) {
+ printf("Unable to get sysinfo for cpu utilization monitor\n");
+ perror("stop_cpu_monitor");
+ }
+#else /* !KERNEL */
+ /* stop.si_user = cp_time[CP_USER];
+ stop.si_nice = cp_time[CP_NICE];
+ stop.si_sys = cp_time[CP_SYS];
+ stop.si_idle = cp_time[CP_IDLE];
+ stop.wait = cp_time[CP_WAIT]; */
+#endif /* !KERNEL */
+#endif /* __osf__ */
+#ifdef AIX
+ off_t off;
+ int rc;
+
+ off = lseek(kmem_fd, sysinfo_offset, SEEK_SET);
+ RF_ASSERT(off == sysinfo_offset);
+ rc = read(kmem_fd, &sysinfo_stop, sizeof(struct sysinfo));
+ if (rc != sizeof(struct sysinfo)) {
+ RF_ERRORMSG2("Stopping CPU monitor: rc=%d != %d\n", rc,
+ sizeof(struct sysinfo));
+ }
+#endif /* AIX */
+}
+
+void rf_print_cpu_util(s)
+ char *s;
+{
+#ifdef __osf__
+ long totalticks, idleticks;
+
+ idleticks = stop.si_idle - start.si_idle + stop.wait - start.wait;
+ totalticks = stop.si_user - start.si_user + stop.si_nice - start.si_nice +
+ stop.si_sys - start.si_sys + idleticks;
+ printf("CPU utilization during %s was %d %%\n", s, 100 - 100*idleticks/totalticks);
+#endif /* __osf__ */
+#ifdef AIX
+ long idle;
+
+ /* XXX compute a percentage here */
+ idle = (long)(sysinfo_stop.cpu[CPU_IDLE] - sysinfo_start.cpu[CPU_IDLE]);
+ printf("%ld idle ticks during %s.\n", idle, s);
+#endif /* AIX */
+}
diff --git a/sys/dev/raidframe/rf_cpuutil.h b/sys/dev/raidframe/rf_cpuutil.h
new file mode 100644
index 00000000000..72603d9aae6
--- /dev/null
+++ b/sys/dev/raidframe/rf_cpuutil.h
@@ -0,0 +1,57 @@
+/* $OpenBSD: rf_cpuutil.h,v 1.1 1999/01/11 14:29:03 niklas Exp $ */
+/* $NetBSD: rf_cpuutil.h,v 1.1 1998/11/13 04:20:27 oster Exp $ */
+/*
+ * rf_cpuutil.h
+ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * :
+ * Log: rf_cpuutil.h,v
+ * Revision 1.3 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.2 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.1 1996/05/18 19:55:29 jimz
+ * Initial revision
+ *
+ */
+
+#ifndef _RF__RF_CPUUTIL_H_
+#define _RF__RF_CPUUTIL_H_
+
+#include "rf_types.h"
+
+int rf_ConfigureCpuMonitor(RF_ShutdownList_t **listp);
+void rf_start_cpu_monitor(void);
+void rf_stop_cpu_monitor(void);
+void rf_print_cpu_util(char *s);
+
+#endif /* !_RF__RF_CPUUTIL_H_ */
diff --git a/sys/dev/raidframe/rf_cvscan.c b/sys/dev/raidframe/rf_cvscan.c
new file mode 100644
index 00000000000..73a6e64d001
--- /dev/null
+++ b/sys/dev/raidframe/rf_cvscan.c
@@ -0,0 +1,450 @@
+/* $OpenBSD: rf_cvscan.c,v 1.1 1999/01/11 14:29:05 niklas Exp $ */
+/* $NetBSD: rf_cvscan.c,v 1.2 1998/11/18 15:13:51 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*******************************************************************************
+ *
+ * cvscan.c -- prioritized cvscan disk queueing code.
+ *
+ * Nov 9, 1994, adapted from raidSim version (MCH)
+ *
+ ******************************************************************************/
+
+/*
+ * :
+ * Log: rf_cvscan.c,v
+ * Revision 1.6 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.5 1996/07/15 17:22:18 jimz
+ * nit-pick code cleanup
+ * resolve stdlib problems on DEC OSF
+ *
+ * Revision 1.4 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.3 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.2 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.1 1996/06/05 19:17:40 jimz
+ * Initial revision
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_alloclist.h"
+#include "rf_stripelocks.h"
+#include "rf_layout.h"
+#include "rf_diskqueue.h"
+#include "rf_cvscan.h"
+#include "rf_debugMem.h"
+#include "rf_general.h"
+#include "rf_sys.h"
+
+#define DO_CHECK_STATE(_hdr_) CheckCvscanState((_hdr_), __FILE__, __LINE__)
+
+#define pri_ok(p) ( ((p) == RF_IO_NORMAL_PRIORITY) || ((p) == RF_IO_LOW_PRIORITY))
+
+static void CheckCvscanState(RF_CvscanHeader_t *hdr, char *file, int line)
+{
+ long i, key;
+ RF_DiskQueueData_t *tmp;
+
+ if( hdr->left != (RF_DiskQueueData_t *) NULL )
+ RF_ASSERT( hdr->left->sectorOffset < hdr->cur_block );
+ for( key=hdr->cur_block, i=0, tmp=hdr->left;
+ tmp != (RF_DiskQueueData_t *) NULL;
+ key=tmp->sectorOffset, i++, tmp=tmp->next )
+ RF_ASSERT( tmp->sectorOffset <= key
+ && tmp->priority == hdr->nxt_priority && pri_ok(tmp->priority) );
+ RF_ASSERT( i == hdr->left_cnt );
+
+ for( key=hdr->cur_block, i=0, tmp=hdr->right;
+ tmp != (RF_DiskQueueData_t *) NULL;
+ key=tmp->sectorOffset, i++, tmp=tmp->next )
+ {
+ RF_ASSERT(key <= tmp->sectorOffset);
+ RF_ASSERT(tmp->priority == hdr->nxt_priority);
+ RF_ASSERT(pri_ok(tmp->priority));
+ }
+ RF_ASSERT( i == hdr->right_cnt );
+
+ for( key=hdr->nxt_priority-1, tmp=hdr->burner;
+ tmp != (RF_DiskQueueData_t *) NULL;
+ key=tmp->priority, tmp=tmp->next )
+ {
+ RF_ASSERT(tmp);
+ RF_ASSERT(hdr);
+ RF_ASSERT(pri_ok(tmp->priority));
+ RF_ASSERT(key >= tmp->priority);
+ RF_ASSERT(tmp->priority < hdr->nxt_priority);
+ }
+}
+
+
+
+static void PriorityInsert(RF_DiskQueueData_t **list_ptr, RF_DiskQueueData_t *req )
+{
+ /*
+ ** insert block pointed to by req in to list whose first
+ ** entry is pointed to by the pointer that list_ptr points to
+ ** ie., list_ptr is a grandparent of the first entry
+ */
+
+ for( ; (*list_ptr)!=(RF_DiskQueueData_t *)NULL &&
+ (*list_ptr)->priority > req->priority;
+ list_ptr = &((*list_ptr)->next) ) {}
+ req->next = (*list_ptr);
+ (*list_ptr) = req;
+}
+
+
+
+static void ReqInsert(RF_DiskQueueData_t **list_ptr, RF_DiskQueueData_t *req, RF_CvscanArmDir_t order)
+{
+ /*
+ ** insert block pointed to by req in to list whose first
+ ** entry is pointed to by the pointer that list_ptr points to
+ ** ie., list_ptr is a grandparent of the first entry
+ */
+
+ for( ; (*list_ptr)!=(RF_DiskQueueData_t *)NULL &&
+
+ ( (order==rf_cvscan_RIGHT && (*list_ptr)->sectorOffset <= req->sectorOffset)
+ || (order==rf_cvscan_LEFT && (*list_ptr)->sectorOffset > req->sectorOffset) );
+ list_ptr = &((*list_ptr)->next) ) {}
+ req->next = (*list_ptr);
+ (*list_ptr) = req;
+}
+
+
+
+static RF_DiskQueueData_t *ReqDequeue(RF_DiskQueueData_t **list_ptr)
+{
+ RF_DiskQueueData_t * ret = (*list_ptr);
+ if( (*list_ptr) != (RF_DiskQueueData_t *) NULL ) {
+ (*list_ptr) = (*list_ptr)->next;
+ }
+ return( ret );
+}
+
+
+
+static void ReBalance(RF_CvscanHeader_t *hdr)
+{
+ /* DO_CHECK_STATE(hdr); */
+ while( hdr->right != (RF_DiskQueueData_t *) NULL
+ && hdr->right->sectorOffset < hdr->cur_block ) {
+ hdr->right_cnt--;
+ hdr->left_cnt++;
+ ReqInsert( &hdr->left, ReqDequeue( &hdr->right ), rf_cvscan_LEFT );
+ }
+ /* DO_CHECK_STATE(hdr); */
+}
+
+
+
+static void Transfer(RF_DiskQueueData_t **to_list_ptr, RF_DiskQueueData_t **from_list_ptr )
+{
+ RF_DiskQueueData_t *gp;
+ for( gp=(*from_list_ptr); gp != (RF_DiskQueueData_t *) NULL; ) {
+ RF_DiskQueueData_t *p = gp->next;
+ PriorityInsert( to_list_ptr, gp );
+ gp = p;
+ }
+ (*from_list_ptr) = (RF_DiskQueueData_t *) NULL;
+}
+
+
+
+static void RealEnqueue(RF_CvscanHeader_t *hdr, RF_DiskQueueData_t *req)
+{
+ RF_ASSERT(req->priority == RF_IO_NORMAL_PRIORITY || req->priority == RF_IO_LOW_PRIORITY);
+
+ DO_CHECK_STATE(hdr);
+ if( hdr->left_cnt == 0 && hdr->right_cnt == 0 ) {
+ hdr->nxt_priority = req->priority;
+ }
+ if( req->priority > hdr->nxt_priority ) {
+ /*
+ ** dump all other outstanding requests on the back burner
+ */
+ Transfer( &hdr->burner, &hdr->left );
+ Transfer( &hdr->burner, &hdr->right );
+ hdr->left_cnt = 0;
+ hdr->right_cnt = 0;
+ hdr->nxt_priority = req->priority;
+ }
+ if( req->priority < hdr->nxt_priority ) {
+ /*
+ ** yet another low priority task!
+ */
+ PriorityInsert( &hdr->burner, req );
+ } else {
+ if( req->sectorOffset < hdr->cur_block ) {
+ /* this request is to the left of the current arms */
+ ReqInsert( &hdr->left, req, rf_cvscan_LEFT );
+ hdr->left_cnt++;
+ } else {
+ /* this request is to the right of the current arms */
+ ReqInsert( &hdr->right, req, rf_cvscan_RIGHT );
+ hdr->right_cnt++;
+ }
+ }
+ DO_CHECK_STATE(hdr);
+}
+
+
+
+void rf_CvscanEnqueue(void *q_in, RF_DiskQueueData_t *elem, int priority)
+{
+ RF_CvscanHeader_t *hdr = (RF_CvscanHeader_t *) q_in;
+ RealEnqueue( hdr, elem /*req*/ );
+}
+
+
+
+RF_DiskQueueData_t *rf_CvscanDequeue(void *q_in)
+{
+ RF_CvscanHeader_t *hdr = (RF_CvscanHeader_t *) q_in;
+ long range, i, sum_dist_left, sum_dist_right;
+ RF_DiskQueueData_t *ret;
+ RF_DiskQueueData_t *tmp;
+
+ DO_CHECK_STATE(hdr);
+
+ if( hdr->left_cnt == 0 && hdr->right_cnt == 0 ) return( (RF_DiskQueueData_t *) NULL );
+
+ range = RF_MIN( hdr->range_for_avg, RF_MIN(hdr->left_cnt,hdr->right_cnt));
+ for( i=0, tmp=hdr->left, sum_dist_left=
+ ((hdr->direction==rf_cvscan_RIGHT)?range*hdr->change_penalty:0);
+ tmp != (RF_DiskQueueData_t *) NULL && i < range;
+ tmp = tmp->next, i++ ) {
+ sum_dist_left += hdr->cur_block - tmp->sectorOffset;
+ }
+ for( i=0, tmp=hdr->right, sum_dist_right=
+ ((hdr->direction==rf_cvscan_LEFT)?range*hdr->change_penalty:0);
+ tmp != (RF_DiskQueueData_t *) NULL && i < range;
+ tmp = tmp->next, i++ ) {
+ sum_dist_right += tmp->sectorOffset - hdr->cur_block;
+ }
+
+ if( hdr->right_cnt == 0 || sum_dist_left < sum_dist_right ) {
+ hdr->direction = rf_cvscan_LEFT;
+ hdr->cur_block = hdr->left->sectorOffset + hdr->left->numSector;
+ hdr->left_cnt = RF_MAX(hdr->left_cnt-1,0);
+ tmp = hdr->left;
+ ret = (ReqDequeue(&hdr->left))/*->parent*/;
+ } else {
+ hdr->direction = rf_cvscan_RIGHT;
+ hdr->cur_block = hdr->right->sectorOffset + hdr->right->numSector;
+ hdr->right_cnt = RF_MAX(hdr->right_cnt-1,0);
+ tmp = hdr->right;
+ ret = (ReqDequeue(&hdr->right))/*->parent*/;
+ }
+ ReBalance( hdr );
+
+ if( hdr->left_cnt == 0 && hdr->right_cnt == 0
+ && hdr->burner != (RF_DiskQueueData_t *) NULL ) {
+ /*
+ ** restore low priority requests for next dequeue
+ */
+ RF_DiskQueueData_t *burner = hdr->burner;
+ hdr->nxt_priority = burner->priority;
+ while( burner != (RF_DiskQueueData_t *) NULL
+ && burner->priority == hdr->nxt_priority ) {
+ RF_DiskQueueData_t *next = burner->next;
+ RealEnqueue( hdr, burner );
+ burner = next;
+ }
+ hdr->burner = burner;
+ }
+ DO_CHECK_STATE(hdr);
+ return( ret );
+}
+
+
+
+RF_DiskQueueData_t *rf_CvscanPeek(void *q_in)
+{
+ RF_CvscanHeader_t *hdr = (RF_CvscanHeader_t *) q_in;
+ long range, i, sum_dist_left, sum_dist_right;
+ RF_DiskQueueData_t *tmp, *headElement;
+
+ DO_CHECK_STATE(hdr);
+
+ if( hdr->left_cnt == 0 && hdr->right_cnt == 0 )
+ headElement = NULL;
+ else {
+ range = RF_MIN( hdr->range_for_avg, RF_MIN(hdr->left_cnt,hdr->right_cnt));
+ for( i=0, tmp=hdr->left, sum_dist_left=
+ ((hdr->direction==rf_cvscan_RIGHT)?range*hdr->change_penalty:0);
+ tmp != (RF_DiskQueueData_t *) NULL && i < range;
+ tmp = tmp->next, i++ ) {
+ sum_dist_left += hdr->cur_block - tmp->sectorOffset;
+ }
+ for( i=0, tmp=hdr->right, sum_dist_right=
+ ((hdr->direction==rf_cvscan_LEFT)?range*hdr->change_penalty:0);
+ tmp != (RF_DiskQueueData_t *) NULL && i < range;
+ tmp = tmp->next, i++ ) {
+ sum_dist_right += tmp->sectorOffset - hdr->cur_block;
+ }
+
+ if( hdr->right_cnt == 0 || sum_dist_left < sum_dist_right )
+ headElement = hdr->left;
+ else
+ headElement = hdr->right;
+ }
+ return(headElement);
+}
+
+
+
+/*
+** CVSCAN( 1, 0 ) is Shortest Seek Time First (SSTF)
+** lowest average response time
+** CVSCAN( 1, infinity ) is SCAN
+** lowest response time standard deviation
+*/
+
+
+int rf_CvscanConfigure()
+{
+ return(0);
+}
+
+
+
+void *rf_CvscanCreate(RF_SectorCount_t sectPerDisk,
+ RF_AllocListElem_t *clList,
+ RF_ShutdownList_t **listp)
+{
+ RF_CvscanHeader_t *hdr;
+ long range = 2; /* Currently no mechanism to change these */
+ long penalty = sectPerDisk / 5;
+
+ RF_MallocAndAdd(hdr, sizeof(RF_CvscanHeader_t), (RF_CvscanHeader_t *), clList);
+ bzero((char *)hdr, sizeof(RF_CvscanHeader_t));
+ hdr->range_for_avg = RF_MAX( range, 1 );
+ hdr->change_penalty = RF_MAX( penalty, 0 );
+ hdr->direction = rf_cvscan_RIGHT;
+ hdr->cur_block = 0;
+ hdr->left_cnt = hdr->right_cnt = 0;
+ hdr->left = hdr->right = (RF_DiskQueueData_t *) NULL;
+ hdr->burner = (RF_DiskQueueData_t *) NULL;
+ DO_CHECK_STATE(hdr);
+
+ return( (void *) hdr );
+}
+
+
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+/* PrintCvscanQueue is not used, so we ignore it... */
+#else
+static void PrintCvscanQueue(RF_CvscanHeader_t *hdr)
+{
+ RF_DiskQueueData_t *tmp;
+
+ printf( "CVSCAN(%d,%d) at %d going %s\n",
+ (int)hdr->range_for_avg,
+ (int)hdr->change_penalty,
+ (int)hdr->cur_block,
+ (hdr->direction==rf_cvscan_LEFT)?"LEFT":"RIGHT" );
+ printf( "\tLeft(%d): ", hdr->left_cnt );
+ for( tmp = hdr->left; tmp != (RF_DiskQueueData_t *) NULL; tmp = tmp->next)
+ printf( "(%d,%ld,%d) ",
+ (int) tmp->sectorOffset,
+ (long) (tmp->sectorOffset + tmp->numSector),
+ tmp->priority );
+ printf( "\n" );
+ printf( "\tRight(%d): ", hdr->right_cnt );
+ for( tmp = hdr->right; tmp != (RF_DiskQueueData_t *) NULL; tmp = tmp->next)
+ printf( "(%d,%ld,%d) ",
+ (int) tmp->sectorOffset,
+ (long) (tmp->sectorOffset + tmp->numSector),
+ tmp->priority );
+ printf( "\n" );
+ printf( "\tBurner: " );
+ for( tmp = hdr->burner; tmp != (RF_DiskQueueData_t *) NULL; tmp = tmp->next)
+ printf( "(%d,%ld,%d) ",
+ (int) tmp->sectorOffset,
+ (long) (tmp->sectorOffset + tmp->numSector),
+ tmp->priority );
+ printf( "\n" );
+}
+#endif
+
+
+/* promotes reconstruction accesses for the given stripeID to normal priority.
+ * returns 1 if an access was found and zero otherwise. Normally, we should
+ * only have one or zero entries in the burner queue, so execution time should
+ * be short.
+ */
+int rf_CvscanPromote(void *q_in, RF_StripeNum_t parityStripeID, RF_ReconUnitNum_t which_ru)
+{
+ RF_CvscanHeader_t *hdr = (RF_CvscanHeader_t *) q_in;
+ RF_DiskQueueData_t *trailer = NULL, *tmp = hdr->burner, *tlist = NULL;
+ int retval=0;
+
+ DO_CHECK_STATE(hdr);
+ while (tmp) { /* handle entries at the front of the list */
+ if (tmp->parityStripeID == parityStripeID && tmp->which_ru == which_ru) {
+ hdr->burner = tmp->next;
+ tmp->priority = RF_IO_NORMAL_PRIORITY;
+ tmp->next = tlist; tlist=tmp;
+ tmp = hdr->burner;
+ } else break;
+ }
+ if (tmp) {trailer=tmp; tmp=tmp->next;}
+ while (tmp) { /* handle entries on the rest of the list */
+ if (tmp->parityStripeID == parityStripeID && tmp->which_ru == which_ru) {
+ trailer->next = tmp->next;
+ tmp->priority = RF_IO_NORMAL_PRIORITY;
+ tmp->next = tlist; tlist=tmp; /* insert on a temp queue */
+ tmp = trailer->next;
+ } else {
+ trailer=tmp; tmp=tmp->next;
+ }
+ }
+ while (tlist) {
+ retval++;
+ tmp = tlist->next;
+ RealEnqueue(hdr, tlist);
+ tlist = tmp;
+ }
+ RF_ASSERT(retval==0 || retval==1);
+ DO_CHECK_STATE((RF_CvscanHeader_t *)q_in);
+ return(retval);
+}
+
diff --git a/sys/dev/raidframe/rf_cvscan.h b/sys/dev/raidframe/rf_cvscan.h
new file mode 100644
index 00000000000..4347fb06a63
--- /dev/null
+++ b/sys/dev/raidframe/rf_cvscan.h
@@ -0,0 +1,97 @@
+/* $OpenBSD: rf_cvscan.h,v 1.1 1999/01/11 14:29:06 niklas Exp $ */
+/* $NetBSD: rf_cvscan.h,v 1.1 1998/11/13 04:20:27 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+** Disk scheduling by CVSCAN( N, r )
+**
+** Given a set of requests, partition them into one set on each
+** side of the current arm position. The trick is to pick which
+** side you are going to service next; once a side is picked you will
+** service the closest request.
+** Let there be n1 requests on one side and n2 requests on the other
+** side. If one of n1 or n2 is zero, select the other side.
+** If both n1 and n2 are nonzero, select a "range" for examination
+** that is N' = min( n1, n2, N ). Average the distance from the
+** current position to the nearest N' requests on each side giving
+** d1 and d2.
+** Suppose the last decision was to move toward set 2, then the
+** current direction is toward set 2, and you will only switch to set
+** 1 if d1+R < d2 where R is r*(total number of cylinders), r in [0,1].
+**
+** I extend this by applying only to the set of requests that all
+** share the same, highest priority level.
+*/
+
+/* :
+ * Log: rf_cvscan.h,v
+ * Revision 1.3 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.2 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.1 1996/06/05 19:17:40 jimz
+ * Initial revision
+ *
+ */
+
+#ifndef _RF__RF_CVSCAN_H_
+#define _RF__RF_CVSCAN_H_
+
+#include "rf_diskqueue.h"
+
+typedef enum RF_CvscanArmDir_e {
+ rf_cvscan_LEFT,
+ rf_cvscan_RIGHT
+} RF_CvscanArmDir_t;
+
+typedef struct RF_CvscanHeader_s {
+ long range_for_avg; /* CVSCAN param N */
+ long change_penalty; /* CVSCAN param R */
+ RF_CvscanArmDir_t direction;
+ RF_SectorNum_t cur_block;
+ int nxt_priority;
+ RF_DiskQueueData_t *left;
+ int left_cnt;
+ RF_DiskQueueData_t *right;
+ int right_cnt;
+ RF_DiskQueueData_t *burner;
+} RF_CvscanHeader_t;
+
+int rf_CvscanConfigure(void);
+void *rf_CvscanCreate(RF_SectorCount_t sect_per_disk,
+ RF_AllocListElem_t *cl_list, RF_ShutdownList_t **listp);
+void rf_CvscanEnqueue(void *qptr, RF_DiskQueueData_t *req, int priority);
+RF_DiskQueueData_t *rf_CvscanDequeue(void *qptr);
+RF_DiskQueueData_t *rf_CvscanPeek(void *qptr);
+int rf_CvscanPromote(void *qptr, RF_StripeNum_t parityStripeID,
+ RF_ReconUnitNum_t which_ru);
+
+#endif /* !_RF__RF_CVSCAN_H_ */
diff --git a/sys/dev/raidframe/rf_dag.h b/sys/dev/raidframe/rf_dag.h
new file mode 100644
index 00000000000..f13fc3f76c3
--- /dev/null
+++ b/sys/dev/raidframe/rf_dag.h
@@ -0,0 +1,320 @@
+/* $OpenBSD: rf_dag.h,v 1.1 1999/01/11 14:29:06 niklas Exp $ */
+/* $NetBSD: rf_dag.h,v 1.1 1998/11/13 04:20:27 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II, Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/****************************************************************************
+ * *
+ * dag.h -- header file for DAG-related data structures *
+ * *
+ ****************************************************************************/
+/*
+ *
+ * :
+ * Log: rf_dag.h,v
+ * Revision 1.35 1996/11/05 18:38:37 jimz
+ * add patch from galvarez@cs.ucsd.edu (Guillermo Alvarez)
+ * to fix dag_params memory-sizing problem (should be an array
+ * of the type, not an array of pointers to the type)
+ *
+ * Revision 1.34 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.33 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.32 1996/06/10 22:22:13 wvcii
+ * added two node status types for use in backward error
+ * recovery experiments.
+ *
+ * Revision 1.31 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.30 1996/06/07 22:49:18 jimz
+ * fix up raidPtr typing
+ *
+ * Revision 1.29 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.28 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.27 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.26 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.25 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.24 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.23 1996/05/16 23:05:20 jimz
+ * Added dag_ptrs field, RF_DAG_PTRCACHESIZE
+ *
+ * The dag_ptrs field of the node is basically some scribble
+ * space to be used here. We could get rid of it, and always
+ * allocate the range of pointers, but that's expensive. So,
+ * we pick a "common case" size for the pointer cache. Hopefully,
+ * we'll find that:
+ * (1) Generally, nptrs doesn't exceed RF_DAG_PTRCACHESIZE by
+ * only a little bit (least efficient case)
+ * (2) Generally, ntprs isn't a lot less than RF_DAG_PTRCACHESIZE
+ * (wasted memory)
+ *
+ * Revision 1.22 1996/05/08 21:01:24 jimz
+ * fixed up enum type names that were conflicting with other
+ * enums and function names (ie, "panic")
+ * future naming trends will be towards RF_ and rf_ for
+ * everything raidframe-related
+ *
+ * Revision 1.21 1996/05/08 15:23:47 wvcii
+ * added new node states: undone, recover, panic
+ *
+ * Revision 1.20 1995/12/01 14:59:19 root
+ * increased MAX_ANTECEDENTS from 10 to 20
+ * should consider getting rid of this (eliminate static array)
+ *
+ * Revision 1.19 1995/11/30 15:58:59 wvcii
+ * added copyright info
+ *
+ * Revision 1.18 1995/11/19 16:27:03 wvcii
+ * created struct dagList
+ *
+ * Revision 1.17 1995/11/07 15:43:01 wvcii
+ * added static array to DAGnode: antType
+ * added commitNode type
+ * added commit node counts to dag header
+ * added ptr (firstDag) to support multi-dag requests
+ * added succedent done/fired counts to nodes to support rollback
+ * added node status type "skipped"
+ * added hdr status types "rollForward, rollBackward"
+ * deleted hdr status type "disable"
+ * updated ResetNode & ResetDAGHeader to zero new fields
+ *
+ */
+
+#ifndef _RF__RF_DAG_H_
+#define _RF__RF_DAG_H_
+
+#include "rf_types.h"
+#include "rf_threadstuff.h"
+#include "rf_alloclist.h"
+#include "rf_stripelocks.h"
+#include "rf_layout.h"
+#include "rf_dagflags.h"
+#include "rf_acctrace.h"
+#include "rf_memchunk.h"
+
+#define RF_THREAD_CONTEXT 0 /* we were invoked from thread context */
+#define RF_INTR_CONTEXT 1 /* we were invoked from interrupt context */
+#define RF_MAX_ANTECEDENTS 20 /* max num of antecedents a node may posses */
+
+#ifdef KERNEL
+#include <sys/buf.h>
+#endif /* KERNEL */
+
+struct RF_PropHeader_s { /* structure for propagation of results */
+ int resultNum; /* bind result # resultNum */
+ int paramNum; /* to parameter # paramNum */
+ RF_PropHeader_t *next; /* linked list for multiple results/params */
+};
+
+typedef enum RF_NodeStatus_e {
+ rf_bwd1, /* node is ready for undo logging (backward error recovery only) */
+ rf_bwd2, /* node has completed undo logging (backward error recovery only) */
+ rf_wait, /* node is waiting to be executed */
+ rf_fired, /* node is currently executing its do function */
+ rf_good, /* node successfully completed execution of its do function */
+ rf_bad, /* node failed to successfully execute its do function */
+ rf_skipped, /* not used anymore, used to imply a node was not executed */
+ rf_recover, /* node is currently executing its undo function */
+ rf_panic, /* node failed to successfully execute its undo function */
+ rf_undone /* node successfully executed its undo function */
+} RF_NodeStatus_t;
+
+/*
+ * These were used to control skipping a node.
+ * Now, these are only used as comments.
+ */
+typedef enum RF_AntecedentType_e {
+ rf_trueData,
+ rf_antiData,
+ rf_outputData,
+ rf_control
+} RF_AntecedentType_t;
+
+#define RF_DAG_PTRCACHESIZE 40
+#define RF_DAG_PARAMCACHESIZE 12
+
+typedef RF_uint8 RF_DagNodeFlags_t;
+
+struct RF_DagNode_s {
+ RF_NodeStatus_t status; /* current status of this node */
+ int (*doFunc)(RF_DagNode_t *); /* normal function */
+ int (*undoFunc)(RF_DagNode_t *); /* func to remove effect of doFunc */
+ int (*wakeFunc)(RF_DagNode_t *, int status); /* func called when the node completes an I/O */
+ int numParams; /* number of parameters required by *funcPtr */
+ int numResults; /* number of results produced by *funcPtr */
+ int numAntecedents; /* number of antecedents */
+ int numAntDone; /* number of antecedents which have finished */
+ int numSuccedents; /* number of succedents */
+ int numSuccFired; /* incremented when a succedent is fired during forward execution */
+ int numSuccDone; /* incremented when a succedent finishes during rollBackward */
+ int commitNode; /* boolean flag - if true, this is a commit node */
+ RF_DagNode_t **succedents; /* succedents, array size numSuccedents */
+ RF_DagNode_t **antecedents; /* antecedents, array size numAntecedents */
+ RF_AntecedentType_t antType[RF_MAX_ANTECEDENTS]; /* type of each antecedent */
+ void **results; /* array of results produced by *funcPtr */
+ RF_DagParam_t *params; /* array of parameters required by *funcPtr */
+ RF_PropHeader_t **propList; /* propagation list, size numSuccedents */
+ RF_DagHeader_t *dagHdr; /* ptr to head of dag containing this node */
+ void *dagFuncData; /* dag execution func uses this for whatever it wants */
+ RF_DagNode_t *next;
+ int nodeNum; /* used by PrintDAG for debug only */
+ int visited; /* used to avoid re-visiting nodes on DAG walks */
+ /* ANY CODE THAT USES THIS FIELD MUST MAINTAIN THE PROPERTY
+ * THAT AFTER IT FINISHES, ALL VISITED FLAGS IN THE DAG ARE IDENTICAL */
+ char *name; /* debug only */
+ RF_DagNodeFlags_t flags; /* see below */
+ RF_DagNode_t *dag_ptrs[RF_DAG_PTRCACHESIZE]; /* cache for performance */
+ RF_DagParam_t dag_params[RF_DAG_PARAMCACHESIZE]; /* cache for performance */
+};
+
+/*
+ * Bit values for flags field of RF_DagNode_t
+ */
+#define RF_DAGNODE_FLAG_NONE 0x00
+#define RF_DAGNODE_FLAG_YIELD 0x01 /* in the kernel, yield the processor before firing this node */
+
+/* enable - DAG ready for normal execution, no errors encountered
+ * rollForward - DAG encountered an error after commit point, rolling forward
+ * rollBackward - DAG encountered an error prior to commit point, rolling backward
+ */
+typedef enum RF_DagStatus_e {
+ rf_enable,
+ rf_rollForward,
+ rf_rollBackward
+} RF_DagStatus_t;
+
+#define RF_MAX_HDR_SUCC 1
+
+#define RF_MAXCHUNKS 10
+
+struct RF_DagHeader_s {
+ RF_DagStatus_t status; /* status of this DAG */
+ int numSuccedents; /* DAG may be a tree, i.e. may have > 1 root */
+ int numCommitNodes; /* number of commit nodes in graph */
+ int numCommits; /* number of commit nodes which have been fired */
+ RF_DagNode_t *succedents[RF_MAX_HDR_SUCC]; /* array of succedents, size numSuccedents */
+ RF_DagHeader_t *next; /* ptr to allow a list of dags */
+ RF_AllocListElem_t *allocList; /* ptr to list of ptrs to be freed prior to freeing DAG */
+ RF_AccessStripeMapHeader_t *asmList; /* list of access stripe maps to be freed */
+ int nodeNum; /* used by PrintDAG for debug only */
+ int numNodesCompleted;
+ RF_AccTraceEntry_t *tracerec; /* perf mon only */
+
+ void (*cbFunc)(void *); /* function to call when the dag completes */
+ void *cbArg; /* argument for cbFunc */
+ char *creator; /* name of function used to create this dag */
+
+ RF_Raid_t *raidPtr; /* the descriptor for the RAID device this DAG is for */
+ void *bp; /* the bp for this I/O passed down from the file system. ignored outside kernel */
+
+ RF_ChunkDesc_t *memChunk[RF_MAXCHUNKS]; /* experimental- Chunks of memory to be retained upon DAG free for re-use */
+ int chunkIndex; /* the idea is to avoid calls to alloc and free */
+
+ RF_ChunkDesc_t **xtraMemChunk; /* escape hatch which allows SelectAlgorithm to merge memChunks from several dags */
+ int xtraChunkIndex; /* number of ptrs to valid chunks */
+ int xtraChunkCnt; /* number of ptrs to chunks allocated */
+
+#ifdef SIMULATE
+ int done; /* Tag to tell if termination node has been fired */
+#endif /* SIMULATE */
+};
+
+struct RF_DagList_s {
+ /* common info for a list of dags which will be fired sequentially */
+ int numDags; /* number of dags in the list */
+ int numDagsFired; /* number of dags in list which have initiated execution */
+ int numDagsDone; /* number of dags in list which have completed execution */
+ RF_DagHeader_t *dags; /* list of dags */
+ RF_RaidAccessDesc_t *desc; /* ptr to descriptor for this access */
+ RF_AccTraceEntry_t tracerec; /* perf mon info for dags (not user info) */
+};
+
+/* resets a node so that it can be fired again */
+#define RF_ResetNode(_n_) { \
+ (_n_)->status = rf_wait; \
+ (_n_)->numAntDone = 0; \
+ (_n_)->numSuccFired = 0; \
+ (_n_)->numSuccDone = 0; \
+ (_n_)->next = NULL; \
+}
+
+#ifdef SIMULATE
+#define RF_ResetDagHeader(_h_) { \
+ (_h_)->done = RF_FALSE; \
+ (_h_)->numNodesCompleted = 0; \
+ (_h_)->numCommits = 0; \
+ (_h_)->status = rf_enable; \
+}
+#else /* SIMULATE */
+#define RF_ResetDagHeader(_h_) { \
+ (_h_)->numNodesCompleted = 0; \
+ (_h_)->numCommits = 0; \
+ (_h_)->status = rf_enable; \
+}
+#endif /* SIMULATE */
+
+/* convience macro for declaring a create dag function */
+
+#define RF_CREATE_DAG_FUNC_DECL(_name_) \
+void _name_ ( \
+ RF_Raid_t *raidPtr, \
+ RF_AccessStripeMap_t *asmap, \
+ RF_DagHeader_t *dag_h, \
+ void *bp, \
+ RF_RaidAccessFlags_t flags, \
+ RF_AllocListElem_t *allocList)
+
+#endif /* !_RF__RF_DAG_H_ */
diff --git a/sys/dev/raidframe/rf_dagdegrd.c b/sys/dev/raidframe/rf_dagdegrd.c
new file mode 100644
index 00000000000..06390061306
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagdegrd.c
@@ -0,0 +1,1212 @@
+/* $OpenBSD: rf_dagdegrd.c,v 1.1 1999/01/11 14:29:06 niklas Exp $ */
+/* $NetBSD: rf_dagdegrd.c,v 1.1 1998/11/13 04:20:27 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_dagdegrd.c
+ *
+ * code for creating degraded read DAGs
+ *
+ * :
+ * Log: rf_dagdegrd.c,v
+ * Revision 1.20 1996/11/05 21:10:40 jimz
+ * failed pda generalization
+ *
+ * Revision 1.19 1996/08/19 23:30:36 jimz
+ * fix chained declustered accesses in degraded mode when mirror copy is failed
+ * (workload shifting not allowed when there are no duplicate copies extant)
+ *
+ * Revision 1.18 1996/07/31 16:29:01 jimz
+ * asm/asmap re-fix (EO merge)
+ *
+ * Revision 1.17 1996/07/31 15:34:34 jimz
+ * evenodd changes; bugfixes for double-degraded archs, generalize
+ * some formerly PQ-only functions
+ *
+ * Revision 1.16 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.15 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.14 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.13 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.12 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.11 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.10 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.9 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.8 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.7 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.6 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.5 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.4 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.3 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.2 1996/05/08 21:01:24 jimz
+ * fixed up enum type names that were conflicting with other
+ * enums and function names (ie, "panic")
+ * future naming trends will be towards RF_ and rf_ for
+ * everything raidframe-related
+ *
+ * Revision 1.1 1996/05/03 19:22:23 wvcii
+ * Initial revision
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_dagutils.h"
+#include "rf_dagfuncs.h"
+#include "rf_threadid.h"
+#include "rf_debugMem.h"
+#include "rf_memchunk.h"
+#include "rf_general.h"
+#include "rf_dagdegrd.h"
+#include "rf_sys.h"
+
+
+/******************************************************************************
+ *
+ * General comments on DAG creation:
+ *
+ * All DAGs in this file use roll-away error recovery. Each DAG has a single
+ * commit node, usually called "Cmt." If an error occurs before the Cmt node
+ * is reached, the execution engine will halt forward execution and work
+ * backward through the graph, executing the undo functions. Assuming that
+ * each node in the graph prior to the Cmt node are undoable and atomic - or -
+ * does not make changes to permanent state, the graph will fail atomically.
+ * If an error occurs after the Cmt node executes, the engine will roll-forward
+ * through the graph, blindly executing nodes until it reaches the end.
+ * If a graph reaches the end, it is assumed to have completed successfully.
+ *
+ * A graph has only 1 Cmt node.
+ *
+ */
+
+
+/******************************************************************************
+ *
+ * The following wrappers map the standard DAG creation interface to the
+ * DAG creation routines. Additionally, these wrappers enable experimentation
+ * with new DAG structures by providing an extra level of indirection, allowing
+ * the DAG creation routines to be replaced at this single point.
+ */
+
+void rf_CreateRaidFiveDegradedReadDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList)
+{
+ rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+ &rf_xorRecoveryFuncs);
+}
+
+
+/******************************************************************************
+ *
+ * DAG creation code begins here
+ */
+
+
+/******************************************************************************
+ * Create a degraded read DAG for RAID level 1
+ *
+ * Hdr -> Nil -> R(p/s)d -> Commit -> Trm
+ *
+ * The "Rd" node reads data from the surviving disk in the mirror pair
+ * Rpd - read of primary copy
+ * Rsd - read of secondary copy
+ *
+ * Parameters: raidPtr - description of the physical array
+ * asmap - logical & physical addresses for this access
+ * bp - buffer ptr (for holding write data)
+ * flags - general flags (e.g. disk locking)
+ * allocList - list of memory allocated in DAG creation
+ *****************************************************************************/
+
+void rf_CreateRaidOneDegradedReadDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList)
+{
+ RF_DagNode_t *nodes, *rdNode, *blockNode, *commitNode, *termNode;
+ RF_StripeNum_t parityStripeID;
+ RF_ReconUnitNum_t which_ru;
+ RF_PhysDiskAddr_t *pda;
+ int useMirror, i;
+
+ useMirror = 0;
+ parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
+ asmap->raidAddress, &which_ru);
+ if (rf_dagDebug) {
+ printf("[Creating RAID level 1 degraded read DAG]\n");
+ }
+ dag_h->creator = "RaidOneDegradedReadDAG";
+ /* alloc the Wnd nodes and the Wmir node */
+ if (asmap->numDataFailed == 0)
+ useMirror = RF_FALSE;
+ else
+ useMirror = RF_TRUE;
+
+ /* total number of nodes = 1 + (block + commit + terminator) */
+ RF_CallocAndAdd(nodes, 4, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+ i = 0;
+ rdNode = &nodes[i]; i++;
+ blockNode = &nodes[i]; i++;
+ commitNode = &nodes[i]; i++;
+ termNode = &nodes[i]; i++;
+
+ /* this dag can not commit until the commit node is reached. errors prior
+ * to the commit point imply the dag has failed and must be retried
+ */
+ dag_h->numCommitNodes = 1;
+ dag_h->numCommits = 0;
+ dag_h->numSuccedents = 1;
+
+ /* initialize the block, commit, and terminator nodes */
+ rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
+ rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList);
+ rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
+ NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+
+ pda = asmap->physInfo;
+ RF_ASSERT(pda != NULL);
+ /* parityInfo must describe entire parity unit */
+ RF_ASSERT(asmap->parityInfo->next == NULL);
+
+ /* initialize the data node */
+ if (!useMirror) {
+ /* read primary copy of data */
+ rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+ rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rpd", allocList);
+ rdNode->params[0].p = pda;
+ rdNode->params[1].p = pda->bufPtr;
+ rdNode->params[2].v = parityStripeID;
+ rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ }
+ else {
+ /* read secondary copy of data */
+ rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+ rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rsd", allocList);
+ rdNode->params[0].p = asmap->parityInfo;
+ rdNode->params[1].p = pda->bufPtr;
+ rdNode->params[2].v = parityStripeID;
+ rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ }
+
+ /* connect header to block node */
+ RF_ASSERT(dag_h->numSuccedents == 1);
+ RF_ASSERT(blockNode->numAntecedents == 0);
+ dag_h->succedents[0] = blockNode;
+
+ /* connect block node to rdnode */
+ RF_ASSERT(blockNode->numSuccedents == 1);
+ RF_ASSERT(rdNode->numAntecedents == 1);
+ blockNode->succedents[0] = rdNode;
+ rdNode->antecedents[0] = blockNode;
+ rdNode->antType[0] = rf_control;
+
+ /* connect rdnode to commit node */
+ RF_ASSERT(rdNode->numSuccedents == 1);
+ RF_ASSERT(commitNode->numAntecedents == 1);
+ rdNode->succedents[0] = commitNode;
+ commitNode->antecedents[0] = rdNode;
+ commitNode->antType[0] = rf_control;
+
+ /* connect commit node to terminator */
+ RF_ASSERT(commitNode->numSuccedents == 1);
+ RF_ASSERT(termNode->numAntecedents == 1);
+ RF_ASSERT(termNode->numSuccedents == 0);
+ commitNode->succedents[0] = termNode;
+ termNode->antecedents[0] = commitNode;
+ termNode->antType[0] = rf_control;
+}
+
+
+
+/******************************************************************************
+ *
+ * creates a DAG to perform a degraded-mode read of data within one stripe.
+ * This DAG is as follows:
+ *
+ * Hdr -> Block -> Rud -> Xor -> Cmt -> T
+ * -> Rrd ->
+ * -> Rp -->
+ *
+ * Each R node is a successor of the L node
+ * One successor arc from each R node goes to C, and the other to X
+ * There is one Rud for each chunk of surviving user data requested by the
+ * user, and one Rrd for each chunk of surviving user data _not_ being read by
+ * the user
+ * R = read, ud = user data, rd = recovery (surviving) data, p = parity
+ * X = XOR, C = Commit, T = terminate
+ *
+ * The block node guarantees a single source node.
+ *
+ * Note: The target buffer for the XOR node is set to the actual user buffer
+ * where the failed data is supposed to end up. This buffer is zero'd by the
+ * code here. Thus, if you create a degraded read dag, use it, and then
+ * re-use, you have to be sure to zero the target buffer prior to the re-use.
+ *
+ * The recfunc argument at the end specifies the name and function used for
+ * the redundancy
+ * recovery function.
+ *
+ *****************************************************************************/
+
+void rf_CreateDegradedReadDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList,
+ RF_RedFuncs_t *recFunc)
+{
+ RF_DagNode_t *nodes, *rudNodes, *rrdNodes, *xorNode, *blockNode;
+ RF_DagNode_t *commitNode, *rpNode, *termNode;
+ int nNodes, nRrdNodes, nRudNodes, nXorBufs, i;
+ int j, paramNum;
+ RF_SectorCount_t sectorsPerSU;
+ RF_ReconUnitNum_t which_ru;
+ char *overlappingPDAs; /* a temporary array of flags */
+ RF_AccessStripeMapHeader_t *new_asm_h[2];
+ RF_PhysDiskAddr_t *pda, *parityPDA;
+ RF_StripeNum_t parityStripeID;
+ RF_PhysDiskAddr_t *failedPDA;
+ RF_RaidLayout_t *layoutPtr;
+ char *rpBuf;
+
+ layoutPtr = &(raidPtr->Layout);
+ /* failedPDA points to the pda within the asm that targets the failed disk */
+ failedPDA = asmap->failedPDAs[0];
+ parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr,
+ asmap->raidAddress, &which_ru);
+ sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
+
+ if (rf_dagDebug) {
+ printf("[Creating degraded read DAG]\n");
+ }
+
+ RF_ASSERT( asmap->numDataFailed == 1 );
+ dag_h->creator = "DegradedReadDAG";
+
+ /*
+ * generate two ASMs identifying the surviving data we need
+ * in order to recover the lost data
+ */
+
+ /* overlappingPDAs array must be zero'd */
+ RF_Calloc(overlappingPDAs, asmap->numStripeUnitsAccessed, sizeof(char), (char *));
+ rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h, &nXorBufs,
+ &rpBuf, overlappingPDAs, allocList);
+
+ /*
+ * create all the nodes at once
+ *
+ * -1 because no access is generated for the failed pda
+ */
+ nRudNodes = asmap->numStripeUnitsAccessed-1;
+ nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) +
+ ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0);
+ nNodes = 5 + nRudNodes + nRrdNodes; /* lock, unlock, xor, Rp, Rud, Rrd */
+ RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *),
+ allocList);
+ i = 0;
+ blockNode = &nodes[i]; i++;
+ commitNode = &nodes[i]; i++;
+ xorNode = &nodes[i]; i++;
+ rpNode = &nodes[i]; i++;
+ termNode = &nodes[i]; i++;
+ rudNodes = &nodes[i]; i += nRudNodes;
+ rrdNodes = &nodes[i]; i += nRrdNodes;
+ RF_ASSERT(i == nNodes);
+
+ /* initialize nodes */
+ dag_h->numCommitNodes = 1;
+ dag_h->numCommits = 0;
+ /* this dag can not commit until the commit node is reached
+ * errors prior to the commit point imply the dag has failed
+ */
+ dag_h->numSuccedents = 1;
+
+ rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, nRudNodes+nRrdNodes+1, 0, 0, 0, dag_h, "Nil", allocList);
+ rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList);
+ rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
+ NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+ rf_InitNode(xorNode, rf_wait, RF_FALSE, recFunc->simple, rf_NullNodeUndoFunc,
+ NULL, 1, nRudNodes+nRrdNodes+1, 2*nXorBufs+2, 1, dag_h,
+ recFunc->SimpleName, allocList);
+
+ /* fill in the Rud nodes */
+ for (pda=asmap->physInfo, i=0; i<nRudNodes; i++, pda=pda->next) {
+ if (pda == failedPDA) {i--; continue;}
+ rf_InitNode(&rudNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc,
+ rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
+ "Rud", allocList);
+ RF_ASSERT(pda);
+ rudNodes[i].params[0].p = pda;
+ rudNodes[i].params[1].p = pda->bufPtr;
+ rudNodes[i].params[2].v = parityStripeID;
+ rudNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ }
+
+ /* fill in the Rrd nodes */
+ i = 0;
+ if (new_asm_h[0]) {
+ for (pda=new_asm_h[0]->stripeMap->physInfo;
+ i<new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
+ i++, pda=pda->next)
+ {
+ rf_InitNode(&rrdNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc,
+ rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
+ dag_h, "Rrd", allocList);
+ RF_ASSERT(pda);
+ rrdNodes[i].params[0].p = pda;
+ rrdNodes[i].params[1].p = pda->bufPtr;
+ rrdNodes[i].params[2].v = parityStripeID;
+ rrdNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ }
+ }
+ if (new_asm_h[1]) {
+ for (j=0,pda=new_asm_h[1]->stripeMap->physInfo;
+ j<new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
+ j++, pda=pda->next)
+ {
+ rf_InitNode(&rrdNodes[i+j], rf_wait, RF_FALSE, rf_DiskReadFunc,
+ rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
+ dag_h, "Rrd", allocList);
+ RF_ASSERT(pda);
+ rrdNodes[i+j].params[0].p = pda;
+ rrdNodes[i+j].params[1].p = pda->bufPtr;
+ rrdNodes[i+j].params[2].v = parityStripeID;
+ rrdNodes[i+j].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ }
+ }
+
+ /* make a PDA for the parity unit */
+ RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
+ parityPDA->row = asmap->parityInfo->row;
+ parityPDA->col = asmap->parityInfo->col;
+ parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU)
+ * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
+ parityPDA->numSector = failedPDA->numSector;
+
+ /* initialize the Rp node */
+ rf_InitNode(rpNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+ rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rp ", allocList);
+ rpNode->params[0].p = parityPDA;
+ rpNode->params[1].p = rpBuf;
+ rpNode->params[2].v = parityStripeID;
+ rpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+
+ /*
+ * the last and nastiest step is to assign all
+ * the parameters of the Xor node
+ */
+ paramNum=0;
+ for (i=0; i<nRrdNodes; i++) {
+ /* all the Rrd nodes need to be xored together */
+ xorNode->params[paramNum++] = rrdNodes[i].params[0];
+ xorNode->params[paramNum++] = rrdNodes[i].params[1];
+ }
+ for (i=0; i<nRudNodes; i++) {
+ /* any Rud nodes that overlap the failed access need to be xored in */
+ if (overlappingPDAs[i]) {
+ RF_MallocAndAdd(pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
+ bcopy((char *)rudNodes[i].params[0].p, (char *)pda, sizeof(RF_PhysDiskAddr_t));
+ rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_DOBUFFER, 0);
+ xorNode->params[paramNum++].p = pda;
+ xorNode->params[paramNum++].p = pda->bufPtr;
+ }
+ }
+ RF_Free(overlappingPDAs, asmap->numStripeUnitsAccessed * sizeof(char));
+
+ /* install parity pda as last set of params to be xor'd */
+ xorNode->params[paramNum++].p = parityPDA;
+ xorNode->params[paramNum++].p = rpBuf;
+
+ /*
+ * the last 2 params to the recovery xor node are
+ * the failed PDA and the raidPtr
+ */
+ xorNode->params[paramNum++].p = failedPDA;
+ xorNode->params[paramNum++].p = raidPtr;
+ RF_ASSERT( paramNum == 2*nXorBufs+2 );
+
+ /*
+ * The xor node uses results[0] as the target buffer.
+ * Set pointer and zero the buffer. In the kernel, this
+ * may be a user buffer in which case we have to remap it.
+ */
+ xorNode->results[0] = failedPDA->bufPtr;
+ RF_BZERO(bp, failedPDA->bufPtr, rf_RaidAddressToByte(raidPtr,
+ failedPDA->numSector));
+
+ /* connect nodes to form graph */
+ /* connect the header to the block node */
+ RF_ASSERT(dag_h->numSuccedents == 1);
+ RF_ASSERT(blockNode->numAntecedents == 0);
+ dag_h->succedents[0] = blockNode;
+
+ /* connect the block node to the read nodes */
+ RF_ASSERT(blockNode->numSuccedents == (1 + nRrdNodes + nRudNodes));
+ RF_ASSERT(rpNode->numAntecedents == 1);
+ blockNode->succedents[0] = rpNode;
+ rpNode->antecedents[0] = blockNode;
+ rpNode->antType[0] = rf_control;
+ for (i = 0; i < nRrdNodes; i++) {
+ RF_ASSERT(rrdNodes[i].numSuccedents == 1);
+ blockNode->succedents[1 + i] = &rrdNodes[i];
+ rrdNodes[i].antecedents[0] = blockNode;
+ rrdNodes[i].antType[0] = rf_control;
+ }
+ for (i = 0; i < nRudNodes; i++) {
+ RF_ASSERT(rudNodes[i].numSuccedents == 1);
+ blockNode->succedents[1 + nRrdNodes + i] = &rudNodes[i];
+ rudNodes[i].antecedents[0] = blockNode;
+ rudNodes[i].antType[0] = rf_control;
+ }
+
+ /* connect the read nodes to the xor node */
+ RF_ASSERT(xorNode->numAntecedents == (1 + nRrdNodes + nRudNodes));
+ RF_ASSERT(rpNode->numSuccedents == 1);
+ rpNode->succedents[0] = xorNode;
+ xorNode->antecedents[0] = rpNode;
+ xorNode->antType[0] = rf_trueData;
+ for (i = 0; i < nRrdNodes; i++) {
+ RF_ASSERT(rrdNodes[i].numSuccedents == 1);
+ rrdNodes[i].succedents[0] = xorNode;
+ xorNode->antecedents[1 + i] = &rrdNodes[i];
+ xorNode->antType[1 + i] = rf_trueData;
+ }
+ for (i = 0; i < nRudNodes; i++) {
+ RF_ASSERT(rudNodes[i].numSuccedents == 1);
+ rudNodes[i].succedents[0] = xorNode;
+ xorNode->antecedents[1 + nRrdNodes + i] = &rudNodes[i];
+ xorNode->antType[1 + nRrdNodes + i] = rf_trueData;
+ }
+
+ /* connect the xor node to the commit node */
+ RF_ASSERT(xorNode->numSuccedents == 1);
+ RF_ASSERT(commitNode->numAntecedents == 1);
+ xorNode->succedents[0] = commitNode;
+ commitNode->antecedents[0] = xorNode;
+ commitNode->antType[0] = rf_control;
+
+ /* connect the termNode to the commit node */
+ RF_ASSERT(commitNode->numSuccedents == 1);
+ RF_ASSERT(termNode->numAntecedents == 1);
+ RF_ASSERT(termNode->numSuccedents == 0);
+ commitNode->succedents[0] = termNode;
+ termNode->antType[0] = rf_control;
+ termNode->antecedents[0] = commitNode;
+}
+
+
+/******************************************************************************
+ * Create a degraded read DAG for Chained Declustering
+ *
+ * Hdr -> Nil -> R(p/s)d -> Cmt -> Trm
+ *
+ * The "Rd" node reads data from the surviving disk in the mirror pair
+ * Rpd - read of primary copy
+ * Rsd - read of secondary copy
+ *
+ * Parameters: raidPtr - description of the physical array
+ * asmap - logical & physical addresses for this access
+ * bp - buffer ptr (for holding write data)
+ * flags - general flags (e.g. disk locking)
+ * allocList - list of memory allocated in DAG creation
+ *****************************************************************************/
+
+void rf_CreateRaidCDegradedReadDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList)
+{
+ RF_DagNode_t *nodes, *rdNode, *blockNode, *commitNode, *termNode;
+ RF_StripeNum_t parityStripeID;
+ int useMirror, i, shiftable;
+ RF_ReconUnitNum_t which_ru;
+ RF_PhysDiskAddr_t *pda;
+
+ if ((asmap->numDataFailed + asmap->numParityFailed) == 0) {
+ shiftable = RF_TRUE;
+ }
+ else {
+ shiftable = RF_FALSE;
+ }
+ useMirror = 0;
+ parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
+ asmap->raidAddress, &which_ru);
+
+ if (rf_dagDebug) {
+ printf("[Creating RAID C degraded read DAG]\n");
+ }
+ dag_h->creator = "RaidCDegradedReadDAG";
+ /* alloc the Wnd nodes and the Wmir node */
+ if (asmap->numDataFailed == 0)
+ useMirror = RF_FALSE;
+ else
+ useMirror = RF_TRUE;
+
+ /* total number of nodes = 1 + (block + commit + terminator) */
+ RF_CallocAndAdd(nodes, 4, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+ i = 0;
+ rdNode = &nodes[i]; i++;
+ blockNode = &nodes[i]; i++;
+ commitNode = &nodes[i]; i++;
+ termNode = &nodes[i]; i++;
+
+ /*
+ * This dag can not commit until the commit node is reached.
+ * Errors prior to the commit point imply the dag has failed
+ * and must be retried.
+ */
+ dag_h->numCommitNodes = 1;
+ dag_h->numCommits = 0;
+ dag_h->numSuccedents = 1;
+
+ /* initialize the block, commit, and terminator nodes */
+ rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
+ rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList);
+ rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
+ NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+
+ pda = asmap->physInfo;
+ RF_ASSERT(pda != NULL);
+ /* parityInfo must describe entire parity unit */
+ RF_ASSERT(asmap->parityInfo->next == NULL);
+
+ /* initialize the data node */
+ if (!useMirror) {
+ rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+ rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rpd", allocList);
+ if (shiftable && rf_compute_workload_shift(raidPtr, pda)) {
+ /* shift this read to the next disk in line */
+ rdNode->params[0].p = asmap->parityInfo;
+ rdNode->params[1].p = pda->bufPtr;
+ rdNode->params[2].v = parityStripeID;
+ rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ }
+ else {
+ /* read primary copy */
+ rdNode->params[0].p = pda;
+ rdNode->params[1].p = pda->bufPtr;
+ rdNode->params[2].v = parityStripeID;
+ rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ }
+ }
+ else {
+ /* read secondary copy of data */
+ rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+ rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rsd", allocList);
+ rdNode->params[0].p = asmap->parityInfo;
+ rdNode->params[1].p = pda->bufPtr;
+ rdNode->params[2].v = parityStripeID;
+ rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ }
+
+ /* connect header to block node */
+ RF_ASSERT(dag_h->numSuccedents == 1);
+ RF_ASSERT(blockNode->numAntecedents == 0);
+ dag_h->succedents[0] = blockNode;
+
+ /* connect block node to rdnode */
+ RF_ASSERT(blockNode->numSuccedents == 1);
+ RF_ASSERT(rdNode->numAntecedents == 1);
+ blockNode->succedents[0] = rdNode;
+ rdNode->antecedents[0] = blockNode;
+ rdNode->antType[0] = rf_control;
+
+ /* connect rdnode to commit node */
+ RF_ASSERT(rdNode->numSuccedents == 1);
+ RF_ASSERT(commitNode->numAntecedents == 1);
+ rdNode->succedents[0] = commitNode;
+ commitNode->antecedents[0] = rdNode;
+ commitNode->antType[0] = rf_control;
+
+ /* connect commit node to terminator */
+ RF_ASSERT(commitNode->numSuccedents == 1);
+ RF_ASSERT(termNode->numAntecedents == 1);
+ RF_ASSERT(termNode->numSuccedents == 0);
+ commitNode->succedents[0] = termNode;
+ termNode->antecedents[0] = commitNode;
+ termNode->antType[0] = rf_control;
+}
+
+/*
+ * XXX move this elsewhere?
+ */
+void rf_DD_GenerateFailedAccessASMs(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_PhysDiskAddr_t **pdap,
+ int *nNodep,
+ RF_PhysDiskAddr_t **pqpdap,
+ int *nPQNodep,
+ RF_AllocListElem_t *allocList)
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ int PDAPerDisk,i;
+ RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
+ int numDataCol = layoutPtr->numDataCol;
+ int state;
+ RF_SectorNum_t suoff, suend;
+ unsigned firstDataCol, napdas, count;
+ RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end = 0;
+ RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0], *ftwo = asmap->failedPDAs[1];
+ RF_PhysDiskAddr_t *pda_p;
+ RF_PhysDiskAddr_t *phys_p;
+ RF_RaidAddr_t sosAddr;
+
+ /* determine how many pda's we will have to generate per unaccess stripe.
+ If there is only one failed data unit, it is one; if two, possibly two,
+ depending wether they overlap. */
+
+ fone_start = rf_StripeUnitOffset(layoutPtr,fone->startSector);
+ fone_end = fone_start + fone->numSector;
+
+#define CONS_PDA(if,start,num) \
+ pda_p->row = asmap->if->row; pda_p->col = asmap->if->col; \
+ pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \
+ pda_p->numSector = num; \
+ pda_p->next = NULL; \
+ RF_MallocAndAdd(pda_p->bufPtr,rf_RaidAddressToByte(raidPtr,num),(char *), allocList)
+
+ if (asmap->numDataFailed==1)
+ {
+ PDAPerDisk = 1;
+ state = 1;
+ RF_MallocAndAdd(*pqpdap,2*sizeof(RF_PhysDiskAddr_t),(RF_PhysDiskAddr_t *), allocList);
+ pda_p = *pqpdap;
+ /* build p */
+ CONS_PDA(parityInfo,fone_start,fone->numSector);
+ pda_p->type = RF_PDA_TYPE_PARITY;
+ pda_p++;
+ /* build q */
+ CONS_PDA(qInfo,fone_start,fone->numSector);
+ pda_p->type = RF_PDA_TYPE_Q;
+ }
+ else
+ {
+ ftwo_start = rf_StripeUnitOffset(layoutPtr,ftwo->startSector);
+ ftwo_end = ftwo_start + ftwo->numSector;
+ if (fone->numSector + ftwo->numSector > secPerSU)
+ {
+ PDAPerDisk = 1;
+ state = 2;
+ RF_MallocAndAdd(*pqpdap,2*sizeof(RF_PhysDiskAddr_t),(RF_PhysDiskAddr_t *), allocList);
+ pda_p = *pqpdap;
+ CONS_PDA(parityInfo,0,secPerSU);
+ pda_p->type = RF_PDA_TYPE_PARITY;
+ pda_p++;
+ CONS_PDA(qInfo,0,secPerSU);
+ pda_p->type = RF_PDA_TYPE_Q;
+ }
+ else
+ {
+ PDAPerDisk = 2;
+ state = 3;
+ /* four of them, fone, then ftwo */
+ RF_MallocAndAdd(*pqpdap,4*sizeof(RF_PhysDiskAddr_t),(RF_PhysDiskAddr_t *), allocList);
+ pda_p = *pqpdap;
+ CONS_PDA(parityInfo,fone_start,fone->numSector);
+ pda_p->type = RF_PDA_TYPE_PARITY;
+ pda_p++;
+ CONS_PDA(qInfo,fone_start,fone->numSector);
+ pda_p->type = RF_PDA_TYPE_Q;
+ pda_p++;
+ CONS_PDA(parityInfo,ftwo_start,ftwo->numSector);
+ pda_p->type = RF_PDA_TYPE_PARITY;
+ pda_p++;
+ CONS_PDA(qInfo,ftwo_start,ftwo->numSector);
+ pda_p->type = RF_PDA_TYPE_Q;
+ }
+ }
+ /* figure out number of nonaccessed pda */
+ napdas = PDAPerDisk * (numDataCol - asmap->numStripeUnitsAccessed - (ftwo==NULL ? 1 : 0));
+ *nPQNodep = PDAPerDisk;
+
+ /* sweep over the over accessed pda's, figuring out the number of
+ additional pda's to generate. Of course, skip the failed ones */
+
+ count = 0;
+ for ( pda_p=asmap->physInfo; pda_p; pda_p= pda_p->next)
+ {
+ if ((pda_p == fone) || (pda_p == ftwo))
+ continue;
+ suoff = rf_StripeUnitOffset(layoutPtr,pda_p->startSector);
+ suend = suoff + pda_p->numSector;
+ switch (state)
+ {
+ case 1: /* one failed PDA to overlap */
+ /* if a PDA doesn't contain the failed unit, it can
+ only miss the start or end, not both */
+ if ((suoff > fone_start) || (suend <fone_end))
+ count++;
+ break;
+ case 2: /* whole stripe */
+ if (suoff) /* leak at begining */
+ count++;
+ if (suend < numDataCol) /* leak at end */
+ count++;
+ break;
+ case 3: /* two disjoint units */
+ if ((suoff > fone_start) || (suend <fone_end))
+ count++;
+ if ((suoff > ftwo_start) || (suend <ftwo_end))
+ count++;
+ break;
+ default:
+ RF_PANIC();
+ }
+ }
+
+ napdas += count;
+ *nNodep = napdas;
+ if (napdas == 0) return; /* short circuit */
+
+ /* allocate up our list of pda's */
+
+ RF_CallocAndAdd(pda_p, napdas, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
+ *pdap = pda_p;
+
+ /* linkem together */
+ for (i=0; i < (napdas-1); i++)
+ pda_p[i].next = pda_p+(i+1);
+
+ /* march through the one's up to the first accessed disk */
+ firstDataCol = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),asmap->physInfo->raidAddress) % numDataCol;
+ sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+ for (i=0; i < firstDataCol; i++)
+ {
+ if ((pda_p - (*pdap)) == napdas)
+ continue;
+ pda_p->type = RF_PDA_TYPE_DATA;
+ pda_p->raidAddress = sosAddr + (i * secPerSU);
+ (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+ /* skip over dead disks */
+ if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status))
+ continue;
+ switch (state)
+ {
+ case 1: /* fone */
+ pda_p->numSector = fone->numSector;
+ pda_p->raidAddress += fone_start;
+ pda_p->startSector += fone_start;
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
+ break;
+ case 2: /* full stripe */
+ pda_p->numSector = secPerSU;
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,secPerSU), (char *), allocList);
+ break;
+ case 3: /* two slabs */
+ pda_p->numSector = fone->numSector;
+ pda_p->raidAddress += fone_start;
+ pda_p->startSector += fone_start;
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
+ pda_p++;
+ pda_p->type = RF_PDA_TYPE_DATA;
+ pda_p->raidAddress = sosAddr + (i * secPerSU);
+ (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+ pda_p->numSector = ftwo->numSector;
+ pda_p->raidAddress += ftwo_start;
+ pda_p->startSector += ftwo_start;
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
+ break;
+ default:
+ RF_PANIC();
+ }
+ pda_p++;
+ }
+
+ /* march through the touched stripe units */
+ for (phys_p = asmap->physInfo; phys_p; phys_p = phys_p->next, i++)
+ {
+ if ((phys_p == asmap->failedPDAs[0]) || (phys_p == asmap->failedPDAs[1]))
+ continue;
+ suoff = rf_StripeUnitOffset(layoutPtr,phys_p->startSector);
+ suend = suoff + phys_p->numSector;
+ switch(state)
+ {
+ case 1: /* single buffer */
+ if (suoff > fone_start)
+ {
+ RF_ASSERT( suend >= fone_end );
+ /* The data read starts after the mapped access,
+ snip off the begining */
+ pda_p->numSector = suoff - fone_start;
+ pda_p->raidAddress = sosAddr + (i*secPerSU) + fone_start;
+ (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
+ pda_p++;
+ }
+ if (suend < fone_end)
+ {
+ RF_ASSERT ( suoff <= fone_start);
+ /* The data read stops before the end of the failed access, extend */
+ pda_p->numSector = fone_end - suend;
+ pda_p->raidAddress = sosAddr + (i*secPerSU) + suend; /* off by one? */
+ (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
+ pda_p++;
+ }
+ break;
+ case 2: /* whole stripe unit */
+ RF_ASSERT( (suoff == 0) || (suend == secPerSU));
+ if (suend < secPerSU)
+ { /* short read, snip from end on */
+ pda_p->numSector = secPerSU - suend;
+ pda_p->raidAddress = sosAddr + (i*secPerSU) + suend; /* off by one? */
+ (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
+ pda_p++;
+ }
+ else
+ if (suoff > 0)
+ { /* short at front */
+ pda_p->numSector = suoff;
+ pda_p->raidAddress = sosAddr + (i*secPerSU);
+ (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
+ pda_p++;
+ }
+ break;
+ case 3: /* two nonoverlapping failures */
+ if ((suoff > fone_start) || (suend <fone_end))
+ {
+ if (suoff > fone_start)
+ {
+ RF_ASSERT( suend >= fone_end );
+ /* The data read starts after the mapped access,
+ snip off the begining */
+ pda_p->numSector = suoff - fone_start;
+ pda_p->raidAddress = sosAddr + (i*secPerSU) + fone_start;
+ (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
+ pda_p++;
+ }
+ if (suend < fone_end)
+ {
+ RF_ASSERT ( suoff <= fone_start);
+ /* The data read stops before the end of the failed access, extend */
+ pda_p->numSector = fone_end - suend;
+ pda_p->raidAddress = sosAddr + (i*secPerSU) + suend; /* off by one? */
+ (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
+ pda_p++;
+ }
+ }
+ if ((suoff > ftwo_start) || (suend <ftwo_end))
+ {
+ if (suoff > ftwo_start)
+ {
+ RF_ASSERT( suend >= ftwo_end );
+ /* The data read starts after the mapped access,
+ snip off the begining */
+ pda_p->numSector = suoff - ftwo_start;
+ pda_p->raidAddress = sosAddr + (i*secPerSU) + ftwo_start;
+ (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
+ pda_p++;
+ }
+ if (suend < ftwo_end)
+ {
+ RF_ASSERT ( suoff <= ftwo_start);
+ /* The data read stops before the end of the failed access, extend */
+ pda_p->numSector = ftwo_end - suend;
+ pda_p->raidAddress = sosAddr + (i*secPerSU) + suend; /* off by one? */
+ (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
+ pda_p++;
+ }
+ }
+ break;
+ default:
+ RF_PANIC();
+ }
+ }
+
+ /* after the last accessed disk */
+ for (; i < numDataCol; i++ )
+ {
+ if ((pda_p - (*pdap)) == napdas)
+ continue;
+ pda_p->type = RF_PDA_TYPE_DATA;
+ pda_p->raidAddress = sosAddr + (i * secPerSU);
+ (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+ /* skip over dead disks */
+ if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status))
+ continue;
+ switch (state)
+ {
+ case 1: /* fone */
+ pda_p->numSector = fone->numSector;
+ pda_p->raidAddress += fone_start;
+ pda_p->startSector += fone_start;
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
+ break;
+ case 2: /* full stripe */
+ pda_p->numSector = secPerSU;
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,secPerSU), (char *), allocList);
+ break;
+ case 3: /* two slabs */
+ pda_p->numSector = fone->numSector;
+ pda_p->raidAddress += fone_start;
+ pda_p->startSector += fone_start;
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
+ pda_p++;
+ pda_p->type = RF_PDA_TYPE_DATA;
+ pda_p->raidAddress = sosAddr + (i * secPerSU);
+ (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+ pda_p->numSector = ftwo->numSector;
+ pda_p->raidAddress += ftwo_start;
+ pda_p->startSector += ftwo_start;
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
+ break;
+ default:
+ RF_PANIC();
+ }
+ pda_p++;
+ }
+
+ RF_ASSERT (pda_p - *pdap == napdas);
+ return;
+}
+
+#define INIT_DISK_NODE(node,name) \
+rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
+(node)->succedents[0] = unblockNode; \
+(node)->succedents[1] = recoveryNode; \
+(node)->antecedents[0] = blockNode; \
+(node)->antType[0] = rf_control
+
+#define DISK_NODE_PARAMS(_node_,_p_) \
+ (_node_).params[0].p = _p_ ; \
+ (_node_).params[1].p = (_p_)->bufPtr; \
+ (_node_).params[2].v = parityStripeID; \
+ (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
+
+void rf_DoubleDegRead(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList,
+ char *redundantReadNodeName,
+ char *recoveryNodeName,
+ int (*recovFunc)(RF_DagNode_t *))
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_DagNode_t *nodes, *rudNodes, *rrdNodes, *recoveryNode, *blockNode, *unblockNode, *rpNodes, *rqNodes, *termNode;
+ RF_PhysDiskAddr_t *pda, *pqPDAs;
+ RF_PhysDiskAddr_t *npdas;
+ int nNodes, nRrdNodes, nRudNodes, i;
+ RF_ReconUnitNum_t which_ru;
+ int nReadNodes, nPQNodes;
+ RF_PhysDiskAddr_t *failedPDA = asmap->failedPDAs[0];
+ RF_PhysDiskAddr_t *failedPDAtwo = asmap->failedPDAs[1];
+ RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, &which_ru);
+
+ if (rf_dagDebug) printf("[Creating Double Degraded Read DAG]\n");
+ rf_DD_GenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes, &pqPDAs, &nPQNodes,allocList);
+
+ nRudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed);
+ nReadNodes = nRrdNodes + nRudNodes + 2*nPQNodes;
+ nNodes = 4 /* block, unblock, recovery, term */ + nReadNodes;
+
+ RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+ i = 0;
+ blockNode = &nodes[i]; i += 1;
+ unblockNode = &nodes[i]; i += 1;
+ recoveryNode = &nodes[i]; i += 1;
+ termNode = &nodes[i]; i += 1;
+ rudNodes = &nodes[i]; i += nRudNodes;
+ rrdNodes = &nodes[i]; i += nRrdNodes;
+ rpNodes = &nodes[i]; i += nPQNodes;
+ rqNodes = &nodes[i]; i += nPQNodes;
+ RF_ASSERT(i == nNodes);
+
+ dag_h->numSuccedents = 1;
+ dag_h->succedents[0] = blockNode;
+ dag_h->creator = "DoubleDegRead";
+ dag_h->numCommits = 0;
+ dag_h->numCommitNodes = 1; /*unblock */
+
+ rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 2, 0, 0, dag_h, "Trm", allocList);
+ termNode->antecedents[0] = unblockNode;
+ termNode->antType[0] = rf_control;
+ termNode->antecedents[1] = recoveryNode;
+ termNode->antType[1] = rf_control;
+
+ /* init the block and unblock nodes */
+ /* The block node has all nodes except itself, unblock and recovery as successors. Similarly for
+ predecessors of the unblock. */
+ rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h, "Nil", allocList);
+ rf_InitNode(unblockNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nReadNodes, 0, 0, dag_h, "Nil", allocList);
+
+ for (i=0; i < nReadNodes; i++)
+ {
+ blockNode->succedents[i] = rudNodes+i;
+ unblockNode->antecedents[i] = rudNodes+i;
+ unblockNode->antType[i] = rf_control;
+ }
+ unblockNode->succedents[0] = termNode;
+
+ /* The recovery node has all the reads as predecessors, and the term node as successors. It gets a pda as a param
+ from each of the read nodes plus the raidPtr.
+ For each failed unit is has a result pda. */
+ rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc, rf_NullNodeUndoFunc, NULL,
+ 1, /* succesors */
+ nReadNodes, /* preds */
+ nReadNodes+2, /* params */
+ asmap->numDataFailed, /* results */
+ dag_h, recoveryNodeName, allocList);
+
+ recoveryNode->succedents[0] = termNode;
+ for (i=0; i < nReadNodes; i++) {
+ recoveryNode->antecedents[i] = rudNodes+i;
+ recoveryNode->antType[i] = rf_trueData;
+ }
+
+ /* build the read nodes, then come back and fill in recovery params and results */
+ pda = asmap->physInfo;
+ for (i=0; i < nRudNodes; pda = pda->next)
+ {
+ if ((pda == failedPDA) || (pda == failedPDAtwo))
+ continue;
+ INIT_DISK_NODE(rudNodes+i,"Rud");
+ RF_ASSERT(pda);
+ DISK_NODE_PARAMS(rudNodes[i],pda);
+ i++;
+ }
+
+ pda = npdas;
+ for (i=0; i < nRrdNodes; i++, pda = pda->next)
+ {
+ INIT_DISK_NODE(rrdNodes+i,"Rrd");
+ RF_ASSERT(pda);
+ DISK_NODE_PARAMS(rrdNodes[i],pda);
+ }
+
+ /* redundancy pdas */
+ pda = pqPDAs;
+ INIT_DISK_NODE(rpNodes,"Rp");
+ RF_ASSERT(pda);
+ DISK_NODE_PARAMS(rpNodes[0],pda);
+ pda++;
+ INIT_DISK_NODE(rqNodes,redundantReadNodeName );
+ RF_ASSERT(pda);
+ DISK_NODE_PARAMS(rqNodes[0],pda);
+ if (nPQNodes==2)
+ {
+ pda++;
+ INIT_DISK_NODE(rpNodes+1,"Rp");
+ RF_ASSERT(pda);
+ DISK_NODE_PARAMS(rpNodes[1],pda);
+ pda++;
+ INIT_DISK_NODE( rqNodes+1,redundantReadNodeName );
+ RF_ASSERT(pda);
+ DISK_NODE_PARAMS(rqNodes[1],pda);
+ }
+
+ /* fill in recovery node params */
+ for (i=0; i < nReadNodes; i++)
+ recoveryNode->params[i] = rudNodes[i].params[0]; /* pda */
+ recoveryNode->params[i++].p = (void *) raidPtr;
+ recoveryNode->params[i++].p = (void *) asmap;
+ recoveryNode->results[0] = failedPDA;
+ if (asmap->numDataFailed ==2 )
+ recoveryNode->results[1] = failedPDAtwo;
+
+ /* zero fill the target data buffers? */
+}
diff --git a/sys/dev/raidframe/rf_dagdegrd.h b/sys/dev/raidframe/rf_dagdegrd.h
new file mode 100644
index 00000000000..3e0bce1c7ff
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagdegrd.h
@@ -0,0 +1,88 @@
+/* $OpenBSD: rf_dagdegrd.h,v 1.1 1999/01/11 14:29:07 niklas Exp $ */
+/* $NetBSD: rf_dagdegrd.h,v 1.1 1998/11/13 04:20:27 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * :
+ * Log: rf_dagdegrd.h,v
+ * Revision 1.6 1996/07/31 16:29:06 jimz
+ * asm/asmap re-fix (EO merge)
+ *
+ * Revision 1.5 1996/07/31 15:34:40 jimz
+ * evenodd changes; bugfixes for double-degraded archs, generalize
+ * some formerly PQ-only functions
+ *
+ * Revision 1.4 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.3 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.2 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.1 1996/05/03 19:22:06 wvcii
+ * Initial revision
+ *
+ */
+
+#ifndef _RF__RF_DAGDEGRD_H_
+#define _RF__RF_DAGDEGRD_H_
+
+#include "rf_types.h"
+
+/* degraded read DAG creation routines */
+void rf_CreateRaidFiveDegradedReadDAG(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp,
+ RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList);
+void rf_CreateRaidOneDegradedReadDAG(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp,
+ RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList);
+void rf_CreateDegradedReadDAG(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp,
+ RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList,
+ RF_RedFuncs_t *recFunc);
+void rf_CreateRaidCDegradedReadDAG(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp,
+ RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList);
+void rf_DD_GenerateFailedAccessASMs(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_PhysDiskAddr_t **pdap,
+ int *nNodep, RF_PhysDiskAddr_t **pqpdap, int *nPQNodep,
+ RF_AllocListElem_t *allocList);
+void rf_DoubleDegRead(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h, void *bp, RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList, char *redundantReadNodeName,
+ char *recoveryNodeName, int (*recovFunc)(RF_DagNode_t *));
+
+#endif /* !_RF__RF_DAGDEGRD_H_ */
diff --git a/sys/dev/raidframe/rf_dagdegwr.c b/sys/dev/raidframe/rf_dagdegwr.c
new file mode 100644
index 00000000000..a712dd1e83b
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagdegwr.c
@@ -0,0 +1,969 @@
+/* $OpenBSD: rf_dagdegwr.c,v 1.1 1999/01/11 14:29:07 niklas Exp $ */
+/* $NetBSD: rf_dagdegwr.c,v 1.1 1998/11/13 04:20:27 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_dagdegwr.c
+ *
+ * code for creating degraded write DAGs
+ *
+ * :
+ * Log: rf_dagdegwr.c,v
+ * Revision 1.23 1996/11/05 21:10:40 jimz
+ * failed pda generalization
+ *
+ * Revision 1.22 1996/08/23 14:49:48 jimz
+ * remove bogus assert from small write double deg DAG generator
+ *
+ * Revision 1.21 1996/08/21 05:09:44 jimz
+ * get rid of bogus fakery in DoubleDegSmallWrite
+ *
+ * Revision 1.20 1996/08/21 04:14:35 jimz
+ * cleanup doubledegsmallwrite
+ * NOTE: we need doubledeglargewrite
+ *
+ * Revision 1.19 1996/08/19 21:39:38 jimz
+ * CommonCreateSimpleDegradedWriteDAG() was unable to correctly create DAGs for
+ * complete stripe overwrite accesses- it assumed the necessity to read old
+ * data. Rather than do the "right" thing, and risk breaking a critical DAG so
+ * close to release, I made a no-op read node to stick in and link up in this
+ * case. Seems to work.
+ *
+ * Revision 1.18 1996/07/31 15:35:34 jimz
+ * evenodd changes; bugfixes for double-degraded archs, generalize
+ * some formerly PQ-only functions
+ *
+ * Revision 1.17 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.16 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.15 1996/07/27 16:30:19 jimz
+ * cleanup sweep
+ *
+ * Revision 1.14 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.13 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.12 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.11 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.10 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.9 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.8 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.7 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.6 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.5 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.4 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.3 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.2 1996/05/08 21:01:24 jimz
+ * fixed up enum type names that were conflicting with other
+ * enums and function names (ie, "panic")
+ * future naming trends will be towards RF_ and rf_ for
+ * everything raidframe-related
+ *
+ * Revision 1.1 1996/05/03 19:21:50 wvcii
+ * Initial revision
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_dagutils.h"
+#include "rf_dagfuncs.h"
+#include "rf_threadid.h"
+#include "rf_debugMem.h"
+#include "rf_memchunk.h"
+#include "rf_general.h"
+#include "rf_dagdegwr.h"
+#include "rf_sys.h"
+
+
+/******************************************************************************
+ *
+ * General comments on DAG creation:
+ *
+ * All DAGs in this file use roll-away error recovery. Each DAG has a single
+ * commit node, usually called "Cmt." If an error occurs before the Cmt node
+ * is reached, the execution engine will halt forward execution and work
+ * backward through the graph, executing the undo functions. Assuming that
+ * each node in the graph prior to the Cmt node are undoable and atomic - or -
+ * does not make changes to permanent state, the graph will fail atomically.
+ * If an error occurs after the Cmt node executes, the engine will roll-forward
+ * through the graph, blindly executing nodes until it reaches the end.
+ * If a graph reaches the end, it is assumed to have completed successfully.
+ *
+ * A graph has only 1 Cmt node.
+ *
+ */
+
+
+/******************************************************************************
+ *
+ * The following wrappers map the standard DAG creation interface to the
+ * DAG creation routines. Additionally, these wrappers enable experimentation
+ * with new DAG structures by providing an extra level of indirection, allowing
+ * the DAG creation routines to be replaced at this single point.
+ */
+
+static RF_CREATE_DAG_FUNC_DECL(rf_CreateSimpleDegradedWriteDAG)
+{
+ rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp,
+ flags, allocList,1, rf_RecoveryXorFunc, RF_TRUE);
+}
+
+void rf_CreateDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList)
+ RF_Raid_t *raidPtr;
+ RF_AccessStripeMap_t *asmap;
+ RF_DagHeader_t *dag_h;
+ void *bp;
+ RF_RaidAccessFlags_t flags;
+ RF_AllocListElem_t *allocList;
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_PhysDiskAddr_t *failedPDA = asmap->failedPDAs[0];
+
+ RF_ASSERT( asmap->numDataFailed == 1 );
+ dag_h->creator = "DegradedWriteDAG";
+
+ /* if the access writes only a portion of the failed unit, and also writes
+ * some portion of at least one surviving unit, we create two DAGs, one for
+ * the failed component and one for the non-failed component, and do them
+ * sequentially. Note that the fact that we're accessing only a portion of
+ * the failed unit indicates that the access either starts or ends in the
+ * failed unit, and hence we need create only two dags. This is inefficient
+ * in that the same data or parity can get read and written twice using this
+ * structure. I need to fix this to do the access all at once.
+ */
+ RF_ASSERT(!(asmap->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit));
+ rf_CreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList);
+}
+
+
+
+/******************************************************************************
+ *
+ * DAG creation code begins here
+ */
+
+
+
+/******************************************************************************
+ *
+ * CommonCreateSimpleDegradedWriteDAG -- creates a DAG to do a degraded-mode
+ * write, which is as follows
+ *
+ * / {Wnq} --\
+ * hdr -> blockNode -> Rod -> Xor -> Cmt -> Wnp ----> unblock -> term
+ * \ {Rod} / \ Wnd ---/
+ * \ {Wnd} -/
+ *
+ * commit nodes: Xor, Wnd
+ *
+ * IMPORTANT:
+ * This DAG generator does not work for double-degraded archs since it does not
+ * generate Q
+ *
+ * This dag is essentially identical to the large-write dag, except that the
+ * write to the failed data unit is suppressed.
+ *
+ * IMPORTANT: this dag does not work in the case where the access writes only
+ * a portion of the failed unit, and also writes some portion of at least one
+ * surviving SU. this case is handled in CreateDegradedWriteDAG above.
+ *
+ * The block & unblock nodes are leftovers from a previous version. They
+ * do nothing, but I haven't deleted them because it would be a tremendous
+ * effort to put them back in.
+ *
+ * This dag is used whenever a one of the data units in a write has failed.
+ * If it is the parity unit that failed, the nonredundant write dag (below)
+ * is used.
+ *****************************************************************************/
+
+void rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags,
+ allocList, nfaults, redFunc, allowBufferRecycle)
+ RF_Raid_t *raidPtr;
+ RF_AccessStripeMap_t *asmap;
+ RF_DagHeader_t *dag_h;
+ void *bp;
+ RF_RaidAccessFlags_t flags;
+ RF_AllocListElem_t *allocList;
+ int nfaults;
+ int (*redFunc)(RF_DagNode_t *);
+ int allowBufferRecycle;
+{
+ int nNodes, nRrdNodes, nWndNodes, nXorBufs, i, j, paramNum, rdnodesFaked;
+ RF_DagNode_t *blockNode, *unblockNode, *wnpNode, *wnqNode, *termNode;
+ RF_DagNode_t *nodes, *wndNodes, *rrdNodes, *xorNode, *commitNode;
+ RF_SectorCount_t sectorsPerSU;
+ RF_ReconUnitNum_t which_ru;
+ char *xorTargetBuf = NULL; /* the target buffer for the XOR operation */
+ char *overlappingPDAs; /* a temporary array of flags */
+ RF_AccessStripeMapHeader_t *new_asm_h[2];
+ RF_PhysDiskAddr_t *pda, *parityPDA;
+ RF_StripeNum_t parityStripeID;
+ RF_PhysDiskAddr_t *failedPDA;
+ RF_RaidLayout_t *layoutPtr;
+
+ layoutPtr = &(raidPtr->Layout);
+ parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress,
+ &which_ru);
+ sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
+ /* failedPDA points to the pda within the asm that targets the failed disk */
+ failedPDA = asmap->failedPDAs[0];
+
+ if (rf_dagDebug)
+ printf("[Creating degraded-write DAG]\n");
+
+ RF_ASSERT( asmap->numDataFailed == 1 );
+ dag_h->creator = "SimpleDegradedWriteDAG";
+
+ /*
+ * Generate two ASMs identifying the surviving data
+ * we need in order to recover the lost data.
+ */
+ /* overlappingPDAs array must be zero'd */
+ RF_Calloc(overlappingPDAs, asmap->numStripeUnitsAccessed, sizeof(char), (char *));
+ rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h, new_asm_h,
+ &nXorBufs, NULL, overlappingPDAs, allocList);
+
+ /* create all the nodes at once */
+ nWndNodes = asmap->numStripeUnitsAccessed - 1; /* no access is generated
+ * for the failed pda */
+
+ nRrdNodes = ((new_asm_h[0]) ? new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) +
+ ((new_asm_h[1]) ? new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0);
+ /*
+ * XXX
+ *
+ * There's a bug with a complete stripe overwrite- that means 0 reads
+ * of old data, and the rest of the DAG generation code doesn't like
+ * that. A release is coming, and I don't wanna risk breaking a critical
+ * DAG generator, so here's what I'm gonna do- if there's no read nodes,
+ * I'm gonna fake there being a read node, and I'm gonna swap in a
+ * no-op node in its place (to make all the link-up code happy).
+ * This should be fixed at some point. --jimz
+ */
+ if (nRrdNodes == 0) {
+ nRrdNodes = 1;
+ rdnodesFaked = 1;
+ }
+ else {
+ rdnodesFaked = 0;
+ }
+ /* lock, unlock, xor, Wnd, Rrd, W(nfaults) */
+ nNodes = 5 + nfaults + nWndNodes + nRrdNodes;
+ RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t),
+ (RF_DagNode_t *), allocList);
+ i = 0;
+ blockNode = &nodes[i]; i += 1;
+ commitNode = &nodes[i]; i += 1;
+ unblockNode = &nodes[i]; i += 1;
+ termNode = &nodes[i]; i += 1;
+ xorNode = &nodes[i]; i += 1;
+ wnpNode = &nodes[i]; i += 1;
+ wndNodes = &nodes[i]; i += nWndNodes;
+ rrdNodes = &nodes[i]; i += nRrdNodes;
+ if (nfaults == 2) {
+ wnqNode = &nodes[i]; i += 1;
+ }
+ else {
+ wnqNode = NULL;
+ }
+ RF_ASSERT(i == nNodes);
+
+ /* this dag can not commit until all rrd and xor Nodes have completed */
+ dag_h->numCommitNodes = 1;
+ dag_h->numCommits = 0;
+ dag_h->numSuccedents = 1;
+
+ RF_ASSERT( nRrdNodes > 0 );
+ rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, nRrdNodes, 0, 0, 0, dag_h, "Nil", allocList);
+ rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, nWndNodes + nfaults, 1, 0, 0, dag_h, "Cmt", allocList);
+ rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, 1, nWndNodes + nfaults, 0, 0, dag_h, "Nil", allocList);
+ rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
+ NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+ rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1,
+ nRrdNodes, 2*nXorBufs+2, nfaults, dag_h, "Xrc", allocList);
+
+ /*
+ * Fill in the Rrd nodes. If any of the rrd buffers are the same size as
+ * the failed buffer, save a pointer to it so we can use it as the target
+ * of the XOR. The pdas in the rrd nodes have been range-restricted, so if
+ * a buffer is the same size as the failed buffer, it must also be at the
+ * same alignment within the SU.
+ */
+ i = 0;
+ if (new_asm_h[0]) {
+ for (i=0, pda=new_asm_h[0]->stripeMap->physInfo;
+ i<new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
+ i++, pda=pda->next)
+ {
+ rf_InitNode(&rrdNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+ rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
+ RF_ASSERT(pda);
+ rrdNodes[i].params[0].p = pda;
+ rrdNodes[i].params[1].p = pda->bufPtr;
+ rrdNodes[i].params[2].v = parityStripeID;
+ rrdNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ }
+ }
+ /* i now equals the number of stripe units accessed in new_asm_h[0] */
+ if (new_asm_h[1]) {
+ for (j=0,pda=new_asm_h[1]->stripeMap->physInfo;
+ j<new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
+ j++, pda=pda->next)
+ {
+ rf_InitNode(&rrdNodes[i+j], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+ rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rrd", allocList);
+ RF_ASSERT(pda);
+ rrdNodes[i+j].params[0].p = pda;
+ rrdNodes[i+j].params[1].p = pda->bufPtr;
+ rrdNodes[i+j].params[2].v = parityStripeID;
+ rrdNodes[i+j].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ if (allowBufferRecycle && (pda->numSector == failedPDA->numSector))
+ xorTargetBuf = pda->bufPtr;
+ }
+ }
+ if (rdnodesFaked) {
+ /*
+ * This is where we'll init that fake noop read node
+ * (XXX should the wakeup func be different?)
+ */
+ rf_InitNode(&rrdNodes[0], rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, 1, 1, 0, 0, dag_h, "RrN", allocList);
+ }
+
+ /*
+ * Make a PDA for the parity unit. The parity PDA should start at
+ * the same offset into the SU as the failed PDA.
+ */
+ /*
+ * Danner comment:
+ * I don't think this copy is really necessary.
+ * We are in one of two cases here.
+ * (1) The entire failed unit is written. Then asmap->parityInfo will
+ * describe the entire parity.
+ * (2) We are only writing a subset of the failed unit and nothing
+ * else. Then the asmap->parityInfo describes the failed unit and
+ * the copy can also be avoided.
+ */
+
+ RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
+ parityPDA->row = asmap->parityInfo->row;
+ parityPDA->col = asmap->parityInfo->col;
+ parityPDA->startSector = ((asmap->parityInfo->startSector / sectorsPerSU)
+ * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
+ parityPDA->numSector = failedPDA->numSector;
+
+ if (!xorTargetBuf) {
+ RF_CallocAndAdd(xorTargetBuf, 1,
+ rf_RaidAddressToByte(raidPtr, failedPDA->numSector), (char *), allocList);
+ }
+
+ /* init the Wnp node */
+ rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+ rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList);
+ wnpNode->params[0].p = parityPDA;
+ wnpNode->params[1].p = xorTargetBuf;
+ wnpNode->params[2].v = parityStripeID;
+ wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+
+ /* fill in the Wnq Node */
+ if (nfaults == 2) {
+ {
+ RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t),
+ (RF_PhysDiskAddr_t *), allocList);
+ parityPDA->row = asmap->qInfo->row;
+ parityPDA->col = asmap->qInfo->col;
+ parityPDA->startSector = ((asmap->qInfo->startSector / sectorsPerSU)
+ * sectorsPerSU) + (failedPDA->startSector % sectorsPerSU);
+ parityPDA->numSector = failedPDA->numSector;
+
+ rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+ rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList);
+ wnqNode->params[0].p = parityPDA;
+ RF_CallocAndAdd(xorNode->results[1], 1,
+ rf_RaidAddressToByte(raidPtr, failedPDA->numSector), (char *), allocList);
+ wnqNode->params[1].p = xorNode->results[1];
+ wnqNode->params[2].v = parityStripeID;
+ wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ }
+ }
+
+ /* fill in the Wnd nodes */
+ for (pda=asmap->physInfo, i=0; i<nWndNodes; i++, pda=pda->next) {
+ if (pda == failedPDA) {
+ i--;
+ continue;
+ }
+ rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+ rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
+ RF_ASSERT(pda);
+ wndNodes[i].params[0].p = pda;
+ wndNodes[i].params[1].p = pda->bufPtr;
+ wndNodes[i].params[2].v = parityStripeID;
+ wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ }
+
+ /* fill in the results of the xor node */
+ xorNode->results[0] = xorTargetBuf;
+
+ /* fill in the params of the xor node */
+
+ paramNum=0;
+ if (rdnodesFaked == 0) {
+ for (i=0; i<nRrdNodes; i++) {
+ /* all the Rrd nodes need to be xored together */
+ xorNode->params[paramNum++] = rrdNodes[i].params[0];
+ xorNode->params[paramNum++] = rrdNodes[i].params[1];
+ }
+ }
+ for (i=0; i < nWndNodes; i++) {
+ /* any Wnd nodes that overlap the failed access need to be xored in */
+ if (overlappingPDAs[i]) {
+ RF_MallocAndAdd(pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
+ bcopy((char *)wndNodes[i].params[0].p, (char *)pda, sizeof(RF_PhysDiskAddr_t));
+ rf_RangeRestrictPDA(raidPtr, failedPDA, pda, RF_RESTRICT_DOBUFFER, 0);
+ xorNode->params[paramNum++].p = pda;
+ xorNode->params[paramNum++].p = pda->bufPtr;
+ }
+ }
+ RF_Free(overlappingPDAs, asmap->numStripeUnitsAccessed * sizeof(char));
+
+ /*
+ * Install the failed PDA into the xor param list so that the
+ * new data gets xor'd in.
+ */
+ xorNode->params[paramNum++].p = failedPDA;
+ xorNode->params[paramNum++].p = failedPDA->bufPtr;
+
+ /*
+ * The last 2 params to the recovery xor node are always the failed
+ * PDA and the raidPtr. install the failedPDA even though we have just
+ * done so above. This allows us to use the same XOR function for both
+ * degraded reads and degraded writes.
+ */
+ xorNode->params[paramNum++].p = failedPDA;
+ xorNode->params[paramNum++].p = raidPtr;
+ RF_ASSERT( paramNum == 2*nXorBufs+2 );
+
+ /*
+ * Code to link nodes begins here
+ */
+
+ /* link header to block node */
+ RF_ASSERT(blockNode->numAntecedents == 0);
+ dag_h->succedents[0] = blockNode;
+
+ /* link block node to rd nodes */
+ RF_ASSERT(blockNode->numSuccedents == nRrdNodes);
+ for (i = 0; i < nRrdNodes; i++) {
+ RF_ASSERT(rrdNodes[i].numAntecedents == 1);
+ blockNode->succedents[i] = &rrdNodes[i];
+ rrdNodes[i].antecedents[0] = blockNode;
+ rrdNodes[i].antType[0] = rf_control;
+ }
+
+ /* link read nodes to xor node*/
+ RF_ASSERT(xorNode->numAntecedents == nRrdNodes);
+ for (i = 0; i < nRrdNodes; i++) {
+ RF_ASSERT(rrdNodes[i].numSuccedents == 1);
+ rrdNodes[i].succedents[0] = xorNode;
+ xorNode->antecedents[i] = &rrdNodes[i];
+ xorNode->antType[i] = rf_trueData;
+ }
+
+ /* link xor node to commit node */
+ RF_ASSERT(xorNode->numSuccedents == 1);
+ RF_ASSERT(commitNode->numAntecedents == 1);
+ xorNode->succedents[0] = commitNode;
+ commitNode->antecedents[0] = xorNode;
+ commitNode->antType[0] = rf_control;
+
+ /* link commit node to wnd nodes */
+ RF_ASSERT(commitNode->numSuccedents == nfaults + nWndNodes);
+ for (i = 0; i < nWndNodes; i++) {
+ RF_ASSERT(wndNodes[i].numAntecedents == 1);
+ commitNode->succedents[i] = &wndNodes[i];
+ wndNodes[i].antecedents[0] = commitNode;
+ wndNodes[i].antType[0] = rf_control;
+ }
+
+ /* link the commit node to wnp, wnq nodes */
+ RF_ASSERT(wnpNode->numAntecedents == 1);
+ commitNode->succedents[nWndNodes] = wnpNode;
+ wnpNode->antecedents[0] = commitNode;
+ wnpNode->antType[0] = rf_control;
+ if (nfaults == 2) {
+ RF_ASSERT(wnqNode->numAntecedents == 1);
+ commitNode->succedents[nWndNodes + 1] = wnqNode;
+ wnqNode->antecedents[0] = commitNode;
+ wnqNode->antType[0] = rf_control;
+ }
+
+ /* link write new data nodes to unblock node */
+ RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nfaults));
+ for(i = 0; i < nWndNodes; i++) {
+ RF_ASSERT(wndNodes[i].numSuccedents == 1);
+ wndNodes[i].succedents[0] = unblockNode;
+ unblockNode->antecedents[i] = &wndNodes[i];
+ unblockNode->antType[i] = rf_control;
+ }
+
+ /* link write new parity node to unblock node */
+ RF_ASSERT(wnpNode->numSuccedents == 1);
+ wnpNode->succedents[0] = unblockNode;
+ unblockNode->antecedents[nWndNodes] = wnpNode;
+ unblockNode->antType[nWndNodes] = rf_control;
+
+ /* link write new q node to unblock node */
+ if (nfaults == 2) {
+ RF_ASSERT(wnqNode->numSuccedents == 1);
+ wnqNode->succedents[0] = unblockNode;
+ unblockNode->antecedents[nWndNodes+1] = wnqNode;
+ unblockNode->antType[nWndNodes+1] = rf_control;
+ }
+
+ /* link unblock node to term node */
+ RF_ASSERT(unblockNode->numSuccedents == 1);
+ RF_ASSERT(termNode->numAntecedents == 1);
+ RF_ASSERT(termNode->numSuccedents == 0);
+ unblockNode->succedents[0] = termNode;
+ termNode->antecedents[0] = unblockNode;
+ termNode->antType[0] = rf_control;
+}
+
+#define CONS_PDA(if,start,num) \
+ pda_p->row = asmap->if->row; pda_p->col = asmap->if->col; \
+ pda_p->startSector = ((asmap->if->startSector / secPerSU) * secPerSU) + start; \
+ pda_p->numSector = num; \
+ pda_p->next = NULL; \
+ RF_MallocAndAdd(pda_p->bufPtr,rf_RaidAddressToByte(raidPtr,num),(char *), allocList)
+
+void rf_WriteGenerateFailedAccessASMs(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_PhysDiskAddr_t **pdap,
+ int *nNodep,
+ RF_PhysDiskAddr_t **pqpdap,
+ int *nPQNodep,
+ RF_AllocListElem_t *allocList)
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ int PDAPerDisk,i;
+ RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
+ int numDataCol = layoutPtr->numDataCol;
+ int state;
+ unsigned napdas;
+ RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end;
+ RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0], *ftwo = asmap->failedPDAs[1];
+ RF_PhysDiskAddr_t *pda_p;
+ RF_RaidAddr_t sosAddr;
+
+ /* determine how many pda's we will have to generate per unaccess stripe.
+ If there is only one failed data unit, it is one; if two, possibly two,
+ depending wether they overlap. */
+
+ fone_start = rf_StripeUnitOffset(layoutPtr,fone->startSector);
+ fone_end = fone_start + fone->numSector;
+
+ if (asmap->numDataFailed==1)
+ {
+ PDAPerDisk = 1;
+ state = 1;
+ RF_MallocAndAdd(*pqpdap,2*sizeof(RF_PhysDiskAddr_t),(RF_PhysDiskAddr_t *), allocList);
+ pda_p = *pqpdap;
+ /* build p */
+ CONS_PDA(parityInfo,fone_start,fone->numSector);
+ pda_p->type = RF_PDA_TYPE_PARITY;
+ pda_p++;
+ /* build q */
+ CONS_PDA(qInfo,fone_start,fone->numSector);
+ pda_p->type = RF_PDA_TYPE_Q;
+ }
+ else
+ {
+ ftwo_start = rf_StripeUnitOffset(layoutPtr,ftwo->startSector);
+ ftwo_end = ftwo_start + ftwo->numSector;
+ if (fone->numSector + ftwo->numSector > secPerSU)
+ {
+ PDAPerDisk = 1;
+ state = 2;
+ RF_MallocAndAdd(*pqpdap,2*sizeof(RF_PhysDiskAddr_t),(RF_PhysDiskAddr_t *), allocList);
+ pda_p = *pqpdap;
+ CONS_PDA(parityInfo,0,secPerSU);
+ pda_p->type = RF_PDA_TYPE_PARITY;
+ pda_p++;
+ CONS_PDA(qInfo,0,secPerSU);
+ pda_p->type = RF_PDA_TYPE_Q;
+ }
+ else
+ {
+ PDAPerDisk = 2;
+ state = 3;
+ /* four of them, fone, then ftwo */
+ RF_MallocAndAdd(*pqpdap,4*sizeof(RF_PhysDiskAddr_t),(RF_PhysDiskAddr_t *), allocList);
+ pda_p = *pqpdap;
+ CONS_PDA(parityInfo,fone_start,fone->numSector);
+ pda_p->type = RF_PDA_TYPE_PARITY;
+ pda_p++;
+ CONS_PDA(qInfo,fone_start,fone->numSector);
+ pda_p->type = RF_PDA_TYPE_Q;
+ pda_p++;
+ CONS_PDA(parityInfo,ftwo_start,ftwo->numSector);
+ pda_p->type = RF_PDA_TYPE_PARITY;
+ pda_p++;
+ CONS_PDA(qInfo,ftwo_start,ftwo->numSector);
+ pda_p->type = RF_PDA_TYPE_Q;
+ }
+ }
+ /* figure out number of nonaccessed pda */
+ napdas = PDAPerDisk * (numDataCol - 2);
+ *nPQNodep = PDAPerDisk;
+
+ *nNodep = napdas;
+ if (napdas == 0) return; /* short circuit */
+
+ /* allocate up our list of pda's */
+
+ RF_CallocAndAdd(pda_p, napdas, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *), allocList);
+ *pdap = pda_p;
+
+ /* linkem together */
+ for (i=0; i < (napdas-1); i++)
+ pda_p[i].next = pda_p+(i+1);
+
+ sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+ for (i=0; i < numDataCol; i++)
+ {
+ if ((pda_p - (*pdap)) == napdas)
+ continue;
+ pda_p->type = RF_PDA_TYPE_DATA;
+ pda_p->raidAddress = sosAddr + (i * secPerSU);
+ (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+ /* skip over dead disks */
+ if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status))
+ continue;
+ switch (state)
+ {
+ case 1: /* fone */
+ pda_p->numSector = fone->numSector;
+ pda_p->raidAddress += fone_start;
+ pda_p->startSector += fone_start;
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
+ break;
+ case 2: /* full stripe */
+ pda_p->numSector = secPerSU;
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,secPerSU), (char *), allocList);
+ break;
+ case 3: /* two slabs */
+ pda_p->numSector = fone->numSector;
+ pda_p->raidAddress += fone_start;
+ pda_p->startSector += fone_start;
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
+ pda_p++;
+ pda_p->type = RF_PDA_TYPE_DATA;
+ pda_p->raidAddress = sosAddr + (i * secPerSU);
+ (raidPtr->Layout.map->MapSector)(raidPtr,pda_p->raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
+ pda_p->numSector = ftwo->numSector;
+ pda_p->raidAddress += ftwo_start;
+ pda_p->startSector += ftwo_start;
+ RF_MallocAndAdd(pda_p->bufPtr, rf_RaidAddressToByte(raidPtr,pda_p->numSector), (char *), allocList);
+ break;
+ default:
+ RF_PANIC();
+ }
+ pda_p++;
+ }
+
+ RF_ASSERT (pda_p - *pdap == napdas);
+ return;
+}
+
+#define DISK_NODE_PDA(node) ((node)->params[0].p)
+
+#define DISK_NODE_PARAMS(_node_,_p_) \
+ (_node_).params[0].p = _p_ ; \
+ (_node_).params[1].p = (_p_)->bufPtr; \
+ (_node_).params[2].v = parityStripeID; \
+ (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
+
+void rf_DoubleDegSmallWrite(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList,
+ char *redundantReadNodeName,
+ char *redundantWriteNodeName,
+ char *recoveryNodeName,
+ int (*recovFunc)(RF_DagNode_t *))
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_DagNode_t *nodes, *wudNodes, *rrdNodes, *recoveryNode, *blockNode, *unblockNode, *rpNodes,*rqNodes, *wpNodes, *wqNodes, *termNode;
+ RF_PhysDiskAddr_t *pda, *pqPDAs;
+ RF_PhysDiskAddr_t *npdas;
+ int nWriteNodes, nNodes, nReadNodes, nRrdNodes, nWudNodes, i;
+ RF_ReconUnitNum_t which_ru;
+ int nPQNodes;
+ RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress, &which_ru);
+
+ /* simple small write case -
+ First part looks like a reconstruct-read of the failed data units.
+ Then a write of all data units not failed. */
+
+
+ /*
+ Hdr
+ |
+ ------Block-
+ / / \
+ Rrd Rrd ... Rrd Rp Rq
+ \ \ /
+ -------PQ-----
+ / \ \
+ Wud Wp WQ
+ \ | /
+ --Unblock-
+ |
+ T
+
+ Rrd = read recovery data (potentially none)
+ Wud = write user data (not incl. failed disks)
+ Wp = Write P (could be two)
+ Wq = Write Q (could be two)
+
+ */
+
+ rf_WriteGenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes, &pqPDAs, &nPQNodes,allocList);
+
+ RF_ASSERT(asmap->numDataFailed == 1);
+
+ nWudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed);
+ nReadNodes = nRrdNodes + 2*nPQNodes;
+ nWriteNodes = nWudNodes+ 2*nPQNodes;
+ nNodes = 4 + nReadNodes + nWriteNodes;
+
+ RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+ blockNode = nodes;
+ unblockNode = blockNode+1;
+ termNode = unblockNode+1;
+ recoveryNode = termNode+1;
+ rrdNodes = recoveryNode+1;
+ rpNodes = rrdNodes + nRrdNodes;
+ rqNodes = rpNodes + nPQNodes;
+ wudNodes = rqNodes + nPQNodes;
+ wpNodes = wudNodes + nWudNodes;
+ wqNodes = wpNodes + nPQNodes;
+
+ dag_h->creator = "PQ_DDSimpleSmallWrite";
+ dag_h->numSuccedents = 1;
+ dag_h->succedents[0] = blockNode;
+ rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+ termNode->antecedents[0] = unblockNode;
+ termNode->antType[0] = rf_control;
+
+ /* init the block and unblock nodes */
+ /* The block node has all the read nodes as successors */
+ rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h, "Nil", allocList);
+ for (i=0; i < nReadNodes; i++)
+ blockNode->succedents[i] = rrdNodes+i;
+
+ /* The unblock node has all the writes as successors */
+ rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWriteNodes, 0, 0, dag_h, "Nil", allocList);
+ for (i=0; i < nWriteNodes; i++) {
+ unblockNode->antecedents[i] = wudNodes+i;
+ unblockNode->antType[i] = rf_control;
+ }
+ unblockNode->succedents[0] = termNode;
+
+#define INIT_READ_NODE(node,name) \
+ rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
+ (node)->succedents[0] = recoveryNode; \
+ (node)->antecedents[0] = blockNode; \
+ (node)->antType[0] = rf_control;
+
+ /* build the read nodes */
+ pda = npdas;
+ for (i=0; i < nRrdNodes; i++, pda = pda->next) {
+ INIT_READ_NODE(rrdNodes+i,"rrd");
+ DISK_NODE_PARAMS(rrdNodes[i],pda);
+ }
+
+ /* read redundancy pdas */
+ pda = pqPDAs;
+ INIT_READ_NODE(rpNodes,"Rp");
+ RF_ASSERT(pda);
+ DISK_NODE_PARAMS(rpNodes[0],pda);
+ pda++;
+ INIT_READ_NODE(rqNodes, redundantReadNodeName );
+ RF_ASSERT(pda);
+ DISK_NODE_PARAMS(rqNodes[0],pda);
+ if (nPQNodes==2)
+ {
+ pda++;
+ INIT_READ_NODE(rpNodes+1,"Rp");
+ RF_ASSERT(pda);
+ DISK_NODE_PARAMS(rpNodes[1],pda);
+ pda++;
+ INIT_READ_NODE(rqNodes+1,redundantReadNodeName );
+ RF_ASSERT(pda);
+ DISK_NODE_PARAMS(rqNodes[1],pda);
+ }
+
+ /* the recovery node has all reads as precedessors and all writes as successors.
+ It generates a result for every write P or write Q node.
+ As parameters, it takes a pda per read and a pda per stripe of user data written.
+ It also takes as the last params the raidPtr and asm.
+ For results, it takes PDA for P & Q. */
+
+
+ rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc, rf_NullNodeUndoFunc, NULL,
+ nWriteNodes, /* succesors */
+ nReadNodes, /* preds */
+ nReadNodes + nWudNodes + 3, /* params */
+ 2 * nPQNodes, /* results */
+ dag_h, recoveryNodeName, allocList);
+
+
+
+ for (i=0; i < nReadNodes; i++ )
+ {
+ recoveryNode->antecedents[i] = rrdNodes+i;
+ recoveryNode->antType[i] = rf_control;
+ recoveryNode->params[i].p = DISK_NODE_PDA(rrdNodes+i);
+ }
+ for (i=0; i < nWudNodes; i++)
+ {
+ recoveryNode->succedents[i] = wudNodes+i;
+ }
+ recoveryNode->params[nReadNodes+nWudNodes].p = asmap->failedPDAs[0];
+ recoveryNode->params[nReadNodes+nWudNodes+1].p = raidPtr;
+ recoveryNode->params[nReadNodes+nWudNodes+2].p = asmap;
+
+ for ( ; i < nWriteNodes; i++)
+ recoveryNode->succedents[i] = wudNodes+i;
+
+ pda = pqPDAs;
+ recoveryNode->results[0] = pda;
+ pda++;
+ recoveryNode->results[1] = pda;
+ if ( nPQNodes == 2)
+ {
+ pda++;
+ recoveryNode->results[2] = pda;
+ pda++;
+ recoveryNode->results[3] = pda;
+ }
+
+ /* fill writes */
+#define INIT_WRITE_NODE(node,name) \
+ rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, allocList); \
+ (node)->succedents[0] = unblockNode; \
+ (node)->antecedents[0] = recoveryNode; \
+ (node)->antType[0] = rf_control;
+
+ pda = asmap->physInfo;
+ for (i=0; i < nWudNodes; i++)
+ {
+ INIT_WRITE_NODE(wudNodes+i,"Wd");
+ DISK_NODE_PARAMS(wudNodes[i],pda);
+ recoveryNode->params[nReadNodes+i].p = DISK_NODE_PDA(wudNodes+i);
+ pda = pda->next;
+ }
+ /* write redundancy pdas */
+ pda = pqPDAs;
+ INIT_WRITE_NODE(wpNodes,"Wp");
+ RF_ASSERT(pda);
+ DISK_NODE_PARAMS(wpNodes[0],pda);
+ pda++;
+ INIT_WRITE_NODE(wqNodes,"Wq");
+ RF_ASSERT(pda);
+ DISK_NODE_PARAMS(wqNodes[0],pda);
+ if (nPQNodes==2)
+ {
+ pda++;
+ INIT_WRITE_NODE(wpNodes+1,"Wp");
+ RF_ASSERT(pda);
+ DISK_NODE_PARAMS(wpNodes[1],pda);
+ pda++;
+ INIT_WRITE_NODE(wqNodes+1,"Wq");
+ RF_ASSERT(pda);
+ DISK_NODE_PARAMS(wqNodes[1],pda);
+ }
+}
diff --git a/sys/dev/raidframe/rf_dagdegwr.h b/sys/dev/raidframe/rf_dagdegwr.h
new file mode 100644
index 00000000000..180c5f75668
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagdegwr.h
@@ -0,0 +1,81 @@
+/* $OpenBSD: rf_dagdegwr.h,v 1.1 1999/01/11 14:29:08 niklas Exp $ */
+/* $NetBSD: rf_dagdegwr.h,v 1.1 1998/11/13 04:20:27 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * :
+ * Log: rf_dagdegwr.h,v
+ * Revision 1.6 1996/07/31 16:30:28 jimz
+ * asm/asmap fix (EO merge)
+ *
+ * Revision 1.5 1996/07/31 15:35:38 jimz
+ * evenodd changes; bugfixes for double-degraded archs, generalize
+ * some formerly PQ-only functions
+ *
+ * Revision 1.4 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.3 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.2 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.1 1996/05/03 19:21:28 wvcii
+ * Initial revision
+ *
+ */
+
+#ifndef _RF__RF_DAGDEGWR_H_
+#define _RF__RF_DAGDEGWR_H_
+
+/* degraded write DAG creation routines */
+void rf_CreateDegradedWriteDAG(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp,
+ RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList);
+void rf_CommonCreateSimpleDegradedWriteDAG(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp,
+ RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList,
+ int nfaults, int (*redFunc)(RF_DagNode_t *), int allowBufferRecycle);
+void rf_WriteGenerateFailedAccessASMs(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_PhysDiskAddr_t **pdap,
+ int *nNodep, RF_PhysDiskAddr_t **pqpdap,
+ int *nPQNodep, RF_AllocListElem_t *allocList);
+void rf_DoubleDegSmallWrite(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h, void *bp, RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList, char *redundantReadNodeName,
+ char *redundantWriteNodeName, char *recoveryNodeName,
+ int (*recovFunc)(RF_DagNode_t *));
+
+#endif /* !_RF__RF_DAGDEGWR_H_ */
diff --git a/sys/dev/raidframe/rf_dagffrd.c b/sys/dev/raidframe/rf_dagffrd.c
new file mode 100644
index 00000000000..b831980cb0e
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagffrd.c
@@ -0,0 +1,500 @@
+/* $OpenBSD: rf_dagffrd.c,v 1.1 1999/01/11 14:29:08 niklas Exp $ */
+/* $NetBSD: rf_dagffrd.c,v 1.1 1998/11/13 04:20:27 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_dagffrd.c
+ *
+ * code for creating fault-free read DAGs
+ *
+ * :
+ * Log: rf_dagffrd.c,v
+ * Revision 1.14 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.13 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.12 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.11 1996/06/06 17:30:44 jimz
+ * turn old Raid1 mirror read creation into a more generic function
+ * parameterized by an addtional parameter: type of mirrored read
+ * this is now used by other dag creation routines so chained declustering
+ * and raid1 can share dag creation code, but have different mirroring
+ * policies
+ *
+ * Revision 1.10 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.9 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.8 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.7 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.6 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.5 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.4 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.3 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.2 1996/05/08 21:01:24 jimz
+ * fixed up enum type names that were conflicting with other
+ * enums and function names (ie, "panic")
+ * future naming trends will be towards RF_ and rf_ for
+ * everything raidframe-related
+ *
+ * Revision 1.1 1996/05/03 19:19:20 wvcii
+ * Initial revision
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_dagutils.h"
+#include "rf_dagfuncs.h"
+#include "rf_threadid.h"
+#include "rf_debugMem.h"
+#include "rf_memchunk.h"
+#include "rf_general.h"
+#include "rf_dagffrd.h"
+
+/******************************************************************************
+ *
+ * General comments on DAG creation:
+ *
+ * All DAGs in this file use roll-away error recovery. Each DAG has a single
+ * commit node, usually called "Cmt." If an error occurs before the Cmt node
+ * is reached, the execution engine will halt forward execution and work
+ * backward through the graph, executing the undo functions. Assuming that
+ * each node in the graph prior to the Cmt node are undoable and atomic - or -
+ * does not make changes to permanent state, the graph will fail atomically.
+ * If an error occurs after the Cmt node executes, the engine will roll-forward
+ * through the graph, blindly executing nodes until it reaches the end.
+ * If a graph reaches the end, it is assumed to have completed successfully.
+ *
+ * A graph has only 1 Cmt node.
+ *
+ */
+
+
+/******************************************************************************
+ *
+ * The following wrappers map the standard DAG creation interface to the
+ * DAG creation routines. Additionally, these wrappers enable experimentation
+ * with new DAG structures by providing an extra level of indirection, allowing
+ * the DAG creation routines to be replaced at this single point.
+ */
+
+void rf_CreateFaultFreeReadDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList)
+{
+ rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+ RF_IO_TYPE_READ);
+}
+
+
+/******************************************************************************
+ *
+ * DAG creation code begins here
+ */
+
+/******************************************************************************
+ *
+ * creates a DAG to perform a nonredundant read or write of data within one
+ * stripe.
+ * For reads, this DAG is as follows:
+ *
+ * /---- read ----\
+ * Header -- Block ---- read ---- Commit -- Terminate
+ * \---- read ----/
+ *
+ * For writes, this DAG is as follows:
+ *
+ * /---- write ----\
+ * Header -- Commit ---- write ---- Block -- Terminate
+ * \---- write ----/
+ *
+ * There is one disk node per stripe unit accessed, and all disk nodes are in
+ * parallel.
+ *
+ * Tricky point here: The first disk node (read or write) is created
+ * normally. Subsequent disk nodes are created by copying the first one,
+ * and modifying a few params. The "succedents" and "antecedents" fields are
+ * _not_ re-created in each node, but rather left pointing to the same array
+ * that was malloc'd when the first node was created. Thus, it's essential
+ * that when this DAG is freed, the succedents and antecedents fields be freed
+ * in ONLY ONE of the read nodes. This does not apply to the "params" field
+ * because it is recreated for each READ node.
+ *
+ * Note that normal-priority accesses do not need to be tagged with their
+ * parity stripe ID, because they will never be promoted. Hence, I've
+ * commented-out the code to do this, and marked it with UNNEEDED.
+ *
+ *****************************************************************************/
+
+void rf_CreateNonredundantDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList,
+ RF_IoType_t type)
+{
+ RF_DagNode_t *nodes, *diskNodes, *blockNode, *commitNode, *termNode;
+ RF_PhysDiskAddr_t *pda = asmap->physInfo;
+ int (*doFunc)(RF_DagNode_t *), (*undoFunc)(RF_DagNode_t *);
+ int i, n, totalNumNodes;
+ char *name;
+
+ n = asmap->numStripeUnitsAccessed;
+ dag_h->creator = "NonredundantDAG";
+
+ RF_ASSERT(RF_IO_IS_R_OR_W(type));
+ switch (type) {
+ case RF_IO_TYPE_READ:
+ doFunc = rf_DiskReadFunc;
+ undoFunc = rf_DiskReadUndoFunc;
+ name = "R ";
+ if (rf_dagDebug) printf("[Creating non-redundant read DAG]\n");
+ break;
+ case RF_IO_TYPE_WRITE:
+ doFunc = rf_DiskWriteFunc;
+ undoFunc = rf_DiskWriteUndoFunc;
+ name = "W ";
+ if (rf_dagDebug) printf("[Creating non-redundant write DAG]\n");
+ break;
+ default:
+ RF_PANIC();
+ }
+
+ /*
+ * For reads, the dag can not commit until the block node is reached.
+ * for writes, the dag commits immediately.
+ */
+ dag_h->numCommitNodes = 1;
+ dag_h->numCommits = 0;
+ dag_h->numSuccedents = 1;
+
+ /*
+ * Node count:
+ * 1 block node
+ * n data reads (or writes)
+ * 1 commit node
+ * 1 terminator node
+ */
+ RF_ASSERT(n > 0);
+ totalNumNodes = n + 3;
+ RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t),
+ (RF_DagNode_t *), allocList);
+ i = 0;
+ diskNodes = &nodes[i]; i += n;
+ blockNode = &nodes[i]; i += 1;
+ commitNode = &nodes[i]; i += 1;
+ termNode = &nodes[i]; i += 1;
+ RF_ASSERT(i == totalNumNodes);
+
+ /* initialize nodes */
+ switch (type) {
+ case RF_IO_TYPE_READ:
+ rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, n, 0, 0, 0, dag_h, "Nil", allocList);
+ rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, 1, n, 0, 0, dag_h, "Cmt", allocList);
+ rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
+ NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+ break;
+ case RF_IO_TYPE_WRITE:
+ rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
+ rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, n, 1, 0, 0, dag_h, "Cmt", allocList);
+ rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
+ NULL, 0, n, 0, 0, dag_h, "Trm", allocList);
+ break;
+ default:
+ RF_PANIC();
+ }
+
+ for (i = 0; i < n; i++) {
+ RF_ASSERT(pda != NULL);
+ rf_InitNode(&diskNodes[i], rf_wait, RF_FALSE, doFunc, undoFunc, rf_GenericWakeupFunc,
+ 1, 1, 4, 0, dag_h, name, allocList);
+ diskNodes[i].params[0].p = pda;
+ diskNodes[i].params[1].p = pda->bufPtr;
+ /* parity stripe id is not necessary */
+ diskNodes[i].params[2].v = 0;
+ diskNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0);
+ pda = pda->next;
+ }
+
+ /*
+ * Connect nodes.
+ */
+
+ /* connect hdr to block node */
+ RF_ASSERT(blockNode->numAntecedents == 0);
+ dag_h->succedents[0] = blockNode;
+
+ if (type == RF_IO_TYPE_READ) {
+ /* connecting a nonredundant read DAG */
+ RF_ASSERT(blockNode->numSuccedents == n);
+ RF_ASSERT(commitNode->numAntecedents == n);
+ for (i=0; i < n; i++) {
+ /* connect block node to each read node */
+ RF_ASSERT(diskNodes[i].numAntecedents == 1);
+ blockNode->succedents[i] = &diskNodes[i];
+ diskNodes[i].antecedents[0] = blockNode;
+ diskNodes[i].antType[0] = rf_control;
+
+ /* connect each read node to the commit node */
+ RF_ASSERT(diskNodes[i].numSuccedents == 1);
+ diskNodes[i].succedents[0] = commitNode;
+ commitNode->antecedents[i] = &diskNodes[i];
+ commitNode->antType[i] = rf_control;
+ }
+ /* connect the commit node to the term node */
+ RF_ASSERT(commitNode->numSuccedents == 1);
+ RF_ASSERT(termNode->numAntecedents == 1);
+ RF_ASSERT(termNode->numSuccedents == 0);
+ commitNode->succedents[0] = termNode;
+ termNode->antecedents[0] = commitNode;
+ termNode->antType[0] = rf_control;
+ }
+ else {
+ /* connecting a nonredundant write DAG */
+ /* connect the block node to the commit node */
+ RF_ASSERT(blockNode->numSuccedents == 1);
+ RF_ASSERT(commitNode->numAntecedents == 1);
+ blockNode->succedents[0] = commitNode;
+ commitNode->antecedents[0] = blockNode;
+ commitNode->antType[0] = rf_control;
+
+ RF_ASSERT(commitNode->numSuccedents == n);
+ RF_ASSERT(termNode->numAntecedents == n);
+ RF_ASSERT(termNode->numSuccedents == 0);
+ for (i=0; i < n; i++) {
+ /* connect the commit node to each write node */
+ RF_ASSERT(diskNodes[i].numAntecedents == 1);
+ commitNode->succedents[i] = &diskNodes[i];
+ diskNodes[i].antecedents[0] = commitNode;
+ diskNodes[i].antType[0] = rf_control;
+
+ /* connect each write node to the term node */
+ RF_ASSERT(diskNodes[i].numSuccedents == 1);
+ diskNodes[i].succedents[0] = termNode;
+ termNode->antecedents[i] = &diskNodes[i];
+ termNode->antType[i] = rf_control;
+ }
+ }
+}
+
+/******************************************************************************
+ * Create a fault-free read DAG for RAID level 1
+ *
+ * Hdr -> Nil -> Rmir -> Cmt -> Trm
+ *
+ * The "Rmir" node schedules a read from the disk in the mirror pair with the
+ * shortest disk queue. the proper queue is selected at Rmir execution. this
+ * deferred mapping is unlike other archs in RAIDframe which generally fix
+ * mapping at DAG creation time.
+ *
+ * Parameters: raidPtr - description of the physical array
+ * asmap - logical & physical addresses for this access
+ * bp - buffer ptr (for holding read data)
+ * flags - general flags (e.g. disk locking)
+ * allocList - list of memory allocated in DAG creation
+ *****************************************************************************/
+
+static void CreateMirrorReadDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList,
+ int (*readfunc)(RF_DagNode_t *node))
+{
+ RF_DagNode_t *readNodes, *nodes, *blockNode, *commitNode, *termNode;
+ RF_PhysDiskAddr_t *data_pda = asmap->physInfo;
+ RF_PhysDiskAddr_t *parity_pda = asmap->parityInfo;
+ int i, n, totalNumNodes;
+
+ n = asmap->numStripeUnitsAccessed;
+ dag_h->creator = "RaidOneReadDAG";
+ if (rf_dagDebug) {
+ printf("[Creating RAID level 1 read DAG]\n");
+ }
+
+ /*
+ * This dag can not commit until the commit node is reached
+ * errors prior to the commit point imply the dag has failed.
+ */
+ dag_h->numCommitNodes = 1;
+ dag_h->numCommits = 0;
+ dag_h->numSuccedents = 1;
+
+ /*
+ * Node count:
+ * n data reads
+ * 1 block node
+ * 1 commit node
+ * 1 terminator node
+ */
+ RF_ASSERT(n > 0);
+ totalNumNodes = n + 3;
+ RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t),
+ (RF_DagNode_t *), allocList);
+ i = 0;
+ readNodes = &nodes[i]; i += n;
+ blockNode = &nodes[i]; i += 1;
+ commitNode = &nodes[i]; i += 1;
+ termNode = &nodes[i]; i += 1;
+ RF_ASSERT(i == totalNumNodes);
+
+ /* initialize nodes */
+ rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
+ rf_NullNodeUndoFunc, NULL, n, 0, 0, 0, dag_h, "Nil", allocList);
+ rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
+ rf_NullNodeUndoFunc, NULL, 1, n, 0, 0, dag_h, "Cmt", allocList);
+ rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
+ rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+
+ for (i = 0; i < n; i++) {
+ RF_ASSERT(data_pda != NULL);
+ RF_ASSERT(parity_pda != NULL);
+ rf_InitNode(&readNodes[i], rf_wait, RF_FALSE, readfunc,
+ rf_DiskReadMirrorUndoFunc, rf_GenericWakeupFunc, 1, 1, 5, 0, dag_h,
+ "Rmir", allocList);
+ readNodes[i].params[0].p = data_pda;
+ readNodes[i].params[1].p = data_pda->bufPtr;
+ /* parity stripe id is not necessary */
+ readNodes[i].params[2].p = 0;
+ readNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0);
+ readNodes[i].params[4].p = parity_pda;
+ data_pda = data_pda->next;
+ parity_pda = parity_pda->next;
+ }
+
+ /*
+ * Connect nodes
+ */
+
+ /* connect hdr to block node */
+ RF_ASSERT(blockNode->numAntecedents == 0);
+ dag_h->succedents[0] = blockNode;
+
+ /* connect block node to read nodes */
+ RF_ASSERT(blockNode->numSuccedents == n);
+ for (i=0; i < n; i++) {
+ RF_ASSERT(readNodes[i].numAntecedents == 1);
+ blockNode->succedents[i] = &readNodes[i];
+ readNodes[i].antecedents[0] = blockNode;
+ readNodes[i].antType[0] = rf_control;
+ }
+
+ /* connect read nodes to commit node */
+ RF_ASSERT(commitNode->numAntecedents == n);
+ for (i=0; i < n; i++) {
+ RF_ASSERT(readNodes[i].numSuccedents == 1);
+ readNodes[i].succedents[0] = commitNode;
+ commitNode->antecedents[i] = &readNodes[i];
+ commitNode->antType[i] = rf_control;
+ }
+
+ /* connect commit node to term node */
+ RF_ASSERT(commitNode->numSuccedents == 1);
+ RF_ASSERT(termNode->numAntecedents == 1);
+ RF_ASSERT(termNode->numSuccedents == 0);
+ commitNode->succedents[0] = termNode;
+ termNode->antecedents[0] = commitNode;
+ termNode->antType[0] = rf_control;
+}
+
+void rf_CreateMirrorIdleReadDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList)
+{
+ CreateMirrorReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+ rf_DiskReadMirrorIdleFunc);
+}
+
+void rf_CreateMirrorPartitionReadDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList)
+{
+ CreateMirrorReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+ rf_DiskReadMirrorPartitionFunc);
+}
diff --git a/sys/dev/raidframe/rf_dagffrd.h b/sys/dev/raidframe/rf_dagffrd.h
new file mode 100644
index 00000000000..61e3ee86241
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagffrd.h
@@ -0,0 +1,75 @@
+/* $OpenBSD: rf_dagffrd.h,v 1.1 1999/01/11 14:29:08 niklas Exp $ */
+/* $NetBSD: rf_dagffrd.h,v 1.1 1998/11/13 04:20:27 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * :
+ * Log: rf_dagffrd.h,v
+ * Revision 1.5 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.4 1996/06/06 17:31:13 jimz
+ * new mirror read creation dags
+ *
+ * Revision 1.3 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.2 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.1 1996/05/03 19:19:53 wvcii
+ * Initial revision
+ *
+ */
+
+#ifndef _RF__RF_DAGFFRD_H_
+#define _RF__RF_DAGFFRD_H_
+
+#include "rf_types.h"
+
+/* fault-free read DAG creation routines */
+void rf_CreateFaultFreeReadDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h, void *bp, RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList);
+void rf_CreateNonredundantDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h, void *bp, RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList, RF_IoType_t type);
+void rf_CreateMirrorIdleReadDAG(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp,
+ RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList);
+void rf_CreateMirrorPartitionReadDAG(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp,
+ RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList);
+
+#endif /* !_RF__RF_DAGFFRD_H_ */
diff --git a/sys/dev/raidframe/rf_dagffwr.c b/sys/dev/raidframe/rf_dagffwr.c
new file mode 100644
index 00000000000..f502de1b293
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagffwr.c
@@ -0,0 +1,2202 @@
+/* $OpenBSD: rf_dagffwr.c,v 1.1 1999/01/11 14:29:09 niklas Exp $ */
+/* $NetBSD: rf_dagffwr.c,v 1.1 1998/11/13 04:20:27 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_dagff.c
+ *
+ * code for creating fault-free DAGs
+ *
+ * :
+ * Log: rf_dagffwr.c,v
+ * Revision 1.19 1996/07/31 15:35:24 jimz
+ * evenodd changes; bugfixes for double-degraded archs, generalize
+ * some formerly PQ-only functions
+ *
+ * Revision 1.18 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.17 1996/07/27 18:40:24 jimz
+ * cleanup sweep
+ *
+ * Revision 1.16 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.15 1996/06/11 01:27:50 jimz
+ * Fixed bug where diskthread shutdown would crash or hang. This
+ * turned out to be two distinct bugs:
+ * (1) [crash] The thread shutdown code wasn't properly waiting for
+ * all the diskthreads to complete. This caused diskthreads that were
+ * exiting+cleaning up to unlock a destroyed mutex.
+ * (2) [hang] TerminateDiskQueues wasn't locking, and DiskIODequeue
+ * only checked for termination _after_ a wakeup if the queues were
+ * empty. This was a race where the termination wakeup could be lost
+ * by the dequeueing thread, and the system would hang waiting for the
+ * thread to exit, while the thread waited for an I/O or a signal to
+ * check the termination flag.
+ *
+ * Revision 1.14 1996/06/10 22:24:01 wvcii
+ * added write dags which do not have a commit node and are
+ * used in forward and backward error recovery experiments.
+ *
+ * Revision 1.13 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.12 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.11 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.10 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.9 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.8 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.7 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.6 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.5 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.4 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.3 1996/05/15 23:23:12 wvcii
+ * fixed bug in small write read old q node succedent initialization
+ *
+ * Revision 1.2 1996/05/08 21:01:24 jimz
+ * fixed up enum type names that were conflicting with other
+ * enums and function names (ie, "panic")
+ * future naming trends will be towards RF_ and rf_ for
+ * everything raidframe-related
+ *
+ * Revision 1.1 1996/05/03 19:20:45 wvcii
+ * Initial revision
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_dagutils.h"
+#include "rf_dagfuncs.h"
+#include "rf_threadid.h"
+#include "rf_debugMem.h"
+#include "rf_dagffrd.h"
+#include "rf_memchunk.h"
+#include "rf_general.h"
+#include "rf_dagffwr.h"
+
+/******************************************************************************
+ *
+ * General comments on DAG creation:
+ *
+ * All DAGs in this file use roll-away error recovery. Each DAG has a single
+ * commit node, usually called "Cmt." If an error occurs before the Cmt node
+ * is reached, the execution engine will halt forward execution and work
+ * backward through the graph, executing the undo functions. Assuming that
+ * each node in the graph prior to the Cmt node are undoable and atomic - or -
+ * does not make changes to permanent state, the graph will fail atomically.
+ * If an error occurs after the Cmt node executes, the engine will roll-forward
+ * through the graph, blindly executing nodes until it reaches the end.
+ * If a graph reaches the end, it is assumed to have completed successfully.
+ *
+ * A graph has only 1 Cmt node.
+ *
+ */
+
+
+/******************************************************************************
+ *
+ * The following wrappers map the standard DAG creation interface to the
+ * DAG creation routines. Additionally, these wrappers enable experimentation
+ * with new DAG structures by providing an extra level of indirection, allowing
+ * the DAG creation routines to be replaced at this single point.
+ */
+
+
+void rf_CreateNonRedundantWriteDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList,
+ RF_IoType_t type)
+{
+ rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+ RF_IO_TYPE_WRITE);
+}
+
+void rf_CreateRAID0WriteDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList,
+ RF_IoType_t type)
+{
+ rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+ RF_IO_TYPE_WRITE);
+}
+
+void rf_CreateSmallWriteDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList)
+{
+#if RF_FORWARD > 0
+ rf_CommonCreateSmallWriteDAGFwd(raidPtr, asmap, dag_h, bp, flags, allocList,
+ &rf_xorFuncs, NULL);
+#else /* RF_FORWARD > 0 */
+#if RF_BACKWARD > 0
+ rf_CommonCreateSmallWriteDAGFwd(raidPtr, asmap, dag_h, bp, flags, allocList,
+ &rf_xorFuncs, NULL);
+#else /* RF_BACKWARD > 0 */
+ /* "normal" rollaway */
+ rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+ &rf_xorFuncs, NULL);
+#endif /* RF_BACKWARD > 0 */
+#endif /* RF_FORWARD > 0 */
+}
+
+void rf_CreateLargeWriteDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList)
+{
+#if RF_FORWARD > 0
+ rf_CommonCreateLargeWriteDAGFwd(raidPtr, asmap, dag_h, bp, flags, allocList,
+ 1, rf_RegularXorFunc, RF_TRUE);
+#else /* RF_FORWARD > 0 */
+#if RF_BACKWARD > 0
+ rf_CommonCreateLargeWriteDAGFwd(raidPtr, asmap, dag_h, bp, flags, allocList,
+ 1, rf_RegularXorFunc, RF_TRUE);
+#else /* RF_BACKWARD > 0 */
+ /* "normal" rollaway */
+ rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+ 1, rf_RegularXorFunc, RF_TRUE);
+#endif /* RF_BACKWARD > 0 */
+#endif /* RF_FORWARD > 0 */
+}
+
+
+/******************************************************************************
+ *
+ * DAG creation code begins here
+ */
+
+
+/******************************************************************************
+ *
+ * creates a DAG to perform a large-write operation:
+ *
+ * / Rod \ / Wnd \
+ * H -- block- Rod - Xor - Cmt - Wnd --- T
+ * \ Rod / \ Wnp /
+ * \[Wnq]/
+ *
+ * The XOR node also does the Q calculation in the P+Q architecture.
+ * All nodes are before the commit node (Cmt) are assumed to be atomic and
+ * undoable - or - they make no changes to permanent state.
+ *
+ * Rod = read old data
+ * Cmt = commit node
+ * Wnp = write new parity
+ * Wnd = write new data
+ * Wnq = write new "q"
+ * [] denotes optional segments in the graph
+ *
+ * Parameters: raidPtr - description of the physical array
+ * asmap - logical & physical addresses for this access
+ * bp - buffer ptr (holds write data)
+ * flags - general flags (e.g. disk locking)
+ * allocList - list of memory allocated in DAG creation
+ * nfaults - number of faults array can tolerate
+ * (equal to # redundancy units in stripe)
+ * redfuncs - list of redundancy generating functions
+ *
+ *****************************************************************************/
+
+void rf_CommonCreateLargeWriteDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList,
+ int nfaults,
+ int (*redFunc)(RF_DagNode_t *),
+ int allowBufferRecycle)
+{
+ RF_DagNode_t *nodes, *wndNodes, *rodNodes, *xorNode, *wnpNode;
+ RF_DagNode_t *wnqNode, *blockNode, *commitNode, *termNode;
+ int nWndNodes, nRodNodes, i, nodeNum, asmNum;
+ RF_AccessStripeMapHeader_t *new_asm_h[2];
+ RF_StripeNum_t parityStripeID;
+ char *sosBuffer, *eosBuffer;
+ RF_ReconUnitNum_t which_ru;
+ RF_RaidLayout_t *layoutPtr;
+ RF_PhysDiskAddr_t *pda;
+
+ layoutPtr = &(raidPtr->Layout);
+ parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr, asmap->raidAddress,
+ &which_ru);
+
+ if (rf_dagDebug) {
+ printf("[Creating large-write DAG]\n");
+ }
+ dag_h->creator = "LargeWriteDAG";
+
+ dag_h->numCommitNodes = 1;
+ dag_h->numCommits = 0;
+ dag_h->numSuccedents = 1;
+
+ /* alloc the nodes: Wnd, xor, commit, block, term, and Wnp */
+ nWndNodes = asmap->numStripeUnitsAccessed;
+ RF_CallocAndAdd(nodes, nWndNodes + 4 + nfaults, sizeof(RF_DagNode_t),
+ (RF_DagNode_t *), allocList);
+ i = 0;
+ wndNodes = &nodes[i]; i += nWndNodes;
+ xorNode = &nodes[i]; i += 1;
+ wnpNode = &nodes[i]; i += 1;
+ blockNode = &nodes[i]; i += 1;
+ commitNode = &nodes[i]; i += 1;
+ termNode = &nodes[i]; i += 1;
+ if (nfaults == 2) {
+ wnqNode = &nodes[i]; i += 1;
+ }
+ else {
+ wnqNode = NULL;
+ }
+ rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h,
+ &nRodNodes, &sosBuffer, &eosBuffer, allocList);
+ if (nRodNodes > 0) {
+ RF_CallocAndAdd(rodNodes, nRodNodes, sizeof(RF_DagNode_t),
+ (RF_DagNode_t *), allocList);
+ }
+ else {
+ rodNodes = NULL;
+ }
+
+ /* begin node initialization */
+ if (nRodNodes > 0) {
+ rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, nRodNodes, 0, 0, 0, dag_h, "Nil", allocList);
+ }
+ else {
+ rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
+ }
+
+ rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL,
+ nWndNodes + nfaults, 1, 0, 0, dag_h, "Cmt", allocList);
+ rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL,
+ 0, nWndNodes + nfaults, 0, 0, dag_h, "Trm", allocList);
+
+ /* initialize the Rod nodes */
+ for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
+ if (new_asm_h[asmNum]) {
+ pda = new_asm_h[asmNum]->stripeMap->physInfo;
+ while (pda) {
+ rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc,
+ rf_DiskReadUndoFunc,rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
+ "Rod", allocList);
+ rodNodes[nodeNum].params[0].p = pda;
+ rodNodes[nodeNum].params[1].p = pda->bufPtr;
+ rodNodes[nodeNum].params[2].v = parityStripeID;
+ rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+ 0, 0, which_ru);
+ nodeNum++;
+ pda = pda->next;
+ }
+ }
+ }
+ RF_ASSERT(nodeNum == nRodNodes);
+
+ /* initialize the wnd nodes */
+ pda = asmap->physInfo;
+ for (i=0; i < nWndNodes; i++) {
+ rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+ rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
+ RF_ASSERT(pda != NULL);
+ wndNodes[i].params[0].p = pda;
+ wndNodes[i].params[1].p = pda->bufPtr;
+ wndNodes[i].params[2].v = parityStripeID;
+ wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ pda = pda->next;
+ }
+
+ /* initialize the redundancy node */
+ if (nRodNodes > 0) {
+ rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1,
+ nRodNodes, 2 * (nWndNodes+nRodNodes) + 1, nfaults, dag_h,
+ "Xr ", allocList);
+ }
+ else {
+ rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1,
+ 1, 2 * (nWndNodes+nRodNodes) + 1, nfaults, dag_h, "Xr ", allocList);
+ }
+ xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
+ for (i=0; i < nWndNodes; i++) {
+ xorNode->params[2*i+0] = wndNodes[i].params[0]; /* pda */
+ xorNode->params[2*i+1] = wndNodes[i].params[1]; /* buf ptr */
+ }
+ for (i=0; i < nRodNodes; i++) {
+ xorNode->params[2*(nWndNodes+i)+0] = rodNodes[i].params[0]; /* pda */
+ xorNode->params[2*(nWndNodes+i)+1] = rodNodes[i].params[1]; /* buf ptr */
+ }
+ /* xor node needs to get at RAID information */
+ xorNode->params[2*(nWndNodes+nRodNodes)].p = raidPtr;
+
+ /*
+ * Look for an Rod node that reads a complete SU. If none, alloc a buffer
+ * to receive the parity info. Note that we can't use a new data buffer
+ * because it will not have gotten written when the xor occurs.
+ */
+ if (allowBufferRecycle) {
+ for (i = 0; i < nRodNodes; i++) {
+ if (((RF_PhysDiskAddr_t *)rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
+ break;
+ }
+ }
+ if ((!allowBufferRecycle) || (i == nRodNodes)) {
+ RF_CallocAndAdd(xorNode->results[0], 1,
+ rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit),
+ (void *), allocList);
+ }
+ else {
+ xorNode->results[0] = rodNodes[i].params[1].p;
+ }
+
+ /* initialize the Wnp node */
+ rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+ rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList);
+ wnpNode->params[0].p = asmap->parityInfo;
+ wnpNode->params[1].p = xorNode->results[0];
+ wnpNode->params[2].v = parityStripeID;
+ wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ /* parityInfo must describe entire parity unit */
+ RF_ASSERT(asmap->parityInfo->next == NULL);
+
+ if (nfaults == 2) {
+ /*
+ * We never try to recycle a buffer for the Q calcuation
+ * in addition to the parity. This would cause two buffers
+ * to get smashed during the P and Q calculation, guaranteeing
+ * one would be wrong.
+ */
+ RF_CallocAndAdd(xorNode->results[1], 1,
+ rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit),
+ (void *),allocList);
+ rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+ rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList);
+ wnqNode->params[0].p = asmap->qInfo;
+ wnqNode->params[1].p = xorNode->results[1];
+ wnqNode->params[2].v = parityStripeID;
+ wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ /* parityInfo must describe entire parity unit */
+ RF_ASSERT(asmap->parityInfo->next == NULL);
+ }
+
+ /*
+ * Connect nodes to form graph.
+ */
+
+ /* connect dag header to block node */
+ RF_ASSERT(blockNode->numAntecedents == 0);
+ dag_h->succedents[0] = blockNode;
+
+ if (nRodNodes > 0) {
+ /* connect the block node to the Rod nodes */
+ RF_ASSERT(blockNode->numSuccedents == nRodNodes);
+ RF_ASSERT(xorNode->numAntecedents == nRodNodes);
+ for (i = 0; i < nRodNodes; i++) {
+ RF_ASSERT(rodNodes[i].numAntecedents == 1);
+ blockNode->succedents[i] = &rodNodes[i];
+ rodNodes[i].antecedents[0] = blockNode;
+ rodNodes[i].antType[0] = rf_control;
+
+ /* connect the Rod nodes to the Xor node */
+ RF_ASSERT(rodNodes[i].numSuccedents == 1);
+ rodNodes[i].succedents[0] = xorNode;
+ xorNode->antecedents[i] = &rodNodes[i];
+ xorNode->antType[i] = rf_trueData;
+ }
+ }
+ else {
+ /* connect the block node to the Xor node */
+ RF_ASSERT(blockNode->numSuccedents == 1);
+ RF_ASSERT(xorNode->numAntecedents == 1);
+ blockNode->succedents[0] = xorNode;
+ xorNode->antecedents[0] = blockNode;
+ xorNode->antType[0] = rf_control;
+ }
+
+ /* connect the xor node to the commit node */
+ RF_ASSERT(xorNode->numSuccedents == 1);
+ RF_ASSERT(commitNode->numAntecedents == 1);
+ xorNode->succedents[0] = commitNode;
+ commitNode->antecedents[0] = xorNode;
+ commitNode->antType[0] = rf_control;
+
+ /* connect the commit node to the write nodes */
+ RF_ASSERT(commitNode->numSuccedents == nWndNodes + nfaults);
+ for (i = 0; i < nWndNodes; i++) {
+ RF_ASSERT(wndNodes->numAntecedents == 1);
+ commitNode->succedents[i] = &wndNodes[i];
+ wndNodes[i].antecedents[0] = commitNode;
+ wndNodes[i].antType[0] = rf_control;
+ }
+ RF_ASSERT(wnpNode->numAntecedents == 1);
+ commitNode->succedents[nWndNodes] = wnpNode;
+ wnpNode->antecedents[0]= commitNode;
+ wnpNode->antType[0] = rf_trueData;
+ if (nfaults == 2) {
+ RF_ASSERT(wnqNode->numAntecedents == 1);
+ commitNode->succedents[nWndNodes + 1] = wnqNode;
+ wnqNode->antecedents[0] = commitNode;
+ wnqNode->antType[0] = rf_trueData;
+ }
+
+ /* connect the write nodes to the term node */
+ RF_ASSERT(termNode->numAntecedents == nWndNodes + nfaults);
+ RF_ASSERT(termNode->numSuccedents == 0);
+ for (i = 0; i < nWndNodes; i++) {
+ RF_ASSERT(wndNodes->numSuccedents == 1);
+ wndNodes[i].succedents[0] = termNode;
+ termNode->antecedents[i] = &wndNodes[i];
+ termNode->antType[i] = rf_control;
+ }
+ RF_ASSERT(wnpNode->numSuccedents == 1);
+ wnpNode->succedents[0] = termNode;
+ termNode->antecedents[nWndNodes] = wnpNode;
+ termNode->antType[nWndNodes] = rf_control;
+ if (nfaults == 2) {
+ RF_ASSERT(wnqNode->numSuccedents == 1);
+ wnqNode->succedents[0] = termNode;
+ termNode->antecedents[nWndNodes + 1] = wnqNode;
+ termNode->antType[nWndNodes + 1] = rf_control;
+ }
+}
+
+/******************************************************************************
+ *
+ * creates a DAG to perform a small-write operation (either raid 5 or pq),
+ * which is as follows:
+ *
+ * Hdr -> Nil -> Rop -> Xor -> Cmt ----> Wnp [Unp] --> Trm
+ * \- Rod X / \----> Wnd [Und]-/
+ * [\- Rod X / \---> Wnd [Und]-/]
+ * [\- Roq -> Q / \--> Wnq [Unq]-/]
+ *
+ * Rop = read old parity
+ * Rod = read old data
+ * Roq = read old "q"
+ * Cmt = commit node
+ * Und = unlock data disk
+ * Unp = unlock parity disk
+ * Unq = unlock q disk
+ * Wnp = write new parity
+ * Wnd = write new data
+ * Wnq = write new "q"
+ * [ ] denotes optional segments in the graph
+ *
+ * Parameters: raidPtr - description of the physical array
+ * asmap - logical & physical addresses for this access
+ * bp - buffer ptr (holds write data)
+ * flags - general flags (e.g. disk locking)
+ * allocList - list of memory allocated in DAG creation
+ * pfuncs - list of parity generating functions
+ * qfuncs - list of q generating functions
+ *
+ * A null qfuncs indicates single fault tolerant
+ *****************************************************************************/
+
+void rf_CommonCreateSmallWriteDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList,
+ RF_RedFuncs_t *pfuncs,
+ RF_RedFuncs_t *qfuncs)
+{
+ RF_DagNode_t *readDataNodes, *readParityNodes, *readQNodes, *termNode;
+ RF_DagNode_t *unlockDataNodes, *unlockParityNodes, *unlockQNodes;
+ RF_DagNode_t *xorNodes, *qNodes, *blockNode, *commitNode, *nodes;
+ RF_DagNode_t *writeDataNodes, *writeParityNodes, *writeQNodes;
+ int i, j, nNodes, totalNumNodes, lu_flag;
+ RF_ReconUnitNum_t which_ru;
+ int (*func)(RF_DagNode_t *), (*undoFunc)(RF_DagNode_t *);
+ int (*qfunc)(RF_DagNode_t *);
+ int numDataNodes, numParityNodes;
+ RF_StripeNum_t parityStripeID;
+ RF_PhysDiskAddr_t *pda;
+ char *name, *qname;
+ long nfaults;
+
+ nfaults = qfuncs ? 2 : 1;
+ lu_flag = (rf_enableAtomicRMW) ? 1 : 0; /* lock/unlock flag */
+
+ parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
+ asmap->raidAddress, &which_ru);
+ pda = asmap->physInfo;
+ numDataNodes = asmap->numStripeUnitsAccessed;
+ numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
+
+ if (rf_dagDebug) {
+ printf("[Creating small-write DAG]\n");
+ }
+ RF_ASSERT(numDataNodes > 0);
+ dag_h->creator = "SmallWriteDAG";
+
+ dag_h->numCommitNodes = 1;
+ dag_h->numCommits = 0;
+ dag_h->numSuccedents = 1;
+
+ /*
+ * DAG creation occurs in four steps:
+ * 1. count the number of nodes in the DAG
+ * 2. create the nodes
+ * 3. initialize the nodes
+ * 4. connect the nodes
+ */
+
+ /*
+ * Step 1. compute number of nodes in the graph
+ */
+
+ /* number of nodes:
+ * a read and write for each data unit
+ * a redundancy computation node for each parity node (nfaults * nparity)
+ * a read and write for each parity unit
+ * a block and commit node (2)
+ * a terminate node
+ * if atomic RMW
+ * an unlock node for each data unit, redundancy unit
+ */
+ totalNumNodes = (2 * numDataNodes) + (nfaults * numParityNodes)
+ + (nfaults * 2 * numParityNodes) + 3;
+ if (lu_flag) {
+ totalNumNodes += (numDataNodes + (nfaults * numParityNodes));
+ }
+
+ /*
+ * Step 2. create the nodes
+ */
+ RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t),
+ (RF_DagNode_t *), allocList);
+ i = 0;
+ blockNode = &nodes[i]; i += 1;
+ commitNode = &nodes[i]; i += 1;
+ readDataNodes = &nodes[i]; i += numDataNodes;
+ readParityNodes = &nodes[i]; i += numParityNodes;
+ writeDataNodes = &nodes[i]; i += numDataNodes;
+ writeParityNodes = &nodes[i]; i += numParityNodes;
+ xorNodes = &nodes[i]; i += numParityNodes;
+ termNode = &nodes[i]; i += 1;
+ if (lu_flag) {
+ unlockDataNodes = &nodes[i]; i += numDataNodes;
+ unlockParityNodes = &nodes[i]; i += numParityNodes;
+ }
+ else {
+ unlockDataNodes = unlockParityNodes = NULL;
+ }
+ if (nfaults == 2) {
+ readQNodes = &nodes[i]; i += numParityNodes;
+ writeQNodes = &nodes[i]; i += numParityNodes;
+ qNodes = &nodes[i]; i += numParityNodes;
+ if (lu_flag) {
+ unlockQNodes = &nodes[i]; i += numParityNodes;
+ }
+ else {
+ unlockQNodes = NULL;
+ }
+ }
+ else {
+ readQNodes = writeQNodes = qNodes = unlockQNodes = NULL;
+ }
+ RF_ASSERT(i == totalNumNodes);
+
+ /*
+ * Step 3. initialize the nodes
+ */
+ /* initialize block node (Nil) */
+ nNodes = numDataNodes + (nfaults * numParityNodes);
+ rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, nNodes, 0, 0, 0, dag_h, "Nil", allocList);
+
+ /* initialize commit node (Cmt) */
+ rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, nNodes, (nfaults * numParityNodes), 0, 0, dag_h, "Cmt", allocList);
+
+ /* initialize terminate node (Trm) */
+ rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
+ NULL, 0, nNodes, 0, 0, dag_h, "Trm", allocList);
+
+ /* initialize nodes which read old data (Rod) */
+ for (i = 0; i < numDataNodes; i++) {
+ rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+ rf_GenericWakeupFunc, (nfaults * numParityNodes), 1, 4, 0, dag_h,
+ "Rod", allocList);
+ RF_ASSERT(pda != NULL);
+ /* physical disk addr desc */
+ readDataNodes[i].params[0].p = pda;
+ /* buffer to hold old data */
+ readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr,
+ dag_h, pda, allocList);
+ readDataNodes[i].params[2].v = parityStripeID;
+ readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+ lu_flag, 0, which_ru);
+ pda = pda->next;
+ for (j = 0; j < readDataNodes[i].numSuccedents; j++) {
+ readDataNodes[i].propList[j] = NULL;
+ }
+ }
+
+ /* initialize nodes which read old parity (Rop) */
+ pda = asmap->parityInfo; i = 0;
+ for (i = 0; i < numParityNodes; i++) {
+ RF_ASSERT(pda != NULL);
+ rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc,
+ rf_DiskReadUndoFunc, rf_GenericWakeupFunc, numParityNodes, 1, 4,
+ 0, dag_h, "Rop", allocList);
+ readParityNodes[i].params[0].p = pda;
+ /* buffer to hold old parity */
+ readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr,
+ dag_h, pda, allocList);
+ readParityNodes[i].params[2].v = parityStripeID;
+ readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+ lu_flag, 0, which_ru);
+ pda = pda->next;
+ for (j = 0; j < readParityNodes[i].numSuccedents; j++) {
+ readParityNodes[i].propList[0] = NULL;
+ }
+ }
+
+ /* initialize nodes which read old Q (Roq) */
+ if (nfaults == 2) {
+ pda = asmap->qInfo;
+ for (i = 0; i < numParityNodes; i++) {
+ RF_ASSERT(pda != NULL);
+ rf_InitNode(&readQNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+ rf_GenericWakeupFunc, numParityNodes, 1, 4, 0, dag_h, "Roq", allocList);
+ readQNodes[i].params[0].p = pda;
+ /* buffer to hold old Q */
+ readQNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda,
+ allocList);
+ readQNodes[i].params[2].v = parityStripeID;
+ readQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+ lu_flag, 0, which_ru);
+ pda = pda->next;
+ for (j = 0; j < readQNodes[i].numSuccedents; j++) {
+ readQNodes[i].propList[0] = NULL;
+ }
+ }
+ }
+
+ /* initialize nodes which write new data (Wnd) */
+ pda = asmap->physInfo;
+ for (i=0; i < numDataNodes; i++) {
+ RF_ASSERT(pda != NULL);
+ rf_InitNode(&writeDataNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc,
+ rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
+ "Wnd", allocList);
+ /* physical disk addr desc */
+ writeDataNodes[i].params[0].p = pda;
+ /* buffer holding new data to be written */
+ writeDataNodes[i].params[1].p = pda->bufPtr;
+ writeDataNodes[i].params[2].v = parityStripeID;
+ writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+ 0, 0, which_ru);
+ if (lu_flag) {
+ /* initialize node to unlock the disk queue */
+ rf_InitNode(&unlockDataNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc,
+ rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h,
+ "Und", allocList);
+ /* physical disk addr desc */
+ unlockDataNodes[i].params[0].p = pda;
+ unlockDataNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+ 0, lu_flag, which_ru);
+ }
+ pda = pda->next;
+ }
+
+ /*
+ * Initialize nodes which compute new parity and Q.
+ */
+ /*
+ * We use the simple XOR func in the double-XOR case, and when
+ * we're accessing only a portion of one stripe unit. The distinction
+ * between the two is that the regular XOR func assumes that the targbuf
+ * is a full SU in size, and examines the pda associated with the buffer
+ * to decide where within the buffer to XOR the data, whereas
+ * the simple XOR func just XORs the data into the start of the buffer.
+ */
+ if ((numParityNodes==2) || ((numDataNodes == 1)
+ && (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit)))
+ {
+ func = pfuncs->simple; undoFunc = rf_NullNodeUndoFunc; name = pfuncs->SimpleName;
+ if (qfuncs) {
+ qfunc = qfuncs->simple;
+ qname = qfuncs->SimpleName;
+ }
+ else {
+ qfunc = NULL;
+ qname = NULL;
+ }
+ }
+ else {
+ func = pfuncs->regular;
+ undoFunc = rf_NullNodeUndoFunc;
+ name = pfuncs->RegularName;
+ if (qfuncs) {
+ qfunc = qfuncs->regular;
+ qname = qfuncs->RegularName;
+ }
+ else {
+ qfunc = NULL;
+ qname = NULL;
+ }
+ }
+ /*
+ * Initialize the xor nodes: params are {pda,buf}
+ * from {Rod,Wnd,Rop} nodes, and raidPtr
+ */
+ if (numParityNodes==2) {
+ /* double-xor case */
+ for (i=0; i < numParityNodes; i++) {
+ /* note: no wakeup func for xor */
+ rf_InitNode(&xorNodes[i], rf_wait, RF_FALSE, func, undoFunc, NULL,
+ 1, (numDataNodes + numParityNodes), 7, 1, dag_h, name, allocList);
+ xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD;
+ xorNodes[i].params[0] = readDataNodes[i].params[0];
+ xorNodes[i].params[1] = readDataNodes[i].params[1];
+ xorNodes[i].params[2] = readParityNodes[i].params[0];
+ xorNodes[i].params[3] = readParityNodes[i].params[1];
+ xorNodes[i].params[4] = writeDataNodes[i].params[0];
+ xorNodes[i].params[5] = writeDataNodes[i].params[1];
+ xorNodes[i].params[6].p = raidPtr;
+ /* use old parity buf as target buf */
+ xorNodes[i].results[0] = readParityNodes[i].params[1].p;
+ if (nfaults == 2) {
+ /* note: no wakeup func for qor */
+ rf_InitNode(&qNodes[i], rf_wait, RF_FALSE, qfunc, undoFunc, NULL, 1,
+ (numDataNodes + numParityNodes), 7, 1, dag_h, qname, allocList);
+ qNodes[i].params[0] = readDataNodes[i].params[0];
+ qNodes[i].params[1] = readDataNodes[i].params[1];
+ qNodes[i].params[2] = readQNodes[i].params[0];
+ qNodes[i].params[3] = readQNodes[i].params[1];
+ qNodes[i].params[4] = writeDataNodes[i].params[0];
+ qNodes[i].params[5] = writeDataNodes[i].params[1];
+ qNodes[i].params[6].p = raidPtr;
+ /* use old Q buf as target buf */
+ qNodes[i].results[0] = readQNodes[i].params[1].p;
+ }
+ }
+ }
+ else {
+ /* there is only one xor node in this case */
+ rf_InitNode(&xorNodes[0], rf_wait, RF_FALSE, func, undoFunc, NULL, 1,
+ (numDataNodes + numParityNodes),
+ (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList);
+ xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD;
+ for (i=0; i < numDataNodes + 1; i++) {
+ /* set up params related to Rod and Rop nodes */
+ xorNodes[0].params[2*i+0] = readDataNodes[i].params[0]; /* pda */
+ xorNodes[0].params[2*i+1] = readDataNodes[i].params[1]; /* buffer ptr */
+ }
+ for (i=0; i < numDataNodes; i++) {
+ /* set up params related to Wnd and Wnp nodes */
+ xorNodes[0].params[2*(numDataNodes+1+i)+0] = /* pda */
+ writeDataNodes[i].params[0];
+ xorNodes[0].params[2*(numDataNodes+1+i)+1] = /* buffer ptr */
+ writeDataNodes[i].params[1];
+ }
+ /* xor node needs to get at RAID information */
+ xorNodes[0].params[2*(numDataNodes+numDataNodes+1)].p = raidPtr;
+ xorNodes[0].results[0] = readParityNodes[0].params[1].p;
+ if (nfaults == 2) {
+ rf_InitNode(&qNodes[0], rf_wait, RF_FALSE, qfunc, undoFunc, NULL, 1,
+ (numDataNodes + numParityNodes),
+ (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h,
+ qname, allocList);
+ for (i=0; i<numDataNodes; i++) {
+ /* set up params related to Rod */
+ qNodes[0].params[2*i+0] = readDataNodes[i].params[0]; /* pda */
+ qNodes[0].params[2*i+1] = readDataNodes[i].params[1]; /* buffer ptr */
+ }
+ /* and read old q */
+ qNodes[0].params[2*numDataNodes + 0] = /* pda */
+ readQNodes[0].params[0];
+ qNodes[0].params[2*numDataNodes + 1] = /* buffer ptr */
+ readQNodes[0].params[1];
+ for (i=0; i < numDataNodes; i++) {
+ /* set up params related to Wnd nodes */
+ qNodes[0].params[2*(numDataNodes+1+i)+0] = /* pda */
+ writeDataNodes[i].params[0];
+ qNodes[0].params[2*(numDataNodes+1+i)+1] = /* buffer ptr */
+ writeDataNodes[i].params[1];
+ }
+ /* xor node needs to get at RAID information */
+ qNodes[0].params[2*(numDataNodes+numDataNodes+1)].p = raidPtr;
+ qNodes[0].results[0] = readQNodes[0].params[1].p;
+ }
+ }
+
+ /* initialize nodes which write new parity (Wnp) */
+ pda = asmap->parityInfo;
+ for (i=0; i < numParityNodes; i++) {
+ rf_InitNode(&writeParityNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc,
+ rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
+ "Wnp", allocList);
+ RF_ASSERT(pda != NULL);
+ writeParityNodes[i].params[0].p = pda; /* param 1 (bufPtr) filled in by xor node */
+ writeParityNodes[i].params[1].p = xorNodes[i].results[0]; /* buffer pointer for parity write operation */
+ writeParityNodes[i].params[2].v = parityStripeID;
+ writeParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+ 0, 0, which_ru);
+ if (lu_flag) {
+ /* initialize node to unlock the disk queue */
+ rf_InitNode(&unlockParityNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc,
+ rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h,
+ "Unp", allocList);
+ unlockParityNodes[i].params[0].p = pda; /* physical disk addr desc */
+ unlockParityNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+ 0, lu_flag, which_ru);
+ }
+ pda = pda->next;
+ }
+
+ /* initialize nodes which write new Q (Wnq) */
+ if (nfaults == 2) {
+ pda = asmap->qInfo;
+ for (i=0; i < numParityNodes; i++) {
+ rf_InitNode(&writeQNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc,
+ rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
+ "Wnq", allocList);
+ RF_ASSERT(pda != NULL);
+ writeQNodes[i].params[0].p = pda; /* param 1 (bufPtr) filled in by xor node */
+ writeQNodes[i].params[1].p = qNodes[i].results[0]; /* buffer pointer for parity write operation */
+ writeQNodes[i].params[2].v = parityStripeID;
+ writeQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+ 0, 0, which_ru);
+ if (lu_flag) {
+ /* initialize node to unlock the disk queue */
+ rf_InitNode(&unlockQNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc,
+ rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h,
+ "Unq", allocList);
+ unlockQNodes[i].params[0].p = pda; /* physical disk addr desc */
+ unlockQNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
+ 0, lu_flag, which_ru);
+ }
+ pda = pda->next;
+ }
+ }
+
+ /*
+ * Step 4. connect the nodes.
+ */
+
+ /* connect header to block node */
+ dag_h->succedents[0] = blockNode;
+
+ /* connect block node to read old data nodes */
+ RF_ASSERT(blockNode->numSuccedents == (numDataNodes + (numParityNodes * nfaults)));
+ for (i = 0; i < numDataNodes; i++) {
+ blockNode->succedents[i] = &readDataNodes[i];
+ RF_ASSERT(readDataNodes[i].numAntecedents == 1);
+ readDataNodes[i].antecedents[0]= blockNode;
+ readDataNodes[i].antType[0] = rf_control;
+ }
+
+ /* connect block node to read old parity nodes */
+ for (i = 0; i < numParityNodes; i++) {
+ blockNode->succedents[numDataNodes + i] = &readParityNodes[i];
+ RF_ASSERT(readParityNodes[i].numAntecedents == 1);
+ readParityNodes[i].antecedents[0] = blockNode;
+ readParityNodes[i].antType[0] = rf_control;
+ }
+
+ /* connect block node to read old Q nodes */
+ if (nfaults == 2) {
+ for (i = 0; i < numParityNodes; i++) {
+ blockNode->succedents[numDataNodes + numParityNodes + i] = &readQNodes[i];
+ RF_ASSERT(readQNodes[i].numAntecedents == 1);
+ readQNodes[i].antecedents[0] = blockNode;
+ readQNodes[i].antType[0] = rf_control;
+ }
+ }
+
+ /* connect read old data nodes to xor nodes */
+ for (i = 0; i < numDataNodes; i++) {
+ RF_ASSERT(readDataNodes[i].numSuccedents == (nfaults * numParityNodes));
+ for (j = 0; j < numParityNodes; j++){
+ RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes);
+ readDataNodes[i].succedents[j] = &xorNodes[j];
+ xorNodes[j].antecedents[i] = &readDataNodes[i];
+ xorNodes[j].antType[i] = rf_trueData;
+ }
+ }
+
+ /* connect read old data nodes to q nodes */
+ if (nfaults == 2) {
+ for (i = 0; i < numDataNodes; i++) {
+ for (j = 0; j < numParityNodes; j++) {
+ RF_ASSERT(qNodes[j].numAntecedents == numDataNodes + numParityNodes);
+ readDataNodes[i].succedents[numParityNodes + j] = &qNodes[j];
+ qNodes[j].antecedents[i] = &readDataNodes[i];
+ qNodes[j].antType[i] = rf_trueData;
+ }
+ }
+ }
+
+ /* connect read old parity nodes to xor nodes */
+ for (i = 0; i < numParityNodes; i++) {
+ RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes);
+ for (j = 0; j < numParityNodes; j++) {
+ readParityNodes[i].succedents[j] = &xorNodes[j];
+ xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
+ xorNodes[j].antType[numDataNodes + i] = rf_trueData;
+ }
+ }
+
+ /* connect read old q nodes to q nodes */
+ if (nfaults == 2) {
+ for (i = 0; i < numParityNodes; i++) {
+ RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes);
+ for (j = 0; j < numParityNodes; j++) {
+ readQNodes[i].succedents[j] = &qNodes[j];
+ qNodes[j].antecedents[numDataNodes + i] = &readQNodes[i];
+ qNodes[j].antType[numDataNodes + i] = rf_trueData;
+ }
+ }
+ }
+
+ /* connect xor nodes to commit node */
+ RF_ASSERT(commitNode->numAntecedents == (nfaults * numParityNodes));
+ for (i = 0; i < numParityNodes; i++) {
+ RF_ASSERT(xorNodes[i].numSuccedents == 1);
+ xorNodes[i].succedents[0] = commitNode;
+ commitNode->antecedents[i] = &xorNodes[i];
+ commitNode->antType[i] = rf_control;
+ }
+
+ /* connect q nodes to commit node */
+ if (nfaults == 2) {
+ for (i = 0; i < numParityNodes; i++) {
+ RF_ASSERT(qNodes[i].numSuccedents == 1);
+ qNodes[i].succedents[0] = commitNode;
+ commitNode->antecedents[i + numParityNodes] = &qNodes[i];
+ commitNode->antType[i + numParityNodes] = rf_control;
+ }
+ }
+
+ /* connect commit node to write nodes */
+ RF_ASSERT(commitNode->numSuccedents == (numDataNodes + (nfaults * numParityNodes)));
+ for (i = 0; i < numDataNodes; i++) {
+ RF_ASSERT(writeDataNodes[i].numAntecedents == 1);
+ commitNode->succedents[i] = &writeDataNodes[i];
+ writeDataNodes[i].antecedents[0] = commitNode;
+ writeDataNodes[i].antType[0] = rf_trueData;
+ }
+ for (i = 0; i < numParityNodes; i++) {
+ RF_ASSERT(writeParityNodes[i].numAntecedents == 1);
+ commitNode->succedents[i + numDataNodes] = &writeParityNodes[i];
+ writeParityNodes[i].antecedents[0] = commitNode;
+ writeParityNodes[i].antType[0] = rf_trueData;
+ }
+ if (nfaults == 2) {
+ for (i = 0; i < numParityNodes; i++) {
+ RF_ASSERT(writeQNodes[i].numAntecedents == 1);
+ commitNode->succedents[i + numDataNodes + numParityNodes] = &writeQNodes[i];
+ writeQNodes[i].antecedents[0] = commitNode;
+ writeQNodes[i].antType[0] = rf_trueData;
+ }
+ }
+
+ RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
+ RF_ASSERT(termNode->numSuccedents == 0);
+ for (i = 0; i < numDataNodes; i++) {
+ if (lu_flag) {
+ /* connect write new data nodes to unlock nodes */
+ RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
+ RF_ASSERT(unlockDataNodes[i].numAntecedents == 1);
+ writeDataNodes[i].succedents[0] = &unlockDataNodes[i];
+ unlockDataNodes[i].antecedents[0] = &writeDataNodes[i];
+ unlockDataNodes[i].antType[0] = rf_control;
+
+ /* connect unlock nodes to term node */
+ RF_ASSERT(unlockDataNodes[i].numSuccedents == 1);
+ unlockDataNodes[i].succedents[0] = termNode;
+ termNode->antecedents[i] = &unlockDataNodes[i];
+ termNode->antType[i] = rf_control;
+ }
+ else {
+ /* connect write new data nodes to term node */
+ RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
+ RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
+ writeDataNodes[i].succedents[0] = termNode;
+ termNode->antecedents[i] = &writeDataNodes[i];
+ termNode->antType[i] = rf_control;
+ }
+ }
+
+ for (i = 0; i < numParityNodes; i++) {
+ if (lu_flag) {
+ /* connect write new parity nodes to unlock nodes */
+ RF_ASSERT(writeParityNodes[i].numSuccedents == 1);
+ RF_ASSERT(unlockParityNodes[i].numAntecedents == 1);
+ writeParityNodes[i].succedents[0] = &unlockParityNodes[i];
+ unlockParityNodes[i].antecedents[0] = &writeParityNodes[i];
+ unlockParityNodes[i].antType[0] = rf_control;
+
+ /* connect unlock nodes to term node */
+ RF_ASSERT(unlockParityNodes[i].numSuccedents == 1);
+ unlockParityNodes[i].succedents[0] = termNode;
+ termNode->antecedents[numDataNodes + i] = &unlockParityNodes[i];
+ termNode->antType[numDataNodes + i] = rf_control;
+ }
+ else {
+ RF_ASSERT(writeParityNodes[i].numSuccedents == 1);
+ writeParityNodes[i].succedents[0] = termNode;
+ termNode->antecedents[numDataNodes + i] = &writeParityNodes[i];
+ termNode->antType[numDataNodes + i] = rf_control;
+ }
+ }
+
+ if (nfaults == 2) {
+ for (i = 0; i < numParityNodes; i++) {
+ if (lu_flag) {
+ /* connect write new Q nodes to unlock nodes */
+ RF_ASSERT(writeQNodes[i].numSuccedents == 1);
+ RF_ASSERT(unlockQNodes[i].numAntecedents == 1);
+ writeQNodes[i].succedents[0] = &unlockQNodes[i];
+ unlockQNodes[i].antecedents[0] = &writeQNodes[i];
+ unlockQNodes[i].antType[0] = rf_control;
+
+ /* connect unlock nodes to unblock node */
+ RF_ASSERT(unlockQNodes[i].numSuccedents == 1);
+ unlockQNodes[i].succedents[0] = termNode;
+ termNode->antecedents[numDataNodes + numParityNodes + i] = &unlockQNodes[i];
+ termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
+ }
+ else {
+ RF_ASSERT(writeQNodes[i].numSuccedents == 1);
+ writeQNodes[i].succedents[0] = termNode;
+ termNode->antecedents[numDataNodes + numParityNodes + i] = &writeQNodes[i];
+ termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
+ }
+ }
+ }
+}
+
+
+/******************************************************************************
+ * create a write graph (fault-free or degraded) for RAID level 1
+ *
+ * Hdr -> Commit -> Wpd -> Nil -> Trm
+ * -> Wsd ->
+ *
+ * The "Wpd" node writes data to the primary copy in the mirror pair
+ * The "Wsd" node writes data to the secondary copy in the mirror pair
+ *
+ * Parameters: raidPtr - description of the physical array
+ * asmap - logical & physical addresses for this access
+ * bp - buffer ptr (holds write data)
+ * flags - general flags (e.g. disk locking)
+ * allocList - list of memory allocated in DAG creation
+ *****************************************************************************/
+
+void rf_CreateRaidOneWriteDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList)
+{
+ RF_DagNode_t *unblockNode, *termNode, *commitNode;
+ RF_DagNode_t *nodes, *wndNode, *wmirNode;
+ int nWndNodes, nWmirNodes, i;
+ RF_ReconUnitNum_t which_ru;
+ RF_PhysDiskAddr_t *pda, *pdaP;
+ RF_StripeNum_t parityStripeID;
+
+ parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
+ asmap->raidAddress, &which_ru);
+ if (rf_dagDebug) {
+ printf("[Creating RAID level 1 write DAG]\n");
+ }
+ dag_h->creator = "RaidOneWriteDAG";
+
+ /* 2 implies access not SU aligned */
+ nWmirNodes = (asmap->parityInfo->next) ? 2 : 1;
+ nWndNodes = (asmap->physInfo->next) ? 2 : 1;
+
+ /* alloc the Wnd nodes and the Wmir node */
+ if (asmap->numDataFailed == 1)
+ nWndNodes--;
+ if (asmap->numParityFailed == 1)
+ nWmirNodes--;
+
+ /* total number of nodes = nWndNodes + nWmirNodes + (commit + unblock + terminator) */
+ RF_CallocAndAdd(nodes, nWndNodes + nWmirNodes + 3, sizeof(RF_DagNode_t),
+ (RF_DagNode_t *), allocList);
+ i = 0;
+ wndNode = &nodes[i]; i += nWndNodes;
+ wmirNode = &nodes[i]; i += nWmirNodes;
+ commitNode = &nodes[i]; i += 1;
+ unblockNode = &nodes[i]; i += 1;
+ termNode = &nodes[i]; i += 1;
+ RF_ASSERT(i == (nWndNodes + nWmirNodes + 3));
+
+ /* this dag can commit immediately */
+ dag_h->numCommitNodes = 1;
+ dag_h->numCommits = 0;
+ dag_h->numSuccedents = 1;
+
+ /* initialize the commit, unblock, and term nodes */
+ rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, (nWndNodes + nWmirNodes), 0, 0, 0, dag_h, "Cmt", allocList);
+ rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
+ NULL, 1, (nWndNodes + nWmirNodes), 0, 0, dag_h, "Nil", allocList);
+ rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
+ NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+
+ /* initialize the wnd nodes */
+ if (nWndNodes > 0) {
+ pda = asmap->physInfo;
+ for (i = 0; i < nWndNodes; i++) {
+ rf_InitNode(&wndNode[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+ rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wpd", allocList);
+ RF_ASSERT(pda != NULL);
+ wndNode[i].params[0].p = pda;
+ wndNode[i].params[1].p = pda->bufPtr;
+ wndNode[i].params[2].v = parityStripeID;
+ wndNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ pda = pda->next;
+ }
+ RF_ASSERT(pda == NULL);
+ }
+
+ /* initialize the mirror nodes */
+ if (nWmirNodes > 0) {
+ pda = asmap->physInfo;
+ pdaP = asmap->parityInfo;
+ for (i = 0; i < nWmirNodes; i++) {
+ rf_InitNode(&wmirNode[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+ rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wsd", allocList);
+ RF_ASSERT(pda != NULL);
+ wmirNode[i].params[0].p = pdaP;
+ wmirNode[i].params[1].p = pda->bufPtr;
+ wmirNode[i].params[2].v = parityStripeID;
+ wmirNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ pda = pda->next;
+ pdaP = pdaP->next;
+ }
+ RF_ASSERT(pda == NULL);
+ RF_ASSERT(pdaP == NULL);
+ }
+
+ /* link the header node to the commit node */
+ RF_ASSERT(dag_h->numSuccedents == 1);
+ RF_ASSERT(commitNode->numAntecedents == 0);
+ dag_h->succedents[0] = commitNode;
+
+ /* link the commit node to the write nodes */
+ RF_ASSERT(commitNode->numSuccedents == (nWndNodes + nWmirNodes));
+ for (i = 0; i < nWndNodes; i++) {
+ RF_ASSERT(wndNode[i].numAntecedents == 1);
+ commitNode->succedents[i] = &wndNode[i];
+ wndNode[i].antecedents[0] = commitNode;
+ wndNode[i].antType[0] = rf_control;
+ }
+ for (i = 0; i < nWmirNodes; i++) {
+ RF_ASSERT(wmirNode[i].numAntecedents == 1);
+ commitNode->succedents[i + nWndNodes] = &wmirNode[i];
+ wmirNode[i].antecedents[0] = commitNode;
+ wmirNode[i].antType[0] = rf_control;
+ }
+
+ /* link the write nodes to the unblock node */
+ RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nWmirNodes));
+ for (i = 0; i < nWndNodes; i++) {
+ RF_ASSERT(wndNode[i].numSuccedents == 1);
+ wndNode[i].succedents[0] = unblockNode;
+ unblockNode->antecedents[i] = &wndNode[i];
+ unblockNode->antType[i] = rf_control;
+ }
+ for (i = 0; i < nWmirNodes; i++) {
+ RF_ASSERT(wmirNode[i].numSuccedents == 1);
+ wmirNode[i].succedents[0] = unblockNode;
+ unblockNode->antecedents[i + nWndNodes] = &wmirNode[i];
+ unblockNode->antType[i + nWndNodes] = rf_control;
+ }
+
+ /* link the unblock node to the term node */
+ RF_ASSERT(unblockNode->numSuccedents == 1);
+ RF_ASSERT(termNode->numAntecedents == 1);
+ RF_ASSERT(termNode->numSuccedents == 0);
+ unblockNode->succedents[0] = termNode;
+ termNode->antecedents[0] = unblockNode;
+ termNode->antType[0] = rf_control;
+}
+
+
+
+/* DAGs which have no commit points.
+ *
+ * The following DAGs are used in forward and backward error recovery experiments.
+ * They are identical to the DAGs above this comment with the exception that the
+ * the commit points have been removed.
+ */
+
+
+
+void rf_CommonCreateLargeWriteDAGFwd(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList,
+ int nfaults,
+ int (*redFunc)(RF_DagNode_t *),
+ int allowBufferRecycle)
+{
+ RF_DagNode_t *nodes, *wndNodes, *rodNodes, *xorNode, *wnpNode;
+ RF_DagNode_t *wnqNode, *blockNode, *syncNode, *termNode;
+ int nWndNodes, nRodNodes, i, nodeNum, asmNum;
+ RF_AccessStripeMapHeader_t *new_asm_h[2];
+ RF_StripeNum_t parityStripeID;
+ char *sosBuffer, *eosBuffer;
+ RF_ReconUnitNum_t which_ru;
+ RF_RaidLayout_t *layoutPtr;
+ RF_PhysDiskAddr_t *pda;
+
+ layoutPtr = &(raidPtr->Layout);
+ parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
+
+ if (rf_dagDebug)
+ printf("[Creating large-write DAG]\n");
+ dag_h->creator = "LargeWriteDAGFwd";
+
+ dag_h->numCommitNodes = 0;
+ dag_h->numCommits = 0;
+ dag_h->numSuccedents = 1;
+
+ /* alloc the nodes: Wnd, xor, commit, block, term, and Wnp */
+ nWndNodes = asmap->numStripeUnitsAccessed;
+ RF_CallocAndAdd(nodes, nWndNodes + 4 + nfaults, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+ i = 0;
+ wndNodes = &nodes[i]; i += nWndNodes;
+ xorNode = &nodes[i]; i += 1;
+ wnpNode = &nodes[i]; i += 1;
+ blockNode = &nodes[i]; i += 1;
+ syncNode = &nodes[i]; i += 1;
+ termNode = &nodes[i]; i += 1;
+ if (nfaults == 2) {
+ wnqNode = &nodes[i]; i += 1;
+ }
+ else {
+ wnqNode = NULL;
+ }
+ rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h, &nRodNodes, &sosBuffer, &eosBuffer, allocList);
+ if (nRodNodes > 0) {
+ RF_CallocAndAdd(rodNodes, nRodNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+ }
+ else {
+ rodNodes = NULL;
+ }
+
+ /* begin node initialization */
+ if (nRodNodes > 0) {
+ rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nRodNodes, 0, 0, 0, dag_h, "Nil", allocList);
+ rf_InitNode(syncNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nWndNodes + 1, nRodNodes, 0, 0, dag_h, "Nil", allocList);
+ }
+ else {
+ rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
+ rf_InitNode(syncNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nWndNodes + 1, 1, 0, 0, dag_h, "Nil", allocList);
+ }
+
+ rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, nWndNodes + nfaults, 0, 0, dag_h, "Trm", allocList);
+
+ /* initialize the Rod nodes */
+ for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
+ if (new_asm_h[asmNum]) {
+ pda = new_asm_h[asmNum]->stripeMap->physInfo;
+ while (pda) {
+ rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Rod", allocList);
+ rodNodes[nodeNum].params[0].p = pda;
+ rodNodes[nodeNum].params[1].p = pda->bufPtr;
+ rodNodes[nodeNum].params[2].v = parityStripeID;
+ rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ nodeNum++;
+ pda=pda->next;
+ }
+ }
+ }
+ RF_ASSERT(nodeNum == nRodNodes);
+
+ /* initialize the wnd nodes */
+ pda = asmap->physInfo;
+ for (i=0; i < nWndNodes; i++) {
+ rf_InitNode(&wndNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
+ RF_ASSERT(pda != NULL);
+ wndNodes[i].params[0].p = pda;
+ wndNodes[i].params[1].p = pda->bufPtr;
+ wndNodes[i].params[2].v = parityStripeID;
+ wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ pda = pda->next;
+ }
+
+ /* initialize the redundancy node */
+ rf_InitNode(xorNode, rf_wait, RF_FALSE, redFunc, rf_NullNodeUndoFunc, NULL, 1, nfaults, 2 * (nWndNodes + nRodNodes) + 1, nfaults, dag_h, "Xr ", allocList);
+ xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
+ for (i=0; i < nWndNodes; i++) {
+ xorNode->params[2*i+0] = wndNodes[i].params[0]; /* pda */
+ xorNode->params[2*i+1] = wndNodes[i].params[1]; /* buf ptr */
+ }
+ for (i=0; i < nRodNodes; i++) {
+ xorNode->params[2*(nWndNodes+i)+0] = rodNodes[i].params[0]; /* pda */
+ xorNode->params[2*(nWndNodes+i)+1] = rodNodes[i].params[1]; /* buf ptr */
+ }
+ xorNode->params[2*(nWndNodes+nRodNodes)].p = raidPtr; /* xor node needs to get at RAID information */
+
+ /* look for an Rod node that reads a complete SU. If none, alloc a buffer to receive the parity info.
+ * Note that we can't use a new data buffer because it will not have gotten written when the xor occurs.
+ */
+ if (allowBufferRecycle) {
+ for (i = 0; i < nRodNodes; i++)
+ if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
+ break;
+ }
+ if ((!allowBufferRecycle) || (i == nRodNodes)) {
+ RF_CallocAndAdd(xorNode->results[0], 1, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (void *), allocList);
+ }
+ else
+ xorNode->results[0] = rodNodes[i].params[1].p;
+
+ /* initialize the Wnp node */
+ rf_InitNode(wnpNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnp", allocList);
+ wnpNode->params[0].p = asmap->parityInfo;
+ wnpNode->params[1].p = xorNode->results[0];
+ wnpNode->params[2].v = parityStripeID;
+ wnpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ RF_ASSERT(asmap->parityInfo->next == NULL); /* parityInfo must describe entire parity unit */
+
+ if (nfaults == 2)
+ {
+ /* we never try to recycle a buffer for the Q calcuation in addition to the parity.
+ This would cause two buffers to get smashed during the P and Q calculation,
+ guaranteeing one would be wrong.
+ */
+ RF_CallocAndAdd(xorNode->results[1], 1, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (void *), allocList);
+ rf_InitNode(wnqNode, rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnq", allocList);
+ wnqNode->params[0].p = asmap->qInfo;
+ wnqNode->params[1].p = xorNode->results[1];
+ wnqNode->params[2].v = parityStripeID;
+ wnqNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ RF_ASSERT(asmap->parityInfo->next == NULL); /* parityInfo must describe entire parity unit */
+ }
+
+
+ /* connect nodes to form graph */
+
+ /* connect dag header to block node */
+ RF_ASSERT(blockNode->numAntecedents == 0);
+ dag_h->succedents[0] = blockNode;
+
+ if (nRodNodes > 0) {
+ /* connect the block node to the Rod nodes */
+ RF_ASSERT(blockNode->numSuccedents == nRodNodes);
+ RF_ASSERT(syncNode->numAntecedents == nRodNodes);
+ for (i = 0; i < nRodNodes; i++) {
+ RF_ASSERT(rodNodes[i].numAntecedents == 1);
+ blockNode->succedents[i] = &rodNodes[i];
+ rodNodes[i].antecedents[0] = blockNode;
+ rodNodes[i].antType[0] = rf_control;
+
+ /* connect the Rod nodes to the Nil node */
+ RF_ASSERT(rodNodes[i].numSuccedents == 1);
+ rodNodes[i].succedents[0] = syncNode;
+ syncNode->antecedents[i] = &rodNodes[i];
+ syncNode->antType[i] = rf_trueData;
+ }
+ }
+ else {
+ /* connect the block node to the Nil node */
+ RF_ASSERT(blockNode->numSuccedents == 1);
+ RF_ASSERT(syncNode->numAntecedents == 1);
+ blockNode->succedents[0] = syncNode;
+ syncNode->antecedents[0] = blockNode;
+ syncNode->antType[0] = rf_control;
+ }
+
+ /* connect the sync node to the Wnd nodes */
+ RF_ASSERT(syncNode->numSuccedents == (1 + nWndNodes));
+ for (i = 0; i < nWndNodes; i++) {
+ RF_ASSERT(wndNodes->numAntecedents == 1);
+ syncNode->succedents[i] = &wndNodes[i];
+ wndNodes[i].antecedents[0] = syncNode;
+ wndNodes[i].antType[0] = rf_control;
+ }
+
+ /* connect the sync node to the Xor node */
+ RF_ASSERT(xorNode->numAntecedents == 1);
+ syncNode->succedents[nWndNodes] = xorNode;
+ xorNode->antecedents[0] = syncNode;
+ xorNode->antType[0] = rf_control;
+
+ /* connect the xor node to the write parity node */
+ RF_ASSERT(xorNode->numSuccedents == nfaults);
+ RF_ASSERT(wnpNode->numAntecedents == 1);
+ xorNode->succedents[0] = wnpNode;
+ wnpNode->antecedents[0]= xorNode;
+ wnpNode->antType[0] = rf_trueData;
+ if (nfaults == 2) {
+ RF_ASSERT(wnqNode->numAntecedents == 1);
+ xorNode->succedents[1] = wnqNode;
+ wnqNode->antecedents[0] = xorNode;
+ wnqNode->antType[0] = rf_trueData;
+ }
+
+ /* connect the write nodes to the term node */
+ RF_ASSERT(termNode->numAntecedents == nWndNodes + nfaults);
+ RF_ASSERT(termNode->numSuccedents == 0);
+ for (i = 0; i < nWndNodes; i++) {
+ RF_ASSERT(wndNodes->numSuccedents == 1);
+ wndNodes[i].succedents[0] = termNode;
+ termNode->antecedents[i] = &wndNodes[i];
+ termNode->antType[i] = rf_control;
+ }
+ RF_ASSERT(wnpNode->numSuccedents == 1);
+ wnpNode->succedents[0] = termNode;
+ termNode->antecedents[nWndNodes] = wnpNode;
+ termNode->antType[nWndNodes] = rf_control;
+ if (nfaults == 2) {
+ RF_ASSERT(wnqNode->numSuccedents == 1);
+ wnqNode->succedents[0] = termNode;
+ termNode->antecedents[nWndNodes + 1] = wnqNode;
+ termNode->antType[nWndNodes + 1] = rf_control;
+ }
+}
+
+
+/******************************************************************************
+ *
+ * creates a DAG to perform a small-write operation (either raid 5 or pq),
+ * which is as follows:
+ *
+ * Hdr -> Nil -> Rop - Xor - Wnp [Unp] -- Trm
+ * \- Rod X- Wnd [Und] -------/
+ * [\- Rod X- Wnd [Und] ------/]
+ * [\- Roq - Q --> Wnq [Unq]-/]
+ *
+ * Rop = read old parity
+ * Rod = read old data
+ * Roq = read old "q"
+ * Cmt = commit node
+ * Und = unlock data disk
+ * Unp = unlock parity disk
+ * Unq = unlock q disk
+ * Wnp = write new parity
+ * Wnd = write new data
+ * Wnq = write new "q"
+ * [ ] denotes optional segments in the graph
+ *
+ * Parameters: raidPtr - description of the physical array
+ * asmap - logical & physical addresses for this access
+ * bp - buffer ptr (holds write data)
+ * flags - general flags (e.g. disk locking)
+ * allocList - list of memory allocated in DAG creation
+ * pfuncs - list of parity generating functions
+ * qfuncs - list of q generating functions
+ *
+ * A null qfuncs indicates single fault tolerant
+ *****************************************************************************/
+
+void rf_CommonCreateSmallWriteDAGFwd(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList,
+ RF_RedFuncs_t *pfuncs,
+ RF_RedFuncs_t *qfuncs)
+{
+ RF_DagNode_t *readDataNodes, *readParityNodes, *readQNodes, *termNode;
+ RF_DagNode_t *unlockDataNodes, *unlockParityNodes, *unlockQNodes;
+ RF_DagNode_t *xorNodes, *qNodes, *blockNode, *nodes;
+ RF_DagNode_t *writeDataNodes, *writeParityNodes, *writeQNodes;
+ int i, j, nNodes, totalNumNodes, lu_flag;
+ RF_ReconUnitNum_t which_ru;
+ int (*func)(RF_DagNode_t *), (*undoFunc)(RF_DagNode_t *);
+ int (*qfunc)(RF_DagNode_t *);
+ int numDataNodes, numParityNodes;
+ RF_StripeNum_t parityStripeID;
+ RF_PhysDiskAddr_t *pda;
+ char *name, *qname;
+ long nfaults;
+
+ nfaults = qfuncs ? 2 : 1;
+ lu_flag = (rf_enableAtomicRMW) ? 1 : 0; /* lock/unlock flag */
+
+ parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
+ pda = asmap->physInfo;
+ numDataNodes = asmap->numStripeUnitsAccessed;
+ numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
+
+ if (rf_dagDebug) printf("[Creating small-write DAG]\n");
+ RF_ASSERT(numDataNodes > 0);
+ dag_h->creator = "SmallWriteDAGFwd";
+
+ dag_h->numCommitNodes = 0;
+ dag_h->numCommits = 0;
+ dag_h->numSuccedents = 1;
+
+ qfunc = NULL;
+ qname = NULL;
+
+ /* DAG creation occurs in four steps:
+ 1. count the number of nodes in the DAG
+ 2. create the nodes
+ 3. initialize the nodes
+ 4. connect the nodes
+ */
+
+ /* Step 1. compute number of nodes in the graph */
+
+ /* number of nodes:
+ a read and write for each data unit
+ a redundancy computation node for each parity node (nfaults * nparity)
+ a read and write for each parity unit
+ a block node
+ a terminate node
+ if atomic RMW
+ an unlock node for each data unit, redundancy unit
+ */
+ totalNumNodes = (2 * numDataNodes) + (nfaults * numParityNodes) + (nfaults * 2 * numParityNodes) + 2;
+ if (lu_flag)
+ totalNumNodes += (numDataNodes + (nfaults * numParityNodes));
+
+
+ /* Step 2. create the nodes */
+ RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+ i = 0;
+ blockNode = &nodes[i]; i += 1;
+ readDataNodes = &nodes[i]; i += numDataNodes;
+ readParityNodes = &nodes[i]; i += numParityNodes;
+ writeDataNodes = &nodes[i]; i += numDataNodes;
+ writeParityNodes = &nodes[i]; i += numParityNodes;
+ xorNodes = &nodes[i]; i += numParityNodes;
+ termNode = &nodes[i]; i += 1;
+ if (lu_flag) {
+ unlockDataNodes = &nodes[i]; i += numDataNodes;
+ unlockParityNodes = &nodes[i]; i += numParityNodes;
+ }
+ else {
+ unlockDataNodes = unlockParityNodes = NULL;
+ }
+ if (nfaults == 2) {
+ readQNodes = &nodes[i]; i += numParityNodes;
+ writeQNodes = &nodes[i]; i += numParityNodes;
+ qNodes = &nodes[i]; i += numParityNodes;
+ if (lu_flag) {
+ unlockQNodes = &nodes[i]; i += numParityNodes;
+ }
+ else {
+ unlockQNodes = NULL;
+ }
+ }
+ else {
+ readQNodes = writeQNodes = qNodes = unlockQNodes = NULL;
+ }
+ RF_ASSERT(i == totalNumNodes);
+
+ /* Step 3. initialize the nodes */
+ /* initialize block node (Nil) */
+ nNodes = numDataNodes + (nfaults * numParityNodes);
+ rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil", allocList);
+
+ /* initialize terminate node (Trm) */
+ rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, nNodes, 0, 0, dag_h, "Trm", allocList);
+
+ /* initialize nodes which read old data (Rod) */
+ for (i = 0; i < numDataNodes; i++) {
+ rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, (numParityNodes * nfaults) + 1, 1, 4, 0, dag_h, "Rod", allocList);
+ RF_ASSERT(pda != NULL);
+ readDataNodes[i].params[0].p = pda; /* physical disk addr desc */
+ readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old data */
+ readDataNodes[i].params[2].v = parityStripeID;
+ readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, lu_flag, 0, which_ru);
+ pda=pda->next;
+ for (j = 0; j < readDataNodes[i].numSuccedents; j++)
+ readDataNodes[i].propList[j] = NULL;
+ }
+
+ /* initialize nodes which read old parity (Rop) */
+ pda = asmap->parityInfo; i = 0;
+ for (i = 0; i < numParityNodes; i++) {
+ RF_ASSERT(pda != NULL);
+ rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, numParityNodes, 1, 4, 0, dag_h, "Rop", allocList);
+ readParityNodes[i].params[0].p = pda;
+ readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old parity */
+ readParityNodes[i].params[2].v = parityStripeID;
+ readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, lu_flag, 0, which_ru);
+ for (j = 0; j < readParityNodes[i].numSuccedents; j++)
+ readParityNodes[i].propList[0] = NULL;
+ pda=pda->next;
+ }
+
+ /* initialize nodes which read old Q (Roq) */
+ if (nfaults == 2)
+ {
+ pda = asmap->qInfo;
+ for (i = 0; i < numParityNodes; i++) {
+ RF_ASSERT(pda != NULL);
+ rf_InitNode(&readQNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, numParityNodes, 1, 4, 0, dag_h, "Roq", allocList);
+ readQNodes[i].params[0].p = pda;
+ readQNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old Q */
+ readQNodes[i].params[2].v = parityStripeID;
+ readQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, lu_flag, 0, which_ru);
+ for (j = 0; j < readQNodes[i].numSuccedents; j++)
+ readQNodes[i].propList[0] = NULL;
+ pda=pda->next;
+ }
+ }
+
+ /* initialize nodes which write new data (Wnd) */
+ pda = asmap->physInfo;
+ for (i=0; i < numDataNodes; i++) {
+ RF_ASSERT(pda != NULL);
+ rf_InitNode(&writeDataNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
+ writeDataNodes[i].params[0].p = pda; /* physical disk addr desc */
+ writeDataNodes[i].params[1].p = pda->bufPtr; /* buffer holding new data to be written */
+ writeDataNodes[i].params[2].v = parityStripeID;
+ writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+
+ if (lu_flag) {
+ /* initialize node to unlock the disk queue */
+ rf_InitNode(&unlockDataNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Und", allocList);
+ unlockDataNodes[i].params[0].p = pda; /* physical disk addr desc */
+ unlockDataNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, lu_flag, which_ru);
+ }
+
+ pda = pda->next;
+ }
+
+
+ /* initialize nodes which compute new parity and Q */
+ /* we use the simple XOR func in the double-XOR case, and when we're accessing only a portion of one stripe unit.
+ * the distinction between the two is that the regular XOR func assumes that the targbuf is a full SU in size,
+ * and examines the pda associated with the buffer to decide where within the buffer to XOR the data, whereas
+ * the simple XOR func just XORs the data into the start of the buffer.
+ */
+ if ((numParityNodes==2) || ((numDataNodes == 1) && (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit))) {
+ func = pfuncs->simple; undoFunc = rf_NullNodeUndoFunc; name = pfuncs->SimpleName;
+ if (qfuncs) {
+ qfunc = qfuncs->simple;
+ qname = qfuncs->SimpleName;
+ }
+ }
+ else {
+ func = pfuncs->regular; undoFunc = rf_NullNodeUndoFunc; name = pfuncs->RegularName;
+ if (qfuncs) { qfunc = qfuncs->regular; qname = qfuncs->RegularName;}
+ }
+ /* initialize the xor nodes: params are {pda,buf} from {Rod,Wnd,Rop} nodes, and raidPtr */
+ if (numParityNodes==2) { /* double-xor case */
+ for (i=0; i < numParityNodes; i++) {
+ rf_InitNode(&xorNodes[i], rf_wait, RF_FALSE, func, undoFunc, NULL, numParityNodes, numParityNodes + numDataNodes, 7, 1, dag_h, name, allocList); /* no wakeup func for xor */
+ xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD;
+ xorNodes[i].params[0] = readDataNodes[i].params[0];
+ xorNodes[i].params[1] = readDataNodes[i].params[1];
+ xorNodes[i].params[2] = readParityNodes[i].params[0];
+ xorNodes[i].params[3] = readParityNodes[i].params[1];
+ xorNodes[i].params[4] = writeDataNodes[i].params[0];
+ xorNodes[i].params[5] = writeDataNodes[i].params[1];
+ xorNodes[i].params[6].p = raidPtr;
+ xorNodes[i].results[0] = readParityNodes[i].params[1].p; /* use old parity buf as target buf */
+ if (nfaults==2)
+ {
+ rf_InitNode(&qNodes[i], rf_wait, RF_FALSE, qfunc, undoFunc, NULL, numParityNodes, numParityNodes + numDataNodes, 7, 1, dag_h, qname, allocList); /* no wakeup func for xor */
+ qNodes[i].params[0] = readDataNodes[i].params[0];
+ qNodes[i].params[1] = readDataNodes[i].params[1];
+ qNodes[i].params[2] = readQNodes[i].params[0];
+ qNodes[i].params[3] = readQNodes[i].params[1];
+ qNodes[i].params[4] = writeDataNodes[i].params[0];
+ qNodes[i].params[5] = writeDataNodes[i].params[1];
+ qNodes[i].params[6].p = raidPtr;
+ qNodes[i].results[0] = readQNodes[i].params[1].p; /* use old Q buf as target buf */
+ }
+ }
+ }
+ else {
+ /* there is only one xor node in this case */
+ rf_InitNode(&xorNodes[0], rf_wait, RF_FALSE, func, undoFunc, NULL, numParityNodes, numParityNodes + numDataNodes, (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList);
+ xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD;
+ for (i=0; i < numDataNodes + 1; i++) {
+ /* set up params related to Rod and Rop nodes */
+ xorNodes[0].params[2*i+0] = readDataNodes[i].params[0]; /* pda */
+ xorNodes[0].params[2*i+1] = readDataNodes[i].params[1]; /* buffer pointer */
+ }
+ for (i=0; i < numDataNodes; i++) {
+ /* set up params related to Wnd and Wnp nodes */
+ xorNodes[0].params[2*(numDataNodes+1+i)+0] = writeDataNodes[i].params[0]; /* pda */
+ xorNodes[0].params[2*(numDataNodes+1+i)+1] = writeDataNodes[i].params[1]; /* buffer pointer */
+ }
+ xorNodes[0].params[2*(numDataNodes+numDataNodes+1)].p = raidPtr; /* xor node needs to get at RAID information */
+ xorNodes[0].results[0] = readParityNodes[0].params[1].p;
+ if (nfaults==2)
+ {
+ rf_InitNode(&qNodes[0], rf_wait, RF_FALSE, qfunc, undoFunc, NULL, numParityNodes, numParityNodes + numDataNodes, (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, qname, allocList);
+ for (i=0; i<numDataNodes; i++) {
+ /* set up params related to Rod */
+ qNodes[0].params[2*i+0] = readDataNodes[i].params[0]; /* pda */
+ qNodes[0].params[2*i+1] = readDataNodes[i].params[1]; /* buffer pointer */
+ }
+ /* and read old q */
+ qNodes[0].params[2*numDataNodes + 0] = readQNodes[0].params[0]; /* pda */
+ qNodes[0].params[2*numDataNodes + 1] = readQNodes[0].params[1]; /* buffer pointer */
+ for (i=0; i < numDataNodes; i++) {
+ /* set up params related to Wnd nodes */
+ qNodes[0].params[2*(numDataNodes+1+i)+0] = writeDataNodes[i].params[0]; /* pda */
+ qNodes[0].params[2*(numDataNodes+1+i)+1] = writeDataNodes[i].params[1]; /* buffer pointer */
+ }
+ qNodes[0].params[2*(numDataNodes+numDataNodes+1)].p = raidPtr; /* xor node needs to get at RAID information */
+ qNodes[0].results[0] = readQNodes[0].params[1].p;
+ }
+ }
+
+ /* initialize nodes which write new parity (Wnp) */
+ pda = asmap->parityInfo;
+ for (i=0; i < numParityNodes; i++) {
+ rf_InitNode(&writeParityNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, numParityNodes, 4, 0, dag_h, "Wnp", allocList);
+ RF_ASSERT(pda != NULL);
+ writeParityNodes[i].params[0].p = pda; /* param 1 (bufPtr) filled in by xor node */
+ writeParityNodes[i].params[1].p = xorNodes[i].results[0]; /* buffer pointer for parity write operation */
+ writeParityNodes[i].params[2].v = parityStripeID;
+ writeParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+
+ if (lu_flag) {
+ /* initialize node to unlock the disk queue */
+ rf_InitNode(&unlockParityNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Unp", allocList);
+ unlockParityNodes[i].params[0].p = pda; /* physical disk addr desc */
+ unlockParityNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, lu_flag, which_ru);
+ }
+
+ pda = pda->next;
+ }
+
+ /* initialize nodes which write new Q (Wnq) */
+ if (nfaults == 2)
+ {
+ pda = asmap->qInfo;
+ for (i=0; i < numParityNodes; i++) {
+ rf_InitNode(&writeQNodes[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, numParityNodes, 4, 0, dag_h, "Wnq", allocList);
+ RF_ASSERT(pda != NULL);
+ writeQNodes[i].params[0].p = pda; /* param 1 (bufPtr) filled in by xor node */
+ writeQNodes[i].params[1].p = qNodes[i].results[0]; /* buffer pointer for parity write operation */
+ writeQNodes[i].params[2].v = parityStripeID;
+ writeQNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+
+ if (lu_flag) {
+ /* initialize node to unlock the disk queue */
+ rf_InitNode(&unlockQNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Unq", allocList);
+ unlockQNodes[i].params[0].p = pda; /* physical disk addr desc */
+ unlockQNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, lu_flag, which_ru);
+ }
+
+ pda = pda->next;
+ }
+ }
+
+ /* Step 4. connect the nodes */
+
+ /* connect header to block node */
+ dag_h->succedents[0] = blockNode;
+
+ /* connect block node to read old data nodes */
+ RF_ASSERT(blockNode->numSuccedents == (numDataNodes + (numParityNodes * nfaults)));
+ for (i = 0; i < numDataNodes; i++) {
+ blockNode->succedents[i] = &readDataNodes[i];
+ RF_ASSERT(readDataNodes[i].numAntecedents == 1);
+ readDataNodes[i].antecedents[0]= blockNode;
+ readDataNodes[i].antType[0] = rf_control;
+ }
+
+ /* connect block node to read old parity nodes */
+ for (i = 0; i < numParityNodes; i++) {
+ blockNode->succedents[numDataNodes + i] = &readParityNodes[i];
+ RF_ASSERT(readParityNodes[i].numAntecedents == 1);
+ readParityNodes[i].antecedents[0] = blockNode;
+ readParityNodes[i].antType[0] = rf_control;
+ }
+
+ /* connect block node to read old Q nodes */
+ if (nfaults == 2)
+ for (i = 0; i < numParityNodes; i++) {
+ blockNode->succedents[numDataNodes + numParityNodes + i] = &readQNodes[i];
+ RF_ASSERT(readQNodes[i].numAntecedents == 1);
+ readQNodes[i].antecedents[0] = blockNode;
+ readQNodes[i].antType[0] = rf_control;
+ }
+
+ /* connect read old data nodes to write new data nodes */
+ for (i = 0; i < numDataNodes; i++) {
+ RF_ASSERT(readDataNodes[i].numSuccedents == ((nfaults * numParityNodes) + 1));
+ RF_ASSERT(writeDataNodes[i].numAntecedents == 1);
+ readDataNodes[i].succedents[0] = &writeDataNodes[i];
+ writeDataNodes[i].antecedents[0] = &readDataNodes[i];
+ writeDataNodes[i].antType[0] = rf_antiData;
+ }
+
+ /* connect read old data nodes to xor nodes */
+ for (i = 0; i < numDataNodes; i++) {
+ for (j = 0; j < numParityNodes; j++){
+ RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes);
+ readDataNodes[i].succedents[1 + j] = &xorNodes[j];
+ xorNodes[j].antecedents[i] = &readDataNodes[i];
+ xorNodes[j].antType[i] = rf_trueData;
+ }
+ }
+
+ /* connect read old data nodes to q nodes */
+ if (nfaults == 2)
+ for (i = 0; i < numDataNodes; i++)
+ for (j = 0; j < numParityNodes; j++){
+ RF_ASSERT(qNodes[j].numAntecedents == numDataNodes + numParityNodes);
+ readDataNodes[i].succedents[1 + numParityNodes + j] = &qNodes[j];
+ qNodes[j].antecedents[i] = &readDataNodes[i];
+ qNodes[j].antType[i] = rf_trueData;
+ }
+
+ /* connect read old parity nodes to xor nodes */
+ for (i = 0; i < numParityNodes; i++) {
+ for (j = 0; j < numParityNodes; j++) {
+ RF_ASSERT(readParityNodes[i].numSuccedents == numParityNodes);
+ readParityNodes[i].succedents[j] = &xorNodes[j];
+ xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
+ xorNodes[j].antType[numDataNodes + i] = rf_trueData;
+ }
+ }
+
+ /* connect read old q nodes to q nodes */
+ if (nfaults == 2)
+ for (i = 0; i < numParityNodes; i++) {
+ for (j = 0; j < numParityNodes; j++) {
+ RF_ASSERT(readQNodes[i].numSuccedents == numParityNodes);
+ readQNodes[i].succedents[j] = &qNodes[j];
+ qNodes[j].antecedents[numDataNodes + i] = &readQNodes[i];
+ qNodes[j].antType[numDataNodes + i] = rf_trueData;
+ }
+ }
+
+ /* connect xor nodes to the write new parity nodes */
+ for (i = 0; i < numParityNodes; i++) {
+ RF_ASSERT(writeParityNodes[i].numAntecedents == numParityNodes);
+ for (j = 0; j < numParityNodes; j++) {
+ RF_ASSERT(xorNodes[j].numSuccedents == numParityNodes);
+ xorNodes[i].succedents[j] = &writeParityNodes[j];
+ writeParityNodes[j].antecedents[i] = &xorNodes[i];
+ writeParityNodes[j].antType[i] = rf_trueData;
+ }
+ }
+
+ /* connect q nodes to the write new q nodes */
+ if (nfaults == 2)
+ for (i = 0; i < numParityNodes; i++) {
+ RF_ASSERT(writeQNodes[i].numAntecedents == numParityNodes);
+ for (j = 0; j < numParityNodes; j++) {
+ RF_ASSERT(qNodes[j].numSuccedents == 1);
+ qNodes[i].succedents[j] = &writeQNodes[j];
+ writeQNodes[j].antecedents[i] = &qNodes[i];
+ writeQNodes[j].antType[i] = rf_trueData;
+ }
+ }
+
+ RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
+ RF_ASSERT(termNode->numSuccedents == 0);
+ for (i = 0; i < numDataNodes; i++) {
+ if (lu_flag) {
+ /* connect write new data nodes to unlock nodes */
+ RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
+ RF_ASSERT(unlockDataNodes[i].numAntecedents == 1);
+ writeDataNodes[i].succedents[0] = &unlockDataNodes[i];
+ unlockDataNodes[i].antecedents[0] = &writeDataNodes[i];
+ unlockDataNodes[i].antType[0] = rf_control;
+
+ /* connect unlock nodes to term node */
+ RF_ASSERT(unlockDataNodes[i].numSuccedents == 1);
+ unlockDataNodes[i].succedents[0] = termNode;
+ termNode->antecedents[i] = &unlockDataNodes[i];
+ termNode->antType[i] = rf_control;
+ }
+ else {
+ /* connect write new data nodes to term node */
+ RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
+ RF_ASSERT(termNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
+ writeDataNodes[i].succedents[0] = termNode;
+ termNode->antecedents[i] = &writeDataNodes[i];
+ termNode->antType[i] = rf_control;
+ }
+ }
+
+ for (i = 0; i < numParityNodes; i++) {
+ if (lu_flag) {
+ /* connect write new parity nodes to unlock nodes */
+ RF_ASSERT(writeParityNodes[i].numSuccedents == 1);
+ RF_ASSERT(unlockParityNodes[i].numAntecedents == 1);
+ writeParityNodes[i].succedents[0] = &unlockParityNodes[i];
+ unlockParityNodes[i].antecedents[0] = &writeParityNodes[i];
+ unlockParityNodes[i].antType[0] = rf_control;
+
+ /* connect unlock nodes to term node */
+ RF_ASSERT(unlockParityNodes[i].numSuccedents == 1);
+ unlockParityNodes[i].succedents[0] = termNode;
+ termNode->antecedents[numDataNodes + i] = &unlockParityNodes[i];
+ termNode->antType[numDataNodes + i] = rf_control;
+ }
+ else {
+ RF_ASSERT(writeParityNodes[i].numSuccedents == 1);
+ writeParityNodes[i].succedents[0] = termNode;
+ termNode->antecedents[numDataNodes + i] = &writeParityNodes[i];
+ termNode->antType[numDataNodes + i] = rf_control;
+ }
+ }
+
+ if (nfaults == 2)
+ for (i = 0; i < numParityNodes; i++) {
+ if (lu_flag) {
+ /* connect write new Q nodes to unlock nodes */
+ RF_ASSERT(writeQNodes[i].numSuccedents == 1);
+ RF_ASSERT(unlockQNodes[i].numAntecedents == 1);
+ writeQNodes[i].succedents[0] = &unlockQNodes[i];
+ unlockQNodes[i].antecedents[0] = &writeQNodes[i];
+ unlockQNodes[i].antType[0] = rf_control;
+
+ /* connect unlock nodes to unblock node */
+ RF_ASSERT(unlockQNodes[i].numSuccedents == 1);
+ unlockQNodes[i].succedents[0] = termNode;
+ termNode->antecedents[numDataNodes + numParityNodes + i] = &unlockQNodes[i];
+ termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
+ }
+ else {
+ RF_ASSERT(writeQNodes[i].numSuccedents == 1);
+ writeQNodes[i].succedents[0] = termNode;
+ termNode->antecedents[numDataNodes + numParityNodes + i] = &writeQNodes[i];
+ termNode->antType[numDataNodes + numParityNodes + i] = rf_control;
+ }
+ }
+}
+
+
+
+/******************************************************************************
+ * create a write graph (fault-free or degraded) for RAID level 1
+ *
+ * Hdr Nil -> Wpd -> Nil -> Trm
+ * Nil -> Wsd ->
+ *
+ * The "Wpd" node writes data to the primary copy in the mirror pair
+ * The "Wsd" node writes data to the secondary copy in the mirror pair
+ *
+ * Parameters: raidPtr - description of the physical array
+ * asmap - logical & physical addresses for this access
+ * bp - buffer ptr (holds write data)
+ * flags - general flags (e.g. disk locking)
+ * allocList - list of memory allocated in DAG creation
+ *****************************************************************************/
+
+void rf_CreateRaidOneWriteDAGFwd(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList)
+{
+ RF_DagNode_t *blockNode, *unblockNode, *termNode;
+ RF_DagNode_t *nodes, *wndNode, *wmirNode;
+ int nWndNodes, nWmirNodes, i;
+ RF_ReconUnitNum_t which_ru;
+ RF_PhysDiskAddr_t *pda, *pdaP;
+ RF_StripeNum_t parityStripeID;
+
+ parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
+ asmap->raidAddress, &which_ru);
+ if (rf_dagDebug) {
+ printf("[Creating RAID level 1 write DAG]\n");
+ }
+
+ nWmirNodes = (asmap->parityInfo->next) ? 2 : 1; /* 2 implies access not SU aligned */
+ nWndNodes = (asmap->physInfo->next) ? 2 : 1;
+
+ /* alloc the Wnd nodes and the Wmir node */
+ if (asmap->numDataFailed == 1)
+ nWndNodes--;
+ if (asmap->numParityFailed == 1)
+ nWmirNodes--;
+
+ /* total number of nodes = nWndNodes + nWmirNodes + (block + unblock + terminator) */
+ RF_CallocAndAdd(nodes, nWndNodes + nWmirNodes + 3, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+ i = 0;
+ wndNode = &nodes[i]; i += nWndNodes;
+ wmirNode = &nodes[i]; i += nWmirNodes;
+ blockNode = &nodes[i]; i += 1;
+ unblockNode = &nodes[i]; i += 1;
+ termNode = &nodes[i]; i += 1;
+ RF_ASSERT(i == (nWndNodes + nWmirNodes + 3));
+
+ /* this dag can commit immediately */
+ dag_h->numCommitNodes = 0;
+ dag_h->numCommits = 0;
+ dag_h->numSuccedents = 1;
+
+ /* initialize the unblock and term nodes */
+ rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, (nWndNodes + nWmirNodes), 0, 0, 0, dag_h, "Nil", allocList);
+ rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, (nWndNodes + nWmirNodes), 0, 0, dag_h, "Nil", allocList);
+ rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+
+ /* initialize the wnd nodes */
+ if (nWndNodes > 0) {
+ pda = asmap->physInfo;
+ for (i = 0; i < nWndNodes; i++) {
+ rf_InitNode(&wndNode[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wpd", allocList);
+ RF_ASSERT(pda != NULL);
+ wndNode[i].params[0].p = pda;
+ wndNode[i].params[1].p = pda->bufPtr;
+ wndNode[i].params[2].v = parityStripeID;
+ wndNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ pda = pda->next;
+ }
+ RF_ASSERT(pda == NULL);
+ }
+
+ /* initialize the mirror nodes */
+ if (nWmirNodes > 0) {
+ pda = asmap->physInfo;
+ pdaP = asmap->parityInfo;
+ for (i = 0; i < nWmirNodes; i++) {
+ rf_InitNode(&wmirNode[i], rf_wait, RF_FALSE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wsd", allocList);
+ RF_ASSERT(pda != NULL);
+ wmirNode[i].params[0].p = pdaP;
+ wmirNode[i].params[1].p = pda->bufPtr;
+ wmirNode[i].params[2].v = parityStripeID;
+ wmirNode[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ pda = pda->next;
+ pdaP = pdaP->next;
+ }
+ RF_ASSERT(pda == NULL);
+ RF_ASSERT(pdaP == NULL);
+ }
+
+ /* link the header node to the block node */
+ RF_ASSERT(dag_h->numSuccedents == 1);
+ RF_ASSERT(blockNode->numAntecedents == 0);
+ dag_h->succedents[0] = blockNode;
+
+ /* link the block node to the write nodes */
+ RF_ASSERT(blockNode->numSuccedents == (nWndNodes + nWmirNodes));
+ for (i = 0; i < nWndNodes; i++) {
+ RF_ASSERT(wndNode[i].numAntecedents == 1);
+ blockNode->succedents[i] = &wndNode[i];
+ wndNode[i].antecedents[0] = blockNode;
+ wndNode[i].antType[0] = rf_control;
+ }
+ for (i = 0; i < nWmirNodes; i++) {
+ RF_ASSERT(wmirNode[i].numAntecedents == 1);
+ blockNode->succedents[i + nWndNodes] = &wmirNode[i];
+ wmirNode[i].antecedents[0] = blockNode;
+ wmirNode[i].antType[0] = rf_control;
+ }
+
+ /* link the write nodes to the unblock node */
+ RF_ASSERT(unblockNode->numAntecedents == (nWndNodes + nWmirNodes));
+ for (i = 0; i < nWndNodes; i++) {
+ RF_ASSERT(wndNode[i].numSuccedents == 1);
+ wndNode[i].succedents[0] = unblockNode;
+ unblockNode->antecedents[i] = &wndNode[i];
+ unblockNode->antType[i] = rf_control;
+ }
+ for (i = 0; i < nWmirNodes; i++) {
+ RF_ASSERT(wmirNode[i].numSuccedents == 1);
+ wmirNode[i].succedents[0] = unblockNode;
+ unblockNode->antecedents[i + nWndNodes] = &wmirNode[i];
+ unblockNode->antType[i + nWndNodes] = rf_control;
+ }
+
+ /* link the unblock node to the term node */
+ RF_ASSERT(unblockNode->numSuccedents == 1);
+ RF_ASSERT(termNode->numAntecedents == 1);
+ RF_ASSERT(termNode->numSuccedents == 0);
+ unblockNode->succedents[0] = termNode;
+ termNode->antecedents[0] = unblockNode;
+ termNode->antType[0] = rf_control;
+
+ return;
+}
diff --git a/sys/dev/raidframe/rf_dagffwr.h b/sys/dev/raidframe/rf_dagffwr.h
new file mode 100644
index 00000000000..69c7fdf4832
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagffwr.h
@@ -0,0 +1,103 @@
+/* $OpenBSD: rf_dagffwr.h,v 1.1 1999/01/11 14:29:10 niklas Exp $ */
+/* $NetBSD: rf_dagffwr.h,v 1.1 1998/11/13 04:20:27 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * :
+ * Log: rf_dagffwr.h,v
+ * Revision 1.6 1996/07/31 15:35:29 jimz
+ * evenodd changes; bugfixes for double-degraded archs, generalize
+ * some formerly PQ-only functions
+ *
+ * Revision 1.5 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.4 1996/06/10 22:25:28 wvcii
+ * added write dags which do not have a commit node and are
+ * used in forward and backward error recovery experiments.
+ *
+ * Revision 1.3 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.2 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.1 1996/05/03 19:20:18 wvcii
+ * Initial revision
+ *
+ */
+
+#ifndef _RF__RF_DAGFFWR_H_
+#define _RF__RF_DAGFFWR_H_
+
+#include "rf_types.h"
+
+/* fault-free write DAG creation routines */
+void rf_CreateNonRedundantWriteDAG(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp,
+ RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList,
+ RF_IoType_t type);
+void rf_CreateRAID0WriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h, void *bp, RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList, RF_IoType_t type);
+void rf_CreateSmallWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h, void *bp, RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList);
+void rf_CreateLargeWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h, void *bp, RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList);
+void rf_CommonCreateLargeWriteDAG(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp,
+ RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList, int nfaults,
+ int (*redFunc)(RF_DagNode_t *), int allowBufferRecycle);
+void rf_CommonCreateLargeWriteDAGFwd(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp,
+ RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList, int nfaults,
+ int (*redFunc)(RF_DagNode_t *), int allowBufferRecycle);
+void rf_CommonCreateSmallWriteDAG(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp,
+ RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList,
+ RF_RedFuncs_t *pfuncs, RF_RedFuncs_t *qfuncs);
+void rf_CommonCreateSmallWriteDAGFwd(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp,
+ RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList,
+ RF_RedFuncs_t *pfuncs, RF_RedFuncs_t *qfuncs);
+void rf_CreateRaidOneWriteDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h, void *bp, RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList);
+void rf_CreateRaidOneWriteDAGFwd(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp,
+ RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList);
+
+#endif /* !_RF__RF_DAGFFWR_H_ */
diff --git a/sys/dev/raidframe/rf_dagflags.h b/sys/dev/raidframe/rf_dagflags.h
new file mode 100644
index 00000000000..ac6f5ec5705
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagflags.h
@@ -0,0 +1,86 @@
+/* $OpenBSD: rf_dagflags.h,v 1.1 1999/01/11 14:29:10 niklas Exp $ */
+/* $NetBSD: rf_dagflags.h,v 1.1 1998/11/13 04:20:27 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/**************************************************************************************
+ *
+ * dagflags.h -- flags that can be given to DoAccess
+ * I pulled these out of dag.h because routines that call DoAccess may need these flags,
+ * but certainly do not need the declarations related to the DAG data structures.
+ *
+ **************************************************************************************/
+
+/* :
+ * Log: rf_dagflags.h,v
+ * Revision 1.10 1996/06/13 19:08:23 jimz
+ * remove unused BD flag
+ *
+ * Revision 1.9 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.8 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.7 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.6 1995/12/01 15:59:40 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_DAGFLAGS_H_
+#define _RF__RF_DAGFLAGS_H_
+
+/*
+ * Bitmasks for the "flags" parameter (RF_RaidAccessFlags_t) used
+ * by DoAccess, SelectAlgorithm, and the DAG creation routines.
+ *
+ * If USE_DAG or USE_ASM is specified, neither the DAG nor the ASM
+ * will be modified, which means that you can't SUPRESS if you
+ * specify USE_DAG.
+ */
+
+#define RF_DAG_FLAGS_NONE 0 /* no flags */
+#define RF_DAG_SUPPRESS_LOCKS (1<<0) /* supress all stripe locks in the DAG */
+#define RF_DAG_RETURN_ASM (1<<1) /* create an ASM and return it instead of freeing it */
+#define RF_DAG_RETURN_DAG (1<<2) /* create a DAG and return it instead of freeing it */
+#define RF_DAG_NONBLOCKING_IO (1<<3) /* cause DoAccess to be non-blocking */
+#define RF_DAG_ACCESS_COMPLETE (1<<4) /* the access is complete */
+#define RF_DAG_DISPATCH_RETURNED (1<<5) /* used to handle the case where the dag invokes no I/O */
+#define RF_DAG_TEST_ACCESS (1<<6) /* this access came through rf_ioctl instead of rf_strategy */
+
+#endif /* !_RF__RF_DAGFLAGS_H_ */
diff --git a/sys/dev/raidframe/rf_dagfuncs.c b/sys/dev/raidframe/rf_dagfuncs.c
new file mode 100644
index 00000000000..78e23ed1d95
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagfuncs.c
@@ -0,0 +1,1050 @@
+/* $OpenBSD: rf_dagfuncs.c,v 1.1 1999/01/11 14:29:10 niklas Exp $ */
+/* $NetBSD: rf_dagfuncs.c,v 1.1 1998/11/13 04:20:28 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * dagfuncs.c -- DAG node execution routines
+ *
+ * Rules:
+ * 1. Every DAG execution function must eventually cause node->status to
+ * get set to "good" or "bad", and "FinishNode" to be called. In the
+ * case of nodes that complete immediately (xor, NullNodeFunc, etc),
+ * the node execution function can do these two things directly. In
+ * the case of nodes that have to wait for some event (a disk read to
+ * complete, a lock to be released, etc) to occur before they can
+ * complete, this is typically achieved by having whatever module
+ * is doing the operation call GenericWakeupFunc upon completion.
+ * 2. DAG execution functions should check the status in the DAG header
+ * and NOP out their operations if the status is not "enable". However,
+ * execution functions that release resources must be sure to release
+ * them even when they NOP out the function that would use them.
+ * Functions that acquire resources should go ahead and acquire them
+ * even when they NOP, so that a downstream release node will not have
+ * to check to find out whether or not the acquire was suppressed.
+ */
+
+/* :
+ * Log: rf_dagfuncs.c,v
+ * Revision 1.64 1996/07/31 16:29:26 jimz
+ * LONGSHIFT -> RF_LONGSHIFT, defined in rf_types.h
+ *
+ * Revision 1.63 1996/07/30 04:00:20 jimz
+ * define LONGSHIFT for mips
+ *
+ * Revision 1.62 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.61 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.60 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.59 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.58 1996/07/17 21:00:58 jimz
+ * clean up timer interface, tracing
+ *
+ * Revision 1.57 1996/07/15 17:22:18 jimz
+ * nit-pick code cleanup
+ * resolve stdlib problems on DEC OSF
+ *
+ * Revision 1.56 1996/06/11 01:27:50 jimz
+ * Fixed bug where diskthread shutdown would crash or hang. This
+ * turned out to be two distinct bugs:
+ * (1) [crash] The thread shutdown code wasn't properly waiting for
+ * all the diskthreads to complete. This caused diskthreads that were
+ * exiting+cleaning up to unlock a destroyed mutex.
+ * (2) [hang] TerminateDiskQueues wasn't locking, and DiskIODequeue
+ * only checked for termination _after_ a wakeup if the queues were
+ * empty. This was a race where the termination wakeup could be lost
+ * by the dequeueing thread, and the system would hang waiting for the
+ * thread to exit, while the thread waited for an I/O or a signal to
+ * check the termination flag.
+ *
+ * Revision 1.55 1996/06/10 22:23:18 wvcii
+ * disk and xor funcs now optionally support undo logging
+ * for backward error recovery experiments
+ *
+ * Revision 1.54 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.53 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.52 1996/06/06 17:28:44 jimz
+ * add new read mirror partition func, rename old read mirror
+ * to rf_DiskReadMirrorIdleFunc
+ *
+ * Revision 1.51 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.50 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.49 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.48 1996/05/30 12:59:18 jimz
+ * make etimer happier, more portable
+ *
+ * Revision 1.47 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.46 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.45 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.44 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.43 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.42 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.41 1996/05/08 21:01:24 jimz
+ * fixed up enum type names that were conflicting with other
+ * enums and function names (ie, "panic")
+ * future naming trends will be towards RF_ and rf_ for
+ * everything raidframe-related
+ *
+ * Revision 1.40 1996/05/08 15:24:14 wvcii
+ * modified GenericWakeupFunc to use recover, undone, and panic node states
+ *
+ * Revision 1.39 1996/05/02 17:18:01 jimz
+ * fix up headers for user-land, following ccmn cleanup
+ *
+ * Revision 1.38 1996/05/01 16:26:51 jimz
+ * don't include rf_ccmn.h (get ready to phase out)
+ *
+ * Revision 1.37 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.36 1995/12/04 19:19:09 wvcii
+ * modified DiskReadMirrorFunc
+ * - added fifth parameter, physical disk address of mirror copy
+ * - SelectIdleDisk conditionally swaps parameters 0 & 4
+ *
+ * Revision 1.35 1995/12/01 15:58:33 root
+ * added copyright info
+ *
+ * Revision 1.34 1995/11/17 18:12:17 amiri
+ * Changed DiskReadMirrorFunc to use the generic mapping routines
+ * to find the mirror of the data, function was assuming RAID level 1.
+ *
+ * Revision 1.33 1995/11/17 15:15:59 wvcii
+ * changes in DiskReadMirrorFunc
+ * - added ASSERTs
+ * - added call to MapParityRAID1
+ *
+ * Revision 1.32 1995/11/07 16:25:50 wvcii
+ * added DiskUnlockFuncForThreads
+ * general debugging of undo functions (first time they were used)
+ *
+ * Revision 1.31 1995/09/06 19:23:36 wvcii
+ * fixed tracing for parity logging nodes
+ *
+ * Revision 1.30 95/07/07 00:13:01 wvcii
+ * added 4th parameter to ParityLogAppend
+ *
+ */
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#ifndef KERNEL
+#include <errno.h>
+#endif /* !KERNEL */
+
+#include <sys/ioctl.h>
+#include <sys/param.h>
+
+#include "rf_archs.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_layout.h"
+#include "rf_etimer.h"
+#include "rf_acctrace.h"
+#include "rf_diskqueue.h"
+#include "rf_dagfuncs.h"
+#include "rf_general.h"
+#include "rf_engine.h"
+#include "rf_dagutils.h"
+
+#ifdef KERNEL
+#include "rf_kintf.h"
+#endif /* KERNEL */
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+#include "rf_paritylog.h"
+#endif /* RF_INCLUDE_PARITYLOGGING > 0 */
+
+int (*rf_DiskReadFunc)(RF_DagNode_t *);
+int (*rf_DiskWriteFunc)(RF_DagNode_t *);
+int (*rf_DiskReadUndoFunc)(RF_DagNode_t *);
+int (*rf_DiskWriteUndoFunc)(RF_DagNode_t *);
+int (*rf_DiskUnlockFunc)(RF_DagNode_t *);
+int (*rf_DiskUnlockUndoFunc)(RF_DagNode_t *);
+int (*rf_RegularXorUndoFunc)(RF_DagNode_t *);
+int (*rf_SimpleXorUndoFunc)(RF_DagNode_t *);
+int (*rf_RecoveryXorUndoFunc)(RF_DagNode_t *);
+
+/*****************************************************************************************
+ * main (only) configuration routine for this module
+ ****************************************************************************************/
+int rf_ConfigureDAGFuncs(listp)
+ RF_ShutdownList_t **listp;
+{
+ RF_ASSERT( ((sizeof(long)==8) && RF_LONGSHIFT==3) || ((sizeof(long)==4) && RF_LONGSHIFT==2) );
+ rf_DiskReadFunc = rf_DiskReadFuncForThreads;
+ rf_DiskReadUndoFunc = rf_DiskUndoFunc;
+ rf_DiskWriteFunc = rf_DiskWriteFuncForThreads;
+ rf_DiskWriteUndoFunc = rf_DiskUndoFunc;
+ rf_DiskUnlockFunc = rf_DiskUnlockFuncForThreads;
+ rf_DiskUnlockUndoFunc = rf_NullNodeUndoFunc;
+ rf_RegularXorUndoFunc = rf_NullNodeUndoFunc;
+ rf_SimpleXorUndoFunc = rf_NullNodeUndoFunc;
+ rf_RecoveryXorUndoFunc = rf_NullNodeUndoFunc;
+ return(0);
+}
+
+
+
+/*****************************************************************************************
+ * the execution function associated with a terminate node
+ ****************************************************************************************/
+int rf_TerminateFunc(node)
+ RF_DagNode_t *node;
+{
+ RF_ASSERT(node->dagHdr->numCommits == node->dagHdr->numCommitNodes);
+ node->status = rf_good;
+ return(rf_FinishNode(node, RF_THREAD_CONTEXT));
+}
+
+int rf_TerminateUndoFunc(node)
+ RF_DagNode_t *node;
+{
+ return(0);
+}
+
+
+/*****************************************************************************************
+ * execution functions associated with a mirror node
+ *
+ * parameters:
+ *
+ * 0 - physical disk addres of data
+ * 1 - buffer for holding read data
+ * 2 - parity stripe ID
+ * 3 - flags
+ * 4 - physical disk address of mirror (parity)
+ *
+ ****************************************************************************************/
+
+int rf_DiskReadMirrorIdleFunc(node)
+ RF_DagNode_t *node;
+{
+ /* select the mirror copy with the shortest queue and fill in node parameters
+ with physical disk address */
+
+ rf_SelectMirrorDiskIdle(node);
+ return(rf_DiskReadFunc(node));
+}
+
+int rf_DiskReadMirrorPartitionFunc(node)
+ RF_DagNode_t *node;
+{
+ /* select the mirror copy with the shortest queue and fill in node parameters
+ with physical disk address */
+
+ rf_SelectMirrorDiskPartition(node);
+ return(rf_DiskReadFunc(node));
+}
+
+int rf_DiskReadMirrorUndoFunc(node)
+ RF_DagNode_t *node;
+{
+ return(0);
+}
+
+
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+/*****************************************************************************************
+ * the execution function associated with a parity log update node
+ ****************************************************************************************/
+int rf_ParityLogUpdateFunc(node)
+ RF_DagNode_t *node;
+{
+ RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
+ caddr_t buf = (caddr_t) node->params[1].p;
+ RF_ParityLogData_t *logData;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+ RF_Etimer_t timer;
+
+ if (node->dagHdr->status == rf_enable)
+ {
+ RF_ETIMER_START(timer);
+ logData = rf_CreateParityLogData(RF_UPDATE, pda, buf,
+ (RF_Raid_t *) (node->dagHdr->raidPtr),
+ node->wakeFunc, (void *) node,
+ node->dagHdr->tracerec, timer);
+ if (logData)
+ rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
+ else
+ {
+ RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->plog_us += RF_ETIMER_VAL_US(timer);
+ (node->wakeFunc)(node, ENOMEM);
+ }
+ }
+ return(0);
+}
+
+
+/*****************************************************************************************
+ * the execution function associated with a parity log overwrite node
+ ****************************************************************************************/
+int rf_ParityLogOverwriteFunc(node)
+ RF_DagNode_t *node;
+{
+ RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
+ caddr_t buf = (caddr_t) node->params[1].p;
+ RF_ParityLogData_t *logData;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+ RF_Etimer_t timer;
+
+ if (node->dagHdr->status == rf_enable)
+ {
+ RF_ETIMER_START(timer);
+ logData = rf_CreateParityLogData(RF_OVERWRITE, pda, buf, (RF_Raid_t *) (node->dagHdr->raidPtr),
+ node->wakeFunc, (void *) node, node->dagHdr->tracerec, timer);
+ if (logData)
+ rf_ParityLogAppend(logData, RF_FALSE, NULL, RF_FALSE);
+ else
+ {
+ RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->plog_us += RF_ETIMER_VAL_US(timer);
+ (node->wakeFunc)(node, ENOMEM);
+ }
+ }
+ return(0);
+}
+
+#else /* RF_INCLUDE_PARITYLOGGING > 0 */
+
+int rf_ParityLogUpdateFunc(node)
+ RF_DagNode_t *node;
+{
+ return(0);
+}
+int rf_ParityLogOverwriteFunc(node)
+ RF_DagNode_t *node;
+{
+ return(0);
+}
+
+#endif /* RF_INCLUDE_PARITYLOGGING > 0 */
+
+int rf_ParityLogUpdateUndoFunc(node)
+ RF_DagNode_t *node;
+{
+ return(0);
+}
+
+int rf_ParityLogOverwriteUndoFunc(node)
+ RF_DagNode_t *node;
+{
+ return(0);
+}
+
+/*****************************************************************************************
+ * the execution function associated with a NOP node
+ ****************************************************************************************/
+int rf_NullNodeFunc(node)
+ RF_DagNode_t *node;
+{
+ node->status = rf_good;
+ return(rf_FinishNode(node, RF_THREAD_CONTEXT));
+}
+
+int rf_NullNodeUndoFunc(node)
+ RF_DagNode_t *node;
+{
+ node->status = rf_undone;
+ return(rf_FinishNode(node, RF_THREAD_CONTEXT));
+}
+
+
+/*****************************************************************************************
+ * the execution function associated with a disk-read node
+ ****************************************************************************************/
+int rf_DiskReadFuncForThreads(node)
+ RF_DagNode_t *node;
+{
+ RF_DiskQueueData_t *req;
+ RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *)node->params[0].p;
+ caddr_t buf = (caddr_t)node->params[1].p;
+ RF_StripeNum_t parityStripeID = (RF_StripeNum_t)node->params[2].v;
+ unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v);
+ unsigned lock = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
+ unsigned unlock = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
+ unsigned which_ru = RF_EXTRACT_RU(node->params[3].v);
+ RF_DiskQueueDataFlags_t flags = 0;
+ RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_READ : RF_IO_TYPE_NOP;
+ RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
+ void *b_proc = NULL;
+#if RF_BACKWARD > 0
+ caddr_t undoBuf;
+#endif
+
+#ifdef KERNEL
+ if (node->dagHdr->bp) b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc;
+#endif /* KERNEL */
+
+ RF_ASSERT( !(lock && unlock) );
+ flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
+ flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
+#if RF_BACKWARD > 0
+ /* allocate and zero the undo buffer.
+ * this is equivalent to copying the original buffer's contents to the undo buffer
+ * prior to performing the disk read.
+ * XXX hardcoded 512 bytes per sector!
+ */
+ if (node->dagHdr->allocList == NULL)
+ rf_MakeAllocList(node->dagHdr->allocList);
+ RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
+#endif /* RF_BACKWARD > 0 */
+ req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
+ buf, parityStripeID, which_ru,
+ (int (*)(void *,int)) node->wakeFunc,
+ node, NULL, node->dagHdr->tracerec,
+ (void *)(node->dagHdr->raidPtr), flags, b_proc);
+ if (!req) {
+ (node->wakeFunc)(node, ENOMEM);
+ } else {
+ node->dagFuncData = (void *) req;
+ rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, priority );
+ }
+ return(0);
+}
+
+
+/*****************************************************************************************
+ * the execution function associated with a disk-write node
+ ****************************************************************************************/
+int rf_DiskWriteFuncForThreads(node)
+ RF_DagNode_t *node;
+{
+ RF_DiskQueueData_t *req;
+ RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *)node->params[0].p;
+ caddr_t buf = (caddr_t)node->params[1].p;
+ RF_StripeNum_t parityStripeID = (RF_StripeNum_t)node->params[2].v;
+ unsigned priority = RF_EXTRACT_PRIORITY(node->params[3].v);
+ unsigned lock = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
+ unsigned unlock = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
+ unsigned which_ru = RF_EXTRACT_RU(node->params[3].v);
+ RF_DiskQueueDataFlags_t flags = 0;
+ RF_IoType_t iotype = (node->dagHdr->status == rf_enable) ? RF_IO_TYPE_WRITE : RF_IO_TYPE_NOP;
+ RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
+ void *b_proc = NULL;
+#if RF_BACKWARD > 0
+ caddr_t undoBuf;
+#endif
+
+#ifdef KERNEL
+ if (node->dagHdr->bp) b_proc = (void *) ((struct buf *) node->dagHdr->bp)->b_proc;
+#endif /* KERNEL */
+
+#if RF_BACKWARD > 0
+ /* This area is used only for backward error recovery experiments
+ * First, schedule allocate a buffer and schedule a pre-read of the disk
+ * After the pre-read, proceed with the normal disk write
+ */
+ if (node->status == rf_bwd2) {
+ /* just finished undo logging, now perform real function */
+ node->status = rf_fired;
+ RF_ASSERT( !(lock && unlock) );
+ flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
+ flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
+ req = rf_CreateDiskQueueData(iotype,
+ pda->startSector, pda->numSector, buf, parityStripeID, which_ru,
+ node->wakeFunc, (void *) node, NULL, node->dagHdr->tracerec,
+ (void *) (node->dagHdr->raidPtr), flags, b_proc);
+
+ if (!req) {
+ (node->wakeFunc)(node, ENOMEM);
+ } else {
+ node->dagFuncData = (void *) req;
+ rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, priority );
+ }
+ }
+
+ else {
+ /* node status should be rf_fired */
+ /* schedule a disk pre-read */
+ node->status = rf_bwd1;
+ RF_ASSERT( !(lock && unlock) );
+ flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
+ flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
+ if (node->dagHdr->allocList == NULL)
+ rf_MakeAllocList(node->dagHdr->allocList);
+ RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
+ req = rf_CreateDiskQueueData(RF_IO_TYPE_READ,
+ pda->startSector, pda->numSector, undoBuf, parityStripeID, which_ru,
+ node->wakeFunc, (void *) node, NULL, node->dagHdr->tracerec,
+ (void *) (node->dagHdr->raidPtr), flags, b_proc);
+
+ if (!req) {
+ (node->wakeFunc)(node, ENOMEM);
+ } else {
+ node->dagFuncData = (void *) req;
+ rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, priority );
+ }
+ }
+ return(0);
+#endif /* RF_BACKWARD > 0 */
+
+ /* normal processing (rollaway or forward recovery) begins here */
+ RF_ASSERT( !(lock && unlock) );
+ flags |= (lock) ? RF_LOCK_DISK_QUEUE : 0;
+ flags |= (unlock) ? RF_UNLOCK_DISK_QUEUE : 0;
+ req = rf_CreateDiskQueueData(iotype, pda->startSector, pda->numSector,
+ buf, parityStripeID, which_ru,
+ (int (*)(void *,int)) node->wakeFunc,
+ (void *) node, NULL,
+ node->dagHdr->tracerec,
+ (void *) (node->dagHdr->raidPtr),
+ flags, b_proc);
+
+ if (!req) {
+ (node->wakeFunc)(node, ENOMEM);
+ } else {
+ node->dagFuncData = (void *) req;
+ rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, priority );
+ }
+
+ return(0);
+}
+
+/*****************************************************************************************
+ * the undo function for disk nodes
+ * Note: this is not a proper undo of a write node, only locks are released.
+ * old data is not restored to disk!
+ ****************************************************************************************/
+int rf_DiskUndoFunc(node)
+ RF_DagNode_t *node;
+{
+ RF_DiskQueueData_t *req;
+ RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *)node->params[0].p;
+ RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
+
+ req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP,
+ 0L, 0, NULL, 0L, 0,
+ (int (*)(void *,int)) node->wakeFunc,
+ (void *) node,
+ NULL, node->dagHdr->tracerec,
+ (void *) (node->dagHdr->raidPtr),
+ RF_UNLOCK_DISK_QUEUE, NULL);
+ if (!req)
+ (node->wakeFunc)(node, ENOMEM);
+ else {
+ node->dagFuncData = (void *) req;
+ rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, RF_IO_NORMAL_PRIORITY );
+ }
+
+ return(0);
+}
+
+/*****************************************************************************************
+ * the execution function associated with an "unlock disk queue" node
+ ****************************************************************************************/
+int rf_DiskUnlockFuncForThreads(node)
+ RF_DagNode_t *node;
+{
+ RF_DiskQueueData_t *req;
+ RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *)node->params[0].p;
+ RF_DiskQueue_t **dqs = ((RF_Raid_t *) (node->dagHdr->raidPtr))->Queues;
+
+ req = rf_CreateDiskQueueData(RF_IO_TYPE_NOP,
+ 0L, 0, NULL, 0L, 0,
+ (int (*)(void *,int)) node->wakeFunc,
+ (void *) node,
+ NULL, node->dagHdr->tracerec,
+ (void *) (node->dagHdr->raidPtr),
+ RF_UNLOCK_DISK_QUEUE, NULL);
+ if (!req)
+ (node->wakeFunc)(node, ENOMEM);
+ else {
+ node->dagFuncData = (void *) req;
+ rf_DiskIOEnqueue( &(dqs[pda->row][pda->col]), req, RF_IO_NORMAL_PRIORITY );
+ }
+
+ return(0);
+}
+
+/*****************************************************************************************
+ * Callback routine for DiskRead and DiskWrite nodes. When the disk op completes,
+ * the routine is called to set the node status and inform the execution engine that
+ * the node has fired.
+ ****************************************************************************************/
+int rf_GenericWakeupFunc(node, status)
+ RF_DagNode_t *node;
+ int status;
+{
+ switch (node->status) {
+ case rf_bwd1 :
+ node->status = rf_bwd2;
+ if (node->dagFuncData)
+ rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData);
+ return(rf_DiskWriteFuncForThreads(node));
+ break;
+ case rf_fired :
+ if (status) node->status = rf_bad;
+ else node->status = rf_good;
+ break;
+ case rf_recover :
+ /* probably should never reach this case */
+ if (status) node->status = rf_panic;
+ else node->status = rf_undone;
+ break;
+ default :
+ RF_PANIC();
+ break;
+ }
+ if (node->dagFuncData)
+ rf_FreeDiskQueueData((RF_DiskQueueData_t *) node->dagFuncData);
+ return(rf_FinishNode(node, RF_INTR_CONTEXT));
+}
+
+
+/*****************************************************************************************
+ * there are three distinct types of xor nodes
+ * A "regular xor" is used in the fault-free case where the access spans a complete
+ * stripe unit. It assumes that the result buffer is one full stripe unit in size,
+ * and uses the stripe-unit-offset values that it computes from the PDAs to determine
+ * where within the stripe unit to XOR each argument buffer.
+ *
+ * A "simple xor" is used in the fault-free case where the access touches only a portion
+ * of one (or two, in some cases) stripe unit(s). It assumes that all the argument
+ * buffers are of the same size and have the same stripe unit offset.
+ *
+ * A "recovery xor" is used in the degraded-mode case. It's similar to the regular
+ * xor function except that it takes the failed PDA as an additional parameter, and
+ * uses it to determine what portions of the argument buffers need to be xor'd into
+ * the result buffer, and where in the result buffer they should go.
+ ****************************************************************************************/
+
+/* xor the params together and store the result in the result field.
+ * assume the result field points to a buffer that is the size of one SU,
+ * and use the pda params to determine where within the buffer to XOR
+ * the input buffers.
+ */
+int rf_RegularXorFunc(node)
+ RF_DagNode_t *node;
+{
+ RF_Raid_t *raidPtr = (RF_Raid_t *)node->params[node->numParams-1].p;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+ RF_Etimer_t timer;
+ int i, retcode;
+#if RF_BACKWARD > 0
+ RF_PhysDiskAddr_t *pda;
+ caddr_t undoBuf;
+#endif
+
+ retcode = 0;
+ if (node->dagHdr->status == rf_enable) {
+ /* don't do the XOR if the input is the same as the output */
+ RF_ETIMER_START(timer);
+ for (i=0; i<node->numParams-1; i+=2) if (node->params[i+1].p != node->results[0]) {
+#if RF_BACKWARD > 0
+ /* This section mimics undo logging for backward error recovery experiments b
+ * allocating and initializing a buffer
+ * XXX 512 byte sector size is hard coded!
+ */
+ pda = node->params[i].p;
+ if (node->dagHdr->allocList == NULL)
+ rf_MakeAllocList(node->dagHdr->allocList);
+ RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
+#endif /* RF_BACKWARD > 0 */
+ retcode = rf_XorIntoBuffer(raidPtr, (RF_PhysDiskAddr_t *) node->params[i].p,
+ (char *)node->params[i+1].p, (char *) node->results[0], node->dagHdr->bp);
+ }
+ RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->xor_us += RF_ETIMER_VAL_US(timer);
+ }
+ return(rf_GenericWakeupFunc(node, retcode)); /* call wake func explicitly since no I/O in this node */
+}
+
+/* xor the inputs into the result buffer, ignoring placement issues */
+int rf_SimpleXorFunc(node)
+ RF_DagNode_t *node;
+{
+ RF_Raid_t *raidPtr = (RF_Raid_t *)node->params[node->numParams-1].p;
+ int i, retcode = 0;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+ RF_Etimer_t timer;
+#if RF_BACKWARD > 0
+ RF_PhysDiskAddr_t *pda;
+ caddr_t undoBuf;
+#endif
+
+ if (node->dagHdr->status == rf_enable) {
+ RF_ETIMER_START(timer);
+ /* don't do the XOR if the input is the same as the output */
+ for (i=0; i<node->numParams-1; i+=2) if (node->params[i+1].p != node->results[0]) {
+#if RF_BACKWARD > 0
+ /* This section mimics undo logging for backward error recovery experiments b
+ * allocating and initializing a buffer
+ * XXX 512 byte sector size is hard coded!
+ */
+ pda = node->params[i].p;
+ if (node->dagHdr->allocList == NULL)
+ rf_MakeAllocList(node->dagHdr->allocList);
+ RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
+#endif /* RF_BACKWARD > 0 */
+ retcode = rf_bxor((char *)node->params[i+1].p, (char *) node->results[0],
+ rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *)node->params[i].p)->numSector),
+ (struct buf *) node->dagHdr->bp);
+ }
+ RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->xor_us += RF_ETIMER_VAL_US(timer);
+ }
+
+ return(rf_GenericWakeupFunc(node, retcode)); /* call wake func explicitly since no I/O in this node */
+}
+
+/* this xor is used by the degraded-mode dag functions to recover lost data.
+ * the second-to-last parameter is the PDA for the failed portion of the access.
+ * the code here looks at this PDA and assumes that the xor target buffer is
+ * equal in size to the number of sectors in the failed PDA. It then uses
+ * the other PDAs in the parameter list to determine where within the target
+ * buffer the corresponding data should be xored.
+ */
+int rf_RecoveryXorFunc(node)
+ RF_DagNode_t *node;
+{
+ RF_Raid_t *raidPtr = (RF_Raid_t *)node->params[node->numParams-1].p;
+ RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout;
+ RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *)node->params[node->numParams-2].p;
+ int i, retcode = 0;
+ RF_PhysDiskAddr_t *pda;
+ int suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr,failedPDA->startSector);
+ char *srcbuf, *destbuf;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+ RF_Etimer_t timer;
+#if RF_BACKWARD > 0
+ caddr_t undoBuf;
+#endif
+
+ if (node->dagHdr->status == rf_enable) {
+ RF_ETIMER_START(timer);
+ for (i=0; i<node->numParams-2; i+=2) if (node->params[i+1].p != node->results[0]) {
+ pda = (RF_PhysDiskAddr_t *)node->params[i].p;
+#if RF_BACKWARD > 0
+ /* This section mimics undo logging for backward error recovery experiments b
+ * allocating and initializing a buffer
+ * XXX 512 byte sector size is hard coded!
+ */
+ if (node->dagHdr->allocList == NULL)
+ rf_MakeAllocList(node->dagHdr->allocList);
+ RF_CallocAndAdd(undoBuf, 1, 512 * pda->numSector, (caddr_t), node->dagHdr->allocList);
+#endif /* RF_BACKWARD > 0 */
+ srcbuf = (char *)node->params[i+1].p;
+ suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+ destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr,suoffset-failedSUOffset);
+ retcode = rf_bxor(srcbuf, destbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), node->dagHdr->bp);
+ }
+ RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->xor_us += RF_ETIMER_VAL_US(timer);
+ }
+ return (rf_GenericWakeupFunc(node, retcode));
+}
+
+/*****************************************************************************************
+ * The next three functions are utilities used by the above xor-execution functions.
+ ****************************************************************************************/
+
+
+/*
+ * this is just a glorified buffer xor. targbuf points to a buffer that is one full stripe unit
+ * in size. srcbuf points to a buffer that may be less than 1 SU, but never more. When the
+ * access described by pda is one SU in size (which by implication means it's SU-aligned),
+ * all that happens is (targbuf) <- (srcbuf ^ targbuf). When the access is less than one
+ * SU in size the XOR occurs on only the portion of targbuf identified in the pda.
+ */
+
+int rf_XorIntoBuffer(raidPtr, pda, srcbuf, targbuf, bp)
+ RF_Raid_t *raidPtr;
+ RF_PhysDiskAddr_t *pda;
+ char *srcbuf;
+ char *targbuf;
+ void *bp;
+{
+ char *targptr;
+ int sectPerSU = raidPtr->Layout.sectorsPerStripeUnit;
+ int SUOffset = pda->startSector % sectPerSU;
+ int length, retcode = 0;
+
+ RF_ASSERT(pda->numSector <= sectPerSU);
+
+ targptr = targbuf + rf_RaidAddressToByte(raidPtr, SUOffset);
+ length = rf_RaidAddressToByte(raidPtr, pda->numSector);
+ retcode = rf_bxor(srcbuf, targptr, length, bp);
+ return(retcode);
+}
+
+/* it really should be the case that the buffer pointers (returned by malloc)
+ * are aligned to the natural word size of the machine, so this is the only
+ * case we optimize for. The length should always be a multiple of the sector
+ * size, so there should be no problem with leftover bytes at the end.
+ */
+int rf_bxor(src, dest, len, bp)
+ char *src;
+ char *dest;
+ int len;
+ void *bp;
+{
+ unsigned mask = sizeof(long) -1, retcode = 0;
+
+ if ( !(((unsigned long) src) & mask) && !(((unsigned long) dest) & mask) && !(len&mask) ) {
+ retcode = rf_longword_bxor((unsigned long *) src, (unsigned long *) dest, len>>RF_LONGSHIFT, bp);
+ } else {
+ RF_ASSERT(0);
+ }
+ return(retcode);
+}
+
+/* map a user buffer into kernel space, if necessary */
+#ifdef KERNEL
+#if defined(__NetBSD__) || defined(__OpenBSD__)
+/* XXX Not a clue if this is even close.. */
+#define REMAP_VA(_bp,x,y) (y) = (x)
+#else
+#define REMAP_VA(_bp,x,y) (y) = (unsigned long *) ((IS_SYS_VA(x)) ? (unsigned long *)(x) : (unsigned long *) rf_MapToKernelSpace((struct buf *) (_bp), (caddr_t)(x)))
+#endif /* __NetBSD__ || __OpenBSD__ */
+#else /* KERNEL */
+#define REMAP_VA(_bp,x,y) (y) = (x)
+#endif /* KERNEL */
+
+/* When XORing in kernel mode, we need to map each user page to kernel space before we can access it.
+ * We don't want to assume anything about which input buffers are in kernel/user
+ * space, nor about their alignment, so in each loop we compute the maximum number
+ * of bytes that we can xor without crossing any page boundaries, and do only this many
+ * bytes before the next remap.
+ */
+int rf_longword_bxor(src, dest, len, bp)
+ register unsigned long *src;
+ register unsigned long *dest;
+ int len; /* longwords */
+ void *bp;
+{
+ register unsigned long *end = src+len;
+ register unsigned long d0, d1, d2, d3, s0, s1, s2, s3; /* temps */
+ register unsigned long *pg_src, *pg_dest; /* per-page source/dest pointers */
+ int longs_this_time; /* # longwords to xor in the current iteration */
+
+ REMAP_VA(bp, src, pg_src);
+ REMAP_VA(bp, dest, pg_dest);
+ if (!pg_src || !pg_dest) return(EFAULT);
+
+ while (len >= 4 ) {
+ longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(pg_src), RF_BLIP(pg_dest)) >> RF_LONGSHIFT); /* note len in longwords */
+ src += longs_this_time; dest+= longs_this_time; len -= longs_this_time;
+ while (longs_this_time >= 4) {
+ d0 = pg_dest[0];
+ d1 = pg_dest[1];
+ d2 = pg_dest[2];
+ d3 = pg_dest[3];
+ s0 = pg_src[0];
+ s1 = pg_src[1];
+ s2 = pg_src[2];
+ s3 = pg_src[3];
+ pg_dest[0] = d0 ^ s0;
+ pg_dest[1] = d1 ^ s1;
+ pg_dest[2] = d2 ^ s2;
+ pg_dest[3] = d3 ^ s3;
+ pg_src += 4;
+ pg_dest += 4;
+ longs_this_time -= 4;
+ }
+ while (longs_this_time > 0) { /* cannot cross any page boundaries here */
+ *pg_dest++ ^= *pg_src++;
+ longs_this_time--;
+ }
+
+ /* either we're done, or we've reached a page boundary on one (or possibly both) of the pointers */
+ if (len) {
+ if (RF_PAGE_ALIGNED(src)) REMAP_VA(bp, src, pg_src);
+ if (RF_PAGE_ALIGNED(dest)) REMAP_VA(bp, dest, pg_dest);
+ if (!pg_src || !pg_dest) return(EFAULT);
+ }
+ }
+ while (src < end) {
+ *pg_dest++ ^= *pg_src++;
+ src++; dest++; len--;
+ if (RF_PAGE_ALIGNED(src)) REMAP_VA(bp, src, pg_src);
+ if (RF_PAGE_ALIGNED(dest)) REMAP_VA(bp, dest, pg_dest);
+ }
+ RF_ASSERT(len == 0);
+ return(0);
+}
+
+
+/*
+ dst = a ^ b ^ c;
+ a may equal dst
+ see comment above longword_bxor
+*/
+int rf_longword_bxor3(dst,a,b,c,len, bp)
+ register unsigned long *dst;
+ register unsigned long *a;
+ register unsigned long *b;
+ register unsigned long *c;
+ int len; /* length in longwords */
+ void *bp;
+{
+ unsigned long a0,a1,a2,a3, b0,b1,b2,b3;
+ register unsigned long *pg_a, *pg_b, *pg_c, *pg_dst; /* per-page source/dest pointers */
+ int longs_this_time; /* # longs to xor in the current iteration */
+ char dst_is_a = 0;
+
+ REMAP_VA(bp, a, pg_a);
+ REMAP_VA(bp, b, pg_b);
+ REMAP_VA(bp, c, pg_c);
+ if (a == dst) {pg_dst = pg_a; dst_is_a = 1;} else { REMAP_VA(bp, dst, pg_dst); }
+
+ /* align dest to cache line. Can't cross a pg boundary on dst here. */
+ while ((((unsigned long) pg_dst) & 0x1f)) {
+ *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
+ dst++; a++; b++; c++;
+ if (RF_PAGE_ALIGNED(a)) {REMAP_VA(bp, a, pg_a); if (!pg_a) return(EFAULT);}
+ if (RF_PAGE_ALIGNED(b)) {REMAP_VA(bp, a, pg_b); if (!pg_b) return(EFAULT);}
+ if (RF_PAGE_ALIGNED(c)) {REMAP_VA(bp, a, pg_c); if (!pg_c) return(EFAULT);}
+ len--;
+ }
+
+ while (len > 4 ) {
+ longs_this_time = RF_MIN(len, RF_MIN(RF_BLIP(a), RF_MIN(RF_BLIP(b), RF_MIN(RF_BLIP(c), RF_BLIP(dst)))) >> RF_LONGSHIFT);
+ a+= longs_this_time; b+= longs_this_time; c+= longs_this_time; dst+=longs_this_time; len-=longs_this_time;
+ while (longs_this_time >= 4) {
+ a0 = pg_a[0]; longs_this_time -= 4;
+
+ a1 = pg_a[1];
+ a2 = pg_a[2];
+
+ a3 = pg_a[3]; pg_a += 4;
+
+ b0 = pg_b[0];
+ b1 = pg_b[1];
+
+ b2 = pg_b[2];
+ b3 = pg_b[3];
+ /* start dual issue */
+ a0 ^= b0; b0 = pg_c[0];
+
+ pg_b += 4; a1 ^= b1;
+
+ a2 ^= b2; a3 ^= b3;
+
+ b1 = pg_c[1]; a0 ^= b0;
+
+ b2 = pg_c[2]; a1 ^= b1;
+
+ b3 = pg_c[3]; a2 ^= b2;
+
+ pg_dst[0] = a0; a3 ^= b3;
+ pg_dst[1] = a1; pg_c += 4;
+ pg_dst[2] = a2;
+ pg_dst[3] = a3; pg_dst += 4;
+ }
+ while (longs_this_time > 0) { /* cannot cross any page boundaries here */
+ *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
+ longs_this_time--;
+ }
+
+ if (len) {
+ if (RF_PAGE_ALIGNED(a)) {REMAP_VA(bp, a, pg_a); if (!pg_a) return(EFAULT); if (dst_is_a) pg_dst = pg_a;}
+ if (RF_PAGE_ALIGNED(b)) {REMAP_VA(bp, b, pg_b); if (!pg_b) return(EFAULT);}
+ if (RF_PAGE_ALIGNED(c)) {REMAP_VA(bp, c, pg_c); if (!pg_c) return(EFAULT);}
+ if (!dst_is_a) if (RF_PAGE_ALIGNED(dst)) {REMAP_VA(bp, dst, pg_dst); if (!pg_dst) return(EFAULT);}
+ }
+ }
+ while (len) {
+ *pg_dst++ = *pg_a++ ^ *pg_b++ ^ *pg_c++;
+ dst++; a++; b++; c++;
+ if (RF_PAGE_ALIGNED(a)) {REMAP_VA(bp, a, pg_a); if (!pg_a) return(EFAULT); if (dst_is_a) pg_dst = pg_a;}
+ if (RF_PAGE_ALIGNED(b)) {REMAP_VA(bp, b, pg_b); if (!pg_b) return(EFAULT);}
+ if (RF_PAGE_ALIGNED(c)) {REMAP_VA(bp, c, pg_c); if (!pg_c) return(EFAULT);}
+ if (!dst_is_a) if (RF_PAGE_ALIGNED(dst)) {REMAP_VA(bp, dst, pg_dst); if (!pg_dst) return(EFAULT);}
+ len--;
+ }
+ return(0);
+}
+
+int rf_bxor3(dst,a,b,c,len, bp)
+ register unsigned char *dst;
+ register unsigned char *a;
+ register unsigned char *b;
+ register unsigned char *c;
+ unsigned long len;
+ void *bp;
+{
+ RF_ASSERT(((RF_UL(dst)|RF_UL(a)|RF_UL(b)|RF_UL(c)|len) & 0x7) == 0);
+
+ return(rf_longword_bxor3((unsigned long *)dst, (unsigned long *)a,
+ (unsigned long *)b, (unsigned long *)c, len>>RF_LONGSHIFT, bp));
+}
diff --git a/sys/dev/raidframe/rf_dagfuncs.h b/sys/dev/raidframe/rf_dagfuncs.h
new file mode 100644
index 00000000000..ab19b712421
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagfuncs.h
@@ -0,0 +1,138 @@
+/* $OpenBSD: rf_dagfuncs.h,v 1.1 1999/01/11 14:29:11 niklas Exp $ */
+/* $NetBSD: rf_dagfuncs.h,v 1.1 1998/11/13 04:20:28 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, William V. Courtright II, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************************
+ *
+ * dagfuncs.h -- header file for DAG node execution routines
+ *
+ ****************************************************************************************/
+
+/*
+ * :
+ * Log: rf_dagfuncs.h,v
+ * Revision 1.17 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.16 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.15 1996/06/06 17:27:20 jimz
+ * added another read mirror func (partitioning), changed names so dag
+ * creation routines can use the appropriate one
+ *
+ * Revision 1.14 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.13 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.12 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.11 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.10 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.9 1995/12/01 15:56:46 root
+ * added copyright info
+ *
+ * Revision 1.8 1995/11/07 16:25:23 wvcii
+ * added DiskUnlockFuncForThreads
+ *
+ */
+
+#ifndef _RF__RF_DAGFUNCS_H_
+#define _RF__RF_DAGFUNCS_H_
+
+int rf_ConfigureDAGFuncs(RF_ShutdownList_t **listp);
+int rf_TerminateFunc(RF_DagNode_t *node);
+int rf_TerminateUndoFunc(RF_DagNode_t *node);
+int rf_DiskReadMirrorIdleFunc(RF_DagNode_t *node);
+int rf_DiskReadMirrorPartitionFunc(RF_DagNode_t *node);
+int rf_DiskReadMirrorUndoFunc(RF_DagNode_t *node);
+int rf_ParityLogUpdateFunc(RF_DagNode_t *node);
+int rf_ParityLogOverwriteFunc(RF_DagNode_t *node);
+int rf_ParityLogUpdateUndoFunc(RF_DagNode_t *node);
+int rf_ParityLogOverwriteUndoFunc(RF_DagNode_t *node);
+int rf_NullNodeFunc(RF_DagNode_t *node);
+int rf_NullNodeUndoFunc(RF_DagNode_t *node);
+int rf_DiskReadFuncForThreads(RF_DagNode_t *node);
+int rf_DiskWriteFuncForThreads(RF_DagNode_t *node);
+int rf_DiskUndoFunc(RF_DagNode_t *node);
+int rf_DiskUnlockFuncForThreads(RF_DagNode_t *node);
+int rf_GenericWakeupFunc(RF_DagNode_t *node, int status);
+int rf_RegularXorFunc(RF_DagNode_t *node);
+int rf_SimpleXorFunc(RF_DagNode_t *node);
+int rf_RecoveryXorFunc(RF_DagNode_t *node);
+int rf_XorIntoBuffer(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, char *srcbuf,
+ char *targbuf, void *bp);
+int rf_bxor(char *src, char *dest, int len, void *bp);
+int rf_longword_bxor(register unsigned long *src, register unsigned long *dest,
+ int len, void *bp);
+int rf_longword_bxor3(register unsigned long *dest, register unsigned long *a,
+ register unsigned long *b, register unsigned long *c, int len, void *bp);
+int rf_bxor3(unsigned char *dst, unsigned char *a, unsigned char *b,
+ unsigned char *c, unsigned long len, void *bp);
+
+/* function ptrs defined in ConfigureDAGFuncs() */
+extern int (*rf_DiskReadFunc)(RF_DagNode_t *);
+extern int (*rf_DiskWriteFunc)(RF_DagNode_t *);
+extern int (*rf_DiskReadUndoFunc)(RF_DagNode_t *);
+extern int (*rf_DiskWriteUndoFunc)(RF_DagNode_t *);
+extern int (*rf_DiskUnlockFunc)(RF_DagNode_t *);
+extern int (*rf_DiskUnlockUndoFunc)(RF_DagNode_t *);
+extern int (*rf_SimpleXorUndoFunc)(RF_DagNode_t *);
+extern int (*rf_RegularXorUndoFunc)(RF_DagNode_t *);
+extern int (*rf_RecoveryXorUndoFunc)(RF_DagNode_t *);
+
+/* macros for manipulating the param[3] in a read or write node */
+#define RF_CREATE_PARAM3(pri, lk, unlk, wru) (((RF_uint64)(((wru&0xFFFFFF)<<8)|((lk)?0x10:0)|((unlk)?0x20:0)|((pri)&0xF)) ))
+#define RF_EXTRACT_PRIORITY(_x_) ((((unsigned) ((unsigned long)(_x_))) >> 0) & 0x0F)
+#define RF_EXTRACT_LOCK_FLAG(_x_) ((((unsigned) ((unsigned long)(_x_))) >> 4) & 0x1)
+#define RF_EXTRACT_UNLOCK_FLAG(_x_) ((((unsigned) ((unsigned long)(_x_))) >> 5) & 0x1)
+#define RF_EXTRACT_RU(_x_) ((((unsigned) ((unsigned long)(_x_))) >> 8) & 0xFFFFFF)
+
+#endif /* !_RF__RF_DAGFUNCS_H_ */
diff --git a/sys/dev/raidframe/rf_dagutils.c b/sys/dev/raidframe/rf_dagutils.c
new file mode 100644
index 00000000000..b050b832af6
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagutils.c
@@ -0,0 +1,1406 @@
+/* $OpenBSD: rf_dagutils.c,v 1.1 1999/01/11 14:29:11 niklas Exp $ */
+/* $NetBSD: rf_dagutils.c,v 1.1 1998/11/13 04:20:28 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Authors: Mark Holland, William V. Courtright II, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/******************************************************************************
+ *
+ * rf_dagutils.c -- utility routines for manipulating dags
+ *
+ *****************************************************************************/
+
+/*
+ * :
+ * Log: rf_dagutils.c,v
+ * Revision 1.55 1996/08/22 14:39:47 jimz
+ * reduce v/k fraction (better load balancing)
+ *
+ * Revision 1.54 1996/08/21 04:14:12 jimz
+ * minor workload shift tweaking
+ *
+ * Revision 1.53 1996/08/20 23:41:16 jimz
+ * fix up workload shift computation
+ *
+ * Revision 1.52 1996/08/20 22:34:16 jimz
+ * first cut at fixing workload shift
+ * needs work
+ *
+ * Revision 1.51 1996/08/20 16:51:16 jimz
+ * comment more verbosely compute_workload_shift()
+ *
+ * Revision 1.50 1996/08/11 00:40:50 jimz
+ * fix up broken comment
+ *
+ * Revision 1.49 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.48 1996/07/27 18:40:01 jimz
+ * cleanup sweep
+ *
+ * Revision 1.47 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.46 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.45 1996/06/17 03:24:59 jimz
+ * include shutdown.h for define of now-macroized ShutdownCreate
+ *
+ * Revision 1.44 1996/06/10 12:50:57 jimz
+ * Add counters to freelists to track number of allocations, frees,
+ * grows, max size, etc. Adjust a couple sets of PRIME params based
+ * on the results.
+ *
+ * Revision 1.43 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.42 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.41 1996/06/06 17:28:58 jimz
+ * make PrintNodeInfoString aware of new mirroring funcs
+ *
+ * Revision 1.40 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.39 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.38 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.37 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.36 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.35 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.34 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.33 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.32 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.31 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.30 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.29 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.28 1996/05/16 23:05:52 jimz
+ * changed InitNode() to use dag_ptrs field of node when appropriate
+ * (see rf_dag.h or comments within InitNode() for details)
+ *
+ * Revision 1.27 1996/05/16 15:37:19 jimz
+ * convert to RF_FREELIST stuff for dag headers
+ *
+ * Revision 1.26 1996/05/08 21:01:24 jimz
+ * fixed up enum type names that were conflicting with other
+ * enums and function names (ie, "panic")
+ * future naming trends will be towards RF_ and rf_ for
+ * everything raidframe-related
+ *
+ * Revision 1.25 1996/05/03 19:56:15 wvcii
+ * added misc routines from old dag creation files
+ *
+ * Revision 1.24 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.23 1995/12/01 15:59:50 root
+ * added copyright info
+ *
+ * Revision 1.22 1995/11/17 15:14:12 wvcii
+ * PrintDAG now processes DiskReadMirrorFunc nodes
+ *
+ * Revision 1.21 1995/11/07 16:22:38 wvcii
+ * InitNode and InitNodeFromBuf now initialize commit fields
+ * beefed up ValidateDag
+ * prettied up PrintDAGList
+ *
+ */
+
+#include "rf_archs.h"
+#include "rf_types.h"
+#include "rf_threadstuff.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_dagutils.h"
+#include "rf_dagfuncs.h"
+#include "rf_general.h"
+#include "rf_freelist.h"
+#include "rf_map.h"
+#include "rf_shutdown.h"
+#include "rf_sys.h"
+
+#define SNUM_DIFF(_a_,_b_) (((_a_)>(_b_))?((_a_)-(_b_)):((_b_)-(_a_)))
+
+RF_RedFuncs_t rf_xorFuncs = {
+ rf_RegularXorFunc, "Reg Xr",
+ rf_SimpleXorFunc, "Simple Xr"};
+
+RF_RedFuncs_t rf_xorRecoveryFuncs = {
+ rf_RecoveryXorFunc, "Recovery Xr",
+ rf_RecoveryXorFunc, "Recovery Xr"};
+
+static void rf_RecurPrintDAG(RF_DagNode_t *, int, int);
+static void rf_PrintDAG(RF_DagHeader_t *);
+static int rf_ValidateBranch(RF_DagNode_t *, int *, int *,
+ RF_DagNode_t **, int );
+static void rf_ValidateBranchVisitedBits(RF_DagNode_t *, int, int);
+static void rf_ValidateVisitedBits(RF_DagHeader_t *);
+
+/******************************************************************************
+ *
+ * InitNode - initialize a dag node
+ *
+ * the size of the propList array is always the same as that of the
+ * successors array.
+ *
+ *****************************************************************************/
+void rf_InitNode(
+ RF_DagNode_t *node,
+ RF_NodeStatus_t initstatus,
+ int commit,
+ int (*doFunc)(RF_DagNode_t *node),
+ int (*undoFunc)(RF_DagNode_t *node),
+ int (*wakeFunc)(RF_DagNode_t *node,int status),
+ int nSucc,
+ int nAnte,
+ int nParam,
+ int nResult,
+ RF_DagHeader_t *hdr,
+ char *name,
+ RF_AllocListElem_t *alist)
+{
+ void **ptrs;
+ int nptrs;
+
+ if (nAnte > RF_MAX_ANTECEDENTS)
+ RF_PANIC();
+ node->status = initstatus;
+ node->commitNode = commit;
+ node->doFunc = doFunc;
+ node->undoFunc = undoFunc;
+ node->wakeFunc = wakeFunc;
+ node->numParams = nParam;
+ node->numResults = nResult;
+ node->numAntecedents = nAnte;
+ node->numAntDone = 0;
+ node->next = NULL;
+ node->numSuccedents = nSucc;
+ node->name = name;
+ node->dagHdr = hdr;
+ node->visited = 0;
+
+ /* allocate all the pointers with one call to malloc */
+ nptrs = nSucc+nAnte+nResult+nSucc;
+
+ if (nptrs <= RF_DAG_PTRCACHESIZE) {
+ /*
+ * The dag_ptrs field of the node is basically some scribble
+ * space to be used here. We could get rid of it, and always
+ * allocate the range of pointers, but that's expensive. So,
+ * we pick a "common case" size for the pointer cache. Hopefully,
+ * we'll find that:
+ * (1) Generally, nptrs doesn't exceed RF_DAG_PTRCACHESIZE by
+ * only a little bit (least efficient case)
+ * (2) Generally, ntprs isn't a lot less than RF_DAG_PTRCACHESIZE
+ * (wasted memory)
+ */
+ ptrs = (void **)node->dag_ptrs;
+ }
+ else {
+ RF_CallocAndAdd(ptrs, nptrs, sizeof(void *), (void **), alist);
+ }
+ node->succedents = (nSucc) ? (RF_DagNode_t **) ptrs : NULL;
+ node->antecedents = (nAnte) ? (RF_DagNode_t **) (ptrs+nSucc) : NULL;
+ node->results = (nResult) ? (void **) (ptrs+nSucc+nAnte) : NULL;
+ node->propList = (nSucc) ? (RF_PropHeader_t **) (ptrs+nSucc+nAnte+nResult) : NULL;
+
+ if (nParam) {
+ if (nParam <= RF_DAG_PARAMCACHESIZE) {
+ node->params = (RF_DagParam_t *)node->dag_params;
+ }
+ else {
+ RF_CallocAndAdd(node->params, nParam, sizeof(RF_DagParam_t), (RF_DagParam_t *), alist);
+ }
+ }
+ else {
+ node->params = NULL;
+ }
+}
+
+
+
+/******************************************************************************
+ *
+ * allocation and deallocation routines
+ *
+ *****************************************************************************/
+
+void rf_FreeDAG(dag_h)
+ RF_DagHeader_t *dag_h;
+{
+ RF_AccessStripeMapHeader_t *asmap, *t_asmap;
+ RF_DagHeader_t *nextDag;
+ int i;
+
+ while (dag_h) {
+ nextDag = dag_h->next;
+ for (i=0; dag_h->memChunk[i] && i < RF_MAXCHUNKS; i++) {
+ /* release mem chunks */
+ rf_ReleaseMemChunk(dag_h->memChunk[i]);
+ dag_h->memChunk[i] = NULL;
+ }
+
+ RF_ASSERT(i == dag_h->chunkIndex);
+ if (dag_h->xtraChunkCnt > 0) {
+ /* free xtraMemChunks */
+ for (i=0; dag_h->xtraMemChunk[i] && i < dag_h->xtraChunkIndex; i++) {
+ rf_ReleaseMemChunk(dag_h->xtraMemChunk[i]);
+ dag_h->xtraMemChunk[i] = NULL;
+ }
+ RF_ASSERT(i == dag_h->xtraChunkIndex);
+ /* free ptrs to xtraMemChunks */
+ RF_Free(dag_h->xtraMemChunk, dag_h->xtraChunkCnt * sizeof(RF_ChunkDesc_t *));
+ }
+ rf_FreeAllocList(dag_h->allocList);
+ for (asmap = dag_h->asmList; asmap;) {
+ t_asmap = asmap;
+ asmap = asmap->next;
+ rf_FreeAccessStripeMap(t_asmap);
+ }
+ rf_FreeDAGHeader(dag_h);
+ dag_h = nextDag;
+ }
+}
+
+RF_PropHeader_t *rf_MakePropListEntry(
+ RF_DagHeader_t *dag_h,
+ int resultNum,
+ int paramNum,
+ RF_PropHeader_t *next,
+ RF_AllocListElem_t *allocList)
+{
+ RF_PropHeader_t *p;
+
+ RF_CallocAndAdd(p, 1, sizeof(RF_PropHeader_t),
+ (RF_PropHeader_t *), allocList);
+ p->resultNum = resultNum;
+ p->paramNum = paramNum;
+ p->next = next;
+ return(p);
+}
+
+static RF_FreeList_t *rf_dagh_freelist;
+
+#define RF_MAX_FREE_DAGH 128
+#define RF_DAGH_INC 16
+#define RF_DAGH_INITIAL 32
+
+static void rf_ShutdownDAGs(void *);
+static void rf_ShutdownDAGs(ignored)
+ void *ignored;
+{
+ RF_FREELIST_DESTROY(rf_dagh_freelist,next,(RF_DagHeader_t *));
+}
+
+int rf_ConfigureDAGs(listp)
+ RF_ShutdownList_t **listp;
+{
+ int rc;
+
+ RF_FREELIST_CREATE(rf_dagh_freelist, RF_MAX_FREE_DAGH,
+ RF_DAGH_INC, sizeof(RF_DagHeader_t));
+ if (rf_dagh_freelist == NULL)
+ return(ENOMEM);
+ rc = rf_ShutdownCreate(listp, rf_ShutdownDAGs, NULL);
+ if (rc) {
+ RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
+ __FILE__, __LINE__, rc);
+ rf_ShutdownDAGs(NULL);
+ return(rc);
+ }
+ RF_FREELIST_PRIME(rf_dagh_freelist, RF_DAGH_INITIAL,next,
+ (RF_DagHeader_t *));
+ return(0);
+}
+
+RF_DagHeader_t *rf_AllocDAGHeader()
+{
+ RF_DagHeader_t *dh;
+
+ RF_FREELIST_GET(rf_dagh_freelist,dh,next,(RF_DagHeader_t *));
+ if (dh) {
+ bzero((char *)dh, sizeof(RF_DagHeader_t));
+ }
+ return(dh);
+}
+
+void rf_FreeDAGHeader(RF_DagHeader_t *dh)
+{
+ RF_FREELIST_FREE(rf_dagh_freelist,dh,next);
+}
+
+/* allocates a buffer big enough to hold the data described by pda */
+void *rf_AllocBuffer(
+ RF_Raid_t *raidPtr,
+ RF_DagHeader_t *dag_h,
+ RF_PhysDiskAddr_t *pda,
+ RF_AllocListElem_t *allocList)
+{
+ char *p;
+
+ RF_MallocAndAdd(p, pda->numSector << raidPtr->logBytesPerSector,
+ (char *), allocList);
+ return((void *)p);
+}
+
+/******************************************************************************
+ *
+ * debug routines
+ *
+ *****************************************************************************/
+
+char *rf_NodeStatusString(RF_DagNode_t *node)
+{
+ switch (node->status) {
+ case rf_wait: return("wait");
+ case rf_fired: return("fired");
+ case rf_good: return("good");
+ case rf_bad: return("bad");
+ default: return("?");
+ }
+}
+
+void rf_PrintNodeInfoString(RF_DagNode_t *node)
+{
+ RF_PhysDiskAddr_t *pda;
+ int (*df)(RF_DagNode_t *) = node->doFunc;
+ int i, lk, unlk;
+ void *bufPtr;
+
+ if ((df==rf_DiskReadFunc) || (df==rf_DiskWriteFunc)
+ || (df==rf_DiskReadMirrorIdleFunc)
+ || (df == rf_DiskReadMirrorPartitionFunc))
+ {
+ pda = (RF_PhysDiskAddr_t *)node->params[0].p;
+ bufPtr = (void *)node->params[1].p;
+ lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
+ unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
+ RF_ASSERT( !(lk && unlk) );
+ printf("r %d c %d offs %ld nsect %d buf 0x%lx %s\n", pda->row, pda->col,
+ (long)pda->startSector, (int) pda->numSector, (long)bufPtr,
+ (lk) ? "LOCK" : ((unlk) ? "UNLK" : " "));
+ return;
+ }
+
+ if (df == rf_DiskUnlockFunc) {
+ pda = (RF_PhysDiskAddr_t *)node->params[0].p;
+ lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
+ unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
+ RF_ASSERT( !(lk && unlk) );
+ printf("r %d c %d %s\n", pda->row, pda->col,
+ (lk) ? "LOCK" : ((unlk) ? "UNLK" : "nop"));
+ return;
+ }
+
+ if ((df==rf_SimpleXorFunc) || (df==rf_RegularXorFunc)
+ || (df==rf_RecoveryXorFunc))
+ {
+ printf("result buf 0x%lx\n",(long) node->results[0]);
+ for (i=0; i<node->numParams-1; i+=2) {
+ pda = (RF_PhysDiskAddr_t *)node->params[i].p;
+ bufPtr = (RF_PhysDiskAddr_t *)node->params[i+1].p;
+ printf(" buf 0x%lx r%d c%d offs %ld nsect %d\n",
+ (long)bufPtr, pda->row, pda->col,
+ (long)pda->startSector, (int)pda->numSector);
+ }
+ return;
+ }
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+ if (df==rf_ParityLogOverwriteFunc || df==rf_ParityLogUpdateFunc) {
+ for (i=0; i<node->numParams-1; i+=2) {
+ pda = (RF_PhysDiskAddr_t *)node->params[i].p;
+ bufPtr = (RF_PhysDiskAddr_t *)node->params[i+1].p;
+ printf(" r%d c%d offs %ld nsect %d buf 0x%lx\n",
+ pda->row, pda->col, (long) pda->startSector,
+ (int) pda->numSector, (long) bufPtr);
+ }
+ return;
+ }
+#endif /* RF_INCLUDE_PARITYLOGGING > 0 */
+
+ if ((df==rf_TerminateFunc) || (df==rf_NullNodeFunc)) {
+ printf("\n");
+ return;
+ }
+
+ printf("?\n");
+}
+
+static void rf_RecurPrintDAG(node, depth, unvisited)
+ RF_DagNode_t *node;
+ int depth;
+ int unvisited;
+{
+ char *anttype;
+ int i;
+
+ node->visited = (unvisited) ? 0 : 1;
+ printf("(%d) %d C%d %s: %s,s%d %d/%d,a%d/%d,p%d,r%d S{", depth,
+ node->nodeNum, node->commitNode, node->name, rf_NodeStatusString(node),
+ node->numSuccedents, node->numSuccFired, node->numSuccDone,
+ node->numAntecedents, node->numAntDone, node->numParams,node->numResults);
+ for (i=0; i<node->numSuccedents; i++) {
+ printf("%d%s", node->succedents[i]->nodeNum,
+ ((i==node->numSuccedents-1) ? "\0" : " "));
+ }
+ printf("} A{");
+ for (i=0; i<node->numAntecedents; i++) {
+ switch (node->antType[i]) {
+ case rf_trueData :
+ anttype = "T";
+ break;
+ case rf_antiData :
+ anttype = "A";
+ break;
+ case rf_outputData :
+ anttype = "O";
+ break;
+ case rf_control :
+ anttype = "C";
+ break;
+ default :
+ anttype = "?";
+ break;
+ }
+ printf("%d(%s)%s", node->antecedents[i]->nodeNum, anttype, (i==node->numAntecedents-1) ? "\0" : " ");
+ }
+ printf("}; ");
+ rf_PrintNodeInfoString(node);
+ for (i=0; i<node->numSuccedents; i++) {
+ if (node->succedents[i]->visited == unvisited)
+ rf_RecurPrintDAG(node->succedents[i], depth+1, unvisited);
+ }
+}
+
+static void rf_PrintDAG(dag_h)
+ RF_DagHeader_t *dag_h;
+{
+ int unvisited, i;
+ char *status;
+
+ /* set dag status */
+ switch (dag_h->status) {
+ case rf_enable :
+ status = "enable";
+ break;
+ case rf_rollForward :
+ status = "rollForward";
+ break;
+ case rf_rollBackward :
+ status = "rollBackward";
+ break;
+ default :
+ status = "illegal!";
+ break;
+ }
+ /* find out if visited bits are currently set or clear */
+ unvisited = dag_h->succedents[0]->visited;
+
+ printf("DAG type: %s\n", dag_h->creator);
+ printf("format is (depth) num commit type: status,nSucc nSuccFired/nSuccDone,nAnte/nAnteDone,nParam,nResult S{x} A{x(type)}; info\n");
+ printf("(0) %d Hdr: %s, s%d, (commit %d/%d) S{", dag_h->nodeNum,
+ status, dag_h->numSuccedents, dag_h->numCommitNodes, dag_h->numCommits);
+ for (i=0; i<dag_h->numSuccedents; i++) {
+ printf("%d%s", dag_h->succedents[i]->nodeNum,
+ ((i==dag_h->numSuccedents-1) ? "\0" : " "));
+ }
+ printf("};\n");
+ for (i=0; i<dag_h->numSuccedents; i++) {
+ if (dag_h->succedents[i]->visited == unvisited)
+ rf_RecurPrintDAG(dag_h->succedents[i], 1, unvisited);
+ }
+}
+
+/* assigns node numbers */
+int rf_AssignNodeNums(RF_DagHeader_t *dag_h)
+{
+ int unvisited, i, nnum;
+ RF_DagNode_t *node;
+
+ nnum = 0;
+ unvisited = dag_h->succedents[0]->visited;
+
+ dag_h->nodeNum = nnum++;
+ for (i=0; i<dag_h->numSuccedents; i++) {
+ node = dag_h->succedents[i];
+ if (node->visited == unvisited) {
+ nnum = rf_RecurAssignNodeNums(dag_h->succedents[i], nnum, unvisited);
+ }
+ }
+ return(nnum);
+}
+
+int rf_RecurAssignNodeNums(node, num, unvisited)
+ RF_DagNode_t *node;
+ int num;
+ int unvisited;
+{
+ int i;
+
+ node->visited = (unvisited) ? 0 : 1;
+
+ node->nodeNum = num++;
+ for (i=0; i<node->numSuccedents; i++) {
+ if (node->succedents[i]->visited == unvisited) {
+ num = rf_RecurAssignNodeNums(node->succedents[i], num, unvisited);
+ }
+ }
+ return(num);
+}
+
+/* set the header pointers in each node to "newptr" */
+void rf_ResetDAGHeaderPointers(dag_h, newptr)
+ RF_DagHeader_t *dag_h;
+ RF_DagHeader_t *newptr;
+{
+ int i;
+ for (i=0; i<dag_h->numSuccedents; i++)
+ if (dag_h->succedents[i]->dagHdr != newptr)
+ rf_RecurResetDAGHeaderPointers(dag_h->succedents[i], newptr);
+}
+
+void rf_RecurResetDAGHeaderPointers(node, newptr)
+ RF_DagNode_t *node;
+ RF_DagHeader_t *newptr;
+{
+ int i;
+ node->dagHdr = newptr;
+ for (i=0; i<node->numSuccedents; i++)
+ if (node->succedents[i]->dagHdr != newptr)
+ rf_RecurResetDAGHeaderPointers(node->succedents[i], newptr);
+}
+
+
+void rf_PrintDAGList(RF_DagHeader_t *dag_h)
+{
+ int i=0;
+
+ for (; dag_h; dag_h=dag_h->next) {
+ rf_AssignNodeNums(dag_h);
+ printf("\n\nDAG %d IN LIST:\n",i++);
+ rf_PrintDAG(dag_h);
+ }
+}
+
+static int rf_ValidateBranch(node, scount, acount, nodes, unvisited)
+ RF_DagNode_t *node;
+ int *scount;
+ int *acount;
+ RF_DagNode_t **nodes;
+ int unvisited;
+{
+ int i, retcode = 0;
+
+ /* construct an array of node pointers indexed by node num */
+ node->visited = (unvisited) ? 0 : 1;
+ nodes[ node->nodeNum ] = node;
+
+ if (node->next != NULL) {
+ printf("INVALID DAG: next pointer in node is not NULL\n");
+ retcode = 1;
+ }
+ if (node->status != rf_wait) {
+ printf("INVALID DAG: Node status is not wait\n");
+ retcode = 1;
+ }
+ if (node->numAntDone != 0) {
+ printf("INVALID DAG: numAntDone is not zero\n");
+ retcode = 1;
+ }
+ if (node->doFunc == rf_TerminateFunc) {
+ if (node->numSuccedents != 0) {
+ printf("INVALID DAG: Terminator node has succedents\n");
+ retcode = 1;
+ }
+ } else {
+ if (node->numSuccedents == 0) {
+ printf("INVALID DAG: Non-terminator node has no succedents\n");
+ retcode = 1;
+ }
+ }
+ for (i=0; i<node->numSuccedents; i++) {
+ if (!node->succedents[i]) {
+ printf("INVALID DAG: succedent %d of node %s is NULL\n",i,node->name);
+ retcode = 1;
+ }
+ scount[ node->succedents[i]->nodeNum ]++;
+ }
+ for (i=0; i<node->numAntecedents; i++) {
+ if (!node->antecedents[i]) {
+ printf("INVALID DAG: antecedent %d of node %s is NULL\n",i,node->name);
+ retcode = 1;
+ }
+ acount[ node->antecedents[i]->nodeNum ]++;
+ }
+ for (i=0; i<node->numSuccedents; i++) {
+ if (node->succedents[i]->visited == unvisited) {
+ if (rf_ValidateBranch(node->succedents[i], scount,
+ acount, nodes, unvisited))
+ {
+ retcode = 1;
+ }
+ }
+ }
+ return(retcode);
+}
+
+static void rf_ValidateBranchVisitedBits(node, unvisited, rl)
+ RF_DagNode_t *node;
+ int unvisited;
+ int rl;
+{
+ int i;
+
+ RF_ASSERT(node->visited == unvisited);
+ for (i=0; i<node->numSuccedents; i++) {
+ if (node->succedents[i] == NULL) {
+ printf("node=%lx node->succedents[%d] is NULL\n", (long)node, i);
+ RF_ASSERT(0);
+ }
+ rf_ValidateBranchVisitedBits(node->succedents[i],unvisited, rl+1);
+ }
+}
+
+/* NOTE: never call this on a big dag, because it is exponential
+ * in execution time
+ */
+static void rf_ValidateVisitedBits(dag)
+ RF_DagHeader_t *dag;
+{
+ int i, unvisited;
+
+ unvisited = dag->succedents[0]->visited;
+
+ for (i=0; i<dag->numSuccedents; i++) {
+ if (dag->succedents[i] == NULL) {
+ printf("dag=%lx dag->succedents[%d] is NULL\n", (long) dag, i);
+ RF_ASSERT(0);
+ }
+ rf_ValidateBranchVisitedBits(dag->succedents[i],unvisited,0);
+ }
+}
+
+/* validate a DAG. _at entry_ verify that:
+ * -- numNodesCompleted is zero
+ * -- node queue is null
+ * -- dag status is rf_enable
+ * -- next pointer is null on every node
+ * -- all nodes have status wait
+ * -- numAntDone is zero in all nodes
+ * -- terminator node has zero successors
+ * -- no other node besides terminator has zero successors
+ * -- no successor or antecedent pointer in a node is NULL
+ * -- number of times that each node appears as a successor of another node
+ * is equal to the antecedent count on that node
+ * -- number of times that each node appears as an antecedent of another node
+ * is equal to the succedent count on that node
+ * -- what else?
+ */
+int rf_ValidateDAG(dag_h)
+ RF_DagHeader_t *dag_h;
+{
+ int i, nodecount;
+ int *scount, *acount; /* per-node successor and antecedent counts */
+ RF_DagNode_t **nodes; /* array of ptrs to nodes in dag */
+ int retcode = 0;
+ int unvisited;
+ int commitNodeCount = 0;
+
+ if (rf_validateVisitedDebug)
+ rf_ValidateVisitedBits(dag_h);
+
+ if (dag_h->numNodesCompleted != 0) {
+ printf("INVALID DAG: num nodes completed is %d, should be 0\n",dag_h->numNodesCompleted);
+ retcode = 1; goto validate_dag_bad;
+ }
+ if (dag_h->status != rf_enable) {
+ printf("INVALID DAG: not enabled\n");
+ retcode = 1; goto validate_dag_bad;
+ }
+ if (dag_h->numCommits != 0) {
+ printf("INVALID DAG: numCommits != 0 (%d)\n",dag_h->numCommits);
+ retcode = 1; goto validate_dag_bad;
+ }
+ if (dag_h->numSuccedents != 1) {
+ /* currently, all dags must have only one succedent */
+ printf("INVALID DAG: numSuccedents !1 (%d)\n",dag_h->numSuccedents);
+ retcode = 1; goto validate_dag_bad;
+ }
+ nodecount = rf_AssignNodeNums(dag_h);
+
+ unvisited = dag_h->succedents[0]->visited;
+
+ RF_Calloc(scount, nodecount, sizeof(int), (int *));
+ RF_Calloc(acount, nodecount, sizeof(int), (int *));
+ RF_Calloc(nodes, nodecount, sizeof(RF_DagNode_t *), (RF_DagNode_t **));
+ for (i=0; i<dag_h->numSuccedents; i++) {
+ if ((dag_h->succedents[i]->visited == unvisited)
+ && rf_ValidateBranch(dag_h->succedents[i], scount,
+ acount, nodes, unvisited))
+ {
+ retcode = 1;
+ }
+ }
+ /* start at 1 to skip the header node */
+ for (i=1; i<nodecount; i++) {
+ if ( nodes[i]->commitNode )
+ commitNodeCount++;
+ if ( nodes[i]->doFunc == NULL ) {
+ printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name);
+ retcode = 1;
+ goto validate_dag_out;
+ }
+ if ( nodes[i]->undoFunc == NULL ) {
+ printf("INVALID DAG: node %s has an undefined doFunc\n", nodes[i]->name);
+ retcode = 1;
+ goto validate_dag_out;
+ }
+ if ( nodes[i]->numAntecedents != scount[ nodes[i]->nodeNum ] ) {
+ printf("INVALID DAG: node %s has %d antecedents but appears as a succedent %d times\n",
+ nodes[i]->name, nodes[i]->numAntecedents, scount[nodes[i]->nodeNum]);
+ retcode = 1;
+ goto validate_dag_out;
+ }
+ if ( nodes[i]->numSuccedents != acount[ nodes[i]->nodeNum ] ) {
+ printf("INVALID DAG: node %s has %d succedents but appears as an antecedent %d times\n",
+ nodes[i]->name, nodes[i]->numSuccedents, acount[nodes[i]->nodeNum]);
+ retcode = 1;
+ goto validate_dag_out;
+ }
+ }
+
+ if ( dag_h->numCommitNodes != commitNodeCount ) {
+ printf("INVALID DAG: incorrect commit node count. hdr->numCommitNodes (%d) found (%d) commit nodes in graph\n",
+ dag_h->numCommitNodes, commitNodeCount);
+ retcode = 1;
+ goto validate_dag_out;
+ }
+
+validate_dag_out:
+ RF_Free(scount, nodecount*sizeof(int));
+ RF_Free(acount, nodecount*sizeof(int));
+ RF_Free(nodes, nodecount*sizeof(RF_DagNode_t *));
+ if (retcode)
+ rf_PrintDAGList(dag_h);
+
+ if (rf_validateVisitedDebug)
+ rf_ValidateVisitedBits(dag_h);
+
+ return(retcode);
+
+validate_dag_bad:
+ rf_PrintDAGList(dag_h);
+ return(retcode);
+}
+
+
+/******************************************************************************
+ *
+ * misc construction routines
+ *
+ *****************************************************************************/
+
+void rf_redirect_asm(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap)
+{
+ int ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) ? 1 : 0;
+ int row = asmap->physInfo->row;
+ int fcol = raidPtr->reconControl[row]->fcol;
+ int srow = raidPtr->reconControl[row]->spareRow;
+ int scol = raidPtr->reconControl[row]->spareCol;
+ RF_PhysDiskAddr_t *pda;
+
+ RF_ASSERT( raidPtr->status[row] == rf_rs_reconstructing );
+ for (pda = asmap->physInfo; pda; pda=pda->next) {
+ if (pda->col == fcol) {
+ if (rf_dagDebug) {
+ if (!rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap,
+ pda->startSector))
+ {
+ RF_PANIC();
+ }
+ }
+ /*printf("Remapped data for large write\n");*/
+ if (ds) {
+ raidPtr->Layout.map->MapSector(raidPtr, pda->raidAddress,
+ &pda->row, &pda->col, &pda->startSector, RF_REMAP);
+ }
+ else {
+ pda->row = srow; pda->col = scol;
+ }
+ }
+ }
+ for (pda = asmap->parityInfo; pda; pda=pda->next) {
+ if (pda->col == fcol) {
+ if (rf_dagDebug) {
+ if (!rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, pda->startSector)) {
+ RF_PANIC();
+ }
+ }
+ }
+ if (ds) {
+ (raidPtr->Layout.map->MapParity)(raidPtr, pda->raidAddress, &pda->row, &pda->col, &pda->startSector, RF_REMAP);
+ }
+ else {
+ pda->row = srow; pda->col = scol;
+ }
+ }
+}
+
+
+/* this routine allocates read buffers and generates stripe maps for the
+ * regions of the array from the start of the stripe to the start of the
+ * access, and from the end of the access to the end of the stripe. It also
+ * computes and returns the number of DAG nodes needed to read all this data.
+ * Note that this routine does the wrong thing if the access is fully
+ * contained within one stripe unit, so we RF_ASSERT against this case at the
+ * start.
+ */
+void rf_MapUnaccessedPortionOfStripe(
+ RF_Raid_t *raidPtr,
+ RF_RaidLayout_t *layoutPtr, /* in: layout information */
+ RF_AccessStripeMap_t *asmap, /* in: access stripe map */
+ RF_DagHeader_t *dag_h, /* in: header of the dag to create */
+ RF_AccessStripeMapHeader_t **new_asm_h, /* in: ptr to array of 2 headers, to be filled in */
+ int *nRodNodes, /* out: num nodes to be generated to read unaccessed data */
+ char **sosBuffer, /* out: pointers to newly allocated buffer */
+ char **eosBuffer,
+ RF_AllocListElem_t *allocList)
+{
+ RF_RaidAddr_t sosRaidAddress, eosRaidAddress;
+ RF_SectorNum_t sosNumSector, eosNumSector;
+
+ RF_ASSERT( asmap->numStripeUnitsAccessed > (layoutPtr->numDataCol/2) );
+ /* generate an access map for the region of the array from start of stripe
+ * to start of access */
+ new_asm_h[0] = new_asm_h[1] = NULL; *nRodNodes = 0;
+ if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->raidAddress)) {
+ sosRaidAddress = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+ sosNumSector = asmap->raidAddress - sosRaidAddress;
+ RF_MallocAndAdd(*sosBuffer, rf_RaidAddressToByte(raidPtr, sosNumSector), (char *), allocList);
+ new_asm_h[0] = rf_MapAccess(raidPtr, sosRaidAddress, sosNumSector, *sosBuffer, RF_DONT_REMAP);
+ new_asm_h[0]->next = dag_h->asmList;
+ dag_h->asmList = new_asm_h[0];
+ *nRodNodes += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
+
+ RF_ASSERT(new_asm_h[0]->stripeMap->next == NULL);
+ /* we're totally within one stripe here */
+ if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
+ rf_redirect_asm(raidPtr, new_asm_h[0]->stripeMap);
+ }
+ /* generate an access map for the region of the array from end of access
+ * to end of stripe */
+ if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->endRaidAddress)) {
+ eosRaidAddress = asmap->endRaidAddress;
+ eosNumSector = rf_RaidAddressOfNextStripeBoundary(layoutPtr, eosRaidAddress) - eosRaidAddress;
+ RF_MallocAndAdd(*eosBuffer, rf_RaidAddressToByte(raidPtr, eosNumSector), (char *), allocList);
+ new_asm_h[1] = rf_MapAccess(raidPtr, eosRaidAddress, eosNumSector, *eosBuffer, RF_DONT_REMAP);
+ new_asm_h[1]->next = dag_h->asmList;
+ dag_h->asmList = new_asm_h[1];
+ *nRodNodes += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
+
+ RF_ASSERT(new_asm_h[1]->stripeMap->next == NULL);
+ /* we're totally within one stripe here */
+ if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
+ rf_redirect_asm(raidPtr, new_asm_h[1]->stripeMap);
+ }
+}
+
+
+
+/* returns non-zero if the indicated ranges of stripe unit offsets overlap */
+int rf_PDAOverlap(
+ RF_RaidLayout_t *layoutPtr,
+ RF_PhysDiskAddr_t *src,
+ RF_PhysDiskAddr_t *dest)
+{
+ RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector);
+ RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector);
+ /* use -1 to be sure we stay within SU */
+ RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector-1);
+ RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector-1);
+ return( (RF_MAX(soffs,doffs) <= RF_MIN(send,dend)) ? 1 : 0 );
+}
+
+
+/* GenerateFailedAccessASMs
+ *
+ * this routine figures out what portion of the stripe needs to be read
+ * to effect the degraded read or write operation. It's primary function
+ * is to identify everything required to recover the data, and then
+ * eliminate anything that is already being accessed by the user.
+ *
+ * The main result is two new ASMs, one for the region from the start of the
+ * stripe to the start of the access, and one for the region from the end of
+ * the access to the end of the stripe. These ASMs describe everything that
+ * needs to be read to effect the degraded access. Other results are:
+ * nXorBufs -- the total number of buffers that need to be XORed together to
+ * recover the lost data,
+ * rpBufPtr -- ptr to a newly-allocated buffer to hold the parity. If NULL
+ * at entry, not allocated.
+ * overlappingPDAs --
+ * describes which of the non-failed PDAs in the user access
+ * overlap data that needs to be read to effect recovery.
+ * overlappingPDAs[i]==1 if and only if, neglecting the failed
+ * PDA, the ith pda in the input asm overlaps data that needs
+ * to be read for recovery.
+ */
+ /* in: asm - ASM for the actual access, one stripe only */
+ /* in: faildPDA - which component of the access has failed */
+ /* in: dag_h - header of the DAG we're going to create */
+ /* out: new_asm_h - the two new ASMs */
+ /* out: nXorBufs - the total number of xor bufs required */
+ /* out: rpBufPtr - a buffer for the parity read */
+void rf_GenerateFailedAccessASMs(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_PhysDiskAddr_t *failedPDA,
+ RF_DagHeader_t *dag_h,
+ RF_AccessStripeMapHeader_t **new_asm_h,
+ int *nXorBufs,
+ char **rpBufPtr,
+ char *overlappingPDAs,
+ RF_AllocListElem_t *allocList)
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+
+ /* s=start, e=end, s=stripe, a=access, f=failed, su=stripe unit */
+ RF_RaidAddr_t sosAddr, sosEndAddr, eosStartAddr, eosAddr;
+
+ RF_SectorCount_t numSect[2], numParitySect;
+ RF_PhysDiskAddr_t *pda;
+ char *rdBuf, *bufP;
+ int foundit, i;
+
+ bufP = NULL;
+ foundit = 0;
+ /* first compute the following raid addresses:
+ start of stripe, (sosAddr)
+ MIN(start of access, start of failed SU), (sosEndAddr)
+ MAX(end of access, end of failed SU), (eosStartAddr)
+ end of stripe (i.e. start of next stripe) (eosAddr)
+ */
+ sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+ sosEndAddr = RF_MIN(asmap->raidAddress, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,failedPDA->raidAddress));
+ eosStartAddr = RF_MAX(asmap->endRaidAddress, rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, failedPDA->raidAddress));
+ eosAddr = rf_RaidAddressOfNextStripeBoundary(layoutPtr, asmap->raidAddress);
+
+ /* now generate access stripe maps for each of the above regions of the
+ * stripe. Use a dummy (NULL) buf ptr for now */
+
+ new_asm_h[0] = (sosAddr != sosEndAddr) ? rf_MapAccess(raidPtr, sosAddr, sosEndAddr-sosAddr, NULL, RF_DONT_REMAP) : NULL;
+ new_asm_h[1] = (eosStartAddr != eosAddr) ? rf_MapAccess(raidPtr, eosStartAddr, eosAddr-eosStartAddr, NULL, RF_DONT_REMAP) : NULL;
+
+ /* walk through the PDAs and range-restrict each SU to the region of the
+ * SU touched on the failed PDA. also compute total data buffer space
+ * requirements in this step. Ignore the parity for now. */
+
+ numSect[0] = numSect[1] = 0;
+ if (new_asm_h[0]) {
+ new_asm_h[0]->next = dag_h->asmList; dag_h->asmList = new_asm_h[0];
+ for (pda = new_asm_h[0]->stripeMap->physInfo; pda; pda = pda->next) {
+ rf_RangeRestrictPDA(raidPtr,failedPDA, pda, RF_RESTRICT_NOBUFFER, 0); numSect[0] += pda->numSector;
+ }
+ }
+ if (new_asm_h[1]) {
+ new_asm_h[1]->next = dag_h->asmList; dag_h->asmList = new_asm_h[1];
+ for (pda = new_asm_h[1]->stripeMap->physInfo; pda; pda = pda->next) {
+ rf_RangeRestrictPDA(raidPtr,failedPDA, pda, RF_RESTRICT_NOBUFFER, 0); numSect[1] += pda->numSector;
+ }
+ }
+ numParitySect = failedPDA->numSector;
+
+ /* allocate buffer space for the data & parity we have to read to recover
+ * from the failure */
+
+ if (numSect[0]+numSect[1]+ ((rpBufPtr) ? numParitySect : 0)) { /* don't allocate parity buf if not needed */
+ RF_MallocAndAdd(rdBuf, rf_RaidAddressToByte(raidPtr,numSect[0]+numSect[1]+numParitySect), (char *), allocList);
+ bufP = rdBuf;
+ if (rf_degDagDebug) printf("Newly allocated buffer (%d bytes) is 0x%lx\n",
+ (int)rf_RaidAddressToByte(raidPtr,numSect[0]+numSect[1]+numParitySect), (unsigned long) bufP);
+ }
+
+ /* now walk through the pdas one last time and assign buffer pointers
+ * (ugh!). Again, ignore the parity. also, count nodes to find out how
+ * many bufs need to be xored together */
+ (*nXorBufs) = 1; /* in read case, 1 is for parity. In write case, 1 is for failed data */
+ if (new_asm_h[0]) {
+ for (pda=new_asm_h[0]->stripeMap->physInfo; pda; pda=pda->next) {pda->bufPtr = bufP; bufP += rf_RaidAddressToByte(raidPtr,pda->numSector);}
+ *nXorBufs += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
+ }
+ if (new_asm_h[1]) {
+ for (pda=new_asm_h[1]->stripeMap->physInfo; pda; pda=pda->next) {pda->bufPtr = bufP; bufP += rf_RaidAddressToByte(raidPtr,pda->numSector);}
+ (*nXorBufs) += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
+ }
+ if (rpBufPtr) *rpBufPtr = bufP; /* the rest of the buffer is for parity */
+
+ /* the last step is to figure out how many more distinct buffers need to
+ * get xor'd to produce the missing unit. there's one for each user-data
+ * read node that overlaps the portion of the failed unit being accessed */
+
+ for (foundit=i=0,pda=asmap->physInfo; pda; i++,pda=pda->next) {
+ if (pda == failedPDA) {i--; foundit=1; continue;}
+ if (rf_PDAOverlap(layoutPtr, pda, failedPDA)) {
+ overlappingPDAs[i] = 1;
+ (*nXorBufs)++;
+ }
+ }
+ if (!foundit) {RF_ERRORMSG("GenerateFailedAccessASMs: did not find failedPDA in asm list\n"); RF_ASSERT(0);}
+
+ if (rf_degDagDebug) {
+ if (new_asm_h[0]) {
+ printf("First asm:\n"); rf_PrintFullAccessStripeMap(new_asm_h[0], 1);
+ }
+ if (new_asm_h[1]) {
+ printf("Second asm:\n"); rf_PrintFullAccessStripeMap(new_asm_h[1], 1);
+ }
+ }
+}
+
+
+/* adjusts the offset and number of sectors in the destination pda so that
+ * it covers at most the region of the SU covered by the source PDA. This
+ * is exclusively a restriction: the number of sectors indicated by the
+ * target PDA can only shrink.
+ *
+ * For example: s = sectors within SU indicated by source PDA
+ * d = sectors within SU indicated by dest PDA
+ * r = results, stored in dest PDA
+ *
+ * |--------------- one stripe unit ---------------------|
+ * | sssssssssssssssssssssssssssssssss |
+ * | ddddddddddddddddddddddddddddddddddddddddddddd |
+ * | rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr |
+ *
+ * Another example:
+ *
+ * |--------------- one stripe unit ---------------------|
+ * | sssssssssssssssssssssssssssssssss |
+ * | ddddddddddddddddddddddd |
+ * | rrrrrrrrrrrrrrrr |
+ *
+ */
+void rf_RangeRestrictPDA(
+ RF_Raid_t *raidPtr,
+ RF_PhysDiskAddr_t *src,
+ RF_PhysDiskAddr_t *dest,
+ int dobuffer,
+ int doraidaddr)
+{
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_SectorNum_t soffs = rf_StripeUnitOffset(layoutPtr, src->startSector);
+ RF_SectorNum_t doffs = rf_StripeUnitOffset(layoutPtr, dest->startSector);
+ RF_SectorNum_t send = rf_StripeUnitOffset(layoutPtr, src->startSector + src->numSector-1); /* use -1 to be sure we stay within SU */
+ RF_SectorNum_t dend = rf_StripeUnitOffset(layoutPtr, dest->startSector + dest->numSector-1);
+ RF_SectorNum_t subAddr = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->startSector); /* stripe unit boundary */
+
+ dest->startSector = subAddr + RF_MAX(soffs,doffs);
+ dest->numSector = subAddr + RF_MIN(send,dend) + 1 - dest->startSector;
+
+ if (dobuffer)
+ dest->bufPtr += (soffs > doffs) ? rf_RaidAddressToByte(raidPtr,soffs-doffs) : 0;
+ if (doraidaddr) {
+ dest->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, dest->raidAddress) +
+ rf_StripeUnitOffset(layoutPtr, dest->startSector);
+ }
+}
+
+/*
+ * Want the highest of these primes to be the largest one
+ * less than the max expected number of columns (won't hurt
+ * to be too small or too large, but won't be optimal, either)
+ * --jimz
+ */
+#define NLOWPRIMES 8
+static int lowprimes[NLOWPRIMES] = {2,3,5,7,11,13,17,19};
+
+/*****************************************************************************
+ * compute the workload shift factor. (chained declustering)
+ *
+ * return nonzero if access should shift to secondary, otherwise,
+ * access is to primary
+ *****************************************************************************/
+int rf_compute_workload_shift(
+ RF_Raid_t *raidPtr,
+ RF_PhysDiskAddr_t *pda)
+{
+ /*
+ * variables:
+ * d = column of disk containing primary
+ * f = column of failed disk
+ * n = number of disks in array
+ * sd = "shift distance" (number of columns that d is to the right of f)
+ * row = row of array the access is in
+ * v = numerator of redirection ratio
+ * k = denominator of redirection ratio
+ */
+ RF_RowCol_t d, f, sd, row, n;
+ int k, v, ret, i;
+
+ row = pda->row;
+ n = raidPtr->numCol;
+
+ /* assign column of primary copy to d */
+ d = pda->col;
+
+ /* assign column of dead disk to f */
+ for(f=0;((!RF_DEAD_DISK(raidPtr->Disks[row][f].status))&&(f<n));f++);
+
+ RF_ASSERT(f < n);
+ RF_ASSERT(f != d);
+
+ sd = (f > d) ? (n + d - f) : (d - f);
+ RF_ASSERT(sd < n);
+
+ /*
+ * v of every k accesses should be redirected
+ *
+ * v/k := (n-1-sd)/(n-1)
+ */
+ v = (n-1-sd);
+ k = (n-1);
+
+#if 1
+ /*
+ * XXX
+ * Is this worth it?
+ *
+ * Now reduce the fraction, by repeatedly factoring
+ * out primes (just like they teach in elementary school!)
+ */
+ for(i=0;i<NLOWPRIMES;i++) {
+ if (lowprimes[i] > v)
+ break;
+ while (((v%lowprimes[i])==0) && ((k%lowprimes[i])==0)) {
+ v /= lowprimes[i];
+ k /= lowprimes[i];
+ }
+ }
+#endif
+
+ raidPtr->hist_diskreq[row][d]++;
+ if (raidPtr->hist_diskreq[row][d] > v) {
+ ret = 0; /* do not redirect */
+ }
+ else {
+ ret = 1; /* redirect */
+ }
+
+#if 0
+ printf("d=%d f=%d sd=%d v=%d k=%d ret=%d h=%d\n", d, f, sd, v, k, ret,
+ raidPtr->hist_diskreq[row][d]);
+#endif
+
+ if (raidPtr->hist_diskreq[row][d] >= k) {
+ /* reset counter */
+ raidPtr->hist_diskreq[row][d] = 0;
+ }
+
+ return(ret);
+}
+
+/*
+ * Disk selection routines
+ */
+
+/*
+ * Selects the disk with the shortest queue from a mirror pair.
+ * Both the disk I/Os queued in RAIDframe as well as those at the physical
+ * disk are counted as members of the "queue"
+ */
+void rf_SelectMirrorDiskIdle(RF_DagNode_t *node)
+{
+ RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
+ RF_RowCol_t rowData, colData, rowMirror, colMirror;
+ int dataQueueLength, mirrorQueueLength, usemirror;
+ RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *)node->params[0].p;
+ RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *)node->params[4].p;
+ RF_PhysDiskAddr_t *tmp_pda;
+ RF_RaidDisk_t **disks = raidPtr->Disks;
+ RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
+
+ /* return the [row col] of the disk with the shortest queue */
+ rowData = data_pda->row;
+ colData = data_pda->col;
+ rowMirror = mirror_pda->row;
+ colMirror = mirror_pda->col;
+ dataQueue = &(dqs[rowData][colData]);
+ mirrorQueue = &(dqs[rowMirror][colMirror]);
+
+#ifdef RF_LOCK_QUEUES_TO_READ_LEN
+ RF_LOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
+#endif /* RF_LOCK_QUEUES_TO_READ_LEN */
+ dataQueueLength = dataQueue->queueLength + dataQueue->numOutstanding;
+#ifdef RF_LOCK_QUEUES_TO_READ_LEN
+ RF_UNLOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
+ RF_LOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
+#endif /* RF_LOCK_QUEUES_TO_READ_LEN */
+ mirrorQueueLength = mirrorQueue->queueLength + mirrorQueue->numOutstanding;
+#ifdef RF_LOCK_QUEUES_TO_READ_LEN
+ RF_UNLOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
+#endif /* RF_LOCK_QUEUES_TO_READ_LEN */
+
+ usemirror = 0;
+ if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
+ usemirror = 0;
+ }
+ else if (RF_DEAD_DISK(disks[rowData][colData].status)) {
+ usemirror = 1;
+ }
+ else if (dataQueueLength < mirrorQueueLength) {
+ usemirror = 0;
+ }
+ else if (mirrorQueueLength < dataQueueLength) {
+ usemirror = 1;
+ }
+ else {
+ /* queues are equal length. attempt cleverness. */
+ if (SNUM_DIFF(dataQueue->last_deq_sector,data_pda->startSector)
+ <= SNUM_DIFF(mirrorQueue->last_deq_sector,mirror_pda->startSector))
+ {
+ usemirror = 0;
+ }
+ else {
+ usemirror = 1;
+ }
+ }
+
+ if (usemirror) {
+ /* use mirror (parity) disk, swap params 0 & 4 */
+ tmp_pda = data_pda;
+ node->params[0].p = mirror_pda;
+ node->params[4].p = tmp_pda;
+ }
+ else {
+ /* use data disk, leave param 0 unchanged */
+ }
+ /* printf("dataQueueLength %d, mirrorQueueLength %d\n",dataQueueLength, mirrorQueueLength); */
+}
+
+/*
+ * Do simple partitioning. This assumes that
+ * the data and parity disks are laid out identically.
+ */
+void rf_SelectMirrorDiskPartition(RF_DagNode_t *node)
+{
+ RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
+ RF_RowCol_t rowData, colData, rowMirror, colMirror;
+ RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *)node->params[0].p;
+ RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *)node->params[4].p;
+ RF_PhysDiskAddr_t *tmp_pda;
+ RF_RaidDisk_t **disks = raidPtr->Disks;
+ RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
+ int usemirror;
+
+ /* return the [row col] of the disk with the shortest queue */
+ rowData = data_pda->row;
+ colData = data_pda->col;
+ rowMirror = mirror_pda->row;
+ colMirror = mirror_pda->col;
+ dataQueue = &(dqs[rowData][colData]);
+ mirrorQueue = &(dqs[rowMirror][colMirror]);
+
+ usemirror = 0;
+ if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
+ usemirror = 0;
+ }
+ else if (RF_DEAD_DISK(disks[rowData][colData].status)) {
+ usemirror = 1;
+ }
+ else if (data_pda->startSector < (disks[rowData][colData].numBlocks / 2)) {
+ usemirror = 0;
+ }
+ else {
+ usemirror = 1;
+ }
+
+ if (usemirror) {
+ /* use mirror (parity) disk, swap params 0 & 4 */
+ tmp_pda = data_pda;
+ node->params[0].p = mirror_pda;
+ node->params[4].p = tmp_pda;
+ }
+ else {
+ /* use data disk, leave param 0 unchanged */
+ }
+}
diff --git a/sys/dev/raidframe/rf_dagutils.h b/sys/dev/raidframe/rf_dagutils.h
new file mode 100644
index 00000000000..cb732879230
--- /dev/null
+++ b/sys/dev/raidframe/rf_dagutils.h
@@ -0,0 +1,192 @@
+/* $OpenBSD: rf_dagutils.h,v 1.1 1999/01/11 14:29:12 niklas Exp $ */
+/* $NetBSD: rf_dagutils.h,v 1.1 1998/11/13 04:20:28 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*************************************************************************
+ *
+ * rf_dagutils.h -- header file for utility routines for manipulating DAGs
+ *
+ *************************************************************************/
+
+/*
+ * :
+ * Log: rf_dagutils.h,v
+ * Revision 1.19 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.18 1996/07/15 17:22:18 jimz
+ * nit-pick code cleanup
+ * resolve stdlib problems on DEC OSF
+ *
+ * Revision 1.17 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.16 1996/06/06 17:27:46 jimz
+ * added another select mirror func (partitioning), changed names so dag
+ * creation routines can use the appropriate one
+ *
+ * fixed old idle mirror func to pick closest arm if queue lengths are equal
+ *
+ * Revision 1.15 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.14 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.13 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.12 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.11 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.10 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.9 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.8 1996/05/08 21:01:24 jimz
+ * fixed up enum type names that were conflicting with other
+ * enums and function names (ie, "panic")
+ * future naming trends will be towards RF_ and rf_ for
+ * everything raidframe-related
+ *
+ * Revision 1.7 1996/05/03 19:55:27 wvcii
+ * added misc routines from old dag creation files
+ *
+ * Revision 1.6 1995/12/01 15:57:28 root
+ * added copyright info
+ *
+ * Revision 1.5 1995/11/07 16:21:36 wvcii
+ * modified InitNode and InitNodeFromBuf prototypes
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_dagfuncs.h"
+#include "rf_general.h"
+
+#ifndef _RF__RF_DAGUTILS_H_
+#define _RF__RF_DAGUTILS_H_
+
+struct RF_RedFuncs_s {
+ int (*regular)(RF_DagNode_t *);
+ char *RegularName;
+ int (*simple)(RF_DagNode_t *);
+ char *SimpleName;
+};
+
+extern RF_RedFuncs_t rf_xorFuncs;
+extern RF_RedFuncs_t rf_xorRecoveryFuncs;
+
+void rf_InitNode(RF_DagNode_t *node, RF_NodeStatus_t initstatus,
+ int commit,
+ int (*doFunc)(RF_DagNode_t *node),
+ int (*undoFunc)(RF_DagNode_t *node),
+ int (*wakeFunc)(RF_DagNode_t *node, int status),
+ int nSucc, int nAnte, int nParam, int nResult,
+ RF_DagHeader_t *hdr, char *name, RF_AllocListElem_t *alist);
+
+void rf_FreeDAG(RF_DagHeader_t *dag_h);
+
+RF_PropHeader_t *rf_MakePropListEntry(RF_DagHeader_t *dag_h, int resultNum,
+ int paramNum, RF_PropHeader_t *next, RF_AllocListElem_t *allocList);
+
+int rf_ConfigureDAGs(RF_ShutdownList_t **listp);
+
+RF_DagHeader_t *rf_AllocDAGHeader(void);
+
+void rf_FreeDAGHeader(RF_DagHeader_t *dh);
+
+void *rf_AllocBuffer(RF_Raid_t *raidPtr, RF_DagHeader_t *dag_h,
+ RF_PhysDiskAddr_t *pda, RF_AllocListElem_t *allocList);
+
+char *rf_NodeStatusString(RF_DagNode_t *node);
+
+void rf_PrintNodeInfoString(RF_DagNode_t *node);
+
+int rf_AssignNodeNums(RF_DagHeader_t *dag_h);
+
+int rf_RecurAssignNodeNums(RF_DagNode_t *node, int num, int unvisited);
+
+void rf_ResetDAGHeaderPointers(RF_DagHeader_t *dag_h, RF_DagHeader_t *newptr);
+
+void rf_RecurResetDAGHeaderPointers(RF_DagNode_t *node, RF_DagHeader_t *newptr);
+
+void rf_PrintDAGList(RF_DagHeader_t *dag_h);
+
+int rf_ValidateDAG(RF_DagHeader_t *dag_h);
+
+void rf_redirect_asm(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap);
+
+void rf_MapUnaccessedPortionOfStripe(RF_Raid_t *raidPtr,
+ RF_RaidLayout_t *layoutPtr,
+ RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h,
+ RF_AccessStripeMapHeader_t **new_asm_h, int *nRodNodes, char **sosBuffer,
+ char **eosBuffer, RF_AllocListElem_t *allocList);
+
+int rf_PDAOverlap(RF_RaidLayout_t *layoutPtr, RF_PhysDiskAddr_t *src,
+ RF_PhysDiskAddr_t *dest);
+
+void rf_GenerateFailedAccessASMs(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_PhysDiskAddr_t *failedPDA,
+ RF_DagHeader_t *dag_h, RF_AccessStripeMapHeader_t **new_asm_h,
+ int *nXorBufs, char **rpBufPtr, char *overlappingPDAs,
+ RF_AllocListElem_t *allocList);
+
+/* flags used by RangeRestrictPDA */
+#define RF_RESTRICT_NOBUFFER 0
+#define RF_RESTRICT_DOBUFFER 1
+
+void rf_RangeRestrictPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *src,
+ RF_PhysDiskAddr_t *dest, int dobuffer, int doraidaddr);
+
+int rf_compute_workload_shift(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda);
+void rf_SelectMirrorDiskIdle(RF_DagNode_t *node);
+void rf_SelectMirrorDiskPartition(RF_DagNode_t *node);
+
+#endif /* !_RF__RF_DAGUTILS_H_ */
diff --git a/sys/dev/raidframe/rf_debugMem.c b/sys/dev/raidframe/rf_debugMem.c
new file mode 100644
index 00000000000..7d32463a11a
--- /dev/null
+++ b/sys/dev/raidframe/rf_debugMem.c
@@ -0,0 +1,578 @@
+/* $OpenBSD: rf_debugMem.c,v 1.1 1999/01/11 14:29:12 niklas Exp $ */
+/* $NetBSD: rf_debugMem.c,v 1.1 1998/11/13 04:20:28 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky, Mark Holland, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* debugMem.c: memory usage debugging stuff.
+ * Malloc, Calloc, and Free are #defined everywhere
+ * to do_malloc, do_calloc, and do_free.
+ *
+ * if RF_UTILITY is nonzero, it means were compiling one of the
+ * raidframe utility programs, such as rfctrl or smd. In this
+ * case, we eliminate all references to the threads package
+ * and to the allocation list stuff.
+ */
+
+/* :
+ * Log: rf_debugMem.c,v
+ * Revision 1.38 1996/08/20 14:45:43 jimz
+ * add debugging to track memory allocated (amount only, w/out
+ * excessive sanity checking)
+ *
+ * Revision 1.37 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.36 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.35 1996/06/13 08:55:38 jimz
+ * make error messages refer to file, line of original
+ * allocation
+ *
+ * Revision 1.34 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.33 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.32 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.31 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.30 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.29 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.28 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.27 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.26 1996/05/21 18:53:46 jimz
+ * return NULL for failed allocations, not panic
+ *
+ * Revision 1.25 1996/05/20 16:14:19 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.24 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.23 1996/05/17 12:42:35 jimz
+ * wrap get_threadid stuff in #ifndef UTILITY for utils which use
+ * redzone allocation stuff
+ *
+ * Revision 1.22 1996/05/16 23:06:09 jimz
+ * don't warn about NULL alists
+ *
+ * Revision 1.21 1996/05/16 22:25:02 jimz
+ * show allocations for [MC]allocAndAdd
+ *
+ * Revision 1.20 1996/05/15 18:30:22 jimz
+ * print memory allocation as well as frees if memDebug > 1
+ *
+ * Revision 1.19 1996/05/07 17:41:17 jimz
+ * add "level 2" for memDebug, which will print freed address ranges
+ *
+ * Revision 1.18 1996/05/02 20:41:53 jimz
+ * really fix malloc problem out-of-kernel in memory_hash_insert()
+ *
+ * Revision 1.17 1996/05/02 20:04:29 jimz
+ * fixed malloc deadlock previous change introduced
+ *
+ * Revision 1.16 1996/05/01 16:27:26 jimz
+ * get rid of ALLOCMH
+ * stop using ccmn_ memory management
+ *
+ * Revision 1.15 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.14 1995/12/01 15:56:17 root
+ * added copyright info
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_sys.h"
+
+#if RF_UTILITY == 0
+#include "rf_threadstuff.h"
+#include "rf_threadid.h"
+#include "rf_options.h"
+#else /* RF_UTILITY == 0 */
+#include "rf_utility.h"
+#endif /* RF_UTILITY == 0 */
+
+#ifndef KERNEL
+#include <stdio.h>
+#include <assert.h>
+#endif /* !KERNEL */
+#include "rf_debugMem.h"
+#include "rf_general.h"
+
+static long tot_mem_in_use = 0, max_mem = 0;
+
+/* Hash table of information about memory allocations */
+#define RF_MH_TABLESIZE 1000
+
+struct mh_struct {
+ void *address;
+ int size;
+ int line;
+ char *filen;
+ char allocated;
+ struct mh_struct *next;
+};
+static struct mh_struct *mh_table[RF_MH_TABLESIZE];
+RF_DECLARE_MUTEX(rf_debug_mem_mutex)
+static int mh_table_initialized=0;
+
+static void memory_hash_insert(void *addr, int size, int line, char *filen);
+static int memory_hash_remove(void *addr, int sz);
+
+#ifndef KERNEL /* no redzones or "real_" routines in the kernel */
+
+static void rf_redzone_free_failed(void *ptr, int size, int line, char *file);
+
+void *rf_real_redzone_malloc(_size_)
+ int _size_;
+{
+ char *p;
+
+ rf_validate_mh_table();
+ p = malloc((_size_)+16);
+ if (p == NULL)
+ return(p);
+ RF_ASSERT (p);
+ *((long *) p) = (_size_) ;
+ ((char *) p)[(_size_)+8] = '!';
+ ((char *) p)[(_size_)+15] = '!';
+ p += 8;
+ return(p);
+}
+
+void *rf_real_redzone_calloc(_n_,_size_)
+int _n_,_size_;
+{
+ char *p;
+ int _sz_;
+
+ rf_validate_mh_table();
+ _sz_ = (_n_) * (_size_);
+ p = malloc((_sz_)+16);
+ if (p == NULL)
+ return(p);
+ bzero(p,(_sz_)+16);
+ *((long *) p) = (_sz_) ;
+ ((char *) p)[(_sz_)+8] = '!';
+ ((char *) p)[(_sz_)+15] = '!';
+ p += 8;
+ return(p);
+}
+
+void rf_real_redzone_free(p, line, filen)
+char *p;
+int line;
+char *filen;
+{
+ unsigned long _size_;
+
+ rf_validate_mh_table();
+ p -= 8;
+ _size_ = *((long *) p);
+ if ((((char *) p)[(_size_)+8] != '!') || (((char *) p)[(_size_)+15] != '!'))
+ rf_redzone_free_failed(p,(_size_),line,filen);
+ free(p);
+}
+
+unsigned long rf_mem_alloc = 0;
+
+char *rf_real_Malloc(size, line, file)
+ int size;
+ int line;
+ char *file;
+{
+ void *pp;
+ char *p;
+ int tid;
+
+ RF_LOCK_MUTEX(rf_debug_mem_mutex);
+ rf_redzone_malloc(pp, size);
+ p = pp;
+ if (p == NULL) {
+ RF_ERRORMSG3("Unable to malloc %d bytes at line %d file %s\n", size,
+ line, file);
+ }
+ if (rf_memAmtDebug) {
+ rf_mem_alloc += size;
+ printf("%lu size %d %s:%d\n", rf_mem_alloc, size, file, line);
+ }
+#if RF_UTILITY == 0
+ if (rf_memDebug > 1) {
+ rf_get_threadid(tid);
+ printf("[%d] malloc 0x%lx - 0x%lx (%d) %s %d\n", tid, p, p+size, size,
+ file, line);
+ }
+#endif /* RF_UTILITY == 0 */
+ if (rf_memDebug)
+ rf_record_malloc(p, size, line, file);
+ RF_UNLOCK_MUTEX(rf_debug_mem_mutex);
+ return(p);
+}
+
+#if RF_UTILITY == 0
+char *rf_real_MallocAndAdd(size, alist, line, file)
+ int size;
+ RF_AllocListElem_t *alist;
+ int line;
+ char *file;
+{
+ void *pp;
+ char *p;
+ int tid;
+
+ RF_LOCK_MUTEX(rf_debug_mem_mutex);
+ rf_redzone_malloc(pp, size);
+ p = pp;
+ if (p == NULL) {
+ RF_ERRORMSG3("Unable to malloc %d bytes at line %d file %s\n", size,
+ line, file);
+ }
+ if (rf_memAmtDebug) {
+ rf_mem_alloc += size;
+ printf("%lu size %d %s:%d\n", rf_mem_alloc, size, file, line);
+ }
+ if (rf_memDebug > 1) {
+ rf_get_threadid(tid);
+ printf("[%d] malloc+add 0x%lx - 0x%lx (%d) %s %d\n", tid, p, p+size,
+ size, file, line);
+ }
+ if (alist) {
+ rf_real_AddToAllocList(alist, pp, size, 0);
+ }
+ if (rf_memDebug)
+ rf_record_malloc(p, size, line, file);
+ RF_UNLOCK_MUTEX(rf_debug_mem_mutex);
+ return(p);
+}
+#endif /* RF_UTILITY == 0 */
+
+char *rf_real_Calloc(nel, elsz, line, file)
+ int nel;
+ int elsz;
+ int line;
+ char *file;
+{
+ int tid, size;
+ void *pp;
+ char *p;
+
+ size = nel * elsz;
+ RF_LOCK_MUTEX(rf_debug_mem_mutex);
+ rf_redzone_calloc(pp, nel, elsz);
+ p = pp;
+ if (p == NULL) {
+ RF_ERRORMSG4("Unable to calloc %d objects of size %d at line %d file %s\n",
+ nel, elsz, line, file);
+ return(NULL);
+ }
+ if (rf_memAmtDebug) {
+ rf_mem_alloc += size;
+ printf("%lu size %d %s:%d\n", rf_mem_alloc, size, file, line);
+ }
+#if RF_UTILITY == 0
+ if (rf_memDebug > 1) {
+ rf_get_threadid(tid);
+ printf("[%d] calloc 0x%lx - 0x%lx (%d,%d) %s %d\n", tid, p, p+size, nel,
+ elsz, file, line);
+ }
+#endif /* RF_UTILITY == 0 */
+ if (rf_memDebug) {
+ rf_record_malloc(p, size, line, file);
+ }
+ RF_UNLOCK_MUTEX(rf_debug_mem_mutex);
+ return(p);
+}
+
+#if RF_UTILITY == 0
+char *rf_real_CallocAndAdd(nel, elsz, alist, line, file)
+ int nel;
+ int elsz;
+ RF_AllocListElem_t *alist;
+ int line;
+ char *file;
+{
+ int tid, size;
+ void *pp;
+ char *p;
+
+ size = nel * elsz;
+ RF_LOCK_MUTEX(rf_debug_mem_mutex);
+ rf_redzone_calloc(pp, nel, elsz);
+ p = pp;
+ if (p == NULL) {
+ RF_ERRORMSG4("Unable to calloc %d objs of size %d at line %d file %s\n",
+ nel, elsz, line, file);
+ return(NULL);
+ }
+ if (rf_memAmtDebug) {
+ rf_mem_alloc += size;
+ printf("%lu size %d %s:%d\n", rf_mem_alloc, size, file, line);
+ }
+ if (rf_memDebug > 1) {
+ rf_get_threadid(tid);
+ printf("[%d] calloc+add 0x%lx - 0x%lx (%d,%d) %s %d\n", tid, p,
+ p+size, nel, elsz, file, line);
+ }
+ if (alist) {
+ rf_real_AddToAllocList(alist, pp, size, 0);
+ }
+ if (rf_memDebug)
+ rf_record_malloc(p, size, line, file);
+ RF_UNLOCK_MUTEX(rf_debug_mem_mutex);
+ return(p);
+}
+#endif /* RF_UTILITY == 0 */
+
+void rf_real_Free(p, sz, line, file)
+ void *p;
+ int sz;
+ int line;
+ char *file;
+{
+ int tid;
+
+#if RF_UTILITY == 0
+ if (rf_memDebug > 1) {
+ rf_get_threadid(tid);
+ printf("[%d] free 0x%lx - 0x%lx (%d) %s %d\n", tid, p, ((char *)p)+sz, sz,
+ file, line);
+ }
+#endif /* RF_UTILITY == 0 */
+ RF_LOCK_MUTEX(rf_debug_mem_mutex);
+ if (rf_memAmtDebug) {
+ rf_mem_alloc -= sz;
+ printf("%lu - size %d %s:%d\n", rf_mem_alloc, sz, file, line);
+ }
+ if (rf_memDebug) {
+ rf_unrecord_malloc(p,sz);
+ }
+ rf_redzone_free(p);
+ RF_UNLOCK_MUTEX(rf_debug_mem_mutex);
+}
+
+void rf_validate_mh_table()
+{
+ int i, size;
+ struct mh_struct *p;
+ char *cp;
+
+ return;
+ for (i=0; i<RF_MH_TABLESIZE; i++) {
+ for (p=mh_table[i]; p; p=p->next) if (p->allocated) {
+ cp = ((char *) p->address) - 8;
+ size = *((long *) cp);
+ if ((((char *) cp)[(size)+8] != '!') || (((char *) cp)[(size)+15] != '!')) {
+ rf_redzone_free_failed(cp,(size),__LINE__,__FILE__);
+ }
+ }
+ }
+}
+
+static void rf_redzone_free_failed(ptr,size,line,file)
+ void *ptr;
+ int size;
+ int line;
+ char *file;
+{
+ RF_ERRORMSG4("Free of 0x%lx (recorded size %d) at %d of %s detected redzone overrun\n",ptr,size,line,file);
+ RF_ASSERT(0);
+}
+
+#endif /* !KERNEL */
+
+void rf_record_malloc(p, size, line, filen)
+void *p;
+int size, line;
+char *filen;
+{
+ RF_ASSERT(size != 0);
+
+ /*RF_LOCK_MUTEX(rf_debug_mem_mutex);*/
+ memory_hash_insert(p, size, line, filen);
+ tot_mem_in_use += size;
+ /*RF_UNLOCK_MUTEX(rf_debug_mem_mutex);*/
+ if ( (long) p == rf_memDebugAddress) {
+ printf("Allocate: debug address allocated from line %d file %s\n",line,filen);
+ }
+}
+
+void rf_unrecord_malloc(p, sz)
+void *p;
+int sz;
+{
+ int size;
+
+ /*RF_LOCK_MUTEX(rf_debug_mem_mutex);*/
+ size = memory_hash_remove(p, sz);
+ tot_mem_in_use -= size;
+ /*RF_UNLOCK_MUTEX(rf_debug_mem_mutex);*/
+ if ( (long) p == rf_memDebugAddress) {
+ printf("Free: Found debug address\n"); /* this is really only a flag line for gdb */
+ }
+}
+
+void rf_print_unfreed()
+{
+ int i, foundone=0;
+ struct mh_struct *p;
+
+ for (i=0; i<RF_MH_TABLESIZE; i++) {
+ for (p=mh_table[i]; p; p=p->next) if (p->allocated) {
+ if (!foundone) printf("\n\nThere are unfreed memory locations at program shutdown:\n");
+ foundone = 1;
+ printf("Addr 0x%lx Size %d line %d file %s\n",
+ (long)p->address,p->size,p->line,p->filen);
+ }
+ }
+ if (tot_mem_in_use) {
+ printf("%ld total bytes in use\n", tot_mem_in_use);
+ }
+}
+
+int rf_ConfigureDebugMem(listp)
+ RF_ShutdownList_t **listp;
+{
+ int i, rc;
+
+ rc = rf_create_managed_mutex(listp, &rf_debug_mem_mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ return(rc);
+ }
+ if (rf_memDebug) {
+ for (i=0; i<RF_MH_TABLESIZE; i++)
+ mh_table[i] = NULL;
+ mh_table_initialized=1;
+ }
+ return(0);
+}
+
+#define HASHADDR(_a_) ( (((unsigned long) _a_)>>3) % RF_MH_TABLESIZE )
+
+static void memory_hash_insert(addr, size, line, filen)
+void *addr;
+int size, line;
+char *filen;
+{
+ unsigned long bucket = HASHADDR(addr);
+ struct mh_struct *p;
+
+ RF_ASSERT(mh_table_initialized);
+
+ /* search for this address in the hash table */
+ for (p=mh_table[bucket]; p && (p->address != addr); p=p->next);
+ if (!p) {
+#ifdef KERNEL
+ RF_Malloc(p,sizeof(struct mh_struct),(struct mh_struct *));
+#else /* KERNEL */
+ p = (struct mh_struct *)malloc(sizeof(struct mh_struct));
+#endif /* KERNEL */
+ RF_ASSERT(p);
+ p->next = mh_table[bucket];
+ mh_table[bucket] = p;
+ p->address = addr;
+ p->allocated = 0;
+ }
+ if (p->allocated) {
+ printf("ERROR: reallocated address 0x%lx from line %d, file %s without intervening free\n",(long) addr, line, filen);
+ printf(" last allocated from line %d file %s\n",p->line, p->filen);
+ RF_ASSERT(0);
+ }
+ p->size = size; p->line = line; p->filen = filen;
+ p->allocated = 1;
+}
+
+static int memory_hash_remove(addr, sz)
+void *addr;
+int sz;
+{
+ unsigned long bucket = HASHADDR(addr);
+ struct mh_struct *p;
+
+ RF_ASSERT(mh_table_initialized);
+ for (p=mh_table[bucket]; p && (p->address != addr); p=p->next);
+ if (!p) {
+ printf("ERROR: freeing never-allocated address 0x%lx\n",(long) addr);
+ RF_PANIC();
+ }
+ if (!p->allocated) {
+ printf("ERROR: freeing unallocated address 0x%lx. Last allocation line %d file %s\n",(long) addr, p->line, p->filen);
+ RF_PANIC();
+ }
+ if (sz > 0 && p->size != sz) { /* you can suppress this error by using a negative value as the size to free */
+ printf("ERROR: incorrect size at free for address 0x%lx: is %d should be %d. Alloc at line %d of file %s\n",(unsigned long) addr, sz, p->size,p->line, p->filen);
+ RF_PANIC();
+ }
+ p->allocated = 0;
+ return(p->size);
+}
+
+void rf_ReportMaxMem()
+{
+ printf("Max memory used: %d bytes\n",(int)max_mem);
+#ifndef KERNEL
+ fflush(stdout);
+ fprintf(stderr,"Max memory used: %d bytes\n",max_mem);
+ fflush(stderr);
+#endif /* !KERNEL */
+}
diff --git a/sys/dev/raidframe/rf_debugMem.h b/sys/dev/raidframe/rf_debugMem.h
new file mode 100644
index 00000000000..2b5f1545d12
--- /dev/null
+++ b/sys/dev/raidframe/rf_debugMem.h
@@ -0,0 +1,263 @@
+/* $OpenBSD: rf_debugMem.h,v 1.1 1999/01/11 14:29:12 niklas Exp $ */
+/* $NetBSD: rf_debugMem.h,v 1.1 1998/11/13 04:20:28 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky, Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_debugMem.h -- memory leak debugging module
+ *
+ * IMPORTANT: if you put the lock/unlock mutex stuff back in here, you
+ * need to take it out of the routines in debugMem.c
+ *
+ * Log: rf_debugMem.h,v
+ * Revision 1.27 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.26 1996/06/11 13:46:43 jimz
+ * make bracing consistent around memory allocation macros
+ *
+ * Revision 1.25 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.24 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.23 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.22 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.21 1996/05/23 22:17:40 jimz
+ * fix alloclist macro names for kernel
+ *
+ * Revision 1.20 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.19 1996/05/23 13:18:23 jimz
+ * include rf_options.h
+ *
+ * Revision 1.18 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.17 1996/05/21 18:51:54 jimz
+ * cleaned up macro args
+ *
+ * Revision 1.16 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.15 1996/05/01 16:26:22 jimz
+ * get rid of old ccmn stuff
+ *
+ * Revision 1.14 1995/12/01 15:58:09 root
+ * added copyright info
+ *
+ * Revision 1.13 1995/10/11 15:26:03 jimz
+ * zero memory after allocation in kernel (hide effects
+ * of uninitialized structs)
+ *
+ * Revision 1.12 1995/10/06 17:04:15 jimz
+ * make Malloc and Free in kernel use kernel malloc package, not cam
+ * dbufs (which is gross, and was exhausting cam zalloc limit)
+ *
+ * Revision 1.11 1995/05/01 13:28:00 holland
+ * parity range locks, locking disk requests, recon+parityscan in kernel, etc.
+ *
+ * Revision 1.10 1995/04/24 13:25:51 holland
+ * rewrite to move disk queues, recon, & atomic RMW to kernel
+ *
+ * Revision 1.9 1995/02/17 19:39:56 holland
+ * added size param to all calls to Free().
+ * this is ignored at user level, but necessary in the kernel.
+ *
+ * Revision 1.8 1995/02/10 17:34:10 holland
+ * kernelization changes
+ *
+ * Revision 1.7 1995/02/03 22:31:36 holland
+ * many changes related to kernelization
+ *
+ * Revision 1.6 1995/02/01 15:13:05 holland
+ * moved #include of general.h out of raid.h and into each file
+ *
+ * Revision 1.5 1995/02/01 14:25:19 holland
+ * began changes for kernelization:
+ * changed all instances of mutex_t and cond_t to DECLARE macros
+ * converted configuration code to use config structure
+ *
+ * Revision 1.4 1995/01/11 19:27:02 holland
+ * many changes related to performance tuning
+ *
+ * Revision 1.3 1994/11/29 21:34:56 danner
+ * Changed type of redzone_calloc and malloc to void *.
+ *
+ * Revision 1.2 1994/11/28 22:13:23 danner
+ * Many macros converted to functions.
+ *
+ */
+
+#ifndef _RF__RF_DEBUGMEM_H_
+#define _RF__RF_DEBUGMEM_H_
+
+#include "rf_archs.h"
+#include "rf_alloclist.h"
+#include "rf_options.h"
+
+#ifndef KERNEL
+
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+void *malloc(), *calloc();
+#endif
+RF_DECLARE_EXTERN_MUTEX(rf_debug_mem_mutex)
+
+/*
+ * redzone malloc, calloc, and free allocate an extra 16 bytes on each
+ * malloc/calloc call to allow tracking of overflows on free.
+ */
+
+#if RF_MEMORY_REDZONES > 0
+#define rf_redzone_malloc(_p_,_size_) _p_ = rf_real_redzone_malloc(_size_)
+#define rf_redzone_calloc(_p_,_n_,_size_) _p_ = rf_real_redzone_calloc(_n_,_size_)
+#define rf_redzone_free(_p_) rf_real_redzone_free(_p_, __LINE__, __FILE__)
+#else /* RF_MEMORY_REDZONES > 0 */
+#define rf_redzone_malloc(_p_,_size_) _p_ = malloc(_size_)
+#define rf_redzone_calloc(_p_,_nel_,_size_) _p_ = calloc(_nel_,_size_)
+#define rf_redzone_free(_ptr_) free(_ptr_)
+#endif /* RF_MEMORY_REDZONES > 0 */
+
+#define RF_Malloc(_p_, _size_, _cast_) { \
+ _p_ = _cast_ rf_real_Malloc(_size_, __LINE__, __FILE__); \
+}
+
+#define RF_MallocAndAdd(_p_, _size_, _cast_, _alist_) { \
+ _p_ = _cast_ rf_real_MallocAndAdd(_size_, _alist_, __LINE__, __FILE__); \
+}
+
+#define RF_Calloc(_p_, _nel_, _elsz_, _cast_) { \
+ _p_ = _cast_ rf_real_Calloc(_nel_, _elsz_, __LINE__, __FILE__); \
+}
+
+#define RF_CallocAndAdd(_p_, _nel_, _elsz_, _cast_, _alist_) { \
+ _p_ = _cast_ rf_real_CallocAndAdd(_nel_, _elsz_, _alist_, __LINE__, __FILE__); \
+}
+
+#define RF_Free(__p_, _sz_) { \
+ rf_real_Free(__p_, _sz_, __LINE__, __FILE__); \
+}
+
+#else /* KERNEL */
+
+#include <sys/types.h>
+#if defined(__NetBSD__) || defined(__OpenBSD__)
+typedef u_int32_t U32;
+#else
+#include <io/common/iotypes.h> /* just to get defn of U32 */
+#endif /* __NetBSD__ || __OpenBSD__ */
+#include <sys/malloc.h>
+
+
+#if defined(__NetBSD__) || defined(__OpenBSD__)
+
+#define RF_Malloc(_p_, _size_, _cast_) \
+ { \
+ _p_ = _cast_ malloc((u_long)_size_, M_DEVBUF, M_WAITOK); \
+ bzero((char *)_p_, _size_); \
+ if (rf_memDebug) rf_record_malloc(_p_, _size_, __LINE__, __FILE__); \
+ }
+
+#else
+
+#define RF_Malloc(_p_, _size_, _cast_) \
+ { \
+ _p_ = _cast_ malloc((u_long)_size_, BUCKETINDEX(_size_), M_DEVBUF, M_WAITOK); \
+ bzero((char *)_p_, _size_); \
+ if (rf_memDebug) rf_record_malloc(_p_, _size_, __LINE__, __FILE__); \
+ }
+#endif /* __NetBSD__ || __OpenBSD__ */
+
+#define RF_MallocAndAdd(__p_, __size_, __cast_, __alist_) \
+ { \
+ RF_Malloc(__p_, __size_, __cast_); \
+ if (__alist_) rf_AddToAllocList(__alist_, __p_, __size_); \
+ }
+
+#define RF_Calloc(_p_, _nel_, _elsz_, _cast_) \
+ { \
+ RF_Malloc( _p_, (_nel_) * (_elsz_), _cast_); \
+ bzero( (_p_), (_nel_) * (_elsz_) ); \
+ }
+
+#define RF_CallocAndAdd(__p,__nel,__elsz,__cast,__alist) \
+ { \
+ RF_Calloc(__p, __nel, __elsz, __cast); \
+ if (__alist) rf_AddToAllocList(__alist, __p, (__nel)*(__elsz)); \
+ }
+
+#define RF_Free(_p_, _sz_) \
+ { \
+ free((void *)(_p_), M_DEVBUF); \
+ if (rf_memDebug) rf_unrecord_malloc(_p_, (U32) (_sz_)); \
+ }
+
+#endif /* KERNEL */
+
+#ifndef KERNEL
+void *rf_real_redzone_malloc(int size);
+void *rf_real_redzone_calloc(int n, int size);
+void rf_real_redzone_free(char *p, int line, char *filen);
+char *rf_real_Malloc(int size, int line, char *file);
+char *rf_real_Calloc(int nel, int elsz, int line, char *file);
+void rf_real_Free(void *p, int sz, int line, char *file);
+void rf_validate_mh_table(void);
+#if RF_UTILITY == 0
+char *rf_real_MallocAndAdd(int size, RF_AllocListElem_t *alist, int line, char *file);
+char *rf_real_CallocAndAdd(int nel, int elsz, RF_AllocListElem_t *alist, int line, char *file);
+#endif /* RF_UTILITY == 0 */
+#endif /* !KERNEL */
+
+void rf_record_malloc(void *p, int size, int line, char *filen);
+void rf_unrecord_malloc(void *p, int sz);
+void rf_print_unfreed(void);
+int rf_ConfigureDebugMem(RF_ShutdownList_t **listp);
+void rf_ReportMaxMem(void);
+
+#endif /* !_RF__RF_DEBUGMEM_H_ */
diff --git a/sys/dev/raidframe/rf_debugprint.c b/sys/dev/raidframe/rf_debugprint.c
new file mode 100644
index 00000000000..573d53ae71a
--- /dev/null
+++ b/sys/dev/raidframe/rf_debugprint.c
@@ -0,0 +1,186 @@
+/* $OpenBSD: rf_debugprint.c,v 1.1 1999/01/11 14:29:13 niklas Exp $ */
+/* $NetBSD: rf_debugprint.c,v 1.1 1998/11/13 04:20:28 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * Code to do debug printfs. Calls to rf_debug_printf cause the corresponding
+ * information to be printed to a circular buffer rather than the screen.
+ * The point is to try and minimize the timing variations induced by the
+ * printfs, and to capture only the printf's immediately preceding a failure.
+ */
+
+/* :
+ * Log: rf_debugprint.c,v
+ * Revision 1.13 1996/08/07 21:08:31 jimz
+ * remove bogus ; from mutex decl
+ *
+ * Revision 1.12 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.11 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.10 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.9 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.8 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.7 1996/05/20 16:16:06 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.6 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.5 1995/12/01 16:00:45 root
+ * added copyright info
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_threadstuff.h"
+#include "rf_debugprint.h"
+#include "rf_general.h"
+#include "rf_options.h"
+
+#include <sys/param.h>
+
+struct RF_Entry_s {
+ char *cstring;
+ void *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8;
+};
+
+/* space for 1k lines */
+#define BUFSHIFT 10
+#define BUFSIZE (1<<BUFSHIFT)
+#define BUFMASK (BUFSIZE-1)
+
+static struct RF_Entry_s rf_debugprint_buf[BUFSIZE];
+static int rf_debugprint_index = 0;
+RF_DECLARE_STATIC_MUTEX(rf_debug_print_mutex)
+
+int rf_ConfigureDebugPrint(listp)
+ RF_ShutdownList_t **listp;
+{
+ int rc;
+
+ rc = rf_create_managed_mutex(listp, &rf_debug_print_mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ return(rc);
+ }
+ rf_clear_debug_print_buffer();
+ return(0);
+}
+
+void rf_clear_debug_print_buffer()
+{
+ int i;
+
+ for (i=0; i<BUFSIZE; i++)
+ rf_debugprint_buf[i].cstring = NULL;
+ rf_debugprint_index = 0;
+}
+
+void rf_debug_printf(s,a1,a2,a3,a4,a5,a6,a7,a8)
+char *s;
+void *a1,*a2,*a3,*a4,*a5,*a6,*a7,*a8;
+{
+ int idx;
+
+ if (rf_debugPrintUseBuffer) {
+
+ RF_LOCK_MUTEX(rf_debug_print_mutex);
+ idx = rf_debugprint_index;
+ rf_debugprint_index = (rf_debugprint_index+1) & BUFMASK;
+ RF_UNLOCK_MUTEX(rf_debug_print_mutex);
+
+ rf_debugprint_buf[idx].cstring = s;
+ rf_debugprint_buf[idx].a1 = a1;
+ rf_debugprint_buf[idx].a2 = a2;
+ rf_debugprint_buf[idx].a3 = a3;
+ rf_debugprint_buf[idx].a4 = a4;
+ rf_debugprint_buf[idx].a5 = a5;
+ rf_debugprint_buf[idx].a6 = a6;
+ rf_debugprint_buf[idx].a7 = a7;
+ rf_debugprint_buf[idx].a8 = a8;
+ }
+ else {
+ printf(s,a1,a2,a3,a4,a5,a6,a7,a8);
+ }
+}
+
+void rf_print_debug_buffer()
+{
+ rf_spill_debug_buffer(NULL);
+}
+
+void rf_spill_debug_buffer(fname)
+ char *fname;
+{
+ int i;
+#ifndef KERNEL
+ FILE *fp;
+#endif /* !KERNEL */
+
+ if (!rf_debugPrintUseBuffer)
+ return;
+
+ RF_LOCK_MUTEX(rf_debug_print_mutex);
+#ifndef KERNEL
+ fp = (fname) ? fopen(fname,"w") : stdout;
+ if (!fp) {printf("Unable to open file %s for writing\n",fname); return;}
+ for (i=rf_debugprint_index+1; i != rf_debugprint_index; i = (i+1)&BUFMASK) if (rf_debugprint_buf[i].cstring)
+ fprintf(fp,rf_debugprint_buf[i].cstring,rf_debugprint_buf[i].a1,rf_debugprint_buf[i].a2,rf_debugprint_buf[i].a3,
+ rf_debugprint_buf[i].a4,rf_debugprint_buf[i].a5,rf_debugprint_buf[i].a6,rf_debugprint_buf[i].a7,rf_debugprint_buf[i].a8);
+ fprintf(fp,rf_debugprint_buf[i].cstring,rf_debugprint_buf[i].a1,rf_debugprint_buf[i].a2,rf_debugprint_buf[i].a3,
+ rf_debugprint_buf[i].a4,rf_debugprint_buf[i].a5,rf_debugprint_buf[i].a6,rf_debugprint_buf[i].a7,rf_debugprint_buf[i].a8);
+ fclose(fp);
+#else /* !KERNEL */
+ for (i=rf_debugprint_index+1; i != rf_debugprint_index; i = (i+1)&BUFMASK) if (rf_debugprint_buf[i].cstring)
+ printf(rf_debugprint_buf[i].cstring,rf_debugprint_buf[i].a1,rf_debugprint_buf[i].a2,rf_debugprint_buf[i].a3,
+ rf_debugprint_buf[i].a4,rf_debugprint_buf[i].a5,rf_debugprint_buf[i].a6,rf_debugprint_buf[i].a7,rf_debugprint_buf[i].a8);
+ printf(rf_debugprint_buf[i].cstring,rf_debugprint_buf[i].a1,rf_debugprint_buf[i].a2,rf_debugprint_buf[i].a3,
+ rf_debugprint_buf[i].a4,rf_debugprint_buf[i].a5,rf_debugprint_buf[i].a6,rf_debugprint_buf[i].a7,rf_debugprint_buf[i].a8);
+#endif /* !KERNEL */
+ RF_UNLOCK_MUTEX(rf_debug_print_mutex);
+}
diff --git a/sys/dev/raidframe/rf_debugprint.h b/sys/dev/raidframe/rf_debugprint.h
new file mode 100644
index 00000000000..6810fd0a6ee
--- /dev/null
+++ b/sys/dev/raidframe/rf_debugprint.h
@@ -0,0 +1,64 @@
+/* $OpenBSD: rf_debugprint.h,v 1.1 1999/01/11 14:29:13 niklas Exp $ */
+/* $NetBSD: rf_debugprint.h,v 1.1 1998/11/13 04:20:28 oster Exp $ */
+/*
+ * rf_debugprint.h
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * :
+ * Log: rf_debugprint.h,v
+ * Revision 1.4 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.3 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.2 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.1 1996/05/18 19:55:43 jimz
+ * Initial revision
+ *
+ */
+
+#ifndef _RF__RF_DEBUGPRINT_H_
+#define _RF__RF_DEBUGPRINT_H_
+
+int rf_ConfigureDebugPrint(RF_ShutdownList_t **listp);
+void rf_clear_debug_print_buffer(void);
+void rf_debug_printf(char *s, void *a1, void *a2, void *a3, void *a4,
+ void *a5, void *a6, void *a7, void *a8);
+void rf_print_debug_buffer(void);
+void rf_spill_debug_buffer(char *fname);
+
+#endif /* !_RF__RF_DEBUGPRINT_H_ */
diff --git a/sys/dev/raidframe/rf_decluster.c b/sys/dev/raidframe/rf_decluster.c
new file mode 100644
index 00000000000..11cff33143a
--- /dev/null
+++ b/sys/dev/raidframe/rf_decluster.c
@@ -0,0 +1,847 @@
+/* $OpenBSD: rf_decluster.c,v 1.1 1999/01/11 14:29:14 niklas Exp $ */
+/* $NetBSD: rf_decluster.c,v 1.1 1998/11/13 04:20:28 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*----------------------------------------------------------------------
+ *
+ * rf_decluster.c -- code related to the declustered layout
+ *
+ * Created 10-21-92 (MCH)
+ *
+ * Nov 93: adding support for distributed sparing. This code is a little
+ * complex: the basic layout used is as follows:
+ * let F = (v-1)/GCD(r,v-1). The spare space for each set of
+ * F consecutive fulltables is grouped together and placed after
+ * that set of tables.
+ * +------------------------------+
+ * | F fulltables |
+ * | Spare Space |
+ * | F fulltables |
+ * | Spare Space |
+ * | ... |
+ * +------------------------------+
+ *
+ *--------------------------------------------------------------------*/
+
+/*
+ * :
+ * Log: rf_decluster.c,v
+ * Revision 1.51 1996/08/21 19:47:10 jimz
+ * fix bogus return values from config
+ *
+ * Revision 1.50 1996/08/20 22:41:42 jimz
+ * better diagnostics for bad blockdesigns
+ *
+ * Revision 1.49 1996/07/31 16:56:18 jimz
+ * dataBytesPerStripe, sectorsPerDisk init arch-indep.
+ *
+ * Revision 1.48 1996/07/29 14:05:12 jimz
+ * fix numPUs/numRUs confusion (everything is now numRUs)
+ * clean up some commenting, return values
+ *
+ * Revision 1.47 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.46 1996/07/27 18:40:11 jimz
+ * cleanup sweep
+ *
+ * Revision 1.45 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.44 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.43 1996/06/19 17:53:48 jimz
+ * move GetNumSparePUs, InstallSpareTable ops into layout switch
+ *
+ * Revision 1.42 1996/06/17 03:23:48 jimz
+ * switch DeclusteredDS typing
+ *
+ * Revision 1.41 1996/06/11 08:55:15 jimz
+ * improved error-checking at configuration time
+ *
+ * Revision 1.40 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.39 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.38 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.37 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.36 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.35 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.34 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.33 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.32 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.31 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.30 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.29 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.28 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.27 1995/12/01 16:00:08 root
+ * added copyright info
+ *
+ * Revision 1.26 1995/11/28 21:35:12 amiri
+ * set the RF_BD_DECLUSTERED flag
+ *
+ * Revision 1.25 1995/11/17 18:56:00 wvcii
+ * added prototyping to MapParity
+ *
+ * Revision 1.24 1995/07/04 22:25:33 holland
+ * increased default num bufs
+ *
+ * Revision 1.23 1995/07/03 20:23:51 holland
+ * changed floating recon bufs & head sep yet again
+ *
+ * Revision 1.22 1995/07/03 18:12:14 holland
+ * changed the way the number of floating recon bufs & the head sep
+ * limit are set
+ *
+ * Revision 1.21 1995/07/02 15:07:42 holland
+ * bug fixes related to getting distributed sparing numbers
+ *
+ * Revision 1.20 1995/06/23 13:41:28 robby
+ * updeated to prototypes in rf_layout.h
+ *
+ */
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_raidframe.h"
+#include "rf_configure.h"
+#include "rf_decluster.h"
+#include "rf_debugMem.h"
+#include "rf_utils.h"
+#include "rf_alloclist.h"
+#include "rf_general.h"
+#include "rf_shutdown.h"
+#include "rf_sys.h"
+
+extern int rf_copyback_in_progress; /* debug only */
+
+/* found in rf_kintf.c */
+int rf_GetSpareTableFromDaemon(RF_SparetWait_t *req);
+
+/* configuration code */
+
+int rf_ConfigureDeclustered(
+ RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr)
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ int b, v, k, r, lambda; /* block design params */
+ int i, j;
+ RF_RowCol_t *first_avail_slot;
+ RF_StripeCount_t complete_FT_count, numCompleteFullTablesPerDisk;
+ RF_DeclusteredConfigInfo_t *info;
+ RF_StripeCount_t PUsPerDisk, spareRegionDepthInPUs, numCompleteSpareRegionsPerDisk, extraPUsPerDisk;
+ RF_StripeCount_t totSparePUsPerDisk;
+ RF_SectorNum_t diskOffsetOfLastFullTableInSUs;
+ RF_SectorCount_t SpareSpaceInSUs;
+ char *cfgBuf = (char *) (cfgPtr->layoutSpecific);
+ RF_StripeNum_t l, SUID;
+
+ SUID = l = 0;
+ numCompleteSpareRegionsPerDisk = 0;
+
+ /* 1. create layout specific structure */
+ RF_MallocAndAdd(info, sizeof(RF_DeclusteredConfigInfo_t), (RF_DeclusteredConfigInfo_t *), raidPtr->cleanupList);
+ if (info == NULL)
+ return(ENOMEM);
+ layoutPtr->layoutSpecificInfo = (void *) info;
+ info->SpareTable = NULL;
+
+ /* 2. extract parameters from the config structure */
+ if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
+ (void) bcopy(cfgBuf, info->sparemap_fname, RF_SPAREMAP_NAME_LEN);
+ }
+ cfgBuf += RF_SPAREMAP_NAME_LEN;
+
+ b = *( (int *) cfgBuf); cfgBuf += sizeof(int);
+ v = *( (int *) cfgBuf); cfgBuf += sizeof(int);
+ k = *( (int *) cfgBuf); cfgBuf += sizeof(int);
+ r = *( (int *) cfgBuf); cfgBuf += sizeof(int);
+ lambda = *( (int *) cfgBuf); cfgBuf += sizeof(int);
+ raidPtr->noRotate = *( (int *) cfgBuf); cfgBuf += sizeof(int);
+
+ /* the sparemaps are generated assuming that parity is rotated, so we issue
+ * a warning if both distributed sparing and no-rotate are on at the same time
+ */
+ if ((layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) && raidPtr->noRotate) {
+ RF_ERRORMSG("Warning: distributed sparing specified without parity rotation.\n");
+ }
+
+ if (raidPtr->numCol != v) {
+ RF_ERRORMSG2("RAID: config error: table element count (%d) not equal to no. of cols (%d)\n", v, raidPtr->numCol);
+ return(EINVAL);
+ }
+
+ /* 3. set up the values used in the mapping code */
+ info->BlocksPerTable = b;
+ info->Lambda = lambda;
+ info->NumParityReps = info->groupSize = k;
+ info->SUsPerTable = b * (k-1) * layoutPtr->SUsPerPU;/* b blks, k-1 SUs each */
+ info->SUsPerFullTable = k * info->SUsPerTable; /* rot k times */
+ info->PUsPerBlock = k-1;
+ info->SUsPerBlock = info->PUsPerBlock * layoutPtr->SUsPerPU;
+ info->TableDepthInPUs = (b*k) / v;
+ info->FullTableDepthInPUs = info->TableDepthInPUs * k; /* k repetitions */
+
+ /* used only in distributed sparing case */
+ info->FullTablesPerSpareRegion = (v-1) / rf_gcd(r, v-1); /* (v-1)/gcd fulltables */
+ info->TablesPerSpareRegion = k * info->FullTablesPerSpareRegion;
+ info->SpareSpaceDepthPerRegionInSUs = (r * info->TablesPerSpareRegion / (v-1)) * layoutPtr->SUsPerPU;
+
+ /* check to make sure the block design is sufficiently small */
+ if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
+ if (info->FullTableDepthInPUs * layoutPtr->SUsPerPU + info->SpareSpaceDepthPerRegionInSUs > layoutPtr->stripeUnitsPerDisk) {
+ RF_ERRORMSG3("RAID: config error: Full Table depth (%d) + Spare Space (%d) larger than disk size (%d) (BD too big)\n",
+ (int)info->FullTableDepthInPUs,
+ (int)info->SpareSpaceDepthPerRegionInSUs,
+ (int)layoutPtr->stripeUnitsPerDisk);
+ return(EINVAL);
+ }
+ } else {
+ if (info->TableDepthInPUs * layoutPtr->SUsPerPU > layoutPtr->stripeUnitsPerDisk) {
+ RF_ERRORMSG2("RAID: config error: Table depth (%d) larger than disk size (%d) (BD too big)\n",
+ (int)(info->TableDepthInPUs * layoutPtr->SUsPerPU), \
+ (int)layoutPtr->stripeUnitsPerDisk);
+ return(EINVAL);
+ }
+ }
+
+
+ /* compute the size of each disk, and the number of tables in the last fulltable (which
+ * need not be complete)
+ */
+ if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
+
+ PUsPerDisk = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU;
+ spareRegionDepthInPUs = (info->TablesPerSpareRegion * info->TableDepthInPUs +
+ (info->TablesPerSpareRegion * info->TableDepthInPUs) / (v-1));
+ info->SpareRegionDepthInSUs = spareRegionDepthInPUs * layoutPtr->SUsPerPU;
+
+ numCompleteSpareRegionsPerDisk = PUsPerDisk / spareRegionDepthInPUs;
+ info->NumCompleteSRs = numCompleteSpareRegionsPerDisk;
+ extraPUsPerDisk = PUsPerDisk % spareRegionDepthInPUs;
+
+ /* assume conservatively that we need the full amount of spare space in one region in order
+ * to provide spares for the partial spare region at the end of the array. We set "i" to
+ * the number of tables in the partial spare region. This may actually include some fulltables.
+ */
+ extraPUsPerDisk -= (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU);
+ if (extraPUsPerDisk <= 0) i = 0;
+ else i = extraPUsPerDisk/info->TableDepthInPUs;
+
+ complete_FT_count = raidPtr->numRow * (numCompleteSpareRegionsPerDisk * (info->TablesPerSpareRegion/k) + i/k);
+ info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable;
+ info->ExtraTablesPerDisk = i % k;
+
+ /* note that in the last spare region, the spare space is complete even though data/parity space is not */
+ totSparePUsPerDisk = (numCompleteSpareRegionsPerDisk+1) * (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU);
+ info->TotSparePUsPerDisk = totSparePUsPerDisk;
+
+ layoutPtr->stripeUnitsPerDisk =
+ ((complete_FT_count/raidPtr->numRow) * info->FullTableDepthInPUs + /* data & parity space */
+ info->ExtraTablesPerDisk * info->TableDepthInPUs +
+ totSparePUsPerDisk /* spare space */
+ ) * layoutPtr->SUsPerPU;
+ layoutPtr->dataStripeUnitsPerDisk =
+ (complete_FT_count * info->FullTableDepthInPUs + info->ExtraTablesPerDisk * info->TableDepthInPUs)
+ * layoutPtr->SUsPerPU * (k-1) / k;
+
+ } else {
+ /* non-dist spare case: force each disk to contain an integral number of tables */
+ layoutPtr->stripeUnitsPerDisk /= (info->TableDepthInPUs * layoutPtr->SUsPerPU);
+ layoutPtr->stripeUnitsPerDisk *= (info->TableDepthInPUs * layoutPtr->SUsPerPU);
+
+ /* compute the number of tables in the last fulltable, which need not be complete */
+ complete_FT_count =
+ ((layoutPtr->stripeUnitsPerDisk/layoutPtr->SUsPerPU) / info->FullTableDepthInPUs) * raidPtr->numRow;
+
+ info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable;
+ info->ExtraTablesPerDisk =
+ ((layoutPtr->stripeUnitsPerDisk/layoutPtr->SUsPerPU) / info->TableDepthInPUs) % k;
+ }
+
+ raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
+
+ /* find the disk offset of the stripe unit where the last fulltable starts */
+ numCompleteFullTablesPerDisk = complete_FT_count / raidPtr->numRow;
+ diskOffsetOfLastFullTableInSUs = numCompleteFullTablesPerDisk * info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
+ if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
+ SpareSpaceInSUs = numCompleteSpareRegionsPerDisk * info->SpareSpaceDepthPerRegionInSUs;
+ diskOffsetOfLastFullTableInSUs += SpareSpaceInSUs;
+ info->DiskOffsetOfLastSpareSpaceChunkInSUs =
+ diskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU;
+ }
+ info->DiskOffsetOfLastFullTableInSUs = diskOffsetOfLastFullTableInSUs;
+ info->numCompleteFullTablesPerDisk = numCompleteFullTablesPerDisk;
+
+ /* 4. create and initialize the lookup tables */
+ info->LayoutTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
+ if (info->LayoutTable == NULL)
+ return(ENOMEM);
+ info->OffsetTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
+ if (info->OffsetTable == NULL)
+ return(ENOMEM);
+ info->BlockTable = rf_make_2d_array(info->TableDepthInPUs*layoutPtr->SUsPerPU, raidPtr->numCol, raidPtr->cleanupList);
+ if (info->BlockTable == NULL)
+ return(ENOMEM);
+
+ first_avail_slot = rf_make_1d_array(v, NULL);
+ if (first_avail_slot == NULL)
+ return(ENOMEM);
+
+ for (i=0; i<b; i++)
+ for (j=0; j<k; j++)
+ info->LayoutTable[i][j] = *cfgBuf++;
+
+ /* initialize offset table */
+ for (i=0; i<b; i++) for (j=0; j<k; j++) {
+ info->OffsetTable[i][j] = first_avail_slot[ info->LayoutTable[i][j] ];
+ first_avail_slot[ info->LayoutTable[i][j] ]++;
+ }
+
+ /* initialize block table */
+ for (SUID=l=0; l<layoutPtr->SUsPerPU; l++) {
+ for (i=0; i<b; i++) {
+ for (j=0; j<k; j++) {
+ info->BlockTable[ (info->OffsetTable[i][j] * layoutPtr->SUsPerPU) + l ]
+ [ info->LayoutTable[i][j] ] = SUID;
+ }
+ SUID++;
+ }
+ }
+
+ rf_free_1d_array(first_avail_slot, v);
+
+ /* 5. set up the remaining redundant-but-useful parameters */
+
+ raidPtr->totalSectors = (k*complete_FT_count + raidPtr->numRow*info->ExtraTablesPerDisk) *
+ info->SUsPerTable * layoutPtr->sectorsPerStripeUnit;
+ layoutPtr->numStripe = (raidPtr->totalSectors / layoutPtr->sectorsPerStripeUnit) / (k-1);
+
+ /* strange evaluation order below to try and minimize overflow problems */
+
+ layoutPtr->dataSectorsPerStripe = (k-1) * layoutPtr->sectorsPerStripeUnit;
+ layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+ layoutPtr->numDataCol = k-1;
+ layoutPtr->numParityCol = 1;
+
+ return(0);
+}
+
+/* declustering with distributed sparing */
+static void rf_ShutdownDeclusteredDS(RF_ThreadArg_t);
+static void rf_ShutdownDeclusteredDS(arg)
+ RF_ThreadArg_t arg;
+{
+ RF_DeclusteredConfigInfo_t *info;
+ RF_Raid_t *raidPtr;
+
+ raidPtr = (RF_Raid_t *)arg;
+ info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+ if (info->SpareTable)
+ rf_FreeSpareTable(raidPtr);
+}
+
+int rf_ConfigureDeclusteredDS(
+ RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr)
+{
+ int rc;
+
+ rc = rf_ConfigureDeclustered(listp, raidPtr, cfgPtr);
+ if (rc)
+ return(rc);
+ rc = rf_ShutdownCreate(listp, rf_ShutdownDeclusteredDS, raidPtr);
+ if (rc) {
+ RF_ERRORMSG1("Got %d adding shutdown event for DeclusteredDS\n", rc);
+ rf_ShutdownDeclusteredDS(raidPtr);
+ return(rc);
+ }
+ return(0);
+}
+
+void rf_MapSectorDeclustered(raidPtr, raidSector, row, col, diskSector, remap)
+ RF_Raid_t *raidPtr;
+ RF_RaidAddr_t raidSector;
+ RF_RowCol_t *row;
+ RF_RowCol_t *col;
+ RF_SectorNum_t *diskSector;
+ int remap;
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+ RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
+ RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
+ RF_StripeNum_t BlockID, BlockOffset, RepIndex;
+ RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
+ RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
+ RF_StripeNum_t base_suid = 0, outSU, SpareRegion=0, SpareSpace=0;
+
+ rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
+
+ FullTableID = SUID / sus_per_fulltable; /* fulltable ID within array (across rows) */
+ if (raidPtr->numRow == 1) *row = 0; /* avoid a mod and a div in the common case */
+ else {
+ *row = FullTableID % raidPtr->numRow;
+ FullTableID /= raidPtr->numRow; /* convert to fulltable ID on this disk */
+ }
+ if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
+ SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
+ SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
+ }
+ FullTableOffset = SUID % sus_per_fulltable;
+ TableID = FullTableOffset / info->SUsPerTable;
+ TableOffset = FullTableOffset - TableID * info->SUsPerTable;
+ BlockID = TableOffset / info->PUsPerBlock;
+ BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
+ BlockID %= info->BlocksPerTable;
+ RepIndex = info->PUsPerBlock - TableID;
+ if (!raidPtr->noRotate) BlockOffset += ((BlockOffset >= RepIndex) ? 1 : 0);
+ *col = info->LayoutTable[BlockID][BlockOffset];
+
+ /* remap to distributed spare space if indicated */
+ if (remap) {
+ RF_ASSERT( raidPtr->Disks[*row][*col].status == rf_ds_reconstructing || raidPtr->Disks[*row][*col].status == rf_ds_dist_spared ||
+ (rf_copyback_in_progress && raidPtr->Disks[*row][*col].status == rf_ds_optimal));
+ rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU);
+ } else {
+
+ outSU = base_suid;
+ outSU += FullTableID * fulltable_depth; /* offs to strt of FT */
+ outSU += SpareSpace; /* skip rsvd spare space */
+ outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU; /* offs to strt of tble */
+ outSU += info->OffsetTable[BlockID][BlockOffset] * layoutPtr->SUsPerPU; /* offs to the PU */
+ }
+ outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock); /* offs to the SU within a PU */
+
+ /* convert SUs to sectors, and, if not aligned to SU boundary, add in offset to sector. */
+ *diskSector = outSU*layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit);
+
+ RF_ASSERT( *col != -1 );
+}
+
+
+/* prototyping this inexplicably causes the compile of the layout table (rf_layout.c) to fail */
+void rf_MapParityDeclustered(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+ RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
+ RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
+ RF_StripeNum_t BlockID, BlockOffset, RepIndex;
+ RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
+ RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
+ RF_StripeNum_t base_suid = 0, outSU, SpareRegion=0, SpareSpace=0;
+
+ rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
+
+ /* compute row & (possibly) spare space exactly as before */
+ FullTableID = SUID / sus_per_fulltable;
+ if (raidPtr->numRow == 1) *row = 0; /* avoid a mod and a div in the common case */
+ else {
+ *row = FullTableID % raidPtr->numRow;
+ FullTableID /= raidPtr->numRow; /* convert to fulltable ID on this disk */
+ }
+ if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
+ SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
+ SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
+ }
+
+ /* compute BlockID and RepIndex exactly as before */
+ FullTableOffset = SUID % sus_per_fulltable;
+ TableID = FullTableOffset / info->SUsPerTable;
+ TableOffset = FullTableOffset - TableID * info->SUsPerTable;
+ /*TableOffset = FullTableOffset % info->SUsPerTable;*/
+ /*BlockID = (TableOffset / info->PUsPerBlock) % info->BlocksPerTable;*/
+ BlockID = TableOffset / info->PUsPerBlock;
+ /*BlockOffset = TableOffset % info->PUsPerBlock;*/
+ BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
+ BlockID %= info->BlocksPerTable;
+
+ /* the parity block is in the position indicated by RepIndex */
+ RepIndex = (raidPtr->noRotate) ? info->PUsPerBlock : info->PUsPerBlock - TableID;
+ *col = info->LayoutTable[BlockID][RepIndex];
+
+ if (remap) {
+ RF_ASSERT( raidPtr->Disks[*row][*col].status == rf_ds_reconstructing || raidPtr->Disks[*row][*col].status == rf_ds_dist_spared ||
+ (rf_copyback_in_progress && raidPtr->Disks[*row][*col].status == rf_ds_optimal));
+ rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU);
+ } else {
+
+ /* compute sector as before, except use RepIndex instead of BlockOffset */
+ outSU = base_suid;
+ outSU += FullTableID * fulltable_depth;
+ outSU += SpareSpace; /* skip rsvd spare space */
+ outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;
+ outSU += info->OffsetTable[BlockID][RepIndex] * layoutPtr->SUsPerPU;
+ }
+
+ outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);
+ *diskSector = outSU*layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit);
+
+ RF_ASSERT( *col != -1 );
+}
+
+/* returns an array of ints identifying the disks that comprise the stripe containing the indicated address.
+ * the caller must _never_ attempt to modify this array.
+ */
+void rf_IdentifyStripeDeclustered(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids,
+ RF_RowCol_t *outRow)
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+ RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
+ RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
+ RF_StripeNum_t base_suid = 0;
+ RF_StripeNum_t SUID = rf_RaidAddressToStripeUnitID(layoutPtr, addr);
+ RF_StripeNum_t stripeID, FullTableID;
+ int tableOffset;
+
+ rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
+ FullTableID = SUID / sus_per_fulltable; /* fulltable ID within array (across rows) */
+ *outRow = FullTableID % raidPtr->numRow;
+ stripeID = rf_StripeUnitIDToStripeID(layoutPtr, SUID); /* find stripe offset into array */
+ tableOffset = (stripeID % info->BlocksPerTable); /* find offset into block design table */
+ *diskids = info->LayoutTable[tableOffset];
+}
+
+/* This returns the default head-separation limit, which is measured
+ * in "required units for reconstruction". Each time a disk fetches
+ * a unit, it bumps a counter. The head-sep code prohibits any disk
+ * from getting more than headSepLimit counter values ahead of any
+ * other.
+ *
+ * We assume here that the number of floating recon buffers is already
+ * set. There are r stripes to be reconstructed in each table, and so
+ * if we have a total of B buffers, we can have at most B/r tables
+ * under recon at any one time. In each table, lambda units are required
+ * from each disk, so given B buffers, the head sep limit has to be
+ * (lambda*B)/r units. We subtract one to avoid weird boundary cases.
+ *
+ * for example, suppose were given 50 buffers, r=19, and lambda=4 as in
+ * the 20.5 design. There are 19 stripes/table to be reconstructed, so
+ * we can have 50/19 tables concurrently under reconstruction, which means
+ * we can allow the fastest disk to get 50/19 tables ahead of the slower
+ * disk. There are lambda "required units" for each disk, so the fastest
+ * disk can get 4*50/19 = 10 counter values ahead of the slowest.
+ *
+ * If numBufsToAccumulate is not 1, we need to limit the head sep further
+ * because multiple bufs will be required for each stripe under recon.
+ */
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitDeclustered(
+ RF_Raid_t *raidPtr)
+{
+ RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+
+ return(info->Lambda * raidPtr->numFloatingReconBufs / info->TableDepthInPUs / rf_numBufsToAccumulate);
+}
+
+/* returns the default number of recon buffers to use. The value
+ * is somewhat arbitrary...it's intended to be large enough to allow
+ * for a reasonably large head-sep limit, but small enough that you
+ * don't use up all your system memory with buffers.
+ */
+int rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t *raidPtr)
+{
+ return(100 * rf_numBufsToAccumulate);
+}
+
+/* sectors in the last fulltable of the array need to be handled
+ * specially since this fulltable can be incomplete. this function
+ * changes the values of certain params to handle this.
+ *
+ * the idea here is that MapSector et. al. figure out which disk the
+ * addressed unit lives on by computing the modulos of the unit number
+ * with the number of units per fulltable, table, etc. In the last
+ * fulltable, there are fewer units per fulltable, so we need to adjust
+ * the number of user data units per fulltable to reflect this.
+ *
+ * so, we (1) convert the fulltable size and depth parameters to
+ * the size of the partial fulltable at the end, (2) compute the
+ * disk sector offset where this fulltable starts, and (3) convert
+ * the users stripe unit number from an offset into the array to
+ * an offset into the last fulltable.
+ */
+void rf_decluster_adjust_params(
+ RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t *SUID,
+ RF_StripeCount_t *sus_per_fulltable,
+ RF_StripeCount_t *fulltable_depth,
+ RF_StripeNum_t *base_suid)
+{
+ RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ /* Nothing! */
+#else
+ char pc = layoutPtr->map->parityConfig;
+#endif
+
+ if (*SUID >= info->FullTableLimitSUID) {
+ /* new full table size is size of last full table on disk */
+ *sus_per_fulltable = info->ExtraTablesPerDisk * info->SUsPerTable;
+
+ /* new full table depth is corresponding depth */
+ *fulltable_depth = info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU;
+
+ /* set up the new base offset */
+ *base_suid = info->DiskOffsetOfLastFullTableInSUs;
+
+ /* convert users array address to an offset into the last fulltable */
+ *SUID -= info->FullTableLimitSUID;
+ }
+}
+
+/*
+ * map a stripe ID to a parity stripe ID.
+ * See comment above RaidAddressToParityStripeID in layout.c.
+ */
+void rf_MapSIDToPSIDDeclustered(
+ RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t stripeID,
+ RF_StripeNum_t *psID,
+ RF_ReconUnitNum_t *which_ru)
+{
+ RF_DeclusteredConfigInfo_t *info;
+
+ info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+
+ *psID = (stripeID / (layoutPtr->SUsPerPU * info->BlocksPerTable))
+ * info->BlocksPerTable + (stripeID % info->BlocksPerTable);
+ *which_ru = (stripeID % (info->BlocksPerTable * layoutPtr->SUsPerPU))
+ / info->BlocksPerTable;
+ RF_ASSERT( (*which_ru) < layoutPtr->SUsPerPU/layoutPtr->SUsPerRU);
+}
+
+/*
+ * Called from MapSector and MapParity to retarget an access at the spare unit.
+ * Modifies the "col" and "outSU" parameters only.
+ */
+void rf_remap_to_spare_space(
+ RF_RaidLayout_t *layoutPtr,
+ RF_DeclusteredConfigInfo_t *info,
+ RF_RowCol_t row,
+ RF_StripeNum_t FullTableID,
+ RF_StripeNum_t TableID,
+ RF_SectorNum_t BlockID,
+ RF_StripeNum_t base_suid,
+ RF_StripeNum_t SpareRegion,
+ RF_RowCol_t *outCol,
+ RF_StripeNum_t *outSU)
+{
+ RF_StripeNum_t ftID, spareTableStartSU, TableInSpareRegion, lastSROffset, which_ft;
+
+ /*
+ * note that FullTableID and hence SpareRegion may have gotten
+ * tweaked by rf_decluster_adjust_params. We detect this by
+ * noticing that base_suid is not 0.
+ */
+ if (base_suid == 0) {
+ ftID = FullTableID;
+ }
+ else {
+ /*
+ * There may be > 1.0 full tables in the last (i.e. partial)
+ * spare region. find out which of these we're in.
+ */
+ lastSROffset = info->NumCompleteSRs * info->SpareRegionDepthInSUs;
+ which_ft = (info->DiskOffsetOfLastFullTableInSUs - lastSROffset) / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU);
+
+ /* compute the actual full table ID */
+ ftID = info->DiskOffsetOfLastFullTableInSUs / (info->FullTableDepthInPUs * layoutPtr->SUsPerPU) + which_ft;
+ SpareRegion = info->NumCompleteSRs;
+ }
+ TableInSpareRegion = (ftID * info->NumParityReps + TableID) % info->TablesPerSpareRegion;
+
+ *outCol = info->SpareTable[TableInSpareRegion][BlockID].spareDisk;
+ RF_ASSERT( *outCol != -1);
+
+ spareTableStartSU = (SpareRegion == info->NumCompleteSRs) ?
+ info->DiskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU :
+ (SpareRegion+1) * info->SpareRegionDepthInSUs - info->SpareSpaceDepthPerRegionInSUs;
+ *outSU = spareTableStartSU + info->SpareTable[TableInSpareRegion][BlockID].spareBlockOffsetInSUs;
+ if (*outSU >= layoutPtr->stripeUnitsPerDisk) {
+ printf("rf_remap_to_spare_space: invalid remapped disk SU offset %ld\n",(long)*outSU);
+ }
+}
+
+int rf_InstallSpareTable(
+ RF_Raid_t *raidPtr,
+ RF_RowCol_t frow,
+ RF_RowCol_t fcol)
+{
+ RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+ RF_SparetWait_t *req;
+ int retcode;
+
+ RF_Malloc(req, sizeof(*req), (RF_SparetWait_t *));
+ req->C = raidPtr->numCol;
+ req->G = raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol;
+ req->fcol = fcol;
+ req->SUsPerPU = raidPtr->Layout.SUsPerPU;
+ req->TablesPerSpareRegion = info->TablesPerSpareRegion;
+ req->BlocksPerTable = info->BlocksPerTable;
+ req->TableDepthInPUs = info->TableDepthInPUs;
+ req->SpareSpaceDepthPerRegionInSUs = info->SpareSpaceDepthPerRegionInSUs;
+
+#ifndef KERNEL
+ info->SpareTable = rf_ReadSpareTable(req, info->sparemap_fname);
+ RF_Free(req, sizeof(*req));
+ retcode = (info->SpareTable) ? 0 : 1;
+#else /* !KERNEL */
+ retcode = rf_GetSpareTableFromDaemon(req);
+ RF_ASSERT(!retcode); /* XXX -- fix this to recover gracefully -- XXX */
+#endif /* !KERNEL */
+
+ return(retcode);
+}
+
+#ifdef KERNEL
+/*
+ * Invoked via ioctl to install a spare table in the kernel.
+ */
+int rf_SetSpareTable(raidPtr, data)
+ RF_Raid_t *raidPtr;
+ void *data;
+{
+ RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+ RF_SpareTableEntry_t **ptrs;
+ int i, retcode;
+
+ /* what we need to copyin is a 2-d array, so first copyin the user pointers to the rows in the table */
+ RF_Malloc(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **));
+ retcode = copyin((caddr_t) data, (caddr_t) ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *));
+
+ if (retcode) return(retcode);
+
+ /* now allocate kernel space for the row pointers */
+ RF_Malloc(info->SpareTable, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **));
+
+ /* now allocate kernel space for each row in the table, and copy it in from user space */
+ for (i=0; i<info->TablesPerSpareRegion; i++) {
+ RF_Malloc(info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t), (RF_SpareTableEntry_t *));
+ retcode = copyin(ptrs[i], info->SpareTable[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t));
+ if (retcode) {
+ info->SpareTable = NULL; /* blow off the memory we've allocated */
+ return(retcode);
+ }
+ }
+
+ /* free up the temporary array we used */
+ RF_Free(ptrs, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *));
+
+ return(0);
+}
+#endif /* KERNEL */
+
+RF_ReconUnitCount_t rf_GetNumSpareRUsDeclustered(raidPtr)
+ RF_Raid_t *raidPtr;
+{
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+
+ return( ((RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo)->TotSparePUsPerDisk );
+}
+
+
+void rf_FreeSpareTable(raidPtr)
+ RF_Raid_t *raidPtr;
+{
+ long i;
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+ RF_SpareTableEntry_t **table = info->SpareTable;
+
+ for (i=0; i<info->TablesPerSpareRegion; i++) {RF_Free(table[i], info->BlocksPerTable * sizeof(RF_SpareTableEntry_t));}
+ RF_Free(table, info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *));
+ info->SpareTable = (RF_SpareTableEntry_t **) NULL;
+}
diff --git a/sys/dev/raidframe/rf_decluster.h b/sys/dev/raidframe/rf_decluster.h
new file mode 100644
index 00000000000..5e08fa12a55
--- /dev/null
+++ b/sys/dev/raidframe/rf_decluster.h
@@ -0,0 +1,182 @@
+/* $OpenBSD: rf_decluster.h,v 1.1 1999/01/11 14:29:14 niklas Exp $ */
+/* $NetBSD: rf_decluster.h,v 1.1 1998/11/13 04:20:28 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*----------------------------------------------------------------------
+ *
+ * decluster.h -- header file for declustered layout code
+ *
+ * Adapted from raidSim version July 1994
+ * Created 10-21-92 (MCH)
+ *
+ *--------------------------------------------------------------------*/
+
+/*
+ * :
+ * Log: rf_decluster.h,v
+ * Revision 1.20 1996/07/29 14:05:12 jimz
+ * fix numPUs/numRUs confusion (everything is now numRUs)
+ * clean up some commenting, return values
+ *
+ * Revision 1.19 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.18 1996/06/19 17:53:48 jimz
+ * move GetNumSparePUs, InstallSpareTable ops into layout switch
+ *
+ * Revision 1.17 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.16 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.15 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.14 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.13 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.12 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.11 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.10 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.9 1995/12/01 15:58:23 root
+ * added copyright info
+ *
+ * Revision 1.8 1995/11/17 18:57:02 wvcii
+ * added prototyping to MapParity
+ *
+ * Revision 1.7 1995/07/02 15:08:31 holland
+ * bug fixes related to getting distributed sparing numbers
+ *
+ * Revision 1.6 1995/06/23 13:41:18 robby
+ * updeated to prototypes in rf_layout.h
+ *
+ */
+
+#ifndef _RF__RF_DECLUSTER_H_
+#define _RF__RF_DECLUSTER_H_
+
+#include "rf_types.h"
+
+/*
+ * These structures define the tables used to locate the spare unit
+ * associated with a particular data or parity unit, and to perform
+ * the associated inverse mapping.
+ */
+struct RF_SpareTableEntry_s {
+ u_int spareDisk; /* disk to which this block is spared */
+ u_int spareBlockOffsetInSUs; /* offset into spare table for that disk */
+};
+
+#define RF_SPAREMAP_NAME_LEN 128
+
+/* this is the layout-specific info structure for the declustered layout.
+ */
+struct RF_DeclusteredConfigInfo_s {
+ RF_StripeCount_t groupSize; /* no. of stripe units per parity stripe */
+ RF_RowCol_t **LayoutTable; /* the block design table */
+ RF_RowCol_t **OffsetTable; /* the sector offset table */
+ RF_RowCol_t **BlockTable; /* the block membership table */
+ RF_StripeCount_t SUsPerFullTable; /* stripe units per full table */
+ RF_StripeCount_t SUsPerTable; /* stripe units per table */
+ RF_StripeCount_t PUsPerBlock; /* parity units per block */
+ RF_StripeCount_t SUsPerBlock; /* stripe units per block */
+ RF_StripeCount_t BlocksPerTable; /* block design tuples per table */
+ RF_StripeCount_t NumParityReps; /* tables per full table */
+ RF_StripeCount_t TableDepthInPUs; /* PUs on one disk in 1 table */
+ RF_StripeCount_t FullTableDepthInPUs; /* PUs on one disk in 1 fulltable */
+ RF_StripeCount_t FullTableLimitSUID; /* SU where partial fulltables start */
+ RF_StripeCount_t ExtraTablesPerDisk; /* # of tables in last fulltable */
+ RF_SectorNum_t DiskOffsetOfLastFullTableInSUs; /* disk offs of partial ft, if any */
+ RF_StripeCount_t numCompleteFullTablesPerDisk; /* ft identifier of partial ft, if any */
+ u_int Lambda; /* the pair count in the block design */
+
+ /* these are used only in the distributed-sparing case */
+ RF_StripeCount_t FullTablesPerSpareRegion; /* # of ft's comprising 1 spare region */
+ RF_StripeCount_t TablesPerSpareRegion; /* # of tables */
+ RF_SectorCount_t SpareSpaceDepthPerRegionInSUs; /* spare space/disk/region */
+ RF_SectorCount_t SpareRegionDepthInSUs; /* # of units/disk/region */
+ RF_SectorNum_t DiskOffsetOfLastSpareSpaceChunkInSUs; /* locates sp space after partial ft */
+ RF_StripeCount_t TotSparePUsPerDisk; /* total number of spare PUs per disk */
+ RF_StripeCount_t NumCompleteSRs;
+ RF_SpareTableEntry_t **SpareTable; /* remap table for spare space */
+ char sparemap_fname[RF_SPAREMAP_NAME_LEN]; /* where to find sparemap. not used in kernel */
+};
+
+int rf_ConfigureDeclustered(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr);
+int rf_ConfigureDeclusteredDS(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr);
+
+void rf_MapSectorDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_MapParityDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_IdentifyStripeDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids, RF_RowCol_t *outRow);
+void rf_MapSIDToPSIDDeclustered(RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t stripeID, RF_StripeNum_t *psID,
+ RF_ReconUnitNum_t *which_ru);
+int rf_InstallSpareTable(RF_Raid_t *raidPtr, RF_RowCol_t frow, RF_RowCol_t fcol);
+void rf_FreeSpareTable(RF_Raid_t *raidPtr);
+
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitDeclustered(RF_Raid_t *raidPtr);
+int rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t *raidPtr);
+
+void rf_decluster_adjust_params(RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t *SUID, RF_StripeCount_t *sus_per_fulltable,
+ RF_StripeCount_t *fulltable_depth, RF_StripeNum_t *base_suid);
+void rf_remap_to_spare_space(
+RF_RaidLayout_t *layoutPtr,
+RF_DeclusteredConfigInfo_t *info, RF_RowCol_t row, RF_StripeNum_t FullTableID,
+ RF_StripeNum_t TableID, RF_SectorNum_t BlockID, RF_StripeNum_t base_suid,
+ RF_StripeNum_t SpareRegion, RF_RowCol_t *outCol, RF_StripeNum_t *outSU);
+int rf_SetSpareTable(RF_Raid_t *raidPtr, void *data);
+RF_ReconUnitCount_t rf_GetNumSpareRUsDeclustered(RF_Raid_t *raidPtr);
+
+#endif /* !_RF__RF_DECLUSTER_H_ */
diff --git a/sys/dev/raidframe/rf_declusterPQ.c b/sys/dev/raidframe/rf_declusterPQ.c
new file mode 100644
index 00000000000..75acfa32670
--- /dev/null
+++ b/sys/dev/raidframe/rf_declusterPQ.c
@@ -0,0 +1,589 @@
+/* $OpenBSD: rf_declusterPQ.c,v 1.1 1999/01/11 14:29:14 niklas Exp $ */
+/* $NetBSD: rf_declusterPQ.c,v 1.1 1998/11/13 04:20:28 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Authors: Daniel Stodolsky, Mark Holland, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*--------------------------------------------------
+ * rf_declusterPQ.c
+ *
+ * mapping code for declustered P & Q or declustered EvenOdd
+ * much code borrowed from rf_decluster.c
+ *
+ *--------------------------------------------------*/
+
+/*
+ * $Header: /cvs/OpenBSD/src/sys/dev/raidframe/Attic/rf_declusterPQ.c,v 1.1 1999/01/11 14:29:14 niklas Exp $
+ *
+ * Log: rf_declusterPQ.c,v
+ * Revision 1.34 1996/08/21 19:47:14 jimz
+ * fix bogus return values from config
+ *
+ * Revision 1.33 1996/08/21 15:09:16 jimz
+ * cleanup debugging spoo
+ *
+ * Revision 1.32 1996/08/21 04:13:36 jimz
+ * debug with EvenOdd
+ *
+ * Revision 1.31 1996/08/20 22:41:54 jimz
+ * 2 parity disks, not 1
+ *
+ * Revision 1.30 1996/07/31 16:56:18 jimz
+ * dataBytesPerStripe, sectorsPerDisk init arch-indep.
+ *
+ * Revision 1.29 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.28 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.27 1996/06/11 08:45:12 jimz
+ * improved error-checking on array configuration
+ *
+ * Revision 1.26 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.25 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.24 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.23 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.22 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.21 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.20 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.19 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.18 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.17 1996/05/17 00:52:56 jimz
+ * RepIndex was not being initialized before the computation of
+ * RepIndexQ in MapQDeclusteredPQ(). I copied the initialization
+ * from MapParityDeclusteredPQ(). Hope that was right.
+ *
+ * Revision 1.16 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.15 1995/12/01 15:57:46 root
+ * added copyright info
+ *
+ * Revision 1.14 1995/11/17 19:00:13 wvcii
+ * added prototyping to MapParity
+ * created MapQ
+ *
+ * Revision 1.13 1995/10/05 22:20:48 jimz
+ * free_1d_array() takes two args; provide them both
+ *
+ * Revision 1.12 1995/09/06 19:26:33 wvcii
+ * offset cfgBuf by sparemap length (ConfigureDeclusteredPQ)
+ *
+ * Revision 1.11 95/06/23 13:41:11 robby
+ * updeated to prototypes in rf_layout.h
+ *
+ * Revision 1.10 1995/05/02 22:46:53 holland
+ * minor code cleanups.
+ *
+ * Revision 1.9 1995/03/15 20:45:23 holland
+ * distr sparing changes.
+ *
+ * Revision 1.8 1995/03/01 20:25:48 holland
+ * kernelization changes
+ *
+ * Revision 1.7 1995/02/17 19:39:56 holland
+ * added size param to all calls to Free().
+ * this is ignored at user level, but necessary in the kernel.
+ *
+ * Revision 1.6 1995/02/10 17:34:10 holland
+ * kernelization changes
+ *
+ * Revision 1.5 1995/02/03 22:31:36 holland
+ * many changes related to kernelization
+ *
+ * Revision 1.4 1995/02/01 15:13:05 holland
+ * moved #include of general.h out of raid.h and into each file
+ *
+ * Revision 1.3 1995/02/01 14:25:19 holland
+ * began changes for kernelization:
+ * changed all instances of mutex_t and cond_t to DECLARE macros
+ * converted configuration code to use config structure
+ *
+ * Revision 1.2 1994/11/28 22:13:56 danner
+ * corrected some mapping bugs.
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_configure.h"
+#include "rf_decluster.h"
+#include "rf_declusterPQ.h"
+#include "rf_debugMem.h"
+#include "rf_utils.h"
+#include "rf_alloclist.h"
+#include "rf_general.h"
+
+/* configuration code */
+
+int rf_ConfigureDeclusteredPQ(
+ RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr)
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ int b, v, k, r, lambda; /* block design params */
+ int i, j, l;
+ int *first_avail_slot;
+ int complete_FT_count, SUID;
+ RF_DeclusteredConfigInfo_t *info;
+ int numCompleteFullTablesPerDisk;
+ int PUsPerDisk, spareRegionDepthInPUs, numCompleteSpareRegionsPerDisk = 0, extraPUsPerDisk;
+ int totSparePUsPerDisk;
+ int diskOffsetOfLastFullTableInSUs, SpareSpaceInSUs;
+ char *cfgBuf = (char *) (cfgPtr->layoutSpecific);
+
+ cfgBuf += RF_SPAREMAP_NAME_LEN;
+
+ b = *( (int *) cfgBuf); cfgBuf += sizeof(int);
+ v = *( (int *) cfgBuf); cfgBuf += sizeof(int);
+ k = *( (int *) cfgBuf); cfgBuf += sizeof(int);
+ r = *( (int *) cfgBuf); cfgBuf += sizeof(int);
+ lambda = *( (int *) cfgBuf); cfgBuf += sizeof(int);
+ raidPtr->noRotate = *( (int *) cfgBuf); cfgBuf += sizeof(int);
+
+ if (k <= 2) {
+ printf("RAIDFRAME: k=%d, minimum value 2\n", k);
+ return(EINVAL);
+ }
+
+ /* 1. create layout specific structure */
+ RF_MallocAndAdd(info, sizeof(RF_DeclusteredConfigInfo_t), (RF_DeclusteredConfigInfo_t *), raidPtr->cleanupList);
+ if (info == NULL)
+ return(ENOMEM);
+ layoutPtr->layoutSpecificInfo = (void *) info;
+
+ /* the sparemaps are generated assuming that parity is rotated, so we issue
+ * a warning if both distributed sparing and no-rotate are on at the same time
+ */
+ if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) && raidPtr->noRotate) {
+ RF_ERRORMSG("Warning: distributed sparing specified without parity rotation.\n");
+ }
+
+ if (raidPtr->numCol != v) {
+ RF_ERRORMSG2("RAID: config error: table element count (%d) not equal to no. of cols (%d)\n", v, raidPtr->numCol);
+ return(EINVAL);
+ }
+
+ /* 3. set up the values used in devRaidMap */
+ info->BlocksPerTable = b;
+ info->NumParityReps = info->groupSize = k;
+ info->PUsPerBlock = k-2; /* PQ */
+ info->SUsPerTable = b * info->PUsPerBlock * layoutPtr->SUsPerPU;/* b blks, k-1 SUs each */
+ info->SUsPerFullTable = k * info->SUsPerTable; /* rot k times */
+ info->SUsPerBlock = info->PUsPerBlock * layoutPtr->SUsPerPU;
+ info->TableDepthInPUs = (b*k) / v;
+ info->FullTableDepthInPUs = info->TableDepthInPUs * k; /* k repetitions */
+
+ /* used only in distributed sparing case */
+ info->FullTablesPerSpareRegion = (v-1) / rf_gcd(r, v-1); /* (v-1)/gcd fulltables */
+ info->TablesPerSpareRegion = k * info->FullTablesPerSpareRegion;
+ info->SpareSpaceDepthPerRegionInSUs = (r * info->TablesPerSpareRegion / (v-1)) * layoutPtr->SUsPerPU;
+
+ /* check to make sure the block design is sufficiently small */
+ if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
+ if (info->FullTableDepthInPUs * layoutPtr->SUsPerPU + info->SpareSpaceDepthPerRegionInSUs > layoutPtr->stripeUnitsPerDisk) {
+ RF_ERRORMSG3("RAID: config error: Full Table depth (%d) + Spare Space (%d) larger than disk size (%d) (BD too big)\n",
+ (int)info->FullTableDepthInPUs,
+ (int)info->SpareSpaceDepthPerRegionInSUs,
+ (int)layoutPtr->stripeUnitsPerDisk);
+ return(EINVAL);
+ }
+ } else {
+ if (info->TableDepthInPUs * layoutPtr->SUsPerPU > layoutPtr->stripeUnitsPerDisk) {
+ RF_ERRORMSG2("RAID: config error: Table depth (%d) larger than disk size (%d) (BD too big)\n",
+ (int)(info->TableDepthInPUs * layoutPtr->SUsPerPU),
+ (int)layoutPtr->stripeUnitsPerDisk);
+ return(EINVAL);
+ }
+ }
+
+
+ /* compute the size of each disk, and the number of tables in the last fulltable (which
+ * need not be complete)
+ */
+ if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
+
+ PUsPerDisk = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU;
+ spareRegionDepthInPUs = (info->TablesPerSpareRegion * info->TableDepthInPUs +
+ (info->TablesPerSpareRegion * info->TableDepthInPUs) / (v-1));
+ info->SpareRegionDepthInSUs = spareRegionDepthInPUs * layoutPtr->SUsPerPU;
+
+ numCompleteSpareRegionsPerDisk = PUsPerDisk / spareRegionDepthInPUs;
+ info->NumCompleteSRs = numCompleteSpareRegionsPerDisk;
+ extraPUsPerDisk = PUsPerDisk % spareRegionDepthInPUs;
+
+ /* assume conservatively that we need the full amount of spare space in one region in order
+ * to provide spares for the partial spare region at the end of the array. We set "i" to
+ * the number of tables in the partial spare region. This may actually include some fulltables.
+ */
+ extraPUsPerDisk -= (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU);
+ if (extraPUsPerDisk <= 0) i = 0;
+ else i = extraPUsPerDisk/info->TableDepthInPUs;
+
+ complete_FT_count = raidPtr->numRow * (numCompleteSpareRegionsPerDisk * (info->TablesPerSpareRegion/k) + i/k);
+ info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable;
+ info->ExtraTablesPerDisk = i % k;
+
+ /* note that in the last spare region, the spare space is complete even though data/parity space is not */
+ totSparePUsPerDisk = (numCompleteSpareRegionsPerDisk+1) * (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU);
+ info->TotSparePUsPerDisk = totSparePUsPerDisk;
+
+ layoutPtr->stripeUnitsPerDisk =
+ ((complete_FT_count/raidPtr->numRow) * info->FullTableDepthInPUs + /* data & parity space */
+ info->ExtraTablesPerDisk * info->TableDepthInPUs +
+ totSparePUsPerDisk /* spare space */
+ ) * layoutPtr->SUsPerPU;
+ layoutPtr->dataStripeUnitsPerDisk =
+ (complete_FT_count * info->FullTableDepthInPUs + info->ExtraTablesPerDisk * info->TableDepthInPUs)
+ * layoutPtr->SUsPerPU * (k-1) / k;
+
+ } else {
+ /* non-dist spare case: force each disk to contain an integral number of tables */
+ layoutPtr->stripeUnitsPerDisk /= (info->TableDepthInPUs * layoutPtr->SUsPerPU);
+ layoutPtr->stripeUnitsPerDisk *= (info->TableDepthInPUs * layoutPtr->SUsPerPU);
+
+ /* compute the number of tables in the last fulltable, which need not be complete */
+ complete_FT_count =
+ ((layoutPtr->stripeUnitsPerDisk/layoutPtr->SUsPerPU) / info->FullTableDepthInPUs) * raidPtr->numRow;
+
+ info->FullTableLimitSUID = complete_FT_count * info->SUsPerFullTable;
+ info->ExtraTablesPerDisk =
+ ((layoutPtr->stripeUnitsPerDisk/layoutPtr->SUsPerPU) / info->TableDepthInPUs) % k;
+ }
+
+ raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
+
+ /* find the disk offset of the stripe unit where the last fulltable starts */
+ numCompleteFullTablesPerDisk = complete_FT_count / raidPtr->numRow;
+ diskOffsetOfLastFullTableInSUs = numCompleteFullTablesPerDisk * info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
+ if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
+ SpareSpaceInSUs = numCompleteSpareRegionsPerDisk * info->SpareSpaceDepthPerRegionInSUs;
+ diskOffsetOfLastFullTableInSUs += SpareSpaceInSUs;
+ info->DiskOffsetOfLastSpareSpaceChunkInSUs =
+ diskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk * info->TableDepthInPUs * layoutPtr->SUsPerPU;
+ }
+ info->DiskOffsetOfLastFullTableInSUs = diskOffsetOfLastFullTableInSUs;
+ info->numCompleteFullTablesPerDisk = numCompleteFullTablesPerDisk;
+
+ /* 4. create and initialize the lookup tables */
+ info->LayoutTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
+ if (info->LayoutTable == NULL)
+ return(ENOMEM);
+ info->OffsetTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
+ if (info->OffsetTable == NULL)
+ return(ENOMEM);
+ info->BlockTable = rf_make_2d_array(info->TableDepthInPUs*layoutPtr->SUsPerPU, raidPtr->numCol, raidPtr->cleanupList);
+ if (info->BlockTable == NULL)
+ return(ENOMEM);
+
+ first_avail_slot = (int *) rf_make_1d_array(v, NULL);
+ if (first_avail_slot == NULL)
+ return(ENOMEM);
+
+ for (i=0; i<b; i++)
+ for (j=0; j<k; j++)
+ info->LayoutTable[i][j] = *cfgBuf++;
+
+ /* initialize offset table */
+ for (i=0; i<b; i++) for (j=0; j<k; j++) {
+ info->OffsetTable[i][j] = first_avail_slot[ info->LayoutTable[i][j] ];
+ first_avail_slot[ info->LayoutTable[i][j] ]++;
+ }
+
+ /* initialize block table */
+ for (SUID=l=0; l<layoutPtr->SUsPerPU; l++) {
+ for (i=0; i<b; i++) {
+ for (j=0; j<k; j++) {
+ info->BlockTable[ (info->OffsetTable[i][j] * layoutPtr->SUsPerPU) + l ]
+ [ info->LayoutTable[i][j] ] = SUID;
+ }
+ SUID++;
+ }
+ }
+
+ rf_free_1d_array(first_avail_slot, v);
+
+ /* 5. set up the remaining redundant-but-useful parameters */
+
+ raidPtr->totalSectors = (k*complete_FT_count + raidPtr->numRow*info->ExtraTablesPerDisk) *
+ info->SUsPerTable * layoutPtr->sectorsPerStripeUnit;
+ layoutPtr->numStripe = (raidPtr->totalSectors / layoutPtr->sectorsPerStripeUnit) / (k-2);
+
+ /* strange evaluation order below to try and minimize overflow problems */
+
+ layoutPtr->dataSectorsPerStripe = (k-2) * layoutPtr->sectorsPerStripeUnit;
+ layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+ layoutPtr->numDataCol = k-2;
+ layoutPtr->numParityCol = 2;
+
+ return(0);
+}
+
+int rf_GetDefaultNumFloatingReconBuffersPQ(RF_Raid_t *raidPtr)
+{
+ int def_decl;
+
+ def_decl = rf_GetDefaultNumFloatingReconBuffersDeclustered(raidPtr);
+ return(RF_MAX(3 * raidPtr->numCol, def_decl));
+}
+
+void rf_MapSectorDeclusteredPQ(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+ RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
+ RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
+ RF_StripeNum_t BlockID, BlockOffset, RepIndex;
+ RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
+ RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
+ RF_StripeNum_t base_suid = 0, outSU, SpareRegion=0, SpareSpace=0;
+
+ rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
+
+ FullTableID = SUID / sus_per_fulltable; /* fulltable ID within array (across rows) */
+ *row = FullTableID % raidPtr->numRow;
+ FullTableID /= raidPtr->numRow; /* convert to fulltable ID on this disk */
+ if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
+ SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
+ SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
+ }
+ FullTableOffset = SUID % sus_per_fulltable;
+ TableID = FullTableOffset / info->SUsPerTable;
+ TableOffset = FullTableOffset - TableID * info->SUsPerTable;
+ BlockID = TableOffset / info->PUsPerBlock;
+ BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
+ BlockID %= info->BlocksPerTable;
+ RF_ASSERT(BlockOffset < info->groupSize-2 );
+ /*
+ TableIDs go from 0 .. GroupSize-1 inclusive.
+ PUsPerBlock is k-2.
+ We want the tableIDs to rotate from the
+ right, so use GroupSize
+ */
+ RepIndex = info->groupSize - 1 - TableID;
+ RF_ASSERT(RepIndex >= 0);
+ if (!raidPtr->noRotate)
+ {
+ if (TableID==0)
+ BlockOffset++; /* P on last drive, Q on first */
+ else
+ BlockOffset += ((BlockOffset >= RepIndex) ? 2 : 0); /* skip over PQ */
+ RF_ASSERT(BlockOffset < info->groupSize);
+ *col = info->LayoutTable[BlockID][BlockOffset];
+ }
+
+ /* remap to distributed spare space if indicated */
+ if (remap) {
+ rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID, TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col, &outSU);
+ } else {
+
+ outSU = base_suid;
+ outSU += FullTableID * fulltable_depth; /* offs to strt of FT */
+ outSU += SpareSpace; /* skip rsvd spare space */
+ outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU; /* offs to strt of tble */
+ outSU += info->OffsetTable[BlockID][BlockOffset] * layoutPtr->SUsPerPU; /* offs to the PU */
+ }
+ outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock); /* offs to the SU within a PU */
+
+ /* convert SUs to sectors, and, if not aligned to SU boundary, add in offset to sector */
+ *diskSector = outSU*layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit);
+}
+
+
+void rf_MapParityDeclusteredPQ(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+ RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
+ RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
+ RF_StripeNum_t BlockID, BlockOffset, RepIndex;
+ RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
+ RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
+ RF_StripeNum_t base_suid = 0, outSU, SpareRegion, SpareSpace=0;
+
+ rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
+
+ /* compute row & (possibly) spare space exactly as before */
+ FullTableID = SUID / sus_per_fulltable;
+ *row = FullTableID % raidPtr->numRow;
+ FullTableID /= raidPtr->numRow; /* convert to fulltable ID on this disk */
+ if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
+ SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
+ SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
+ }
+
+ /* compute BlockID and RepIndex exactly as before */
+ FullTableOffset = SUID % sus_per_fulltable;
+ TableID = FullTableOffset / info->SUsPerTable;
+ TableOffset = FullTableOffset - TableID * info->SUsPerTable;
+ BlockID = TableOffset / info->PUsPerBlock;
+ BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
+ BlockID %= info->BlocksPerTable;
+
+ /* the parity block is in the position indicated by RepIndex */
+ RepIndex = (raidPtr->noRotate) ? info->PUsPerBlock : info->groupSize - 1 - TableID;
+ *col = info->LayoutTable[BlockID][RepIndex];
+
+ if (remap)
+ RF_PANIC();
+
+ /* compute sector as before, except use RepIndex instead of BlockOffset */
+ outSU = base_suid;
+ outSU += FullTableID * fulltable_depth;
+ outSU += SpareSpace; /* skip rsvd spare space */
+ outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;
+ outSU += info->OffsetTable[BlockID][RepIndex] * layoutPtr->SUsPerPU;
+ outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);
+
+ *diskSector = outSU*layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit);
+}
+
+void rf_MapQDeclusteredPQ(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+ RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
+ RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
+ RF_StripeNum_t BlockID, BlockOffset, RepIndex, RepIndexQ;
+ RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
+ RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
+ RF_StripeNum_t base_suid = 0, outSU, SpareRegion, SpareSpace=0;
+
+ rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
+
+ /* compute row & (possibly) spare space exactly as before */
+ FullTableID = SUID / sus_per_fulltable;
+ *row = FullTableID % raidPtr->numRow;
+ FullTableID /= raidPtr->numRow; /* convert to fulltable ID on this disk */
+ if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
+ SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
+ SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
+ }
+
+ /* compute BlockID and RepIndex exactly as before */
+ FullTableOffset = SUID % sus_per_fulltable;
+ TableID = FullTableOffset / info->SUsPerTable;
+ TableOffset = FullTableOffset - TableID * info->SUsPerTable;
+ BlockID = TableOffset / info->PUsPerBlock;
+ BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
+ BlockID %= info->BlocksPerTable;
+
+ /* the q block is in the position indicated by RepIndex */
+ RepIndex = (raidPtr->noRotate) ? info->PUsPerBlock : info->groupSize - 1 - TableID;
+ RepIndexQ = ((RepIndex == (info->groupSize-1)) ? 0 : RepIndex+1);
+ *col = info->LayoutTable[BlockID][RepIndexQ];
+
+ if (remap)
+ RF_PANIC();
+
+ /* compute sector as before, except use RepIndex instead of BlockOffset */
+ outSU = base_suid;
+ outSU += FullTableID * fulltable_depth;
+ outSU += SpareSpace; /* skip rsvd spare space */
+ outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;
+ outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);
+
+ outSU += info->OffsetTable[BlockID][RepIndexQ] * layoutPtr->SUsPerPU;
+ *diskSector = outSU*layoutPtr->sectorsPerStripeUnit + (raidSector % layoutPtr->sectorsPerStripeUnit);
+}
+
+/* returns an array of ints identifying the disks that comprise the stripe containing the indicated address.
+ * the caller must _never_ attempt to modify this array.
+ */
+void rf_IdentifyStripeDeclusteredPQ(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids,
+ RF_RowCol_t *outRow)
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_DeclusteredConfigInfo_t *info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
+ RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
+ RF_StripeCount_t fulltable_depth = info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
+ RF_StripeNum_t base_suid = 0;
+ RF_StripeNum_t SUID = rf_RaidAddressToStripeUnitID(layoutPtr, addr);
+ RF_StripeNum_t stripeID, FullTableID;
+ int tableOffset;
+
+ rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable, &fulltable_depth, &base_suid);
+ FullTableID = SUID / sus_per_fulltable; /* fulltable ID within array (across rows) */
+ *outRow = FullTableID % raidPtr->numRow;
+ stripeID = rf_StripeUnitIDToStripeID(layoutPtr, SUID); /* find stripe offset into array */
+ tableOffset = (stripeID % info->BlocksPerTable); /* find offset into block design table */
+ *diskids = info->LayoutTable[tableOffset];
+}
diff --git a/sys/dev/raidframe/rf_declusterPQ.h b/sys/dev/raidframe/rf_declusterPQ.h
new file mode 100644
index 00000000000..2ef5d4c220e
--- /dev/null
+++ b/sys/dev/raidframe/rf_declusterPQ.h
@@ -0,0 +1,100 @@
+/* $OpenBSD: rf_declusterPQ.h,v 1.1 1999/01/11 14:29:14 niklas Exp $ */
+/* $NetBSD: rf_declusterPQ.h,v 1.1 1998/11/13 04:20:28 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky, Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* :
+ * Log: rf_declusterPQ.h,v
+ * Revision 1.13 1996/08/20 22:42:08 jimz
+ * missing prototype of IdentifyStripeDeclusteredPQ added
+ *
+ * Revision 1.12 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.11 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.10 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.9 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.8 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.7 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.6 1995/12/01 15:59:20 root
+ * added copyright info
+ *
+ * Revision 1.5 1995/11/17 19:08:23 wvcii
+ * added prototyping to MapParity
+ *
+ * Revision 1.4 1995/11/07 15:30:33 wvcii
+ * changed PQDagSelect prototype
+ * function no longer generates numHdrSucc, numTermAnt
+ * removed ParityLoggingDagSelect prototype
+ *
+ * Revision 1.3 1995/06/23 13:40:57 robby
+ * updeated to prototypes in rf_layout.h
+ *
+ * Revision 1.2 1995/05/02 22:46:53 holland
+ * minor code cleanups.
+ *
+ * Revision 1.1 1994/11/19 20:26:57 danner
+ * Initial revision
+ *
+ */
+
+#ifndef _RF__RF_DECLUSTERPQ_H_
+#define _RF__RF_DECLUSTERPQ_H_
+
+#include "rf_types.h"
+
+int rf_ConfigureDeclusteredPQ(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr);
+int rf_GetDefaultNumFloatingReconBuffersPQ(RF_Raid_t *raidPtr);
+void rf_MapSectorDeclusteredPQ(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_MapParityDeclusteredPQ(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_MapQDeclusteredPQ(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_IdentifyStripeDeclusteredPQ(RF_Raid_t *raidPtr, RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids, RF_RowCol_t *outRow);
+
+#endif /* !_RF__RF_DECLUSTERPQ_H_ */
diff --git a/sys/dev/raidframe/rf_demo.c b/sys/dev/raidframe/rf_demo.c
new file mode 100644
index 00000000000..91212482c37
--- /dev/null
+++ b/sys/dev/raidframe/rf_demo.c
@@ -0,0 +1,506 @@
+/* $OpenBSD: rf_demo.c,v 1.1 1999/01/11 14:29:15 niklas Exp $ */
+/* $NetBSD: rf_demo.c,v 1.1 1998/11/13 04:20:28 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Khalil Amiri
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/**********************************************************************************
+ *
+ * rf_demo.c -- code for supporting demos. this is not actually part of the driver.
+ *
+ **********************************************************************************/
+
+/* :
+ * Log: rf_demo.c,v
+ * Revision 1.24 1996/06/17 14:38:33 jimz
+ * properly #if out RF_DEMO code
+ * fix bug in MakeConfig that was causing weird behavior
+ * in configuration routines (config was not zeroed at start)
+ * clean up genplot handling of stacks
+ *
+ * Revision 1.23 1996/06/17 03:23:09 jimz
+ * explicitly do pthread stuff (for join)
+ * NOTE: this should be changed!
+ *
+ * Revision 1.22 1996/06/14 23:15:38 jimz
+ * attempt to deal with thread GC problem
+ *
+ * Revision 1.21 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.20 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.19 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.18 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.17 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.16 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.15 1996/05/20 16:14:08 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.14 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.13 1995/12/01 15:56:07 root
+ * added copyright info
+ *
+ */
+
+#include "rf_archs.h"
+
+#if RF_DEMO > 0
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <strings.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <signal.h>
+
+#include "rf_threadstuff.h"
+#include "rf_demo.h"
+#include "rf_utils.h"
+#include "rf_general.h"
+#include "rf_options.h"
+
+#ifdef SIMULATE
+#include "rf_diskevent.h"
+#endif /* SIMULATE */
+
+static int doMax = 0; /* currently no way to set this */
+
+/****************************************************************************************
+ * fault-free demo code
+ ***************************************************************************************/
+
+static int user_iops_meter = -1;
+static int disk_iops_meter = -1;
+static int max_user_meter = -1;
+static int max_disk_meter = -1;
+static int recon_pctg_meter = -1;
+static int avg_resp_time_meter = -1;
+static int recon_time_meter = -1;
+static int ff_avg_resp_time_meter = -1;
+static int deg_avg_resp_time_meter = -1;
+static int recon_avg_resp_time_meter = -1;
+static int user_ios_ff=0;
+static int user_ios_deg=0;
+static int user_ios_recon=0;
+static long user_resp_time_sum_ff = 0;
+static long user_resp_time_sum_deg = 0;
+static long user_resp_time_sum_recon = 0;
+
+int rf_demo_op_mode = 0;
+
+RF_DECLARE_STATIC_MUTEX(iops_mutex)
+static int user_ios_so_far, disk_ios_so_far, max_user, max_disk;
+static long user_resp_time_sum_ms;
+static int recon_pctg;
+static struct timeval iops_starttime;
+#ifndef SIMULATE
+static RF_Thread_t update_thread_desc;
+#endif /* !SIMULATE */
+static int meter_update_terminate;
+
+static int meter_update_interval = 2; /* seconds between meter updates */
+static int iops_initialized = 0, recon_initialized = 0;
+
+static char *demoMeterTags[] = {"FF", "Degr", "Recon"};
+
+static int vpos=0;
+
+static int rf_CreateMeter(char *title, char *geom, char *color);
+static void rf_UpdateMeter(int meterid, int value);
+static void rf_DestroyMeter(int meterid, int killproc);
+
+void rf_startup_iops_demo(meter_vpos, C, G)
+ int meter_vpos;
+ int C;
+ int G;
+{
+ char buf[100], title[100];
+ int rc;
+
+ vpos = meter_vpos;
+ sprintf(buf, "%dx%d-0+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE));
+ sprintf(title,"%s %d/%d User IOs/sec",demoMeterTags[rf_demoMeterTag],C,G);
+ user_iops_meter = rf_CreateMeter(title, buf, "black");
+ sprintf(buf, "%dx%d-%d+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, RF_DEMO_METER_WIDTH+RF_DEMO_METER_SPACING,vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE));
+ sprintf(title,"%s %d/%d Disk IOs/sec",demoMeterTags[rf_demoMeterTag],C,G);
+ disk_iops_meter = rf_CreateMeter(title, buf, "red");
+ if (doMax) {
+ sprintf(buf, "%dx%d-%d+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, 2*(RF_DEMO_METER_WIDTH+RF_DEMO_METER_SPACING),vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE));
+ sprintf(title,"%s %d/%d Avg User IOs/s",demoMeterTags[rf_demoMeterTag],C,G);
+ max_user_meter = rf_CreateMeter(title, buf, "black");
+ sprintf(buf, "%dx%d-%d+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, 3*(RF_DEMO_METER_WIDTH+RF_DEMO_METER_SPACING), vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE));
+ sprintf(title,"%s %d/%d Avg Disk IOs/s",demoMeterTags[rf_demoMeterTag],C,G);
+ max_disk_meter = rf_CreateMeter(title, buf, "red");
+ sprintf(buf, "%dx%d-%d+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, 4*(RF_DEMO_METER_WIDTH+RF_DEMO_METER_SPACING), vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE));
+ } else {
+ sprintf(buf, "%dx%d-%d+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, 2*(RF_DEMO_METER_WIDTH+RF_DEMO_METER_SPACING), vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE));
+ }
+ sprintf(title,"%s %d/%d Avg User Resp Time (ms)",demoMeterTags[rf_demoMeterTag],C,G);
+ avg_resp_time_meter = rf_CreateMeter(title, buf, "blue");
+ rc = rf_mutex_init(&iops_mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ return;
+ }
+ user_ios_so_far = disk_ios_so_far = max_user = max_disk = 0;
+ user_resp_time_sum_ms = 0;
+
+ meter_update_terminate = 0;
+#ifndef SIMULATE
+ pthread_create(&update_thread_desc, raidframe_attr_default, (pthread_startroutine_t)rf_meter_update_thread, NULL);
+#endif /* !SIMULATE */
+ gettimeofday(&iops_starttime, NULL);
+ iops_initialized = 1;
+}
+
+
+void rf_update_user_stats(resptime)
+ int resptime;
+{
+ if (!iops_initialized && !recon_initialized) return;
+ RF_LOCK_MUTEX(iops_mutex);
+ user_ios_so_far++;
+ user_resp_time_sum_ms += resptime;
+ RF_UNLOCK_MUTEX(iops_mutex);
+}
+
+void rf_update_disk_iops(val)
+ int val;
+{
+ if (!iops_initialized) return;
+ RF_LOCK_MUTEX(iops_mutex);
+ disk_ios_so_far += val;
+ RF_UNLOCK_MUTEX(iops_mutex);
+}
+
+void rf_meter_update_thread()
+{
+ struct timeval now, diff;
+ int iops, resptime;
+ float secs;
+
+#ifndef SIMULATE
+ while (!meter_update_terminate) {
+ gettimeofday(&now, NULL);
+ RF_TIMEVAL_DIFF(&iops_starttime, &now, &diff);
+ secs = ((float) diff.tv_sec) + ((float) diff.tv_usec)/1000000.0;
+#else /* !SIMULATE */
+ secs = rf_cur_time;
+#endif /* !SIMULATE */
+ if (user_iops_meter >= 0) {
+ iops = (secs!=0.0) ? (int) (((float) user_ios_so_far) / secs) : 0;
+ rf_UpdateMeter(user_iops_meter, iops);
+ if (max_user_meter && iops > max_user) {max_user = iops; rf_UpdateMeter(max_user_meter, iops);}
+ }
+
+ if (disk_iops_meter >= 0) {
+ iops = (secs!=0.0) ? (int) (((float) disk_ios_so_far) / secs) : 0;
+ rf_UpdateMeter(disk_iops_meter, iops);
+ if (max_disk_meter && iops > max_disk) {max_disk = iops; rf_UpdateMeter(max_disk_meter, iops);}
+ }
+
+ if (recon_pctg_meter >= 0) {
+ rf_UpdateMeter(recon_pctg_meter, recon_pctg);
+ }
+
+ switch (rf_demo_op_mode){
+ case RF_DEMO_FAULT_FREE:
+ resptime = (user_ios_so_far != 0) ? user_resp_time_sum_ms / user_ios_so_far : 0;
+ if (resptime && (ff_avg_resp_time_meter >=0))
+ rf_UpdateMeter(ff_avg_resp_time_meter, resptime);
+ user_ios_ff += user_ios_so_far;
+ user_resp_time_sum_ff += user_resp_time_sum_ms;
+ break;
+ case RF_DEMO_DEGRADED:
+ resptime = (user_ios_so_far != 0) ? user_resp_time_sum_ms / user_ios_so_far : 0;
+ if (resptime &&(deg_avg_resp_time_meter >=0))
+ rf_UpdateMeter(deg_avg_resp_time_meter, resptime);
+ user_ios_deg += user_ios_so_far;
+ user_resp_time_sum_deg += user_resp_time_sum_ms;
+ case RF_DEMO_RECON:
+ resptime = (user_ios_so_far != 0) ? user_resp_time_sum_ms / user_ios_so_far : 0;
+ if (resptime && (recon_avg_resp_time_meter >= 0))
+ rf_UpdateMeter(recon_avg_resp_time_meter, resptime);
+ user_ios_recon += user_ios_so_far;
+ user_resp_time_sum_recon += user_resp_time_sum_ms;
+ break;
+ default: printf("WARNING: demo meter update thread: Invalid op mode! \n");
+ }
+ user_ios_so_far = 0;
+ user_resp_time_sum_ms = 0;
+#ifndef SIMULATE
+ RF_DELAY_THREAD(1,0);
+ }
+#endif /* !SIMULATE */
+}
+
+void rf_finish_iops_demo()
+{
+ long status;
+
+ if (!iops_initialized) return;
+ iops_initialized = 0; /* make sure any subsequent update calls don't do anything */
+ meter_update_terminate = 1;
+#ifndef SIMULATE
+ pthread_join(update_thread_desc, (pthread_addr_t)&status);
+#endif /* !SIMULATE */
+
+ rf_DestroyMeter(user_iops_meter, (doMax) ? 1 : 0);
+ rf_DestroyMeter(disk_iops_meter, (doMax) ? 1 : 0);
+ rf_DestroyMeter(max_user_meter, 0);
+ rf_DestroyMeter(max_disk_meter, 0);
+ rf_DestroyMeter(avg_resp_time_meter, 0);
+ rf_mutex_destroy(&iops_mutex);
+}
+
+void rf_demo_update_mode(arg_mode)
+ int arg_mode;
+{
+ int hpos;
+ char buf[100], title[100];
+
+ switch (rf_demo_op_mode = arg_mode) {
+ case RF_DEMO_DEGRADED:
+
+ /* freeze fault-free response time meter; create degraded mode meter */
+ hpos=rf_demoMeterHpos+2;
+ sprintf(buf, "%dx%d-%d+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, hpos * (RF_DEMO_METER_WIDTH+RF_DEMO_METER_SPACING), vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE));
+ sprintf(title,"Degraded Mode Average Response Time (ms)",demoMeterTags[rf_demoMeterTag]);
+ deg_avg_resp_time_meter = rf_CreateMeter(title, buf, "purple");
+ rf_UpdateMeter(ff_avg_resp_time_meter, (user_ios_ff == 0)? 0: user_resp_time_sum_ff/user_ios_ff);
+ break;
+
+ case RF_DEMO_RECON:
+
+ /* freeze degraded mode response time meter; create recon meters */
+ hpos = rf_demoMeterHpos+1;
+ sprintf(buf, "%dx%d-%d+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, hpos * (RF_DEMO_METER_WIDTH+RF_DEMO_METER_SPACING), vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE));
+ sprintf(title,"Reconstruction Average Response Time (ms)",demoMeterTags[rf_demoMeterTag]);
+ recon_avg_resp_time_meter = rf_CreateMeter(title, buf, "darkgreen");
+ sprintf(buf, "%dx%d-%d+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, (rf_demoMeterHpos) * (RF_DEMO_METER_WIDTH + RF_DEMO_METER_SPACING), vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE));
+ sprintf(title,"Percent Complete / Recon Time");
+ recon_pctg_meter = rf_CreateMeter(title,buf,"red");
+ rf_UpdateMeter(deg_avg_resp_time_meter, (user_ios_deg == 0)? 0: user_resp_time_sum_deg/user_ios_deg);
+ break;
+
+ default: /*do nothing -- finish_recon_demo will update rest of meters */;
+ }
+
+}
+
+
+/****************************************************************************************
+ * reconstruction demo code
+ ***************************************************************************************/
+
+
+void rf_startup_recon_demo(meter_vpos, C, G, init)
+ int meter_vpos;
+ int C;
+ int G;
+ int init;
+{
+ char buf[100], title[100];
+ int rc;
+
+ vpos = meter_vpos;
+ if (init) {
+ /* init demo -- display ff resp time meter */
+ sprintf(buf, "%dx%d-%d+%d",RF_DEMO_METER_WIDTH, RF_DEMO_METER_HEIGHT, (rf_demoMeterHpos+3) * (RF_DEMO_METER_WIDTH+RF_DEMO_METER_SPACING), vpos * (RF_DEMO_METER_HEIGHT+RF_DEMO_METER_VSPACE));
+ sprintf(title,"%s %d/%d Fault-Free Avg User Resp Time (ms)",demoMeterTags[rf_demoMeterTag],C,G);
+ ff_avg_resp_time_meter = rf_CreateMeter(title, buf, "blue");
+ }
+ rc = rf_mutex_init(&iops_mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ }
+
+ meter_update_terminate = 0;
+#ifndef SIMULATE
+ pthread_create(&update_thread_desc, raidframe_attr_default, (pthread_startroutine_t)rf_meter_update_thread, NULL);
+#endif /* !SIMULATE */
+ gettimeofday(&iops_starttime, NULL);
+ recon_initialized = 1;
+}
+
+void rf_update_recon_meter(val)
+ int val;
+{
+ recon_pctg = val;
+}
+
+
+void rf_finish_recon_demo(etime)
+ struct timeval *etime;
+{
+ long status;
+ int hpos;
+
+ hpos = rf_demoMeterHpos;
+
+ recon_initialized = 0; /* make sure any subsequent
+ update calls don't do anything */
+ recon_pctg = etime->tv_sec; /* display recon time on meter */
+
+ rf_UpdateMeter(recon_avg_resp_time_meter, (user_ios_recon == 0)? 0: user_resp_time_sum_recon/user_ios_recon);
+
+ rf_UpdateMeter(recon_pctg_meter, etime->tv_sec);
+
+ meter_update_terminate = 1;
+
+#ifndef SIMULATE
+ pthread_join(update_thread_desc, (pthread_addr_t)&status); /* join the meter update thread */
+#endif /* !SIMULATE */
+ rf_DestroyMeter(recon_pctg_meter, 0);
+ rf_DestroyMeter(ff_avg_resp_time_meter, 0);
+ rf_DestroyMeter(deg_avg_resp_time_meter, 0);
+ rf_DestroyMeter(recon_avg_resp_time_meter, 0);
+ rf_mutex_destroy(&iops_mutex);
+}
+
+
+/****************************************************************************************
+ * meter manipulation code
+ ***************************************************************************************/
+
+#define MAXMETERS 50
+static struct meter_info { int sd; int pid; char name[100]; } minfo[MAXMETERS];
+static int meter_num = 0;
+
+int rf_ConfigureMeters()
+{
+ int i;
+ for (i=0; i<MAXMETERS; i++)
+ minfo[i].sd = -1;
+ return(0);
+}
+
+/* forks a dmeter process to create a 4-digit meter window
+ * "title" appears in the title bar of the meter window
+ * returns an integer handle (really a socket descriptor) by which
+ * the new meter can be accessed.
+ */
+static int rf_CreateMeter(title, geom, color)
+ char *title;
+ char *geom;
+ char *color;
+{
+ char geombuf[100], *clr;
+ int sd, pid, i, status;
+ struct sockaddr sa;
+
+ if (!geom) sprintf(geombuf,"120x40-0+%d", 50*meter_num); else sprintf(geombuf, "%s", geom);
+ clr = (color) ? color : "black";
+ sprintf(minfo[meter_num].name,"/tmp/xm_%d",meter_num);
+ unlink(minfo[meter_num].name);
+
+ if ( !(pid = fork()) ) {
+ execlp("dmeter","dmeter","-noscroll","-t",title,"-geometry",geombuf,"-sa",minfo[meter_num].name,"-fg",clr,NULL);
+ perror("rf_CreateMeter: exec failed");
+ return(-1);
+ }
+
+ sd = socket(AF_UNIX,SOCK_STREAM,0);
+ sa.sa_family = AF_UNIX;
+ strcpy(sa.sa_data, minfo[meter_num].name);
+ for (i=0; i<50; i++) { /* this give us 25 seconds to get the meter running */
+ if ( (status = connect(sd,&sa,sizeof(sa))) != -1) break;
+#ifdef SIMULATE
+ sleep (1);
+#else /* SIMULATE */
+ RF_DELAY_THREAD(0, 500);
+#endif /* SIMULATE */
+ }
+ if (status == -1) {
+ perror("Unable to connect to meter");
+ exit(1);
+ }
+ minfo[meter_num].sd = sd;
+ minfo[meter_num].pid = pid;
+ return(meter_num++);
+}
+
+/* causes the meter to display the given value */
+void rf_UpdateMeter(meterid, value)
+ int meterid;
+ int value;
+{
+ if (write(minfo[meterid].sd, &value, sizeof(int)) < sizeof(int)) {
+ fprintf(stderr,"Unable to write to meter %d\n",meterid);
+ }
+}
+
+void rf_DestroyMeter(meterid, killproc)
+ int meterid;
+ int killproc;
+{
+ close(minfo[meterid].sd);
+ if (killproc) kill(minfo[meterid].pid, SIGTERM);
+ minfo[meterid].sd = -1;
+}
+
+int rf_ShutdownAllMeters()
+{
+ int i;
+
+ for (i=0; i<MAXMETERS; i++)
+ if (minfo[i].sd >= 0)
+ rf_DestroyMeter(i, 0);
+ return(0);
+}
+
+#endif /* RF_DEMO > 0 */
diff --git a/sys/dev/raidframe/rf_demo.h b/sys/dev/raidframe/rf_demo.h
new file mode 100644
index 00000000000..90a20935d57
--- /dev/null
+++ b/sys/dev/raidframe/rf_demo.h
@@ -0,0 +1,83 @@
+/* $OpenBSD: rf_demo.h,v 1.1 1999/01/11 14:29:15 niklas Exp $ */
+/* $NetBSD: rf_demo.h,v 1.1 1998/11/13 04:20:28 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Khalil Amiri
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_demo.h
+ * some constants for demo'ing software
+ */
+
+/* :
+ * Log: rf_demo.h,v
+ * Revision 1.8 1996/06/14 23:15:38 jimz
+ * attempt to deal with thread GC problem
+ *
+ * Revision 1.7 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.6 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.5 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.4 1995/12/01 15:58:53 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_DEMO_H_
+#define _RF__RF_DEMO_H_
+
+#include "rf_types.h"
+
+#define RF_DEMO_METER_WIDTH 300 /* how wide each meter is */
+#define RF_DEMO_METER_HEIGHT 150 /* how tall */
+#define RF_DEMO_METER_SPACING 15 /* how much space between horizontally */
+#define RF_DEMO_METER_VSPACE 20 /* how much space between vertically */
+#define RF_DEMO_FAULT_FREE 0
+#define RF_DEMO_DEGRADED 1
+#define RF_DEMO_RECON 2
+
+void rf_startup_iops_demo(int meter_vpos, int C, int G);
+void rf_update_user_stats(int resptime);
+void rf_update_disk_iops(int val);
+void rf_meter_update_thread(void);
+void rf_finish_iops_demo(void);
+void rf_demo_update_mode(int arg_mode);
+void rf_startup_recon_demo(int meter_vpos, int C, int G, int init);
+void rf_update_recon_meter(int val);
+void rf_finish_recon_demo(struct timeval *etime);
+
+extern int rf_demo_op_mode;
+
+#endif /* !_RF__RF_DEMO_H_ */
diff --git a/sys/dev/raidframe/rf_desc.h b/sys/dev/raidframe/rf_desc.h
new file mode 100644
index 00000000000..a1a8e4f3684
--- /dev/null
+++ b/sys/dev/raidframe/rf_desc.h
@@ -0,0 +1,181 @@
+/* $OpenBSD: rf_desc.h,v 1.1 1999/01/11 14:29:15 niklas Exp $ */
+/* $NetBSD: rf_desc.h,v 1.1 1998/11/13 04:20:28 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * :
+ * Log: rf_desc.h,v
+ * Revision 1.29 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.28 1996/06/07 22:49:22 jimz
+ * fix up raidPtr typing
+ *
+ * Revision 1.27 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.26 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.25 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.24 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.23 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.22 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.21 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.20 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.19 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.18 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.17 1995/12/01 15:58:43 root
+ * added copyright info
+ *
+ * Revision 1.16 1995/11/19 16:31:30 wvcii
+ * descriptors now contain an array of dag lists as opposed to a dag header
+ *
+ * Revision 1.15 1995/11/07 16:24:17 wvcii
+ * updated def of _AccessState
+ *
+ */
+
+#ifndef _RF__RF_DESC_H_
+#define _RF__RF_DESC_H_
+
+#include "rf_archs.h"
+#include "rf_types.h"
+#include "rf_etimer.h"
+#include "rf_dag.h"
+
+struct RF_RaidReconDesc_s {
+ RF_Raid_t *raidPtr; /* raid device descriptor */
+ RF_RowCol_t row; /* row of failed disk */
+ RF_RowCol_t col; /* col of failed disk */
+ int state; /* how far along the reconstruction operation has gotten */
+ RF_RaidDisk_t *spareDiskPtr; /* describes target disk for recon (not used in dist sparing) */
+ int numDisksDone; /* the number of surviving disks that have completed their work */
+ RF_RowCol_t srow; /* row ID of the spare disk (not used in dist sparing) */
+ RF_RowCol_t scol; /* col ID of the spare disk (not used in dist sparing) */
+#ifdef KERNEL
+ /*
+ * Prevent recon from hogging CPU
+ */
+ RF_Etimer_t recon_exec_timer;
+ RF_uint64 reconExecTimerRunning;
+ RF_uint64 reconExecTicks;
+ RF_uint64 maxReconExecTicks;
+#endif /* KERNEL */
+
+#if RF_RECON_STATS > 0
+ RF_uint64 hsStallCount; /* head sep stall count */
+ RF_uint64 numReconExecDelays;
+ RF_uint64 numReconEventWaits;
+#endif /* RF_RECON_STATS > 0 */
+ RF_RaidReconDesc_t *next;
+};
+
+struct RF_RaidAccessDesc_s {
+ RF_Raid_t *raidPtr; /* raid device descriptor */
+ RF_IoType_t type; /* read or write */
+ RF_RaidAddr_t raidAddress; /* starting address in raid address space */
+ RF_SectorCount_t numBlocks; /* number of blocks (sectors) to transfer */
+ RF_StripeCount_t numStripes; /* number of stripes involved in access */
+ caddr_t bufPtr; /* pointer to data buffer */
+
+#if !defined(KERNEL) && !defined(SIMULATE)
+ caddr_t obufPtr; /* real pointer to data buffer */
+#endif /* !KERNEL && !SIMULATE */
+
+ RF_RaidAccessFlags_t flags; /* flags controlling operation */
+ int state; /* index into states telling how far along the RAID operation has gotten */
+ RF_AccessState_t *states; /* array of states to be run */
+ int status; /* pass/fail status of the last operation */
+ RF_DagList_t *dagArray; /* array of dag lists, one list per stripe */
+ RF_AccessStripeMapHeader_t *asmap; /* the asm for this I/O */
+ void *bp; /* buf pointer for this RAID acc. ignored outside the kernel */
+ RF_DagHeader_t **paramDAG; /* allows the DAG to be returned to the caller after I/O completion */
+ RF_AccessStripeMapHeader_t **paramASM; /* allows the ASM to be returned to the caller after I/O completion */
+ RF_AccTraceEntry_t tracerec; /* perf monitoring information for a user access (not for dag stats) */
+ void (*callbackFunc)(RF_CBParam_t); /* callback function for this I/O */
+ void *callbackArg; /* arg to give to callback func */
+ int tid; /* debug only, user-level only: thread id of thr that did this access */
+
+ RF_AllocListElem_t *cleanupList; /* memory to be freed at the end of the access*/
+
+ RF_RaidAccessDesc_t *next;
+ RF_RaidAccessDesc_t *head;
+
+ int numPending;
+
+ RF_DECLARE_MUTEX(mutex) /* these are used to implement blocking I/O */
+ RF_DECLARE_COND(cond)
+
+#ifdef SIMULATE
+ RF_Owner_t owner;
+ int async_flag;
+#endif /* SIMULATE */
+
+ RF_Etimer_t timer; /* used for timing this access */
+};
+
+#endif /* !_RF__RF_DESC_H_ */
diff --git a/sys/dev/raidframe/rf_diskevent.c b/sys/dev/raidframe/rf_diskevent.c
new file mode 100644
index 00000000000..927f9ef0e29
--- /dev/null
+++ b/sys/dev/raidframe/rf_diskevent.c
@@ -0,0 +1,291 @@
+/* $OpenBSD: rf_diskevent.c,v 1.1 1999/01/11 14:29:16 niklas Exp $ */
+/* $NetBSD: rf_diskevent.c,v 1.1 1998/11/13 04:20:28 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Rachad Youssef
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_diskevent. - support for disk device, by managing a heap of future events
+ * adapted from original code by David Kotz, Song Bac Toh (1994)
+ */
+
+/* :
+ * Log: rf_diskevent.c,v
+ * Revision 1.18 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.17 1996/07/27 16:05:19 jimz
+ * return ENOMEM if DDEventInit fails its call to InitHeap
+ *
+ * Revision 1.16 1996/06/10 12:06:24 jimz
+ * fix spelling errors
+ *
+ * Revision 1.15 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.14 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.13 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.12 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.11 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.10 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.9 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.8 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.7 1995/12/01 15:57:56 root
+ * added copyright info
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_heap.h"
+#include "rf_diskevent.h"
+#include "rf_general.h"
+#include "rf_dag.h"
+#include "rf_diskthreads.h"
+#include "rf_states.h"
+#include "rf_shutdown.h"
+
+/* trace printing can be turned on/off in the Makefile */
+
+RF_TICS_t rf_cur_time;
+static RF_Owner_t cur_owner;
+static RF_Heap_t heap;
+
+static void rf_DDEventShutdown(ignored)
+ void *ignored;
+{
+ rf_FreeHeap(heap);
+}
+
+/* ======================================================================== */
+/* DDEventInit
+ *
+ * Initialize the event heap.
+ */
+int rf_DDEventInit(listp)
+ RF_ShutdownList_t **listp;
+{
+ int rc;
+
+ heap = rf_InitHeap(RF_HEAP_MAX); /* initialize the heap */
+ if (heap == NULL)
+ return(ENOMEM);
+ rc = rf_ShutdownCreate(listp, rf_DDEventShutdown, NULL);
+ if (rc) {
+ RF_ERRORMSG3("RAIDFRAME: failed creating shutdown event file %s line %d rc=%d\n",
+ __FILE__, __LINE__, rc);
+ rf_FreeHeap(heap);
+ return(rc);
+ }
+ rf_cur_time=(RF_TICS_t)0;
+ return(0);
+}
+
+
+
+/* DDEventRequest
+ *
+ * Put an event request into the event heap.
+ */
+void rf_DDEventRequest(
+ RF_TICS_t eventTime,
+ int (*CompleteFunc)(),
+ void *argument,
+ RF_Owner_t owner,
+ RF_RowCol_t row,
+ RF_RowCol_t col,
+ RF_Raid_t *raidPtr,
+ void *diskid)
+{
+ RF_HeapData_t *hpdat;
+
+ RF_Malloc(hpdat,sizeof(RF_HeapData_t),(RF_HeapData_t *) );
+ if (hpdat == NULL) {
+ fprintf(stderr, "DDEventRequest: malloc failed\n");
+ RF_PANIC();
+ }
+
+ hpdat->eventTime = eventTime;
+ hpdat->CompleteFunc = CompleteFunc;
+ hpdat->argument = argument;
+ hpdat->owner = owner;
+ hpdat->row = row;
+ hpdat->col = col;
+ hpdat->raidPtr = raidPtr;
+ hpdat->diskid = diskid;
+ rf_AddHeap(heap, hpdat, (hpdat->eventTime));
+}
+
+void rf_DAGEventRequest(
+ RF_TICS_t eventTime,
+ RF_Owner_t owner,
+ RF_RowCol_t row,
+ RF_RowCol_t col,
+ RF_RaidAccessDesc_t *desc,
+ RF_Raid_t *raidPtr)
+{
+ RF_HeapData_t *hpdat;
+
+ RF_Malloc(hpdat,sizeof(RF_HeapData_t),(RF_HeapData_t *) );
+ if (hpdat == NULL) {
+ fprintf(stderr, "DDEventRequest: malloc failed\n");
+ RF_PANIC();
+ }
+
+ hpdat->eventTime = eventTime;
+ hpdat->CompleteFunc = NULL;
+ hpdat->argument = NULL;
+ hpdat->owner = owner;
+ hpdat->row = row;
+ hpdat->col = col;
+ hpdat->desc=desc;
+ hpdat->raidPtr = raidPtr;
+
+ rf_AddHeap(heap, hpdat, (hpdat->eventTime));
+}
+
+
+/* ------------------------------------------------------------------------ */
+/* @SUBTITLE "Print out the request queue" */
+/* There is only 1 request queue so no argument is needed for this
+ function */
+void rf_DDPrintRequests()
+{
+ RF_HeapData_t *Hpdat;
+ RF_HeapKey_t Hpkey;
+ RF_Heap_t tempHp;
+
+ printf("Events on heap:\n");
+
+ tempHp = rf_InitHeap(RF_HEAP_MAX);
+ while (rf_RemHeap(heap, &Hpdat, &Hpkey) != RF_HEAP_NONE)
+ {
+ printf ("at %5g HpKey there is: something for owner %d at disk %d %d\n",Hpkey,
+ Hpdat->owner,Hpdat->row,Hpdat->col);
+ rf_AddHeap(tempHp, Hpdat, Hpdat->eventTime);
+ }
+
+ printf("END heap:\n");
+ rf_FreeHeap(heap); /* free the empty old heap */
+
+ heap = tempHp; /* restore the recycled heap */
+}
+/* ------------------------------------------------------------------------ */
+
+int rf_ProcessEvent()
+{
+ RF_HeapData_t *Hpdat;
+ RF_HeapKey_t Hpkey;
+ int retcode;
+
+ retcode = rf_RemHeap(heap, &Hpdat, &Hpkey);
+
+ if (retcode==RF_HEAP_FOUND) {
+ if (rf_eventDebug) {
+ rf_DDPrintRequests();
+ printf ("Now processing: at %5g something for owner %d at disk %d %d\n",
+ Hpkey, Hpdat->owner, Hpdat->row, Hpdat->col);
+ }
+ rf_cur_time=Hpkey;
+
+ rf_SetCurrentOwner(Hpdat->owner);
+
+ if (Hpdat->row>=0) {/* ongoing dag event */
+ rf_SetDiskIdle (Hpdat->raidPtr, Hpdat->row, Hpdat->col);
+ if (Hpdat->diskid != NULL) {
+ rf_simulator_complete_io(Hpdat->diskid);
+ }
+ retcode=(Hpdat->CompleteFunc)(Hpdat->argument,0);
+ if (retcode==RF_HEAP_FOUND)
+ (((RF_DagNode_t *) (Hpdat->argument))->dagHdr->cbFunc)(((RF_DagNode_t *) (Hpdat->argument))->dagHdr->cbArg);
+ RF_Free(Hpdat,sizeof(RF_HeapData_t));
+ return(retcode);
+ }
+ else {
+ /* this is a dag event or reconstruction event */
+ if (Hpdat->row==RF_DD_DAGEVENT_ROW){ /* dag event */
+ rf_ContinueRaidAccess(Hpdat->desc);
+ retcode = RF_FALSE;
+ RF_Free(Hpdat,sizeof(RF_HeapData_t));
+ return (RF_FALSE);
+ }
+ else {
+ /* recon event */
+ retcode=(Hpdat->CompleteFunc)(Hpdat->argument,0);
+ retcode = RF_FALSE;
+ RF_Free(Hpdat,sizeof(RF_HeapData_t));
+ return (RF_FALSE);
+ }
+ }
+ }
+ if (rf_eventDebug)
+ printf("HEAP is empty\n");
+ return(RF_DD_NOTHING_THERE);
+}
+
+RF_Owner_t rf_GetCurrentOwner()
+{
+ return(cur_owner);
+}
+
+void rf_SetCurrentOwner(RF_Owner_t owner)
+{
+ cur_owner=owner;
+}
+
+RF_TICS_t rf_CurTime()
+{
+ return(rf_cur_time);
+}
diff --git a/sys/dev/raidframe/rf_diskevent.h b/sys/dev/raidframe/rf_diskevent.h
new file mode 100644
index 00000000000..103ddde7d13
--- /dev/null
+++ b/sys/dev/raidframe/rf_diskevent.h
@@ -0,0 +1,97 @@
+/* $OpenBSD: rf_diskevent.h,v 1.1 1999/01/11 14:29:16 niklas Exp $ */
+/* $NetBSD: rf_diskevent.h,v 1.1 1998/11/13 04:20:28 oster Exp $ */
+/*
+ * rf_diskevent.h
+ * Adapted from original code by David Kotz (1994)
+ *
+ * The disk-device module is event driven. This module keeps the event
+ * request mechanism, which is based on proteus SimRequests,
+ * abstracted away from the bulk of the disk device code.
+ *
+ * Functions
+ * DDEventInit
+ * DDEventRequest
+ * DDEventPrint
+ * DDEventCancel
+ */
+
+/* :
+ * Log: rf_diskevent.h,v
+ * Revision 1.10 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.9 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.8 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.7 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.6 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.5 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.4 1995/12/01 15:57:16 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_DISKEVENT_H_
+#define _RF__RF_DISKEVENT_H_
+
+#include "rf_types.h"
+#include "rf_heap.h"
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include "time.h"
+#endif
+
+#define RF_DD_NOTHING_THERE (-1)
+#define RF_DD_DAGEVENT_ROW (-3)
+#define RF_DD_DAGEVENT_COL RF_DD_DAGEVENT_ROW
+
+extern RF_TICS_t rf_cur_time;
+
+/*
+ * list of disk-device request types,
+ * initialized in diskdevice.c,
+ * used in diskevent.c
+ */
+typedef void (*RF_DDhandler)(int disk, RF_TICS_t eventTime);
+struct RF_dd_handlers_s {
+ RF_DDhandler handler; /* function implementing this event type */
+ char name[20]; /* name of that event type */
+};
+extern struct RF_dd_handlers_s rf_DDhandlers[];
+
+int rf_DDEventInit(RF_ShutdownList_t **listp);
+void rf_DDEventRequest(RF_TICS_t eventTime, int (*CompleteFunc)(),
+ void *argument, RF_Owner_t owner, RF_RowCol_t row, RF_RowCol_t col,
+ RF_Raid_t *raidPtr, void *diskid);
+void rf_DAGEventRequest(RF_TICS_t eventTime, RF_Owner_t owner,
+ RF_RowCol_t row, RF_RowCol_t col, RF_RaidAccessDesc_t *desc,
+ RF_Raid_t *raidPtr);
+void rf_DDPrintRequests(void);
+int rf_ProcessEvent(void);
+RF_Owner_t rf_GetCurrentOwner(void);
+void rf_SetCurrentOwner(RF_Owner_t owner);
+RF_TICS_t rf_CurTime(void);
+
+#endif /* !_RF__RF_DISKEVENT_H_ */
diff --git a/sys/dev/raidframe/rf_diskqueue.c b/sys/dev/raidframe/rf_diskqueue.c
new file mode 100644
index 00000000000..cd01f3c531f
--- /dev/null
+++ b/sys/dev/raidframe/rf_diskqueue.c
@@ -0,0 +1,929 @@
+/* $OpenBSD: rf_diskqueue.c,v 1.1 1999/01/11 14:29:17 niklas Exp $ */
+/* $NetBSD: rf_diskqueue.c,v 1.2 1998/12/03 14:58:24 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/****************************************************************************************
+ *
+ * rf_diskqueue.c -- higher-level disk queue code
+ *
+ * the routines here are a generic wrapper around the actual queueing
+ * routines. The code here implements thread scheduling, synchronization,
+ * and locking ops (see below) on top of the lower-level queueing code.
+ *
+ * to support atomic RMW, we implement "locking operations". When a locking op
+ * is dispatched to the lower levels of the driver, the queue is locked, and no further
+ * I/Os are dispatched until the queue receives & completes a corresponding "unlocking
+ * operation". This code relies on the higher layers to guarantee that a locking
+ * op will always be eventually followed by an unlocking op. The model is that
+ * the higher layers are structured so locking and unlocking ops occur in pairs, i.e.
+ * an unlocking op cannot be generated until after a locking op reports completion.
+ * There is no good way to check to see that an unlocking op "corresponds" to the
+ * op that currently has the queue locked, so we make no such attempt. Since by
+ * definition there can be only one locking op outstanding on a disk, this should
+ * not be a problem.
+ *
+ * In the kernel, we allow multiple I/Os to be concurrently dispatched to the disk
+ * driver. In order to support locking ops in this environment, when we decide to
+ * do a locking op, we stop dispatching new I/Os and wait until all dispatched I/Os
+ * have completed before dispatching the locking op.
+ *
+ * Unfortunately, the code is different in the 3 different operating states
+ * (user level, kernel, simulator). In the kernel, I/O is non-blocking, and
+ * we have no disk threads to dispatch for us. Therefore, we have to dispatch
+ * new I/Os to the scsi driver at the time of enqueue, and also at the time
+ * of completion. At user level, I/O is blocking, and so only the disk threads
+ * may dispatch I/Os. Thus at user level, all we can do at enqueue time is
+ * enqueue and wake up the disk thread to do the dispatch.
+ *
+ ***************************************************************************************/
+
+/*
+ * :
+ *
+ * Log: rf_diskqueue.c,v
+ * Revision 1.50 1996/08/07 21:08:38 jimz
+ * b_proc -> kb_proc
+ *
+ * Revision 1.49 1996/07/05 20:36:14 jimz
+ * make rf_ConfigureDiskQueueSystem return 0
+ *
+ * Revision 1.48 1996/06/18 20:53:11 jimz
+ * fix up disk queueing (remove configure routine,
+ * add shutdown list arg to create routines)
+ *
+ * Revision 1.47 1996/06/14 14:16:36 jimz
+ * fix handling of bogus queue type
+ *
+ * Revision 1.46 1996/06/13 20:41:44 jimz
+ * add scan, cscan, random queueing
+ *
+ * Revision 1.45 1996/06/11 01:27:50 jimz
+ * Fixed bug where diskthread shutdown would crash or hang. This
+ * turned out to be two distinct bugs:
+ * (1) [crash] The thread shutdown code wasn't properly waiting for
+ * all the diskthreads to complete. This caused diskthreads that were
+ * exiting+cleaning up to unlock a destroyed mutex.
+ * (2) [hang] TerminateDiskQueues wasn't locking, and DiskIODequeue
+ * only checked for termination _after_ a wakeup if the queues were
+ * empty. This was a race where the termination wakeup could be lost
+ * by the dequeueing thread, and the system would hang waiting for the
+ * thread to exit, while the thread waited for an I/O or a signal to
+ * check the termination flag.
+ *
+ * Revision 1.44 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.43 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.42 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.41 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.40 1996/06/06 17:28:04 jimz
+ * track sector number of last I/O dequeued
+ *
+ * Revision 1.39 1996/06/06 01:14:13 jimz
+ * fix crashing bug when tracerec is NULL (ie, from copyback)
+ * initialize req->queue
+ *
+ * Revision 1.38 1996/06/05 19:38:32 jimz
+ * fixed up disk queueing types config
+ * added sstf disk queueing
+ * fixed exit bug on diskthreads (ref-ing bad mem)
+ *
+ * Revision 1.37 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.36 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.35 1996/05/30 12:59:18 jimz
+ * make etimer happier, more portable
+ *
+ * Revision 1.34 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.33 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.32 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.31 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.30 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.29 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.28 1996/05/20 16:14:29 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.27 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.26 1996/05/16 19:21:49 wvcii
+ * fixed typo in init_dqd
+ *
+ * Revision 1.25 1996/05/16 16:02:51 jimz
+ * switch to RF_FREELIST stuff for DiskQueueData
+ *
+ * Revision 1.24 1996/05/10 16:24:14 jimz
+ * new cvscan function names
+ *
+ * Revision 1.23 1996/05/01 16:27:54 jimz
+ * don't use ccmn bp management
+ *
+ * Revision 1.22 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.21 1995/12/01 15:59:59 root
+ * added copyright info
+ *
+ * Revision 1.20 1995/11/07 16:27:20 wvcii
+ * added Peek() function to diskqueuesw
+ * non-locking accesses are never blocked (assume clients enforce proper
+ * respect for lock acquisition)
+ *
+ * Revision 1.19 1995/10/05 18:56:52 jimz
+ * fix req handling in IOComplete
+ *
+ * Revision 1.18 1995/10/04 20:13:50 wvcii
+ * added asserts to monitor numOutstanding queueLength
+ *
+ * Revision 1.17 1995/10/04 07:43:52 wvcii
+ * queue->numOutstanding now valid for user & sim
+ * added queue->queueLength
+ * user tested & verified, sim untested
+ *
+ * Revision 1.16 1995/09/12 00:21:19 wvcii
+ * added support for tracing disk queue time
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_threadstuff.h"
+#include "rf_threadid.h"
+#include "rf_raid.h"
+#include "rf_diskqueue.h"
+#include "rf_alloclist.h"
+#include "rf_acctrace.h"
+#include "rf_etimer.h"
+#include "rf_configure.h"
+#include "rf_general.h"
+#include "rf_freelist.h"
+#include "rf_debugprint.h"
+#include "rf_shutdown.h"
+#include "rf_cvscan.h"
+#include "rf_sstf.h"
+#include "rf_fifo.h"
+
+#ifdef SIMULATE
+#include "rf_diskevent.h"
+#endif /* SIMULATE */
+
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+extern struct buf *ubc_bufget();
+#endif
+
+static int init_dqd(RF_DiskQueueData_t *);
+static void clean_dqd(RF_DiskQueueData_t *);
+static void rf_ShutdownDiskQueueSystem(void *);
+/* From rf_kintf.c */
+int rf_DispatchKernelIO(RF_DiskQueue_t *,RF_DiskQueueData_t *);
+
+
+#define Dprintf1(s,a) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf2(s,a,b) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf3(s,a,b,c) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
+#define Dprintf4(s,a,b,c,d) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL)
+#define Dprintf5(s,a,b,c,d,e) if (rf_queueDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL)
+
+#if !defined(KERNEL) && !defined(SIMULATE)
+
+/* queue must be locked before invoking this */
+#define SIGNAL_DISK_QUEUE(_q_,_wh_) \
+{ \
+ if ( (_q_)->numWaiting > 0) { \
+ (_q_)->numWaiting--; \
+ RF_SIGNAL_COND( ((_q_)->cond) ); \
+ } \
+}
+
+/* queue must be locked before invoking this */
+#define WAIT_DISK_QUEUE(_q_,_wh_) \
+{ \
+ (_q_)->numWaiting++; \
+ RF_WAIT_COND( ((_q_)->cond), ((_q_)->mutex) ); \
+}
+
+#else /* !defined(KERNEL) && !defined(SIMULATE) */
+
+#define SIGNAL_DISK_QUEUE(_q_,_wh_)
+#define WAIT_DISK_QUEUE(_q_,_wh_)
+
+#endif /* !defined(KERNEL) && !defined(SIMULATE) */
+
+/*****************************************************************************************
+ *
+ * the disk queue switch defines all the functions used in the different queueing
+ * disciplines
+ * queue ID, init routine, enqueue routine, dequeue routine
+ *
+ ****************************************************************************************/
+
+static RF_DiskQueueSW_t diskqueuesw[] = {
+ {"fifo", /* FIFO */
+ rf_FifoCreate,
+ rf_FifoEnqueue,
+ rf_FifoDequeue,
+ rf_FifoPeek,
+ rf_FifoPromote},
+
+ {"cvscan", /* cvscan */
+ rf_CvscanCreate,
+ rf_CvscanEnqueue,
+ rf_CvscanDequeue,
+ rf_CvscanPeek,
+ rf_CvscanPromote },
+
+ {"sstf", /* shortest seek time first */
+ rf_SstfCreate,
+ rf_SstfEnqueue,
+ rf_SstfDequeue,
+ rf_SstfPeek,
+ rf_SstfPromote},
+
+ {"scan", /* SCAN (two-way elevator) */
+ rf_ScanCreate,
+ rf_SstfEnqueue,
+ rf_ScanDequeue,
+ rf_ScanPeek,
+ rf_SstfPromote},
+
+ {"cscan", /* CSCAN (one-way elevator) */
+ rf_CscanCreate,
+ rf_SstfEnqueue,
+ rf_CscanDequeue,
+ rf_CscanPeek,
+ rf_SstfPromote},
+
+#if !defined(KERNEL) && RF_INCLUDE_QUEUE_RANDOM > 0
+ /* to make a point to Chris :-> */
+ {"random", /* random */
+ rf_FifoCreate,
+ rf_FifoEnqueue,
+ rf_RandomDequeue,
+ rf_RandomPeek,
+ rf_FifoPromote},
+#endif /* !KERNEL && RF_INCLUDE_QUEUE_RANDOM > 0 */
+};
+#define NUM_DISK_QUEUE_TYPES (sizeof(diskqueuesw)/sizeof(RF_DiskQueueSW_t))
+
+static RF_FreeList_t *rf_dqd_freelist;
+
+#define RF_MAX_FREE_DQD 256
+#define RF_DQD_INC 16
+#define RF_DQD_INITIAL 64
+
+#if defined(__NetBSD__) || defined(__OpenBSD__)
+#ifdef _KERNEL
+#include <sys/buf.h>
+#endif
+#endif
+
+static int init_dqd(dqd)
+ RF_DiskQueueData_t *dqd;
+{
+#ifdef KERNEL
+#if defined(__NetBSD__) || defined(__OpenBSD__)
+ /* XXX not sure if the following malloc is appropriate... probably not quite... */
+ dqd->bp = (struct buf *) malloc( sizeof(struct buf), M_DEVBUF, M_NOWAIT);
+ memset(dqd->bp,0,sizeof(struct buf)); /* if you don't do it, nobody else will.. */
+ /* XXX */
+ /* printf("NEED TO IMPLEMENT THIS BETTER!\n"); */
+#else
+ dqd->bp = ubc_bufget();
+#endif
+ if (dqd->bp == NULL) {
+ return(ENOMEM);
+ }
+#endif /* KERNEL */
+ return(0);
+}
+
+static void clean_dqd(dqd)
+ RF_DiskQueueData_t *dqd;
+{
+#ifdef KERNEL
+#if defined(__NetBSD__) || defined(__OpenBSD__)
+ /* printf("NEED TO IMPLEMENT THIS BETTER(2)!\n"); */
+ /* XXX ? */
+ free( dqd->bp, M_DEVBUF );
+#else
+ ubc_buffree(dqd->bp);
+#endif
+
+#endif /* KERNEL */
+}
+
+/* configures a single disk queue */
+static int config_disk_queue(
+ RF_Raid_t *raidPtr,
+ RF_DiskQueue_t *diskqueue,
+ RF_RowCol_t r, /* row & col -- debug only. BZZT not any more... */
+ RF_RowCol_t c,
+ RF_DiskQueueSW_t *p,
+ RF_SectorCount_t sectPerDisk,
+ dev_t dev,
+ int maxOutstanding,
+ RF_ShutdownList_t **listp,
+ RF_AllocListElem_t *clList)
+{
+ int rc;
+
+ diskqueue->row = r;
+ diskqueue->col = c;
+ diskqueue->qPtr = p;
+ diskqueue->qHdr = (p->Create)(sectPerDisk, clList, listp);
+ diskqueue->dev = dev;
+ diskqueue->numOutstanding = 0;
+ diskqueue->queueLength = 0;
+ diskqueue->maxOutstanding = maxOutstanding;
+ diskqueue->curPriority = RF_IO_NORMAL_PRIORITY;
+ diskqueue->nextLockingOp = NULL;
+ diskqueue->unlockingOp = NULL;
+ diskqueue->numWaiting=0;
+ diskqueue->flags = 0;
+ diskqueue->raidPtr = raidPtr;
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ diskqueue->rf_cinfo = &raidPtr->raid_cinfo[r][c];
+#endif
+ rc = rf_create_managed_mutex(listp, &diskqueue->mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ return(rc);
+ }
+ rc = rf_create_managed_cond(listp, &diskqueue->cond);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ return(rc);
+ }
+ return(0);
+}
+
+static void rf_ShutdownDiskQueueSystem(ignored)
+ void *ignored;
+{
+ RF_FREELIST_DESTROY_CLEAN(rf_dqd_freelist,next,(RF_DiskQueueData_t *),clean_dqd);
+}
+
+int rf_ConfigureDiskQueueSystem(listp)
+ RF_ShutdownList_t **listp;
+{
+ int rc;
+
+ RF_FREELIST_CREATE(rf_dqd_freelist, RF_MAX_FREE_DQD,
+ RF_DQD_INC, sizeof(RF_DiskQueueData_t));
+ if (rf_dqd_freelist == NULL)
+ return(ENOMEM);
+ rc = rf_ShutdownCreate(listp, rf_ShutdownDiskQueueSystem, NULL);
+ if (rc) {
+ RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
+ __FILE__, __LINE__, rc);
+ rf_ShutdownDiskQueueSystem(NULL);
+ return(rc);
+ }
+ RF_FREELIST_PRIME_INIT(rf_dqd_freelist, RF_DQD_INITIAL,next,
+ (RF_DiskQueueData_t *),init_dqd);
+ return(0);
+}
+
+#ifndef KERNEL
+/* this is called prior to shutdown to wakeup everyone waiting on a disk queue
+ * and tell them to exit
+ */
+void rf_TerminateDiskQueues(raidPtr)
+ RF_Raid_t *raidPtr;
+{
+ RF_RowCol_t r, c;
+
+ raidPtr->terminate_disk_queues = 1;
+ for (r=0; r<raidPtr->numRow; r++) {
+ for (c=0; c<raidPtr->numCol + ((r==0) ? raidPtr->numSpare : 0); c++) {
+ RF_LOCK_QUEUE_MUTEX(&raidPtr->Queues[r][c], "TerminateDiskQueues");
+ RF_BROADCAST_COND(raidPtr->Queues[r][c].cond);
+ RF_UNLOCK_QUEUE_MUTEX(&raidPtr->Queues[r][c], "TerminateDiskQueues");
+ }
+ }
+}
+#endif /* !KERNEL */
+
+int rf_ConfigureDiskQueues(
+ RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr)
+{
+ RF_DiskQueue_t **diskQueues, *spareQueues;
+ RF_DiskQueueSW_t *p;
+ RF_RowCol_t r, c;
+ int rc, i;
+
+ raidPtr->maxQueueDepth = cfgPtr->maxOutstandingDiskReqs;
+
+ for(p=NULL,i=0;i<NUM_DISK_QUEUE_TYPES;i++) {
+ if (!strcmp(diskqueuesw[i].queueType, cfgPtr->diskQueueType)) {
+ p = &diskqueuesw[i];
+ break;
+ }
+ }
+ if (p == NULL) {
+ RF_ERRORMSG2("Unknown queue type \"%s\". Using %s\n",cfgPtr->diskQueueType, diskqueuesw[0].queueType);
+ p = &diskqueuesw[0];
+ }
+
+ RF_CallocAndAdd(diskQueues, raidPtr->numRow, sizeof(RF_DiskQueue_t *), (RF_DiskQueue_t **), raidPtr->cleanupList);
+ if (diskQueues == NULL) {
+ return(ENOMEM);
+ }
+ raidPtr->Queues = diskQueues;
+ for (r=0; r<raidPtr->numRow; r++) {
+ RF_CallocAndAdd(diskQueues[r], raidPtr->numCol + ((r==0) ? raidPtr->numSpare : 0), sizeof(RF_DiskQueue_t), (RF_DiskQueue_t *), raidPtr->cleanupList);
+ if (diskQueues[r] == NULL)
+ return(ENOMEM);
+ for (c=0; c<raidPtr->numCol; c++) {
+ rc = config_disk_queue(raidPtr, &diskQueues[r][c], r, c, p,
+ raidPtr->sectorsPerDisk, raidPtr->Disks[r][c].dev,
+ cfgPtr->maxOutstandingDiskReqs, listp, raidPtr->cleanupList);
+ if (rc)
+ return(rc);
+ }
+ }
+
+ spareQueues = &raidPtr->Queues[0][raidPtr->numCol];
+ for (r=0; r<raidPtr->numSpare; r++) {
+ rc = config_disk_queue(raidPtr, &spareQueues[r],
+ 0, raidPtr->numCol+r, p,
+ raidPtr->sectorsPerDisk,
+ raidPtr->Disks[0][raidPtr->numCol+r].dev,
+ cfgPtr->maxOutstandingDiskReqs, listp,
+ raidPtr->cleanupList);
+ if (rc)
+ return(rc);
+ }
+ return(0);
+}
+
+/* Enqueue a disk I/O
+ *
+ * Unfortunately, we have to do things differently in the different
+ * environments (simulator, user-level, kernel).
+ * At user level, all I/O is blocking, so we have 1 or more threads/disk
+ * and the thread that enqueues is different from the thread that dequeues.
+ * In the kernel, I/O is non-blocking and so we'd like to have multiple
+ * I/Os outstanding on the physical disks when possible.
+ *
+ * when any request arrives at a queue, we have two choices:
+ * dispatch it to the lower levels
+ * queue it up
+ *
+ * kernel rules for when to do what:
+ * locking request: queue empty => dispatch and lock queue,
+ * else queue it
+ * unlocking req : always dispatch it
+ * normal req : queue empty => dispatch it & set priority
+ * queue not full & priority is ok => dispatch it
+ * else queue it
+ *
+ * user-level rules:
+ * always enqueue. In the special case of an unlocking op, enqueue
+ * in a special way that will cause the unlocking op to be the next
+ * thing dequeued.
+ *
+ * simulator rules:
+ * Do the same as at user level, with the sleeps and wakeups suppressed.
+ */
+void rf_DiskIOEnqueue(queue, req, pri)
+ RF_DiskQueue_t *queue;
+ RF_DiskQueueData_t *req;
+ int pri;
+{
+ int tid;
+
+ RF_ETIMER_START(req->qtime);
+ rf_get_threadid(tid);
+ RF_ASSERT(req->type == RF_IO_TYPE_NOP || req->numSector);
+ req->priority = pri;
+
+ if (rf_queueDebug && (req->numSector == 0)) {
+ printf("Warning: Enqueueing zero-sector access\n");
+ }
+
+#ifdef KERNEL
+ /*
+ * kernel
+ */
+ RF_LOCK_QUEUE_MUTEX( queue, "DiskIOEnqueue" );
+ /* locking request */
+ if (RF_LOCKING_REQ(req)) {
+ if (RF_QUEUE_EMPTY(queue)) {
+ Dprintf3("Dispatching pri %d locking op to r %d c %d (queue empty)\n",pri,queue->row, queue->col);
+ RF_LOCK_QUEUE(queue);
+ rf_DispatchKernelIO(queue, req);
+ } else {
+ queue->queueLength++; /* increment count of number of requests waiting in this queue */
+ Dprintf3("Enqueueing pri %d locking op to r %d c %d (queue not empty)\n",pri,queue->row, queue->col);
+ req->queue = (void *)queue;
+ (queue->qPtr->Enqueue)(queue->qHdr, req, pri);
+ }
+ }
+ /* unlocking request */
+ else if (RF_UNLOCKING_REQ(req)) { /* we'll do the actual unlock when this I/O completes */
+ Dprintf3("Dispatching pri %d unlocking op to r %d c %d\n",pri,queue->row, queue->col);
+ RF_ASSERT(RF_QUEUE_LOCKED(queue));
+ rf_DispatchKernelIO(queue, req);
+ }
+ /* normal request */
+ else if (RF_OK_TO_DISPATCH(queue, req)) {
+ Dprintf3("Dispatching pri %d regular op to r %d c %d (ok to dispatch)\n",pri,queue->row, queue->col);
+ rf_DispatchKernelIO(queue, req);
+ } else {
+ queue->queueLength++; /* increment count of number of requests waiting in this queue */
+ Dprintf3("Enqueueing pri %d regular op to r %d c %d (not ok to dispatch)\n",pri,queue->row, queue->col);
+ req->queue = (void *)queue;
+ (queue->qPtr->Enqueue)(queue->qHdr, req, pri);
+ }
+ RF_UNLOCK_QUEUE_MUTEX( queue, "DiskIOEnqueue" );
+
+#else /* KERNEL */
+ /*
+ * user-level
+ */
+ RF_LOCK_QUEUE_MUTEX( queue, "DiskIOEnqueue" );
+ queue->queueLength++; /* increment count of number of requests waiting in this queue */
+ /* unlocking request */
+ if (RF_UNLOCKING_REQ(req)) {
+ Dprintf4("[%d] enqueueing pri %d unlocking op & signalling r %d c %d\n", tid, pri, queue->row, queue->col);
+ RF_ASSERT(RF_QUEUE_LOCKED(queue) && queue->unlockingOp == NULL);
+ queue->unlockingOp = req;
+ }
+ /* locking and normal requests */
+ else {
+ req->queue = (void *)queue;
+ Dprintf5("[%d] enqueueing pri %d %s op & signalling r %d c %d\n", tid, pri,
+ (RF_LOCKING_REQ(req)) ? "locking" : "regular",queue->row,queue->col);
+ (queue->qPtr->Enqueue)(queue->qHdr, req, pri);
+ }
+ SIGNAL_DISK_QUEUE( queue, "DiskIOEnqueue");
+ RF_UNLOCK_QUEUE_MUTEX( queue, "DiskIOEnqueue" );
+#endif /* KERNEL */
+}
+
+#if !defined(KERNEL) && !defined(SIMULATE)
+/* user-level only: tell all threads to wake up & recheck the queue */
+void rf_BroadcastOnQueue(queue)
+ RF_DiskQueue_t *queue;
+{
+ int i;
+
+ if (queue->maxOutstanding > 1) for (i=0; i<queue->maxOutstanding; i++) {
+ SIGNAL_DISK_QUEUE(queue, "BroadcastOnQueue" );
+ }
+}
+#endif /* !KERNEL && !SIMULATE */
+
+#ifndef KERNEL /* not used in kernel */
+
+RF_DiskQueueData_t *rf_DiskIODequeue(queue)
+ RF_DiskQueue_t *queue;
+{
+ RF_DiskQueueData_t *p, *headItem;
+ int tid;
+
+ rf_get_threadid(tid);
+ RF_LOCK_QUEUE_MUTEX( queue, "DiskIODequeue" );
+ for (p=NULL; !p; ) {
+ if (queue->unlockingOp) {
+ /* unlocking request */
+ RF_ASSERT(RF_QUEUE_LOCKED(queue));
+ p = queue->unlockingOp;
+ queue->unlockingOp = NULL;
+ Dprintf4("[%d] dequeueing pri %d unlocking op r %d c %d\n", tid, p->priority, queue->row,queue->col);
+ }
+ else {
+ headItem = (queue->qPtr->Peek)(queue->qHdr);
+ if (headItem) {
+ if (RF_LOCKING_REQ(headItem)) {
+ /* locking request */
+ if (!RF_QUEUE_LOCKED(queue)) {
+ /* queue isn't locked, so dequeue the request & lock the queue */
+ p = (queue->qPtr->Dequeue)( queue->qHdr );
+ if (p)
+ Dprintf4("[%d] dequeueing pri %d locking op r %d c %d\n", tid, p->priority, queue->row, queue->col);
+ else
+ Dprintf3("[%d] no dequeue -- raw queue empty r %d c %d\n", tid, queue->row, queue->col);
+ }
+ else {
+ /* queue already locked, no dequeue occurs */
+ Dprintf3("[%d] no dequeue -- queue is locked r %d c %d\n", tid, queue->row, queue->col);
+ p = NULL;
+ }
+ }
+ else {
+ /* normal request, always dequeue and assume caller already has lock (if needed) */
+ p = (queue->qPtr->Dequeue)( queue->qHdr );
+ if (p)
+ Dprintf4("[%d] dequeueing pri %d regular op r %d c %d\n", tid, p->priority, queue->row, queue->col);
+ else
+ Dprintf3("[%d] no dequeue -- raw queue empty r %d c %d\n", tid, queue->row, queue->col);
+ }
+ }
+ else {
+ Dprintf3("[%d] no dequeue -- raw queue empty r %d c %d\n", tid, queue->row, queue->col);
+ }
+ }
+
+ if (queue->raidPtr->terminate_disk_queues) {
+ p = NULL;
+ break;
+ }
+#ifdef SIMULATE
+ break; /* in simulator, return NULL on empty queue instead of blocking */
+#else /* SIMULATE */
+ if (!p) {
+ Dprintf3("[%d] nothing to dequeue: waiting r %d c %d\n", tid, queue->row, queue->col);
+ WAIT_DISK_QUEUE( queue, "DiskIODequeue" );
+ }
+#endif /* SIMULATE */
+ }
+
+ if (p) {
+ queue->queueLength--; /* decrement count of number of requests waiting in this queue */
+ RF_ASSERT(queue->queueLength >= 0);
+ queue->numOutstanding++;
+ queue->last_deq_sector = p->sectorOffset;
+ /* record the amount of time this request spent in the disk queue */
+ RF_ETIMER_STOP(p->qtime);
+ RF_ETIMER_EVAL(p->qtime);
+ if (p->tracerec)
+ p->tracerec->diskqueue_us += RF_ETIMER_VAL_US(p->qtime);
+ }
+
+ if (p && RF_LOCKING_REQ(p)) {
+ RF_ASSERT(!RF_QUEUE_LOCKED(queue));
+ Dprintf3("[%d] locking queue r %d c %d\n",tid,queue->row,queue->col);
+ RF_LOCK_QUEUE(queue);
+ }
+ RF_UNLOCK_QUEUE_MUTEX( queue, "DiskIODequeue" );
+
+ return(p);
+}
+
+#else /* !KERNEL */
+
+/* get the next set of I/Os started, kernel version only */
+void rf_DiskIOComplete(queue, req, status)
+ RF_DiskQueue_t *queue;
+ RF_DiskQueueData_t *req;
+ int status;
+{
+ int done=0;
+
+ RF_LOCK_QUEUE_MUTEX( queue, "DiskIOComplete" );
+
+ /* unlock the queue:
+ (1) after an unlocking req completes
+ (2) after a locking req fails
+ */
+ if (RF_UNLOCKING_REQ(req) || (RF_LOCKING_REQ(req) && status)) {
+ Dprintf2("DiskIOComplete: unlocking queue at r %d c %d\n", queue->row, queue->col);
+ RF_ASSERT(RF_QUEUE_LOCKED(queue) && (queue->unlockingOp == NULL));
+ RF_UNLOCK_QUEUE(queue);
+ }
+
+ queue->numOutstanding--;
+ RF_ASSERT(queue->numOutstanding >= 0);
+
+ /* dispatch requests to the disk until we find one that we can't. */
+ /* no reason to continue once we've filled up the queue */
+ /* no reason to even start if the queue is locked */
+
+ while (!done && !RF_QUEUE_FULL(queue) && !RF_QUEUE_LOCKED(queue)) {
+ if (queue->nextLockingOp) {
+ req = queue->nextLockingOp; queue->nextLockingOp = NULL;
+ Dprintf3("DiskIOComplete: a pri %d locking req was pending at r %d c %d\n",req->priority,queue->row, queue->col);
+ } else {
+ req = (queue->qPtr->Dequeue)( queue->qHdr );
+ if (req != NULL) {
+ Dprintf3("DiskIOComplete: extracting pri %d req from queue at r %d c %d\n",req->priority,queue->row, queue->col);
+ } else {
+ Dprintf1("DiskIOComplete: no more requests to extract.\n","");
+ }
+ }
+ if (req) {
+ queue->queueLength--; /* decrement count of number of requests waiting in this queue */
+ RF_ASSERT(queue->queueLength >= 0);
+ }
+ if (!req) done=1;
+ else if (RF_LOCKING_REQ(req)) {
+ if (RF_QUEUE_EMPTY(queue)) { /* dispatch it */
+ Dprintf3("DiskIOComplete: dispatching pri %d locking req to r %d c %d (queue empty)\n",req->priority,queue->row, queue->col);
+ RF_LOCK_QUEUE(queue);
+ rf_DispatchKernelIO(queue, req);
+ done = 1;
+ } else { /* put it aside to wait for the queue to drain */
+ Dprintf3("DiskIOComplete: postponing pri %d locking req to r %d c %d\n",req->priority,queue->row, queue->col);
+ RF_ASSERT(queue->nextLockingOp == NULL);
+ queue->nextLockingOp = req;
+ done = 1;
+ }
+ } else if (RF_UNLOCKING_REQ(req)) { /* should not happen: unlocking ops should not get queued */
+ RF_ASSERT(RF_QUEUE_LOCKED(queue)); /* support it anyway for the future */
+ Dprintf3("DiskIOComplete: dispatching pri %d unl req to r %d c %d (SHOULD NOT SEE THIS)\n",req->priority,queue->row, queue->col);
+ rf_DispatchKernelIO(queue, req);
+ done = 1;
+ } else if (RF_OK_TO_DISPATCH(queue, req)) {
+ Dprintf3("DiskIOComplete: dispatching pri %d regular req to r %d c %d (ok to dispatch)\n",req->priority,queue->row, queue->col);
+ rf_DispatchKernelIO(queue, req);
+ } else { /* we can't dispatch it, so just re-enqueue it. */
+ /* potential trouble here if disk queues batch reqs */
+ Dprintf3("DiskIOComplete: re-enqueueing pri %d regular req to r %d c %d\n",req->priority,queue->row, queue->col);
+ queue->queueLength++;
+ (queue->qPtr->Enqueue)(queue->qHdr, req, req->priority);
+ done = 1;
+ }
+ }
+
+ RF_UNLOCK_QUEUE_MUTEX( queue, "DiskIOComplete" );
+}
+#endif /* !KERNEL */
+
+/* promotes accesses tagged with the given parityStripeID from low priority
+ * to normal priority. This promotion is optional, meaning that a queue
+ * need not implement it. If there is no promotion routine associated with
+ * a queue, this routine does nothing and returns -1.
+ */
+int rf_DiskIOPromote(queue, parityStripeID, which_ru)
+ RF_DiskQueue_t *queue;
+ RF_StripeNum_t parityStripeID;
+ RF_ReconUnitNum_t which_ru;
+{
+ int retval;
+
+ if (!queue->qPtr->Promote)
+ return(-1);
+ RF_LOCK_QUEUE_MUTEX( queue, "DiskIOPromote" );
+ retval = (queue->qPtr->Promote)( queue->qHdr, parityStripeID, which_ru );
+ RF_UNLOCK_QUEUE_MUTEX( queue, "DiskIOPromote" );
+ return(retval);
+}
+
+RF_DiskQueueData_t *rf_CreateDiskQueueData(
+ RF_IoType_t typ,
+ RF_SectorNum_t ssect,
+ RF_SectorCount_t nsect,
+ caddr_t buf,
+ RF_StripeNum_t parityStripeID,
+ RF_ReconUnitNum_t which_ru,
+ int (*wakeF)(void *,int),
+ void *arg,
+ RF_DiskQueueData_t *next,
+ RF_AccTraceEntry_t *tracerec,
+ void *raidPtr,
+ RF_DiskQueueDataFlags_t flags,
+ void *kb_proc)
+{
+ RF_DiskQueueData_t *p;
+
+ RF_FREELIST_GET_INIT(rf_dqd_freelist,p,next,(RF_DiskQueueData_t *),init_dqd);
+
+ p->sectorOffset = ssect + rf_protectedSectors;
+ p->numSector = nsect;
+ p->type = typ;
+ p->buf = buf;
+ p->parityStripeID= parityStripeID;
+ p->which_ru = which_ru;
+ p->CompleteFunc = wakeF;
+ p->argument = arg;
+ p->next = next;
+ p->tracerec = tracerec;
+ p->priority = RF_IO_NORMAL_PRIORITY;
+ p->AuxFunc = NULL;
+ p->buf2 = NULL;
+#ifdef SIMULATE
+ p->owner = rf_GetCurrentOwner();
+#endif /* SIMULATE */
+ p->raidPtr = raidPtr;
+ p->flags = flags;
+#ifdef KERNEL
+ p->b_proc = kb_proc;
+#endif /* KERNEL */
+ return(p);
+}
+
+RF_DiskQueueData_t *rf_CreateDiskQueueDataFull(
+ RF_IoType_t typ,
+ RF_SectorNum_t ssect,
+ RF_SectorCount_t nsect,
+ caddr_t buf,
+ RF_StripeNum_t parityStripeID,
+ RF_ReconUnitNum_t which_ru,
+ int (*wakeF)(void *,int),
+ void *arg,
+ RF_DiskQueueData_t *next,
+ RF_AccTraceEntry_t *tracerec,
+ int priority,
+ int (*AuxFunc)(void *,...),
+ caddr_t buf2,
+ void *raidPtr,
+ RF_DiskQueueDataFlags_t flags,
+ void *kb_proc)
+{
+ RF_DiskQueueData_t *p;
+
+ RF_FREELIST_GET_INIT(rf_dqd_freelist,p,next,(RF_DiskQueueData_t *),init_dqd);
+
+ p->sectorOffset = ssect + rf_protectedSectors;
+ p->numSector = nsect;
+ p->type = typ;
+ p->buf = buf;
+ p->parityStripeID= parityStripeID;
+ p->which_ru = which_ru;
+ p->CompleteFunc = wakeF;
+ p->argument = arg;
+ p->next = next;
+ p->tracerec = tracerec;
+ p->priority = priority;
+ p->AuxFunc = AuxFunc;
+ p->buf2 = buf2;
+#ifdef SIMULATE
+ p->owner = rf_GetCurrentOwner();
+#endif /* SIMULATE */
+ p->raidPtr = raidPtr;
+ p->flags = flags;
+#ifdef KERNEL
+ p->b_proc = kb_proc;
+#endif /* KERNEL */
+ return(p);
+}
+
+void rf_FreeDiskQueueData(p)
+ RF_DiskQueueData_t *p;
+{
+ RF_FREELIST_FREE_CLEAN(rf_dqd_freelist,p,next,clean_dqd);
+}
diff --git a/sys/dev/raidframe/rf_diskqueue.h b/sys/dev/raidframe/rf_diskqueue.h
new file mode 100644
index 00000000000..20878553479
--- /dev/null
+++ b/sys/dev/raidframe/rf_diskqueue.h
@@ -0,0 +1,315 @@
+/* $OpenBSD: rf_diskqueue.h,v 1.1 1999/01/11 14:29:17 niklas Exp $ */
+/* $NetBSD: rf_diskqueue.h,v 1.1 1998/11/13 04:20:29 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************************
+ *
+ * rf_diskqueue.h -- header file for disk queues
+ *
+ * see comments in rf_diskqueue.c
+ *
+ ****************************************************************************************/
+/*
+ *
+ * :
+ *
+ * Log: rf_diskqueue.h,v
+ * Revision 1.31 1996/08/07 21:08:49 jimz
+ * b_proc -> kb_proc (IRIX complained)
+ *
+ * Revision 1.30 1996/06/18 20:53:11 jimz
+ * fix up disk queueing (remove configure routine,
+ * add shutdown list arg to create routines)
+ *
+ * Revision 1.29 1996/06/13 20:38:19 jimz
+ * fix queue type in DiskQueueData
+ *
+ * Revision 1.28 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.27 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.26 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.25 1996/06/06 17:29:12 jimz
+ * track arm position of last I/O dequeued
+ *
+ * Revision 1.24 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.23 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.22 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.21 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.20 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.19 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.18 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.17 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.16 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.15 1996/05/10 19:39:31 jimz
+ * add prev pointer to DiskQueueData
+ *
+ * Revision 1.14 1996/05/10 16:24:04 jimz
+ * mark old defines as deprecated, add RF_ defines
+ *
+ * Revision 1.13 1995/12/01 15:59:04 root
+ * added copyright info
+ *
+ * Revision 1.12 1995/11/07 16:26:44 wvcii
+ * added Peek() function to diskqueuesw
+ *
+ * Revision 1.11 1995/10/05 02:33:15 jimz
+ * made queue lens longs (less instructions to read :-)
+ *
+ * Revision 1.10 1995/10/04 07:07:07 wvcii
+ * queue->numOutstanding now valid for user & sim
+ * user tested & verified, sim untested
+ *
+ * Revision 1.9 1995/09/12 00:21:37 wvcii
+ * added support for tracing disk queue time
+ *
+ * Revision 1.8 95/04/24 13:25:51 holland
+ * rewrite to move disk queues, recon, & atomic RMW to kernel
+ *
+ * Revision 1.6.10.2 1995/04/03 20:13:56 holland
+ * added numOutstanding and maxOutstanding to support moving
+ * disk queues into kernel code
+ *
+ * Revision 1.6.10.1 1995/04/03 20:03:56 holland
+ * initial checkin on branch
+ *
+ * Revision 1.6 1995/03/03 18:34:33 rachad
+ * Simulator mechanism added
+ *
+ * Revision 1.5 1995/03/01 20:25:48 holland
+ * kernelization changes
+ *
+ * Revision 1.4 1995/02/03 22:31:36 holland
+ * many changes related to kernelization
+ *
+ * Revision 1.3 1995/02/01 14:25:19 holland
+ * began changes for kernelization:
+ * changed all instances of mutex_t and cond_t to DECLARE macros
+ * converted configuration code to use config structure
+ *
+ * Revision 1.2 1994/11/29 20:36:02 danner
+ * Added symbolic constants for io_type (e.g,IO_TYPE_READ)
+ * and support for READ_OP_WRITE
+ *
+ */
+
+
+#ifndef _RF__RF_DISKQUEUE_H_
+#define _RF__RF_DISKQUEUE_H_
+
+#include "rf_threadstuff.h"
+#include "rf_acctrace.h"
+#include "rf_alloclist.h"
+#include "rf_types.h"
+#include "rf_etimer.h"
+
+
+#ifdef _KERNEL
+#if defined(__NetBSD__)
+#include "rf_netbsd.h"
+#elif defined(__OpenBSD__)
+#include "rf_openbsd.h"
+#endif
+#endif
+
+
+#define RF_IO_NORMAL_PRIORITY 1
+#define RF_IO_LOW_PRIORITY 0
+
+/* the data held by a disk queue entry */
+struct RF_DiskQueueData_s {
+ RF_SectorNum_t sectorOffset; /* sector offset into the disk */
+ RF_SectorCount_t numSector; /* number of sectors to read/write */
+ RF_IoType_t type; /* read/write/nop */
+ caddr_t buf; /* buffer pointer */
+ RF_StripeNum_t parityStripeID; /* the RAID parity stripe ID this access is for */
+ RF_ReconUnitNum_t which_ru; /* which RU within this parity stripe */
+ int priority; /* the priority of this request */
+ int (*CompleteFunc)(void *,int);/* function to be called upon completion */
+ int (*AuxFunc)(void *,...); /* function called upon completion of the first I/O of a Read_Op_Write pair*/
+ void *argument; /* argument to be passed to CompleteFunc */
+#ifdef SIMULATE
+ RF_Owner_t owner; /* which task is responsible for this request */
+#endif /* SIMULATE */
+ void *raidPtr; /* needed for simulation */
+ RF_AccTraceEntry_t *tracerec; /* perf mon only */
+ RF_Etimer_t qtime; /* perf mon only - time request is in queue */
+ long entryTime;
+ RF_DiskQueueData_t *next;
+ RF_DiskQueueData_t *prev;
+ caddr_t buf2; /* for read-op-write */
+ dev_t dev; /* the device number for in-kernel version */
+ RF_DiskQueue_t *queue; /* the disk queue to which this req is targeted */
+ RF_DiskQueueDataFlags_t flags; /* flags controlling operation */
+
+#ifdef KERNEL
+ struct proc *b_proc; /* the b_proc from the original bp passed into the driver for this I/O */
+ struct buf *bp; /* a bp to use to get this I/O done */
+#endif /* KERNEL */
+};
+
+#define RF_LOCK_DISK_QUEUE 0x01
+#define RF_UNLOCK_DISK_QUEUE 0x02
+
+/* note: "Create" returns type-specific queue header pointer cast to (void *) */
+struct RF_DiskQueueSW_s {
+ RF_DiskQueueType_t queueType;
+ void *(*Create)(RF_SectorCount_t, RF_AllocListElem_t *, RF_ShutdownList_t **); /* creation routine -- one call per queue in system */
+ void (*Enqueue)(void *,RF_DiskQueueData_t * ,int); /* enqueue routine */
+ RF_DiskQueueData_t *(*Dequeue)(void *); /* dequeue routine */
+ RF_DiskQueueData_t *(*Peek)(void *); /* peek at head of queue */
+
+ /* the rest are optional: they improve performance, but the driver will deal with it if they don't exist */
+ int (*Promote)(void *, RF_StripeNum_t, RF_ReconUnitNum_t); /* promotes priority of tagged accesses */
+};
+
+struct RF_DiskQueue_s {
+ RF_DiskQueueSW_t *qPtr; /* access point to queue functions */
+ void *qHdr; /* queue header, of whatever type */
+ RF_DECLARE_MUTEX(mutex) /* mutex locking data structures */
+ RF_DECLARE_COND(cond) /* condition variable for synchronization */
+ long numOutstanding; /* number of I/Os currently outstanding on disk */
+ long maxOutstanding; /* max # of I/Os that can be outstanding on a disk (in-kernel only) */
+ int curPriority; /* the priority of accs all that are currently outstanding */
+ long queueLength; /* number of requests in queue */
+ RF_DiskQueueData_t *nextLockingOp; /* a locking op that has arrived at the head of the queue & is waiting for drainage */
+ RF_DiskQueueData_t *unlockingOp; /* used at user level to communicate unlocking op b/w user (or dag exec) & disk threads */
+ int numWaiting; /* number of threads waiting on this variable. user-level only */
+ RF_DiskQueueFlags_t flags; /* terminate, locked */
+ RF_Raid_t *raidPtr; /* associated array */
+ dev_t dev; /* device number for kernel version */
+ RF_SectorNum_t last_deq_sector; /* last sector number dequeued or dispatched */
+ int row, col; /* debug only */
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ struct raidcinfo *rf_cinfo; /* disks component info.. */
+#endif
+};
+
+#define RF_DQ_LOCKED 0x02 /* no new accs allowed until queue is explicitly unlocked */
+
+/* macros setting & returning information about queues and requests */
+#define RF_QUEUE_LOCKED(_q) ((_q)->flags & RF_DQ_LOCKED)
+#define RF_QUEUE_EMPTY(_q) (((_q)->numOutstanding == 0) && ((_q)->nextLockingOp == NULL) && !RF_QUEUE_LOCKED(_q))
+#define RF_QUEUE_FULL(_q) ((_q)->numOutstanding == (_q)->maxOutstanding)
+
+#define RF_LOCK_QUEUE(_q) (_q)->flags |= RF_DQ_LOCKED
+#define RF_UNLOCK_QUEUE(_q) (_q)->flags &= ~RF_DQ_LOCKED
+
+#define RF_LOCK_QUEUE_MUTEX(_q_,_wh_) RF_LOCK_MUTEX((_q_)->mutex)
+#define RF_UNLOCK_QUEUE_MUTEX(_q_,_wh_) RF_UNLOCK_MUTEX((_q_)->mutex)
+
+#define RF_LOCKING_REQ(_r) ((_r)->flags & RF_LOCK_DISK_QUEUE)
+#define RF_UNLOCKING_REQ(_r) ((_r)->flags & RF_UNLOCK_DISK_QUEUE)
+
+/* whether it is ok to dispatch a regular request */
+#define RF_OK_TO_DISPATCH(_q_,_r_) \
+ (RF_QUEUE_EMPTY(_q_) || \
+ (!RF_QUEUE_FULL(_q_) && ((_r_)->priority >= (_q_)->curPriority)))
+
+int rf_ConfigureDiskQueueSystem(RF_ShutdownList_t **listp);
+
+void rf_TerminateDiskQueues(RF_Raid_t *raidPtr);
+
+int rf_ConfigureDiskQueues(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr);
+
+void rf_DiskIOEnqueue(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int pri);
+
+#if !defined(KERNEL) && !defined(SIMULATE)
+void rf_BroadcastOnQueue(RF_DiskQueue_t *queue);
+#endif /* !KERNEL && !SIMULATE */
+
+#ifndef KERNEL
+RF_DiskQueueData_t *rf_DiskIODequeue(RF_DiskQueue_t *queue);
+#else /* !KERNEL */
+void rf_DiskIOComplete(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req, int status);
+#endif /* !KERNEL */
+
+int rf_DiskIOPromote(RF_DiskQueue_t *queue, RF_StripeNum_t parityStripeID,
+ RF_ReconUnitNum_t which_ru);
+
+RF_DiskQueueData_t *rf_CreateDiskQueueData(RF_IoType_t typ,
+ RF_SectorNum_t ssect, RF_SectorCount_t nsect, caddr_t buf,
+ RF_StripeNum_t parityStripeID, RF_ReconUnitNum_t which_ru,
+ int (*wakeF)(void *, int),
+ void *arg, RF_DiskQueueData_t *next, RF_AccTraceEntry_t *tracerec,
+ void *raidPtr, RF_DiskQueueDataFlags_t flags, void *kb_proc);
+
+RF_DiskQueueData_t *rf_CreateDiskQueueDataFull(RF_IoType_t typ,
+ RF_SectorNum_t ssect, RF_SectorCount_t nsect, caddr_t buf,
+ RF_StripeNum_t parityStripeID, RF_ReconUnitNum_t which_ru,
+ int (*wakeF)(void *, int),
+ void *arg, RF_DiskQueueData_t *next, RF_AccTraceEntry_t *tracerec,
+ int priority, int (*AuxFunc)(void *,...), caddr_t buf2,
+ void *raidPtr, RF_DiskQueueDataFlags_t flags, void *kb_proc);
+
+void rf_FreeDiskQueueData(RF_DiskQueueData_t *p);
+
+#endif /* !_RF__RF_DISKQUEUE_H_ */
diff --git a/sys/dev/raidframe/rf_disks.c b/sys/dev/raidframe/rf_disks.c
new file mode 100644
index 00000000000..fc89d407f47
--- /dev/null
+++ b/sys/dev/raidframe/rf_disks.c
@@ -0,0 +1,651 @@
+/* $OpenBSD: rf_disks.c,v 1.1 1999/01/11 14:29:17 niklas Exp $ */
+/* $NetBSD: rf_disks.c,v 1.2 1998/12/03 15:06:25 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/***************************************************************
+ * rf_disks.c -- code to perform operations on the actual disks
+ ***************************************************************/
+
+/* :
+ * Log: rf_disks.c,v
+ * Revision 1.32 1996/07/27 18:40:24 jimz
+ * cleanup sweep
+ *
+ * Revision 1.31 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.30 1996/07/19 16:11:21 jimz
+ * pass devname to DoReadCapacity
+ *
+ * Revision 1.29 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.28 1996/07/10 22:28:38 jimz
+ * get rid of obsolete row statuses (dead,degraded2)
+ *
+ * Revision 1.27 1996/06/10 12:06:14 jimz
+ * don't do any SCSI op stuff in simulator at all
+ *
+ * Revision 1.26 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.25 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.24 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.23 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.22 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.21 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.20 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.19 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.18 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.17 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.16 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.15 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.14 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.13 1996/05/02 14:57:43 jimz
+ * initialize sectorMask
+ *
+ * Revision 1.12 1995/12/01 15:57:04 root
+ * added copyright info
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_alloclist.h"
+#include "rf_utils.h"
+#include "rf_configure.h"
+#include "rf_general.h"
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include "rf_camlayer.h"
+#endif
+#include "rf_options.h"
+#include "rf_sys.h"
+
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/ioctl.h>
+#include <sys/fcntl.h>
+#ifdef __NETBSD__
+#include <sys/vnode.h>
+#endif
+
+int raidlookup __P((char *, struct proc *p, struct vnode **));
+#endif
+
+#ifdef SIMULATE
+static char disk_db_file_name[120], disk_type_name[120];
+static double init_offset;
+#endif /* SIMULATE */
+
+#define DPRINTF6(a,b,c,d,e,f) if (rf_diskDebug) printf(a,b,c,d,e,f)
+#define DPRINTF7(a,b,c,d,e,f,g) if (rf_diskDebug) printf(a,b,c,d,e,f,g)
+
+#include "rf_ccmn.h"
+
+/****************************************************************************************
+ *
+ * initialize the disks comprising the array
+ *
+ * We want the spare disks to have regular row,col numbers so that we can easily
+ * substitue a spare for a failed disk. But, the driver code assumes throughout
+ * that the array contains numRow by numCol _non-spare_ disks, so it's not clear
+ * how to fit in the spares. This is an unfortunate holdover from raidSim. The
+ * quick and dirty fix is to make row zero bigger than the rest, and put all the
+ * spares in it. This probably needs to get changed eventually.
+ *
+ ***************************************************************************************/
+int rf_ConfigureDisks(
+ RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr)
+{
+ RF_RaidDisk_t **disks;
+ RF_SectorCount_t min_numblks = (RF_SectorCount_t)0x7FFFFFFFFFFFLL;
+ RF_RowCol_t r, c;
+ int bs, ret;
+ unsigned i, count, foundone=0, numFailuresThisRow;
+ RF_DiskOp_t *rdcap_op = NULL, *tur_op = NULL;
+ int num_rows_done,num_cols_done;
+
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ struct proc *proc = 0;
+#endif
+#ifndef SIMULATE
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+ ret = rf_SCSI_AllocReadCapacity(&rdcap_op);
+ if (ret)
+ goto fail;
+ ret = rf_SCSI_AllocTUR(&tur_op);
+ if (ret)
+ goto fail;
+#endif /* !__NetBSD__ && !__OpenBSD__ */
+#endif /* !SIMULATE */
+
+ num_rows_done = 0;
+ num_cols_done = 0;
+
+
+ RF_CallocAndAdd(disks, raidPtr->numRow, sizeof(RF_RaidDisk_t *), (RF_RaidDisk_t **), raidPtr->cleanupList);
+ if (disks == NULL) {
+ ret = ENOMEM;
+ goto fail;
+ }
+ raidPtr->Disks = disks;
+
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+
+ proc = raidPtr->proc; /* Blah XXX */
+
+ /* get space for the device-specific stuff... */
+ RF_CallocAndAdd(raidPtr->raid_cinfo, raidPtr->numRow,
+ sizeof(struct raidcinfo *), (struct raidcinfo **),
+ raidPtr->cleanupList);
+ if (raidPtr->raid_cinfo == NULL) {
+ ret = ENOMEM;
+ goto fail;
+ }
+#endif
+
+ for (r=0; r<raidPtr->numRow; r++) {
+ numFailuresThisRow = 0;
+ RF_CallocAndAdd(disks[r], raidPtr->numCol + ((r==0) ? raidPtr->numSpare : 0), sizeof(RF_RaidDisk_t), (RF_RaidDisk_t *), raidPtr->cleanupList);
+ if (disks[r] == NULL) {
+ ret = ENOMEM;
+ goto fail;
+ }
+
+ /* get more space for device specific stuff.. */
+ RF_CallocAndAdd(raidPtr->raid_cinfo[r],
+ raidPtr->numCol + ((r==0) ? raidPtr->numSpare : 0),
+ sizeof(struct raidcinfo), (struct raidcinfo *),
+ raidPtr->cleanupList);
+ if (raidPtr->raid_cinfo[r] == NULL) {
+ ret = ENOMEM;
+ goto fail;
+ }
+
+
+ for (c=0; c<raidPtr->numCol; c++) {
+ ret = rf_ConfigureDisk(raidPtr,&cfgPtr->devnames[r][c][0],
+ &disks[r][c], rdcap_op, tur_op,
+ cfgPtr->devs[r][c],r,c);
+ if (ret)
+ goto fail;
+ if (disks[r][c].status != rf_ds_optimal) {
+ numFailuresThisRow++;
+ }
+ else {
+ if (disks[r][c].numBlocks < min_numblks)
+ min_numblks = disks[r][c].numBlocks;
+ DPRINTF7("Disk at row %d col %d: dev %s numBlocks %ld blockSize %d (%ld MB)\n",
+ r,c,disks[r][c].devname,
+ (long int) disks[r][c].numBlocks,
+ disks[r][c].blockSize,
+ (long int) disks[r][c].numBlocks * disks[r][c].blockSize / 1024 / 1024);
+ }
+ num_cols_done++;
+ }
+ /* XXX fix for n-fault tolerant */
+ if (numFailuresThisRow > 0)
+ raidPtr->status[r] = rf_rs_degraded;
+ num_rows_done++;
+ }
+#ifndef SIMULATE
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ /* we do nothing */
+#else
+ rf_SCSI_FreeDiskOp(rdcap_op, 1); rdcap_op = NULL;
+ rf_SCSI_FreeDiskOp(tur_op, 0); tur_op = NULL;
+#endif
+#endif /* !SIMULATE */
+ /* all disks must be the same size & have the same block size, bs must be a power of 2 */
+ bs = 0;
+ for (foundone=r=0; !foundone && r<raidPtr->numRow; r++) {
+ for (c=0; !foundone && c<raidPtr->numCol; c++) {
+ if (disks[r][c].status == rf_ds_optimal) {
+ bs = disks[r][c].blockSize;
+ foundone = 1;
+ }
+ }
+ }
+ if (!foundone) {
+ RF_ERRORMSG("RAIDFRAME: Did not find any live disks in the array.\n");
+ ret = EINVAL;
+ goto fail;
+ }
+ for (count=0,i=1; i; i<<=1) if (bs & i)
+ count++;
+ if (count != 1) {
+ RF_ERRORMSG1("Error: block size on disks (%d) must be a power of 2\n",bs);
+ ret = EINVAL;
+ goto fail;
+ }
+ for (r=0; r<raidPtr->numRow; r++) {
+ for (c=0; c<raidPtr->numCol; c++) {
+ if (disks[r][c].status == rf_ds_optimal) {
+ if (disks[r][c].blockSize != bs) {
+ RF_ERRORMSG2("Error: block size of disk at r %d c %d different from disk at r 0 c 0\n",r,c);
+ ret = EINVAL;
+ goto fail;
+ }
+ if (disks[r][c].numBlocks != min_numblks) {
+ RF_ERRORMSG3("WARNING: truncating disk at r %d c %d to %d blocks\n",
+ r,c,(int) min_numblks);
+ disks[r][c].numBlocks = min_numblks;
+ }
+ }
+ }
+ }
+
+ raidPtr->sectorsPerDisk = min_numblks;
+ raidPtr->logBytesPerSector = ffs(bs) - 1;
+ raidPtr->bytesPerSector = bs;
+ raidPtr->sectorMask = bs-1;
+ return(0);
+
+fail:
+
+#ifndef SIMULATE
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+
+ for(r=0;r<raidPtr->numRow;r++) {
+ for(c=0;c<raidPtr->numCol;c++) {
+ /* Cleanup.. */
+#ifdef DEBUG
+ printf("Cleaning up row: %d col: %d\n",r,c);
+#endif
+ if (raidPtr->raid_cinfo[r][c].ci_vp) {
+ (void)vn_close(raidPtr->raid_cinfo[r][c].ci_vp,
+ FREAD|FWRITE, proc->p_ucred, proc);
+ }
+ }
+ }
+ /* Space allocated for raid_vpp will get cleaned up at some other point */
+ /* XXX Need more #ifdefs in the above... */
+
+#else
+
+ if (rdcap_op) rf_SCSI_FreeDiskOp(rdcap_op, 1);
+ if (tur_op) rf_SCSI_FreeDiskOp(tur_op, 0);
+
+#endif
+#endif /* !SIMULATE */
+ return(ret);
+}
+
+
+/****************************************************************************************
+ * set up the data structures describing the spare disks in the array
+ * recall from the above comment that the spare disk descriptors are stored
+ * in row zero, which is specially expanded to hold them.
+ ***************************************************************************************/
+int rf_ConfigureSpareDisks(
+ RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr)
+{
+ char buf[256];
+ int r,c,i, ret;
+ RF_DiskOp_t *rdcap_op = NULL, *tur_op = NULL;
+ unsigned bs;
+ RF_RaidDisk_t *disks;
+ int num_spares_done;
+
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ struct proc *proc;
+#endif
+
+#ifndef SIMULATE
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+ ret = rf_SCSI_AllocReadCapacity(&rdcap_op);
+ if (ret)
+ goto fail;
+ ret = rf_SCSI_AllocTUR(&tur_op);
+ if (ret)
+ goto fail;
+#endif /* !__NetBSD__ && !__OpenBSD__ */
+#endif /* !SIMULATE */
+
+ num_spares_done = 0;
+
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ proc = raidPtr->proc;
+ /* The space for the spares should have already been
+ allocated by ConfigureDisks() */
+#endif
+
+ disks = &raidPtr->Disks[0][raidPtr->numCol];
+ for (i=0; i<raidPtr->numSpare; i++) {
+ ret = rf_ConfigureDisk(raidPtr,&cfgPtr->spare_names[i][0],
+ &disks[i], rdcap_op, tur_op,
+ cfgPtr->spare_devs[i],0,raidPtr->numCol+i);
+ if (ret)
+ goto fail;
+ if (disks[i].status != rf_ds_optimal) {
+ RF_ERRORMSG1("Warning: spare disk %s failed TUR\n",buf);
+ } else {
+ disks[i].status = rf_ds_spare; /* change status to spare */
+ DPRINTF6("Spare Disk %d: dev %s numBlocks %ld blockSize %d (%ld MB)\n",i,
+ disks[i].devname,
+ (long int) disks[i].numBlocks,disks[i].blockSize,
+ (long int) disks[i].numBlocks * disks[i].blockSize / 1024 / 1024);
+ }
+ num_spares_done++;
+ }
+#ifndef SIMULATE
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && (_KERNEL)
+
+#else
+ rf_SCSI_FreeDiskOp(rdcap_op, 1); rdcap_op = NULL;
+ rf_SCSI_FreeDiskOp(tur_op, 0); tur_op = NULL;
+#endif
+#endif /* !SIMULATE */
+
+ /* check sizes and block sizes on spare disks */
+ bs = 1 << raidPtr->logBytesPerSector;
+ for (i=0; i<raidPtr->numSpare; i++) {
+ if (disks[i].blockSize != bs) {
+ RF_ERRORMSG3("Block size of %d on spare disk %s is not the same as on other disks (%d)\n",disks[i].blockSize, disks[i].devname, bs);
+ ret = EINVAL;
+ goto fail;
+ }
+ if (disks[i].numBlocks < raidPtr->sectorsPerDisk) {
+ RF_ERRORMSG3("Spare disk %s (%d blocks) is too small to serve as a spare (need %ld blocks)\n",
+ disks[i].devname, disks[i].blockSize, (long int)raidPtr->sectorsPerDisk);
+ ret = EINVAL;
+ goto fail;
+ } else if (disks[i].numBlocks > raidPtr->sectorsPerDisk) {
+ RF_ERRORMSG2("Warning: truncating spare disk %s to %ld blocks\n",disks[i].devname, (long int) raidPtr->sectorsPerDisk);
+
+ disks[i].numBlocks = raidPtr->sectorsPerDisk;
+ }
+ }
+
+ return(0);
+
+fail:
+#ifndef SIMULATE
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+
+ /* Release the hold on the main components. We've failed to allocate a
+ spare, and since we're failing, we need to free things.. */
+
+ for(r=0;r<raidPtr->numRow;r++) {
+ for(c=0;c<raidPtr->numCol;c++) {
+ /* Cleanup.. */
+#ifdef DEBUG
+ printf("Cleaning up row: %d col: %d\n",r,c);
+#endif
+ if (raidPtr->raid_cinfo[r][c].ci_vp) {
+ (void)vn_close(raidPtr->raid_cinfo[r][c].ci_vp,
+ FREAD|FWRITE, proc->p_ucred, proc);
+ }
+ }
+ }
+
+ for(i=0;i<raidPtr->numSpare;i++) {
+ /* Cleanup.. */
+#ifdef DEBUG
+ printf("Cleaning up spare: %d\n",i);
+#endif
+ if (raidPtr->raid_cinfo[0][raidPtr->numCol+i].ci_vp) {
+ (void)vn_close(raidPtr->raid_cinfo[0][raidPtr->numCol+i].ci_vp,
+ FREAD|FWRITE, proc->p_ucred, proc);
+ }
+ }
+
+#else
+
+ if (rdcap_op) rf_SCSI_FreeDiskOp(rdcap_op, 1);
+ if (tur_op) rf_SCSI_FreeDiskOp(tur_op, 0);
+
+#endif
+
+#endif /* !SIMULATE */
+ return(ret);
+}
+
+
+
+/* configure a single disk in the array */
+int rf_ConfigureDisk(raidPtr, buf, diskPtr, rdcap_op, tur_op, dev, row, col)
+ RF_Raid_t *raidPtr; /* We need this down here too!! GO */
+ char *buf;
+ RF_RaidDisk_t *diskPtr;
+ RF_DiskOp_t *rdcap_op;
+ RF_DiskOp_t *tur_op;
+ dev_t dev; /* device number used only in kernel */
+ RF_RowCol_t row;
+ RF_RowCol_t col;
+{
+ char *p;
+#ifdef SIMULATE
+ double init_offset;
+#else /* SIMULATE */
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ int retcode;
+#else
+ int busid, targid, lun, retcode;
+#endif
+#endif /* SIMULATE */
+
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ struct partinfo dpart;
+ struct vnode *vp;
+ struct vattr va;
+ struct proc *proc;
+ int error;
+#endif
+
+retcode = 0;
+ p = rf_find_non_white(buf);
+ if (p[strlen(p)-1] == '\n') {
+ /* strip off the newline */
+ p[strlen(p)-1] = '\0';
+ }
+ (void) strcpy(diskPtr->devname, p);
+
+#ifdef SIMULATE
+
+ init_offset = 0.0;
+ rf_InitDisk(&diskPtr->diskState, disk_db_file_name,diskPtr->devname,0,0,init_offset,row,col);
+ rf_GeometryDoReadCapacity(&diskPtr->diskState, &diskPtr->numBlocks, &diskPtr->blockSize);
+ diskPtr->numBlocks = diskPtr->numBlocks * rf_sizePercentage / 100;
+
+ /* we allow the user to specify that only a fraction of the disks should be used
+ * this is just for debug: it speeds up the parity scan
+ */
+
+#else /* SIMULATE */
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+ /* get bus, target, lun */
+ retcode = rf_extract_ids(p, &busid, &targid, &lun);
+ if (retcode)
+ return(retcode);
+
+ /* required in kernel, nop at user level */
+ retcode = rf_SCSI_OpenUnit(dev);
+ if (retcode)
+ return(retcode);
+
+ diskPtr->dev = dev;
+ if (rf_SCSI_DoTUR(tur_op, (u_char)busid, (u_char)targid, (u_char)lun, dev)) {
+ RF_ERRORMSG1("Disk %s failed TUR. Marked as dead.\n",diskPtr->devname);
+ diskPtr->status = rf_ds_failed;
+ } else {
+ diskPtr->status = rf_ds_optimal;
+ retcode = rf_SCSI_DoReadCapacity(raidPtr,rdcap_op, busid, targid, lun, dev,
+ &diskPtr->numBlocks, &diskPtr->blockSize, diskPtr->devname);
+ if (retcode)
+ return(retcode);
+
+ /* we allow the user to specify that only a fraction of the disks should be used
+ * this is just for debug: it speeds up the parity scan
+ */
+ diskPtr->numBlocks = diskPtr->numBlocks * rf_sizePercentage / 100;
+ }
+#endif
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+
+ proc = raidPtr->proc; /* XXX Yes, this is not nice.. */
+
+ /* Let's start by claiming the component is fine and well... */
+ /* XXX not the case if the disk is toast.. */
+ diskPtr->status = rf_ds_optimal;
+
+
+ raidPtr->raid_cinfo[row][col].ci_vp = NULL;
+ raidPtr->raid_cinfo[row][col].ci_dev = NULL;
+
+ error = raidlookup(diskPtr->devname, proc, &vp);
+ if (error) {
+ printf("raidlookup on device: %s failed!\n",diskPtr->devname);
+ if (error == ENXIO) {
+ /* XXX the component isn't there... must be dead :-( */
+ diskPtr->status = rf_ds_failed;
+ } else {
+ return(error);
+ }
+ }
+
+ if (diskPtr->status == rf_ds_optimal) {
+
+ if ((error = VOP_GETATTR(vp, &va, proc->p_ucred, proc)) != 0) {
+ return(error);
+ }
+
+ error = VOP_IOCTL(vp, DIOCGPART, (caddr_t)&dpart,
+ FREAD, proc->p_ucred, proc);
+ if (error) {
+ return(error);
+ }
+
+
+ diskPtr->blockSize = dpart.disklab->d_secsize;
+
+ diskPtr->numBlocks = dpart.part->p_size - rf_protectedSectors;
+
+ raidPtr->raid_cinfo[row][col].ci_vp = vp;
+ raidPtr->raid_cinfo[row][col].ci_dev = va.va_rdev;
+
+#if 0
+ diskPtr->dev = dev;
+#endif
+
+ diskPtr->dev = va.va_rdev; /* XXX or the above? */
+
+ /* we allow the user to specify that only a fraction of the disks should be used
+ * this is just for debug: it speeds up the parity scan
+ */
+ diskPtr->numBlocks = diskPtr->numBlocks * rf_sizePercentage / 100;
+
+ }
+
+#endif /* !__NetBSD__ && !__OpenBSD__ */
+#endif /* SIMULATE */
+
+ return(0);
+}
+
+#ifdef SIMULATE
+
+void rf_default_disk_names()
+{
+ sprintf(disk_db_file_name,"disk.db");
+ sprintf(disk_type_name,"HP2247");
+}
+
+void rf_set_disk_db_name(s)
+ char *s;
+{
+ strcpy(disk_db_file_name,s);
+}
+
+void rf_set_disk_type_name(s)
+ char *s;
+{
+ strcpy(disk_type_name,s);
+}
+
+#endif /* SIMULATE */
diff --git a/sys/dev/raidframe/rf_disks.h b/sys/dev/raidframe/rf_disks.h
new file mode 100644
index 00000000000..8857391a8bd
--- /dev/null
+++ b/sys/dev/raidframe/rf_disks.h
@@ -0,0 +1,161 @@
+/* $OpenBSD: rf_disks.h,v 1.1 1999/01/11 14:29:18 niklas Exp $ */
+/* $NetBSD: rf_disks.h,v 1.1 1998/11/13 04:20:29 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_disks.h -- header file for code related to physical disks
+ */
+
+/* :
+ * Log: rf_disks.h,v
+ * Revision 1.15 1996/08/20 23:05:13 jimz
+ * add nreads, nwrites to RaidDisk
+ *
+ * Revision 1.14 1996/06/17 03:20:15 jimz
+ * increase devname len to 56
+ *
+ * Revision 1.13 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.12 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.11 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.10 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.9 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.8 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.7 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.6 1996/05/02 22:06:57 jimz
+ * add RF_RaidDisk_t
+ *
+ * Revision 1.5 1995/12/01 15:56:53 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_DISKS_H_
+#define _RF__RF_DISKS_H_
+
+#include <sys/types.h>
+
+#include "rf_archs.h"
+#include "rf_types.h"
+#ifdef SIMULATE
+#include "rf_geometry.h"
+#endif /* SIMULATE */
+
+/*
+ * A physical disk can be in one of several states:
+ * IF YOU ADD A STATE, CHECK TO SEE IF YOU NEED TO MODIFY RF_DEAD_DISK() BELOW.
+ */
+enum RF_DiskStatus_e {
+ rf_ds_optimal, /* no problems */
+ rf_ds_failed, /* reconstruction ongoing */
+ rf_ds_reconstructing, /* reconstruction complete to spare, dead disk not yet replaced */
+ rf_ds_dist_spared, /* reconstruction complete to distributed spare space, dead disk not yet replaced */
+ rf_ds_spared, /* reconstruction complete to distributed spare space, dead disk not yet replaced */
+ rf_ds_spare, /* an available spare disk */
+ rf_ds_used_spare /* a spare which has been used, and hence is not available */
+};
+typedef enum RF_DiskStatus_e RF_DiskStatus_t;
+
+struct RF_RaidDisk_s {
+ char devname[56]; /* name of device file */
+ RF_DiskStatus_t status; /* whether it is up or down */
+ RF_RowCol_t spareRow; /* if in status "spared", this identifies the spare disk */
+ RF_RowCol_t spareCol; /* if in status "spared", this identifies the spare disk */
+ RF_SectorCount_t numBlocks; /* number of blocks, obtained via READ CAPACITY */
+ int blockSize;
+ /* XXX the folling is needed since we seem to need SIMULATE defined
+ in order to get user-land stuff to compile, but we *don't* want
+ this in the structure for the user-land utilities, as the
+ kernel doesn't know about it!! (and it messes up the size of
+ the structure, so there is a communication problem between
+ the kernel and the userland utils :-( GO */
+#if defined(SIMULATE) && !defined(RF_UTILITY)
+ RF_DiskState_t diskState; /* the name of the disk as used in the disk module */
+#endif /* SIMULATE */
+#if RF_KEEP_DISKSTATS > 0
+ RF_uint64 nreads;
+ RF_uint64 nwrites;
+#endif /* RF_KEEP_DISKSTATS > 0 */
+ dev_t dev;
+};
+
+/*
+ * An RF_DiskOp_t ptr is really a pointer to a UAGT_CCB, but I want
+ * to isolate the cam layer from all other layers, so I typecast to/from
+ * RF_DiskOp_t * (i.e. void *) at the interfaces.
+ */
+typedef void RF_DiskOp_t;
+
+/* if a disk is in any of these states, it is inaccessible */
+#define RF_DEAD_DISK(_dstat_) (((_dstat_) == rf_ds_spared) || \
+ ((_dstat_) == rf_ds_reconstructing) || ((_dstat_) == rf_ds_failed) || \
+ ((_dstat_) == rf_ds_dist_spared))
+
+int rf_ConfigureDisks(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr);
+int rf_ConfigureSpareDisks(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr);
+int rf_ConfigureDisk(RF_Raid_t *raidPtr, char *buf, RF_RaidDisk_t *diskPtr,
+ RF_DiskOp_t *rdcap_op, RF_DiskOp_t *tur_op, dev_t dev,
+ RF_RowCol_t row, RF_RowCol_t col);
+
+#ifdef SIMULATE
+void rf_default_disk_names(void);
+void rf_set_disk_db_name(char *s);
+void rf_set_disk_type_name(char *s);
+#endif /* SIMULATE */
+
+#endif /* !_RF__RF_DISKS_H_ */
diff --git a/sys/dev/raidframe/rf_diskthreads.h b/sys/dev/raidframe/rf_diskthreads.h
new file mode 100644
index 00000000000..60181759b6d
--- /dev/null
+++ b/sys/dev/raidframe/rf_diskthreads.h
@@ -0,0 +1,103 @@
+/* $OpenBSD: rf_diskthreads.h,v 1.1 1999/01/11 14:29:18 niklas Exp $ */
+/* $NetBSD: rf_diskthreads.h,v 1.1 1998/11/13 04:20:29 oster Exp $ */
+/*
+ * rf_diskthreads.h
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * :
+ * Log: rf_diskthreads.h,v
+ * Revision 1.7 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.6 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.5 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.4 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.3 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.2 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.1 1996/05/18 19:55:58 jimz
+ * Initial revision
+ *
+ */
+/*
+ * rf_diskthreads.h -- types and prototypes for disk thread system
+ */
+
+#ifndef _RF__RF_DISKTHREADS_H_
+#define _RF__RF_DISKTHREADS_H_
+
+#include "rf_types.h"
+
+/* this is the information that a disk thread needs to do its job */
+struct RF_DiskId_s {
+ RF_DiskQueue_t *queue;
+ RF_Raid_t *raidPtr;
+ RF_RaidDisk_t *disk;
+ int fd; /* file descriptor */
+ RF_RowCol_t row, col; /* debug only */
+#ifdef SIMULATE
+ int state;
+#endif /* SIMULATE */
+};
+
+int rf_ConfigureDiskThreads(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr);
+
+#ifdef SIMULATE
+int rf_SetDiskIdle(RF_Raid_t *raidPtr, RF_RowCol_t r, RF_RowCol_t c);
+int rf_ScanDiskQueues(RF_Raid_t *raidPtr);
+void rf_simulator_complete_io(RF_DiskId_t *id);
+void rf_PrintDiskStat(RF_Raid_t *raidPtr);
+#else /* SIMULATE */
+int rf_ShutdownDiskThreads(RF_Raid_t *raidPtr);
+#endif /* SIMULATE */
+
+#endif /* !_RF__RF_DISKTHREADS_H_ */
diff --git a/sys/dev/raidframe/rf_driver.c b/sys/dev/raidframe/rf_driver.c
new file mode 100644
index 00000000000..f8db8f5baf0
--- /dev/null
+++ b/sys/dev/raidframe/rf_driver.c
@@ -0,0 +1,1765 @@
+/* $OpenBSD: rf_driver.c,v 1.1 1999/01/11 14:29:18 niklas Exp $ */
+/* $NetBSD: rf_driver.c,v 1.2 1998/11/13 13:45:15 drochner Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Khalil Amiri, Claudson Bornstein, William V. Courtright II,
+ * Robby Findler, Daniel Stodolsky, Rachad Youssef, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/******************************************************************************
+ *
+ * rf_driver.c -- main setup, teardown, and access routines for the RAID driver
+ *
+ * all routines are prefixed with rf_ (raidframe), to avoid conficts.
+ *
+ ******************************************************************************/
+
+/*
+ * :
+ * Log: rf_driver.c,v
+ * Revision 1.147 1996/08/21 04:12:46 jimz
+ * added hook for starting out req_hist w/ more distributed values
+ * (currently not done)
+ *
+ * Revision 1.146 1996/07/29 14:05:12 jimz
+ * fix numPUs/numRUs confusion (everything is now numRUs)
+ * clean up some commenting, return values
+ *
+ * Revision 1.145 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.144 1996/07/27 18:40:24 jimz
+ * cleanup sweep
+ *
+ * Revision 1.143 1996/07/22 21:11:53 jimz
+ * fix formatting on DoAccess error msg
+ *
+ * Revision 1.142 1996/07/19 16:10:06 jimz
+ * added call to rf_ResetDebugOptions() in rf_ConfigureDebug()
+ *
+ * Revision 1.141 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.140 1996/07/17 21:00:58 jimz
+ * clean up timer interface, tracing
+ *
+ * Revision 1.139 1996/07/15 05:40:41 jimz
+ * some recon datastructure cleanup
+ * better handling of multiple failures
+ * added undocumented double-recon test
+ *
+ * Revision 1.138 1996/07/11 19:08:00 jimz
+ * generalize reconstruction mechanism
+ * allow raid1 reconstructs via copyback (done with array
+ * quiesced, not online, therefore not disk-directed)
+ *
+ * Revision 1.137 1996/07/10 22:28:00 jimz
+ * get rid of obsolete row statuses (dead,degraded2)
+ *
+ * Revision 1.136 1996/06/17 14:38:33 jimz
+ * properly #if out RF_DEMO code
+ * fix bug in MakeConfig that was causing weird behavior
+ * in configuration routines (config was not zeroed at start)
+ * clean up genplot handling of stacks
+ *
+ * Revision 1.135 1996/06/17 03:20:32 jimz
+ * move out raidframe_attr_default
+ * don't monkey with stack sizes
+ *
+ * Revision 1.134 1996/06/14 23:15:38 jimz
+ * attempt to deal with thread GC problem
+ *
+ * Revision 1.133 1996/06/14 21:24:08 jimz
+ * new ConfigureEtimer init
+ * moved out timer vars
+ *
+ * Revision 1.132 1996/06/14 16:19:03 jimz
+ * remove include of pdllib.h (beginning of PDL cleanup)
+ *
+ * Revision 1.131 1996/06/14 14:35:24 jimz
+ * clean up dfstrace protection
+ *
+ * Revision 1.130 1996/06/14 14:16:09 jimz
+ * engine config is now array-specific
+ *
+ * Revision 1.129 1996/06/13 19:08:10 jimz
+ * add debug var to force keep_acc_totals on
+ *
+ * Revision 1.128 1996/06/11 10:57:08 jimz
+ * init recon_done_proc_mutex
+ *
+ * Revision 1.127 1996/06/10 14:18:58 jimz
+ * move user, throughput stats into per-array structure
+ *
+ * Revision 1.126 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.125 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.124 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.123 1996/06/05 19:38:32 jimz
+ * fixed up disk queueing types config
+ * added sstf disk queueing
+ * fixed exit bug on diskthreads (ref-ing bad mem)
+ *
+ * Revision 1.122 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.121 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.120 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.119 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.118 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.117 1996/05/30 16:28:33 jimz
+ * typo in rf_SignalQuiescenceLock() fixed
+ *
+ * Revision 1.116 1996/05/30 12:59:18 jimz
+ * make etimer happier, more portable
+ *
+ * Revision 1.115 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.114 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.113 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.112 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.111 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.110 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.109 1996/05/23 00:39:56 jimz
+ * demoMode -> rf_demoMode
+ *
+ * Revision 1.108 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.107 1996/05/21 14:30:04 jimz
+ * idler_desc_mutex should be ifndef SIMULATE
+ *
+ * Revision 1.106 1996/05/20 19:31:12 jimz
+ * add atomic debug (mutex and cond leak finder) stuff
+ *
+ * Revision 1.105 1996/05/20 16:12:45 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.104 1996/05/18 20:09:41 jimz
+ * bit of cleanup to compile cleanly in kernel, once again
+ *
+ * Revision 1.103 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.102 1996/05/16 21:20:51 jimz
+ * use FREELIST stuff to manage access descriptors
+ *
+ * Revision 1.101 1996/05/16 14:21:10 jimz
+ * remove bogus copies from write path on user
+ *
+ * Revision 1.100 1996/05/15 22:33:54 jimz
+ * appropriately #ifdef cache stuff
+ *
+ * Revision 1.99 1996/05/08 21:34:41 jimz
+ * #if 0 ShutdownCache() and ConfigureCache()
+ *
+ * Revision 1.98 1996/05/08 21:01:24 jimz
+ * fixed up enum type names that were conflicting with other
+ * enums and function names (ie, "panic")
+ * future naming trends will be towards RF_ and rf_ for
+ * everything raidframe-related
+ *
+ * Revision 1.97 1996/05/07 19:02:58 wvcii
+ * corrected header comment of rf_DoAccess()
+ * reordered free of desc in FreeRaidAccDesc() The desc is now
+ * freed last.
+ *
+ * Revision 1.96 1996/05/07 17:40:50 jimz
+ * add doDebug
+ *
+ * Revision 1.95 1996/05/06 21:35:23 jimz
+ * fixed ordering of cleanup and removed extra decrement of configureCount
+ *
+ * Revision 1.94 1996/05/06 18:44:14 jimz
+ * reorder cleanup to not blow alloclist out from under various modules
+ * zero raidPtr contents on config
+ *
+ * Revision 1.93 1996/05/04 17:06:53 jimz
+ * Fail the I/O with ENOSPC if reading past end of the array in the kernel.
+ *
+ * Revision 1.92 1996/05/03 19:44:22 wvcii
+ * debug vars degDagDebug and enableAtomicRMW now defined
+ * in this file.
+ *
+ * Revision 1.91 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.90 1995/12/08 15:07:03 arw
+ * cache code cleanup
+ *
+ * Revision 1.89 1995/12/06 20:53:58 wvcii
+ * created debug var forceParityLogReint
+ * this variable forces reintegration of all parity logs at shutdown
+ *
+ * Revision 1.88 1995/12/01 15:59:10 root
+ * added copyright info
+ *
+ * Revision 1.87 1995/11/28 21:34:02 amiri
+ * modified SetReconfiguredMode so that it installs the
+ * spare table only if arch is declustered based on block designs
+ *
+ * Revision 1.86 1995/11/21 23:06:11 amiri
+ * added division by zero check in printing
+ * throughput stats.
+ *
+ * Revision 1.85 1995/11/19 16:27:25 wvcii
+ * disableParityVerify now defined locally, only read from config
+ * file for !KERNEL compiles
+ *
+ * Revision 1.84 1995/11/17 15:08:31 wvcii
+ * added debug var disableParityVerify
+ * used in RealLoopTest to disable parity verification
+ *
+ * Revision 1.83 1995/11/07 15:48:43 wvcii
+ * deleted debug vars: suppressAtomicRMW, enableRollAway, concatDagDebug
+ * deleted debug vars: debugSelectUnit, debugSelectBlock
+ * added debug var: enableAtomicRMW
+ *
+ * Revision 1.82 1995/10/18 19:28:45 amiri
+ * added support for reconstruction demos in the
+ * simulator, by updating some simulator
+ * variables in Faildisk.
+ *
+ * Revision 1.81 1995/10/09 18:36:33 jimz
+ * move rf_StopThroughputStats() into FreeAccDesc()
+ * changed throughput output print format
+ * added user-level copy to write path to emulate kernel hack
+ *
+ * Revision 1.80 1995/10/09 18:07:47 wvcii
+ * moved call to rf_StopThroughputStats to rf_states.c
+ *
+ * Revision 1.79 1995/10/09 17:38:53 jimz
+ * quiesce an array for user-level testing before shutting it down
+ * (should this also be done in the kernel?)
+ *
+ * Revision 1.78 1995/10/09 15:35:43 wvcii
+ * added code to measure throughput in user mode
+ *
+ * Revision 1.77 1995/10/05 06:18:59 jimz
+ * Changed DDEventRequest() to take additional arg, used by simulator
+ * to cache diskid so queue length can be decremented on io complete
+ * (this is a hack to get around the fact that the event mechanism
+ * assumes it can dereference arbitrary handles on enqueued events)
+ *
+ * Revision 1.76 1995/10/04 07:25:10 jimz
+ * turn off bigstacks by default
+ *
+ * Revision 1.75 1995/10/04 07:24:34 jimz
+ * code for bigstacks in user process
+ *
+ * Revision 1.74 1995/09/26 21:42:51 wvcii
+ * removed calls to ConfigureCache, ShutdownCache when building kernel
+ * kernel currently does not support any cached architectures
+ *
+ * Revision 1.73 1995/09/20 21:05:35 jimz
+ * add missing unit arg to IO_BUF_ERR() in non-kernel case
+ *
+ * Revision 1.72 1995/09/19 23:02:44 jimz
+ * call RF_DKU_END_IO in the appropriate places
+ *
+ * Revision 1.71 1995/09/07 19:02:31 jimz
+ * mods to get raidframe to compile and link
+ * in kernel environment
+ *
+ * Revision 1.70 1995/09/06 19:24:01 wvcii
+ * added debug vars enableRollAway and debugRecovery
+ *
+ * Revision 1.69 1995/08/24 19:25:36 rachad
+ * Fixes to LSS GC in the simulater
+ *
+ * Revision 1.68 1995/07/28 21:43:42 robby
+ * checkin after leaving for Rice. Bye
+ *
+ * Revision 1.67 1995/07/26 18:06:52 cfb
+ * *** empty log message ***
+ *
+ * Revision 1.66 1995/07/26 03:25:24 robby
+ * fixed accesses mutex and updated call to ConfigureCache
+ *
+ * Revision 1.65 1995/07/25 14:36:52 rachad
+ * *** empty log message ***
+ *
+ * Revision 1.64 1995/07/21 19:29:05 robby
+ * added total_accesses
+ *
+ * Revision 1.63 1995/07/20 19:43:35 cfb
+ * *** empty log message ***
+ *
+ * Revision 1.62 1995/07/20 16:10:24 rachad
+ * *** empty log message ***
+ *
+ * Revision 1.61 1995/07/20 03:36:53 rachad
+ * Added suport for cache warming
+ *
+ * Revision 1.60 1995/07/17 22:31:31 cfb
+ * *** empty log message ***
+ *
+ * Revision 1.59 1995/07/16 17:02:23 cfb
+ * *** empty log message ***
+ *
+ * Revision 1.58 1995/07/16 15:19:27 cfb
+ * *** empty log message ***
+ *
+ * Revision 1.57 1995/07/16 03:17:01 cfb
+ * *** empty log message ***
+ *
+ * Revision 1.56 1995/07/13 16:11:59 cfb
+ * *** empty log message ***
+ *
+ * Revision 1.55 1995/07/13 15:42:40 cfb
+ * added cacheDebug variable ...
+ *
+ * Revision 1.54 1995/07/13 14:28:27 rachad
+ * *** empty log message ***
+ *
+ * Revision 1.53 1995/07/10 21:48:52 robby
+ * added virtualStripingWarnings
+ *
+ * Revision 1.52 1995/07/10 20:41:13 rachad
+ * *** empty log message ***
+ *
+ * Revision 1.51 1995/07/09 19:46:49 cfb
+ * Added cache Shutdown
+ *
+ * Revision 1.50 1995/07/08 21:38:53 rachad
+ * Added support for interactive traces
+ * in the simulator
+ *
+ * Revision 1.49 1995/07/08 18:05:39 rachad
+ * Linked up Claudsons code with the real cache
+ *
+ * Revision 1.48 1995/07/07 16:00:22 cfb
+ * Added initialization of cacheDesc to AllocRaidAccDesc
+ *
+ * Revision 1.47 1995/07/06 14:22:37 rachad
+ * Merge complete
+ *
+ * Revision 1.46.50.2 1995/06/21 17:48:30 robby
+ * test
+ *
+ * Revision 1.46.50.1 1995/06/21 17:34:49 robby
+ * branching to work on "meta-dag" capabilities
+ *
+ * Revision 1.46.10.5 1995/07/03 21:58:34 holland
+ * added support for suppressing both stripe locks & large writes
+ *
+ * Revision 1.46.10.4 1995/06/27 03:42:48 holland
+ * typo fix
+ *
+ * Revision 1.46.10.3 1995/06/27 03:31:42 holland
+ * prototypes
+ *
+ * Revision 1.46.10.2 1995/06/27 03:17:57 holland
+ * fixed callback bug in kernel rf_DoAccess
+ *
+ * Revision 1.46.10.1 1995/06/25 14:32:44 holland
+ * initial checkin on new branch
+ *
+ * Revision 1.46 1995/06/13 17:52:41 holland
+ * added UserStats stuff
+ *
+ * Revision 1.45 1995/06/13 16:03:41 rachad
+ * *** empty log message ***
+ *
+ * Revision 1.44 1995/06/12 15:54:40 rachad
+ * Added garbege collection for log structured storage
+ *
+ * Revision 1.43 1995/06/09 18:01:09 holland
+ * various changes related to in-kernel recon, multiple-row arrays,
+ * trace extraction from kernel, etc.
+ *
+ * Revision 1.42 1995/06/08 19:52:28 rachad
+ * *** empty log message ***
+ *
+ * Revision 1.41 1995/06/08 00:11:49 robby
+ * added a debug variable -- showVirtualSizeRequirements
+ *
+ * Revision 1.40 1995/06/05 00:33:30 holland
+ * protectedSectors bug fix
+ *
+ * Revision 1.39 1995/06/01 22:45:03 holland
+ * made compilation of parity logging and virtual striping
+ * stuff conditional on some constants defined in rf_archs.h
+ *
+ * Revision 1.38 1995/06/01 21:52:37 holland
+ * replaced NULL sizes in calls to Free() by -1, and caused this
+ * to suppress the size-mismatch error
+ *
+ * Revision 1.37 1995/05/26 20:04:54 wvcii
+ * modified parity logging debug vars
+ *
+ * Revision 1.36 95/05/21 15:32:41 wvcii
+ * added debug vars: parityLogDebug, numParityRegions, numParityLogs,
+ * numReintegrationThreads
+ *
+ * Revision 1.35 95/05/19 20:58:21 holland
+ * cleanups on error cases in rf_DoAccess
+ *
+ * Revision 1.34 1995/05/16 17:35:53 holland
+ * added rf_copyback_in_progress. this is debug-only.
+ *
+ * Revision 1.33 1995/05/15 12:25:35 holland
+ * bug fix in test code: no stripe locks were getting acquired in RAID0 mode
+ *
+ * Revision 1.32 1995/05/10 18:54:12 holland
+ * bug fixes related to deadlock problem at time of disk failure
+ * eliminated read-op-write code
+ * beefed up parity checking in loop test
+ * various small changes & new ASSERTs
+ *
+ * Revision 1.31 1995/05/02 22:49:02 holland
+ * add shutdown calls for each architecture
+ *
+ * Revision 1.30 1995/05/01 14:43:37 holland
+ * merged changes from Bill
+ *
+ * Revision 1.29 1995/05/01 13:28:00 holland
+ * parity range locks, locking disk requests, recon+parityscan in kernel, etc.
+ *
+ * Revision 1.28 1995/04/24 13:25:51 holland
+ * rewrite to move disk queues, recon, & atomic RMW to kernel
+ *
+ * Revision 1.27 1995/04/06 14:47:56 rachad
+ * merge completed
+ *
+ * Revision 1.26 1995/04/03 20:32:35 rachad
+ * added reconstruction to simulator
+ *
+ * Revision 1.25.10.2 1995/04/03 20:41:00 holland
+ * misc changes related to distributed sparing
+ *
+ * Revision 1.25.10.1 1995/03/17 20:04:01 holland
+ * initial checkin on new branch
+ *
+ * Revision 1.25 1995/03/15 20:34:30 holland
+ * changes for distributed sparing.
+ *
+ * Revision 1.24 1995/03/09 19:53:05 rachad
+ * *** empty log message ***
+ *
+ * Revision 1.23 1995/03/03 18:36:16 rachad
+ * Simulator mechanism added
+ *
+ * Revision 1.22 1995/03/01 20:25:48 holland
+ * kernelization changes
+ *
+ * Revision 1.21 1995/02/17 19:39:56 holland
+ * added size param to all calls to Free().
+ * this is ignored at user level, but necessary in the kernel.
+ *
+ * Revision 1.20 1995/02/17 13:37:49 holland
+ * kernelization changes -- not yet complete
+ *
+ * Revision 1.19 1995/02/10 18:08:07 holland
+ * fixed a few things I broke during kernelization
+ *
+ * Revision 1.18 1995/02/10 17:34:10 holland
+ * kernelization changes
+ *
+ * Revision 1.17 1995/02/04 15:51:35 holland
+ * kernelization changes
+ *
+ * Revision 1.16 1995/02/03 22:31:36 holland
+ * many changes related to kernelization
+ *
+ * Revision 1.15 1995/02/01 15:13:05 holland
+ * moved #include of general.h out of raid.h and into each file
+ *
+ * Revision 1.14 1995/02/01 14:25:19 holland
+ * began changes for kernelization:
+ * changed all instances of mutex_t and cond_t to DECLARE macros
+ * converted configuration code to use config structure
+ *
+ * Revision 1.13 1995/01/30 14:53:46 holland
+ * extensive changes related to making DoIO non-blocking
+ *
+ * Revision 1.12 1995/01/25 00:26:21 holland
+ * eliminated support for aio
+ *
+ * Revision 1.11 1995/01/24 23:58:46 holland
+ * multi-way recon XOR, plus various small changes
+ *
+ * Revision 1.10 1995/01/11 19:27:02 holland
+ * various changes related to performance tuning
+ *
+ * Revision 1.9 1994/12/05 15:29:09 holland
+ * added trace run time limitation (maxTraceRunTimeSec)
+ *
+ * Revision 1.8 1994/12/05 04:18:12 holland
+ * various new control vars in the config file
+ *
+ * Revision 1.7 1994/11/29 23:11:36 holland
+ * tracerec bug on dag retry fixed
+ *
+ * Revision 1.6 1994/11/29 22:11:38 danner
+ * holland updates
+ *
+ * Revision 1.5 1994/11/29 21:09:47 danner
+ * Detailed tracing support (holland).
+ *
+ * Revision 1.4 1994/11/29 20:36:02 danner
+ * Added suppressAtomicRMW option.
+ *
+ * Revision 1.3 1994/11/21 15:34:06 danner
+ * Added ConfigureAllocList() call.
+ *
+ */
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ioctl.h>
+#include <sys/fcntl.h>
+#ifdef __NETBSD__
+#include <sys/vnode.h>
+#endif
+#endif
+
+#ifdef KERNEL
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include <dkusage.h>
+#include <dfstrace.h>
+#endif /* !__NetBSD__ && !__OpenBSD__ */
+#endif /* KERNEL */
+
+#include "rf_archs.h"
+#include "rf_threadstuff.h"
+
+#ifndef KERNEL
+#include <stdio.h>
+#include <stdlib.h>
+#endif /* KERNEL */
+
+#include <sys/errno.h>
+
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_aselect.h"
+#include "rf_diskqueue.h"
+#include "rf_parityscan.h"
+#include "rf_alloclist.h"
+#include "rf_threadid.h"
+#include "rf_dagutils.h"
+#include "rf_utils.h"
+#include "rf_etimer.h"
+#include "rf_acctrace.h"
+#include "rf_configure.h"
+#include "rf_general.h"
+#include "rf_desc.h"
+#include "rf_states.h"
+#include "rf_freelist.h"
+#include "rf_decluster.h"
+#include "rf_map.h"
+#include "rf_diskthreads.h"
+#include "rf_revent.h"
+#include "rf_callback.h"
+#include "rf_engine.h"
+#include "rf_memchunk.h"
+#include "rf_mcpair.h"
+#include "rf_nwayxor.h"
+#include "rf_debugprint.h"
+#include "rf_copyback.h"
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include "rf_camlayer.h"
+#endif
+#include "rf_driver.h"
+#include "rf_options.h"
+#include "rf_shutdown.h"
+#include "rf_sys.h"
+#include "rf_cpuutil.h"
+
+#ifdef SIMULATE
+#include "rf_diskevent.h"
+#endif /* SIMULATE */
+
+#ifdef KERNEL
+#include <sys/buf.h>
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include <io/common/devdriver.h>
+#endif /* !__NetBSD__ && !__OpenBSD__ */
+
+#if DFSTRACE > 0
+#include <sys/dfs_log.h>
+#include <sys/dfstracebuf.h>
+#endif /* DFSTRACE > 0 */
+
+#if DKUSAGE > 0
+#include <sys/dkusage.h>
+#include <io/common/iotypes.h>
+#include <io/cam/dec_cam.h>
+#include <io/cam/cam.h>
+#include <io/cam/pdrv.h>
+#endif /* DKUSAGE > 0 */
+#endif /* KERNEL */
+
+#if RF_DEMO > 0
+#include "rf_demo.h"
+#endif /* RF_DEMO > 0 */
+
+/* rad == RF_RaidAccessDesc_t */
+static RF_FreeList_t *rf_rad_freelist;
+#define RF_MAX_FREE_RAD 128
+#define RF_RAD_INC 16
+#define RF_RAD_INITIAL 32
+
+/* debug variables */
+char rf_panicbuf[2048]; /* a buffer to hold an error msg when we panic */
+
+/* main configuration routines */
+static int raidframe_booted = 0;
+
+static void rf_ConfigureDebug(RF_Config_t *cfgPtr);
+static void set_debug_option(char *name, long val);
+static void rf_UnconfigureArray(void);
+static int init_rad(RF_RaidAccessDesc_t *);
+static void clean_rad(RF_RaidAccessDesc_t *);
+static void rf_ShutdownRDFreeList(void *);
+static int rf_ConfigureRDFreeList(RF_ShutdownList_t **);
+
+
+RF_DECLARE_MUTEX(rf_printf_mutex) /* debug only: avoids interleaved printfs by different stripes */
+RF_DECLARE_GLOBAL_THREADID /* declarations for threadid.h */
+
+#if !defined(KERNEL) && !defined(SIMULATE)
+static int rf_InitThroughputStats(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr, RF_Config_t *cfgPtr);
+static void rf_StopThroughputStats(RF_Raid_t *raidPtr);
+static void rf_PrintThroughputStats(RF_Raid_t *raidPtr);
+#endif /* !KERNEL && !SIMULATE */
+
+#ifdef KERNEL
+#define SIGNAL_QUIESCENT_COND(_raid_) wakeup(&((_raid_)->accesses_suspended))
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#define WAIT_FOR_QUIESCENCE(_raid_) \
+ mpsleep(&((_raid_)->accesses_suspended), PZERO, "raidframe quiesce", 0, \
+ (void *) simple_lock_addr((_raid_)->access_suspend_mutex), MS_LOCK_SIMPLE)
+#else
+#define WAIT_FOR_QUIESCENCE(_raid_) \
+ tsleep(&((_raid_)->accesses_suspended),PRIBIO|PCATCH,"raidframe quiesce", 0);
+
+#endif
+#if DKUSAGE > 0
+#define IO_BUF_ERR(bp, err, unit) { \
+ bp->b_flags |= B_ERROR; \
+ bp->b_resid = bp->b_bcount; \
+ bp->b_error = err; \
+ RF_DKU_END_IO(unit, bp); \
+ biodone(bp); \
+}
+#else
+#define IO_BUF_ERR(bp, err, unit) { \
+ bp->b_flags |= B_ERROR; \
+ bp->b_resid = bp->b_bcount; \
+ bp->b_error = err; \
+ RF_DKU_END_IO(unit); \
+ biodone(bp); \
+}
+#endif /* DKUSAGE > 0 */
+#else /* KERNEL */
+
+#define SIGNAL_QUIESCENT_COND(_raid_) RF_SIGNAL_COND((_raid_)->quiescent_cond)
+#define WAIT_FOR_QUIESCENCE(_raid_) RF_WAIT_COND((_raid_)->quiescent_cond, (_raid_)->access_suspend_mutex)
+#define IO_BUF_ERR(bp, err, unit)
+
+#endif /* KERNEL */
+
+static int configureCount=0; /* number of active configurations */
+static int isconfigged=0; /* is basic raidframe (non per-array) stuff configged */
+RF_DECLARE_STATIC_MUTEX(configureMutex) /* used to lock the configuration stuff */
+
+static RF_ShutdownList_t *globalShutdown; /* non array-specific stuff */
+
+static int rf_ConfigureRDFreeList(RF_ShutdownList_t **listp);
+
+/* called at system boot time */
+int rf_BootRaidframe()
+{
+#if 0
+ long stacksize;
+#endif
+ int rc;
+
+ if (raidframe_booted)
+ return(EBUSY);
+ raidframe_booted = 1;
+
+#if RF_DEBUG_ATOMIC > 0
+ rf_atent_init();
+#endif /* RF_DEBUG_ATOMIC > 0 */
+
+ rf_setup_threadid();
+ rf_assign_threadid();
+
+#if !defined(KERNEL) && !defined(SIMULATE)
+ if (RF_THREAD_ATTR_CREATE(raidframe_attr_default)) {
+ fprintf(stderr, "Unable to create default thread attr\n");
+ exit(1);
+ }
+#if 0
+ stacksize = RF_THREAD_ATTR_GETSTACKSIZE(raidframe_attr_default);
+ if (stacksize < 0) {
+ fprintf(stderr, "Unable to get stack size of default thread attr\n");
+ exit(1);
+ }
+ stacksize += 16384;
+ rc = RF_THREAD_ATTR_SETSTACKSIZE(raidframe_attr_default, stacksize);
+ if (rc) {
+ fprintf(stderr, "Unable to set stack size of default thread attr\n");
+ exit(1);
+ }
+#endif /* 0 */
+#endif /* !KERNEL && !SIMULATE */
+ rc = rf_mutex_init(&configureMutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ RF_PANIC();
+ }
+ configureCount = 0;
+ isconfigged = 0;
+ globalShutdown = NULL;
+ return(0);
+}
+
+/*
+ * This function is really just for debugging user-level stuff: it
+ * frees up all memory, other RAIDframe resources which might otherwise
+ * be kept around. This is used with systems like "sentinel" to detect
+ * memory leaks.
+ */
+int rf_UnbootRaidframe()
+{
+ int rc;
+
+ RF_LOCK_MUTEX(configureMutex);
+ if (configureCount) {
+ RF_UNLOCK_MUTEX(configureMutex);
+ return(EBUSY);
+ }
+ raidframe_booted = 0;
+ RF_UNLOCK_MUTEX(configureMutex);
+ rc = rf_mutex_destroy(&configureMutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to destroy mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ RF_PANIC();
+ }
+#if RF_DEBUG_ATOMIC > 0
+ rf_atent_shutdown();
+#endif /* RF_DEBUG_ATOMIC > 0 */
+ return(0);
+}
+
+/*
+ * Called whenever an array is shutdown
+ */
+static void rf_UnconfigureArray()
+{
+ int rc;
+
+ RF_LOCK_MUTEX(configureMutex);
+ if (--configureCount == 0) { /* if no active configurations, shut everything down */
+ isconfigged = 0;
+
+ rc = rf_ShutdownList(&globalShutdown);
+ if (rc) {
+ RF_ERRORMSG1("RAIDFRAME: unable to do global shutdown, rc=%d\n", rc);
+ }
+
+ rf_shutdown_threadid();
+
+ /*
+ * We must wait until now, because the AllocList module
+ * uses the DebugMem module.
+ */
+ if (rf_memDebug)
+ rf_print_unfreed();
+ }
+ RF_UNLOCK_MUTEX(configureMutex);
+}
+
+/*
+ * Called to shut down an array.
+ */
+int rf_Shutdown(raidPtr)
+ RF_Raid_t *raidPtr;
+{
+#if !defined(__NetBSD__) && !defined(__OpenBSD__) && !defined(_KERNEL)
+ int rc;
+#endif
+ int r,c;
+
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ struct proc *p;
+#endif
+
+ if (!raidPtr->valid) {
+ RF_ERRORMSG("Attempt to shut down unconfigured RAIDframe driver. Aborting shutdown\n");
+ return(EINVAL);
+ }
+
+ /*
+ * wait for outstanding IOs to land
+ * As described in rf_raid.h, we use the rad_freelist lock
+ * to protect the per-array info about outstanding descs
+ * since we need to do freelist locking anyway, and this
+ * cuts down on the amount of serialization we've got going
+ * on.
+ */
+ RF_FREELIST_DO_LOCK(rf_rad_freelist);
+ if (raidPtr->waitShutdown) {
+ RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
+ return(EBUSY);
+ }
+ raidPtr->waitShutdown = 1;
+ while (raidPtr->nAccOutstanding) {
+ RF_WAIT_COND(raidPtr->outstandingCond, RF_FREELIST_MUTEX_OF(rf_rad_freelist));
+ }
+ RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
+
+#if !defined(KERNEL) && !defined(SIMULATE)
+ rf_PrintThroughputStats(raidPtr);
+#endif /* !KERNEL && !SIMULATE */
+
+ raidPtr->valid = 0;
+
+#if !defined(KERNEL) && !defined(SIMULATE)
+ rf_TerminateDiskQueues(raidPtr); /* tell all disk queues to release any waiting threads */
+ rf_ShutdownDiskThreads(raidPtr); /* wait for all threads to exit */
+#endif /* !KERNEL && !SIMULATE */
+
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ /* We take this opportunity to close the vnodes like we should.. */
+
+ p = raidPtr->proc; /* XXX */
+
+ for(r=0;r<raidPtr->numRow;r++) {
+ for(c=0;c<raidPtr->numCol;c++) {
+ printf("Closing vnode for row: %d col: %d\n",r,c);
+ if (raidPtr->raid_cinfo[r][c].ci_vp) {
+ (void)vn_close(raidPtr->raid_cinfo[r][c].ci_vp,
+ FREAD|FWRITE, p->p_ucred, p);
+ } else {
+ printf("vnode was NULL\n");
+ }
+
+ }
+ }
+ for(r=0;r<raidPtr->numSpare;r++) {
+ printf("Closing vnode for spare: %d\n",r);
+ if (raidPtr->raid_cinfo[0][raidPtr->numCol+r].ci_vp) {
+ (void)vn_close(raidPtr->raid_cinfo[0][raidPtr->numCol+r].ci_vp,
+ FREAD|FWRITE, p->p_ucred, p);
+ } else {
+ printf("vnode was NULL\n");
+ }
+ }
+
+
+#endif
+
+ rf_ShutdownList(&raidPtr->shutdownList);
+
+ rf_UnconfigureArray();
+
+ return(0);
+}
+
+#define DO_INIT_CONFIGURE(f) { \
+ rc = f (&globalShutdown); \
+ if (rc) { \
+ RF_ERRORMSG2("RAIDFRAME: failed %s with %d\n", RF_STRING(f), rc); \
+ rf_ShutdownList(&globalShutdown); \
+ configureCount--; \
+ RF_UNLOCK_MUTEX(configureMutex); \
+ return(rc); \
+ } \
+}
+
+#define DO_RAID_FAIL() { \
+ rf_ShutdownList(&raidPtr->shutdownList); \
+ rf_UnconfigureArray(); \
+}
+
+#define DO_RAID_INIT_CONFIGURE(f) { \
+ rc = f (&raidPtr->shutdownList, raidPtr, cfgPtr); \
+ if (rc) { \
+ RF_ERRORMSG2("RAIDFRAME: failed %s with %d\n", RF_STRING(f), rc); \
+ DO_RAID_FAIL(); \
+ return(rc); \
+ } \
+}
+
+#define DO_RAID_MUTEX(_m_) { \
+ rc = rf_create_managed_mutex(&raidPtr->shutdownList, (_m_)); \
+ if (rc) { \
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", \
+ __FILE__, __LINE__, rc); \
+ DO_RAID_FAIL(); \
+ return(rc); \
+ } \
+}
+
+#define DO_RAID_COND(_c_) { \
+ rc = rf_create_managed_cond(&raidPtr->shutdownList, (_c_)); \
+ if (rc) { \
+ RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", \
+ __FILE__, __LINE__, rc); \
+ DO_RAID_FAIL(); \
+ return(rc); \
+ } \
+}
+
+int rf_Configure(raidPtr, cfgPtr)
+ RF_Raid_t *raidPtr;
+ RF_Config_t *cfgPtr;
+{
+ RF_RowCol_t row, col;
+ int i, rc;
+ int unit;
+ struct proc *p;
+
+ if (raidPtr->valid) {
+ RF_ERRORMSG("RAIDframe configuration not shut down. Aborting configure.\n");
+ return(EINVAL);
+ }
+
+ RF_LOCK_MUTEX(configureMutex);
+ configureCount++;
+ if (isconfigged == 0) {
+ rc = rf_create_managed_mutex(&globalShutdown, &rf_printf_mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ rf_ShutdownList(&globalShutdown);
+ return(rc);
+ }
+
+ /* initialize globals */
+ printf("RAIDFRAME: protectedSectors is %ld\n",rf_protectedSectors);
+
+ rf_clear_debug_print_buffer();
+
+ DO_INIT_CONFIGURE(rf_ConfigureAllocList);
+ DO_INIT_CONFIGURE(rf_ConfigureEtimer);
+ /*
+ * Yes, this does make debugging general to the whole system instead
+ * of being array specific. Bummer, drag.
+ */
+ rf_ConfigureDebug(cfgPtr);
+ DO_INIT_CONFIGURE(rf_ConfigureDebugMem);
+#ifdef SIMULATE
+ rf_default_disk_names();
+ DO_INIT_CONFIGURE(rf_DDEventInit);
+#endif /* SIMULATE */
+ DO_INIT_CONFIGURE(rf_ConfigureAccessTrace);
+ DO_INIT_CONFIGURE(rf_ConfigureMapModule);
+ DO_INIT_CONFIGURE(rf_ConfigureReconEvent);
+ DO_INIT_CONFIGURE(rf_ConfigureCallback);
+ DO_INIT_CONFIGURE(rf_ConfigureMemChunk);
+ DO_INIT_CONFIGURE(rf_ConfigureRDFreeList);
+ DO_INIT_CONFIGURE(rf_ConfigureNWayXor);
+ DO_INIT_CONFIGURE(rf_ConfigureStripeLockFreeList);
+ DO_INIT_CONFIGURE(rf_ConfigureMCPair);
+#ifndef SIMULATE
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+ DO_INIT_CONFIGURE(rf_ConfigureCamLayer);
+#endif
+#endif /* !SIMULATE */
+ DO_INIT_CONFIGURE(rf_ConfigureDAGs);
+ DO_INIT_CONFIGURE(rf_ConfigureDAGFuncs);
+ DO_INIT_CONFIGURE(rf_ConfigureDebugPrint);
+ DO_INIT_CONFIGURE(rf_ConfigureReconstruction);
+ DO_INIT_CONFIGURE(rf_ConfigureCopyback);
+ DO_INIT_CONFIGURE(rf_ConfigureDiskQueueSystem);
+ DO_INIT_CONFIGURE(rf_ConfigureCpuMonitor);
+ isconfigged = 1;
+ }
+ RF_UNLOCK_MUTEX(configureMutex);
+
+ /*
+ * Null out the entire raid descriptor to avoid problems when we reconfig.
+ * This also clears the valid bit.
+ */
+ /* XXX this clearing should be moved UP to outside of here.... that, or
+ rf_Configure() needs to take more arguments... XXX */
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ unit = raidPtr->raidid;
+ p = raidPtr->proc; /* XXX save these... */
+#endif
+ bzero((char *)raidPtr, sizeof(RF_Raid_t));
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ raidPtr->raidid = unit;
+ raidPtr->proc = p; /* XXX and then recover them..*/
+#endif
+ DO_RAID_MUTEX(&raidPtr->mutex);
+ /* set up the cleanup list. Do this after ConfigureDebug so that value of memDebug will be set */
+
+ rf_MakeAllocList(raidPtr->cleanupList);
+ if (raidPtr->cleanupList == NULL) {
+ DO_RAID_FAIL();
+ return(ENOMEM);
+ }
+
+ rc = rf_ShutdownCreate(&raidPtr->shutdownList,
+ (void (*)(void *))rf_FreeAllocList,
+ raidPtr->cleanupList);
+ if (rc) {
+ RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
+ __FILE__, __LINE__, rc);
+ DO_RAID_FAIL();
+ return(rc);
+ }
+
+ raidPtr->numRow = cfgPtr->numRow;
+ raidPtr->numCol = cfgPtr->numCol;
+ raidPtr->numSpare = cfgPtr->numSpare;
+
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ /* XXX we don't even pretend to support more than one row
+ in the kernel... */
+ if (raidPtr->numRow != 1) {
+ RF_ERRORMSG("Only one row supported in kernel.\n");
+ DO_RAID_FAIL();
+ return(EINVAL);
+ }
+#endif
+
+
+
+ RF_CallocAndAdd(raidPtr->status, raidPtr->numRow, sizeof(RF_RowStatus_t),
+ (RF_RowStatus_t *), raidPtr->cleanupList);
+ if (raidPtr->status == NULL) {
+ DO_RAID_FAIL();
+ return(ENOMEM);
+ }
+
+ RF_CallocAndAdd(raidPtr->reconControl, raidPtr->numRow,
+ sizeof(RF_ReconCtrl_t *), (RF_ReconCtrl_t **), raidPtr->cleanupList);
+ if (raidPtr->reconControl == NULL) {
+ DO_RAID_FAIL();
+ return(ENOMEM);
+ }
+ for (i=0; i<raidPtr->numRow; i++) {
+ raidPtr->status[i] = rf_rs_optimal;
+ raidPtr->reconControl[i] = NULL;
+ }
+
+ DO_RAID_INIT_CONFIGURE(rf_ConfigureEngine);
+#if !defined(KERNEL) && !defined(SIMULATE)
+ DO_RAID_INIT_CONFIGURE(rf_InitThroughputStats);
+#endif /* !KERNEL && !SIMULATE */
+
+ DO_RAID_INIT_CONFIGURE(rf_ConfigureStripeLocks);
+
+ DO_RAID_COND(&raidPtr->outstandingCond);
+
+ raidPtr->nAccOutstanding = 0;
+ raidPtr->waitShutdown = 0;
+
+ DO_RAID_MUTEX(&raidPtr->access_suspend_mutex);
+ DO_RAID_COND(&raidPtr->quiescent_cond);
+
+ DO_RAID_COND(&raidPtr->waitForReconCond);
+
+ DO_RAID_MUTEX(&raidPtr->recon_done_proc_mutex);
+ DO_RAID_INIT_CONFIGURE(rf_ConfigureDisks);
+ DO_RAID_INIT_CONFIGURE(rf_ConfigureSpareDisks);
+ /* do this after ConfigureDisks & ConfigureSpareDisks to be sure dev no. is set */
+ DO_RAID_INIT_CONFIGURE(rf_ConfigureDiskQueues);
+#ifndef KERNEL
+ DO_RAID_INIT_CONFIGURE(rf_ConfigureDiskThreads);
+#endif /* !KERNEL */
+
+ DO_RAID_INIT_CONFIGURE(rf_ConfigureLayout);
+
+ DO_RAID_INIT_CONFIGURE(rf_ConfigurePSStatus);
+
+ for(row=0;row<raidPtr->numRow;row++) {
+ for(col=0;col<raidPtr->numCol;col++) {
+ /*
+ * XXX better distribution
+ */
+ raidPtr->hist_diskreq[row][col] = 0;
+ }
+ }
+
+ if (rf_keepAccTotals) {
+ raidPtr->keep_acc_totals = 1;
+ }
+
+ rf_StartUserStats(raidPtr);
+
+ raidPtr->valid = 1;
+ return(0);
+}
+
+static int init_rad(desc)
+ RF_RaidAccessDesc_t *desc;
+{
+ int rc;
+
+ rc = rf_mutex_init(&desc->mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ return(rc);
+ }
+ rc = rf_cond_init(&desc->cond);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ rf_mutex_destroy(&desc->mutex);
+ return(rc);
+ }
+ return(0);
+}
+
+static void clean_rad(desc)
+ RF_RaidAccessDesc_t *desc;
+{
+ rf_mutex_destroy(&desc->mutex);
+ rf_cond_destroy(&desc->cond);
+}
+
+static void rf_ShutdownRDFreeList(ignored)
+ void *ignored;
+{
+ RF_FREELIST_DESTROY_CLEAN(rf_rad_freelist,next,(RF_RaidAccessDesc_t *),clean_rad);
+}
+
+static int rf_ConfigureRDFreeList(listp)
+ RF_ShutdownList_t **listp;
+{
+ int rc;
+
+ RF_FREELIST_CREATE(rf_rad_freelist, RF_MAX_FREE_RAD,
+ RF_RAD_INC, sizeof(RF_RaidAccessDesc_t));
+ if (rf_rad_freelist == NULL) {
+ return(ENOMEM);
+ }
+ rc = rf_ShutdownCreate(listp, rf_ShutdownRDFreeList, NULL);
+ if (rc) {
+ RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ rf_ShutdownRDFreeList(NULL);
+ return(rc);
+ }
+ RF_FREELIST_PRIME_INIT(rf_rad_freelist, RF_RAD_INITIAL,next,
+ (RF_RaidAccessDesc_t *),init_rad);
+ return(0);
+}
+
+RF_RaidAccessDesc_t *rf_AllocRaidAccDesc(
+ RF_Raid_t *raidPtr,
+ RF_IoType_t type,
+ RF_RaidAddr_t raidAddress,
+ RF_SectorCount_t numBlocks,
+ caddr_t bufPtr,
+ void *bp,
+ RF_DagHeader_t **paramDAG,
+ RF_AccessStripeMapHeader_t **paramASM,
+ RF_RaidAccessFlags_t flags,
+ void (*cbF)(struct buf *),
+ void *cbA,
+ RF_AccessState_t *states)
+{
+ RF_RaidAccessDesc_t *desc;
+
+ RF_FREELIST_GET_INIT_NOUNLOCK(rf_rad_freelist,desc,next,(RF_RaidAccessDesc_t *),init_rad);
+ if (raidPtr->waitShutdown) {
+ /*
+ * Actually, we're shutting the array down. Free the desc
+ * and return NULL.
+ */
+ RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
+ RF_FREELIST_FREE_CLEAN(rf_rad_freelist,desc,next,clean_rad);
+ return(NULL);
+ }
+ raidPtr->nAccOutstanding++;
+ RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
+
+ desc->raidPtr = (void*)raidPtr;
+ desc->type = type;
+ desc->raidAddress = raidAddress;
+ desc->numBlocks = numBlocks;
+ desc->bufPtr = bufPtr;
+ desc->bp = bp;
+ desc->paramDAG = paramDAG;
+ desc->paramASM = paramASM;
+ desc->flags = flags;
+ desc -> states = states;
+ desc -> state = 0;
+
+ desc->status = 0;
+ bzero((char *)&desc->tracerec, sizeof(RF_AccTraceEntry_t));
+ desc->callbackFunc= (void (*)(RF_CBParam_t))cbF; /* XXX */
+ desc->callbackArg = cbA;
+ desc->next = NULL;
+ desc->head = desc;
+ desc->numPending = 0;
+ desc->cleanupList = NULL;
+ rf_MakeAllocList(desc->cleanupList);
+ rf_get_threadid(desc->tid);
+#ifdef SIMULATE
+ desc->owner = rf_GetCurrentOwner();
+#endif /* SIMULATE */
+ return(desc);
+}
+
+void rf_FreeRaidAccDesc(RF_RaidAccessDesc_t *desc)
+{
+ RF_Raid_t *raidPtr = desc->raidPtr;
+
+ RF_ASSERT(desc);
+
+#if !defined(KERNEL) && !defined(SIMULATE)
+ rf_StopThroughputStats(raidPtr);
+#endif /* !KERNEL && !SIMULATE */
+
+ rf_FreeAllocList(desc->cleanupList);
+ RF_FREELIST_FREE_CLEAN_NOUNLOCK(rf_rad_freelist,desc,next,clean_rad);
+ raidPtr->nAccOutstanding--;
+ if (raidPtr->waitShutdown) {
+ RF_SIGNAL_COND(raidPtr->outstandingCond);
+ }
+ RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
+}
+
+#ifdef JIMZ
+#define THREAD_NUMDESC 1024
+#define THREAD_NUM 600
+static RF_RaidAccessDesc_t *dest_hist[THREAD_NUM*THREAD_NUMDESC];
+int jimz_access_num[THREAD_NUM];
+#endif /* JIMZ */
+
+/*********************************************************************
+ * Main routine for performing an access.
+ * Accesses are retried until a DAG can not be selected. This occurs
+ * when either the DAG library is incomplete or there are too many
+ * failures in a parity group.
+ ********************************************************************/
+int rf_DoAccess(
+ RF_Raid_t *raidPtr,
+ RF_IoType_t type,
+ int async_flag,
+ RF_RaidAddr_t raidAddress,
+ RF_SectorCount_t numBlocks,
+ caddr_t bufPtr,
+ void *bp_in,
+ RF_DagHeader_t **paramDAG,
+ RF_AccessStripeMapHeader_t **paramASM,
+ RF_RaidAccessFlags_t flags,
+ RF_RaidAccessDesc_t **paramDesc,
+ void (*cbF)(struct buf *),
+ void *cbA)
+/*
+type should be read or write
+async_flag should be RF_TRUE or RF_FALSE
+bp_in is a buf pointer. void * to facilitate ignoring it outside the kernel
+*/
+{
+ int tid;
+ RF_RaidAccessDesc_t *desc;
+ caddr_t lbufPtr = bufPtr;
+#ifdef KERNEL
+ struct buf *bp = (struct buf *) bp_in;
+#if DFSTRACE > 0
+ struct { RF_uint64 raidAddr; int numBlocks; char type;} dfsrecord;
+#endif /* DFSTRACE > 0 */
+#else /* KERNEL */
+ void *bp = bp_in;
+#endif /* KERNEL */
+
+ raidAddress += rf_raidSectorOffset;
+
+ if (!raidPtr->valid) {
+ RF_ERRORMSG("RAIDframe driver not successfully configured. Rejecting access.\n");
+ IO_BUF_ERR(bp, EINVAL, raidPtr->raidid);
+ return(EINVAL);
+ }
+
+#if defined(KERNEL) && DFSTRACE > 0
+ if (rf_DFSTraceAccesses) {
+ dfsrecord.raidAddr = raidAddress;
+ dfsrecord.numBlocks = numBlocks;
+ dfsrecord.type = type;
+ dfs_log(DFS_NOTE, (char *) &dfsrecord, sizeof(dfsrecord), 0);
+ }
+#endif /* KERNEL && DFSTRACE > 0 */
+
+ rf_get_threadid(tid);
+ if (rf_accessDebug) {
+
+ printf("logBytes is: %d %d %d\n",raidPtr->raidid,
+ raidPtr->logBytesPerSector,
+ (int)rf_RaidAddressToByte(raidPtr,numBlocks));
+ printf("[%d] %s raidAddr %d (stripeid %d-%d) numBlocks %d (%d bytes) buf 0x%lx\n",tid,
+ (type==RF_IO_TYPE_READ) ? "READ":"WRITE", (int)raidAddress,
+ (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress),
+ (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress+numBlocks-1),
+ (int) numBlocks,
+ (int) rf_RaidAddressToByte(raidPtr,numBlocks),
+ (long) bufPtr);
+ }
+
+ if (raidAddress + numBlocks > raidPtr->totalSectors) {
+
+ printf("DoAccess: raid addr %lu too large to access %lu sectors. Max legal addr is %lu\n",
+ (u_long)raidAddress,(u_long)numBlocks,(u_long)raidPtr->totalSectors);
+
+#ifdef KERNEL
+ if (type == RF_IO_TYPE_READ) {
+ IO_BUF_ERR(bp, ENOSPC, raidPtr->raidid);
+ return(ENOSPC);
+ } else {
+ IO_BUF_ERR(bp, ENOSPC, raidPtr->raidid);
+ return(ENOSPC);
+ }
+#elif defined(SIMULATE)
+ RF_PANIC();
+#else /* SIMULATE */
+ return(EIO);
+#endif /* SIMULATE */
+ }
+
+#if !defined(KERNEL) && !defined(SIMULATE)
+ rf_StartThroughputStats(raidPtr);
+#endif /* !KERNEL && !SIMULATE */
+
+ desc = rf_AllocRaidAccDesc(raidPtr, type, raidAddress,
+ numBlocks, lbufPtr, bp, paramDAG, paramASM,
+ flags, cbF, cbA, raidPtr->Layout.map->states);
+
+ if (desc == NULL) {
+ return(ENOMEM);
+ }
+#ifdef JIMZ
+ dest_hist[(tid*THREAD_NUMDESC)+jimz_access_num[tid]]; jimz_access_num[tid]++;
+#endif /* JIMZ */
+
+ RF_ETIMER_START(desc->tracerec.tot_timer);
+
+#ifdef SIMULATE
+ /* simulator uses paramDesc to continue dag from test function */
+ desc->async_flag=async_flag;
+
+ *paramDesc=desc;
+
+ return(0);
+#endif /* SIMULATE */
+
+ rf_ContinueRaidAccess(desc);
+
+#ifndef KERNEL
+ if (!(flags & RF_DAG_NONBLOCKING_IO)) {
+ RF_LOCK_MUTEX(desc->mutex);
+ while (!(desc->flags & RF_DAG_ACCESS_COMPLETE)) {
+ RF_WAIT_COND(desc->cond, desc->mutex);
+ }
+ RF_UNLOCK_MUTEX(desc->mutex);
+ rf_FreeRaidAccDesc(desc);
+ }
+#endif /* !KERNEL */
+
+ return(0);
+}
+
+/* force the array into reconfigured mode without doing reconstruction */
+int rf_SetReconfiguredMode(raidPtr, row, col)
+ RF_Raid_t *raidPtr;
+ int row;
+ int col;
+{
+ if (!(raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
+ printf("Can't set reconfigured mode in dedicated-spare array\n");
+ RF_PANIC();
+ }
+ RF_LOCK_MUTEX(raidPtr->mutex);
+ raidPtr->numFailures++;
+ raidPtr->Disks[row][col].status = rf_ds_dist_spared;
+ raidPtr->status[row] = rf_rs_reconfigured;
+ /* install spare table only if declustering + distributed sparing architecture. */
+ if ( raidPtr->Layout.map->flags & RF_BD_DECLUSTERED )
+ rf_InstallSpareTable(raidPtr, row, col);
+ RF_UNLOCK_MUTEX(raidPtr->mutex);
+ return(0);
+}
+
+extern int fail_row, fail_col, fail_time;
+extern int delayed_recon;
+
+int rf_FailDisk(
+ RF_Raid_t *raidPtr,
+ int frow,
+ int fcol,
+ int initRecon)
+{
+ int tid;
+
+ rf_get_threadid(tid);
+ printf("[%d] Failing disk r%d c%d\n",tid,frow,fcol);
+ RF_LOCK_MUTEX(raidPtr->mutex);
+ raidPtr->numFailures++;
+ raidPtr->Disks[frow][fcol].status = rf_ds_failed;
+ raidPtr->status[frow] = rf_rs_degraded;
+ RF_UNLOCK_MUTEX(raidPtr->mutex);
+#ifdef SIMULATE
+#if RF_DEMO > 0
+ if (rf_demoMode) {
+ rf_demo_update_mode (RF_DEMO_DEGRADED);
+ fail_col = fcol; fail_row = frow;
+ fail_time = rf_CurTime(); /* XXX */
+ if (initRecon)
+ delayed_recon = RF_TRUE;
+ }
+ else {
+ if (initRecon)
+ rf_ReconstructFailedDisk(raidPtr, frow, fcol);
+ }
+#else /* RF_DEMO > 0 */
+ if (initRecon)
+ rf_ReconstructFailedDisk(raidPtr, frow, fcol);
+#endif /* RF_DEMO > 0 */
+#else /* SIMULATE */
+ if (initRecon)
+ rf_ReconstructFailedDisk(raidPtr, frow, fcol);
+#endif /* SIMULATE */
+ return(0);
+}
+
+#ifdef SIMULATE
+extern RF_Owner_t recon_owner;
+
+void rf_ScheduleContinueReconstructFailedDisk(reconDesc)
+ RF_RaidReconDesc_t *reconDesc;
+{
+ rf_DDEventRequest(rf_CurTime(), rf_ContinueReconstructFailedDisk,
+ (void *) reconDesc, recon_owner, -4, -4, reconDesc->raidPtr, NULL);
+}
+#endif /* SIMULATE */
+
+/* releases a thread that is waiting for the array to become quiesced.
+ * access_suspend_mutex should be locked upon calling this
+ */
+void rf_SignalQuiescenceLock(raidPtr, reconDesc)
+ RF_Raid_t *raidPtr;
+ RF_RaidReconDesc_t *reconDesc;
+{
+ int tid;
+
+ if (rf_quiesceDebug) {
+ rf_get_threadid(tid);
+ printf("[%d] Signalling quiescence lock\n", tid);
+ }
+ raidPtr->access_suspend_release = 1;
+
+ if (raidPtr->waiting_for_quiescence) {
+#ifndef SIMULATE
+ SIGNAL_QUIESCENT_COND(raidPtr);
+#else /* !SIMULATE */
+ if (reconDesc) {
+ rf_ScheduleContinueReconstructFailedDisk(reconDesc);
+ }
+#endif /* !SIMULATE */
+ }
+}
+
+/* suspends all new requests to the array. No effect on accesses that are in flight. */
+int rf_SuspendNewRequestsAndWait(raidPtr)
+ RF_Raid_t *raidPtr;
+{
+ if (rf_quiesceDebug)
+ printf("Suspending new reqs\n");
+
+ RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
+ raidPtr->accesses_suspended++;
+ raidPtr->waiting_for_quiescence = (raidPtr->accs_in_flight == 0) ? 0 : 1;
+
+#ifndef SIMULATE
+ if (raidPtr->waiting_for_quiescence) {
+ raidPtr->access_suspend_release=0;
+ while (!raidPtr->access_suspend_release) {
+ printf("Suspending: Waiting for Quiescence\n");
+ WAIT_FOR_QUIESCENCE(raidPtr);
+ raidPtr->waiting_for_quiescence = 0;
+ }
+ }
+ printf("Quiescence reached..\n");
+#endif /* !SIMULATE */
+
+ RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
+ return (raidPtr->waiting_for_quiescence);
+}
+
+/* wake up everyone waiting for quiescence to be released */
+void rf_ResumeNewRequests(raidPtr)
+ RF_Raid_t *raidPtr;
+{
+ RF_CallbackDesc_t *t, *cb;
+
+ if (rf_quiesceDebug)
+ printf("Resuming new reqs\n");
+
+ RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
+ raidPtr->accesses_suspended--;
+ if (raidPtr->accesses_suspended == 0)
+ cb = raidPtr->quiesce_wait_list;
+ else
+ cb = NULL;
+ raidPtr->quiesce_wait_list = NULL;
+ RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
+
+ while (cb) {
+ t = cb;
+ cb = cb->next;
+ (t->callbackFunc)(t->callbackArg);
+ rf_FreeCallbackDesc(t);
+ }
+}
+
+/*****************************************************************************************
+ *
+ * debug routines
+ *
+ ****************************************************************************************/
+
+static void set_debug_option(name, val)
+ char *name;
+ long val;
+{
+ RF_DebugName_t *p;
+
+ for (p = rf_debugNames; p->name; p++) {
+ if (!strcmp(p->name, name)) {
+ *(p->ptr) = val;
+ printf("[Set debug variable %s to %ld]\n",name,val);
+ return;
+ }
+ }
+ RF_ERRORMSG1("Unknown debug string \"%s\"\n",name);
+}
+
+
+/* would like to use sscanf here, but apparently not available in kernel */
+/*ARGSUSED*/
+static void rf_ConfigureDebug(cfgPtr)
+ RF_Config_t *cfgPtr;
+{
+ char *val_p, *name_p, *white_p;
+ long val;
+ int i;
+
+ rf_ResetDebugOptions();
+ for (i=0; cfgPtr->debugVars[i][0] && i < RF_MAXDBGV; i++) {
+ name_p = rf_find_non_white(&cfgPtr->debugVars[i][0]);
+ white_p = rf_find_white(name_p); /* skip to start of 2nd word */
+ val_p = rf_find_non_white(white_p);
+ if (*val_p == '0' && *(val_p+1) == 'x') val = rf_htoi(val_p+2);
+ else val = rf_atoi(val_p);
+ *white_p = '\0';
+ set_debug_option(name_p, val);
+ }
+}
+
+/* performance monitoring stuff */
+
+#define TIMEVAL_TO_US(t) (((long) t.tv_sec) * 1000000L + (long) t.tv_usec)
+
+#if !defined(KERNEL) && !defined(SIMULATE)
+
+/*
+ * Throughput stats currently only used in user-level RAIDframe
+ */
+
+static int rf_InitThroughputStats(
+ RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr)
+{
+ int rc;
+
+ /* these used by user-level raidframe only */
+ rc = rf_create_managed_mutex(listp, &raidPtr->throughputstats.mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ return(rc);
+ }
+ raidPtr->throughputstats.sum_io_us = 0;
+ raidPtr->throughputstats.num_ios = 0;
+ raidPtr->throughputstats.num_out_ios = 0;
+ return(0);
+}
+
+void rf_StartThroughputStats(RF_Raid_t *raidPtr)
+{
+ RF_LOCK_MUTEX(raidPtr->throughputstats.mutex);
+ raidPtr->throughputstats.num_ios++;
+ raidPtr->throughputstats.num_out_ios++;
+ if (raidPtr->throughputstats.num_out_ios == 1)
+ RF_GETTIME(raidPtr->throughputstats.start);
+ RF_UNLOCK_MUTEX(raidPtr->throughputstats.mutex);
+}
+
+static void rf_StopThroughputStats(RF_Raid_t *raidPtr)
+{
+ struct timeval diff;
+
+ RF_LOCK_MUTEX(raidPtr->throughputstats.mutex);
+ raidPtr->throughputstats.num_out_ios--;
+ if (raidPtr->throughputstats.num_out_ios == 0) {
+ RF_GETTIME(raidPtr->throughputstats.stop);
+ RF_TIMEVAL_DIFF(&raidPtr->throughputstats.start, &raidPtr->throughputstats.stop, &diff);
+ raidPtr->throughputstats.sum_io_us += TIMEVAL_TO_US(diff);
+ }
+ RF_UNLOCK_MUTEX(raidPtr->throughputstats.mutex);
+}
+
+static void rf_PrintThroughputStats(RF_Raid_t *raidPtr)
+{
+ RF_ASSERT(raidPtr->throughputstats.num_out_ios == 0);
+ if ( raidPtr->throughputstats.sum_io_us != 0 ) {
+ printf("[Througphut: %8.2f IOs/second]\n", raidPtr->throughputstats.num_ios
+ / (raidPtr->throughputstats.sum_io_us / 1000000.0));
+ }
+}
+
+#endif /* !KERNEL && !SIMULATE */
+
+void rf_StartUserStats(RF_Raid_t *raidPtr)
+{
+ RF_GETTIME(raidPtr->userstats.start);
+ raidPtr->userstats.sum_io_us = 0;
+ raidPtr->userstats.num_ios = 0;
+ raidPtr->userstats.num_sect_moved = 0;
+}
+
+void rf_StopUserStats(RF_Raid_t *raidPtr)
+{
+ RF_GETTIME(raidPtr->userstats.stop);
+}
+
+void rf_UpdateUserStats(raidPtr, rt, numsect)
+ RF_Raid_t *raidPtr;
+ int rt; /* resp time in us */
+ int numsect; /* number of sectors for this access */
+{
+ raidPtr->userstats.sum_io_us += rt;
+ raidPtr->userstats.num_ios++;
+ raidPtr->userstats.num_sect_moved += numsect;
+}
+
+void rf_PrintUserStats(RF_Raid_t *raidPtr)
+{
+ long elapsed_us, mbs, mbs_frac;
+ struct timeval diff;
+
+ RF_TIMEVAL_DIFF(&raidPtr->userstats.start, &raidPtr->userstats.stop, &diff);
+ elapsed_us = TIMEVAL_TO_US(diff);
+
+ /* 2000 sectors per megabyte, 10000000 microseconds per second */
+ if (elapsed_us)
+ mbs = (raidPtr->userstats.num_sect_moved / 2000) / (elapsed_us / 1000000);
+ else
+ mbs = 0;
+
+ /* this computes only the first digit of the fractional mb/s moved */
+ if (elapsed_us) {
+ mbs_frac = ((raidPtr->userstats.num_sect_moved / 200) / (elapsed_us / 1000000))
+ - (mbs * 10);
+ }
+ else {
+ mbs_frac = 0;
+ }
+
+ printf("Number of I/Os: %ld\n",raidPtr->userstats.num_ios);
+ printf("Elapsed time (us): %ld\n",elapsed_us);
+ printf("User I/Os per second: %ld\n",RF_DB0_CHECK(raidPtr->userstats.num_ios, (elapsed_us/1000000)));
+ printf("Average user response time: %ld us\n",RF_DB0_CHECK(raidPtr->userstats.sum_io_us, raidPtr->userstats.num_ios));
+ printf("Total sectors moved: %ld\n",raidPtr->userstats.num_sect_moved);
+ printf("Average access size (sect): %ld\n",RF_DB0_CHECK(raidPtr->userstats.num_sect_moved, raidPtr->userstats.num_ios));
+ printf("Achieved data rate: %ld.%ld MB/sec\n",mbs,mbs_frac);
+}
diff --git a/sys/dev/raidframe/rf_driver.h b/sys/dev/raidframe/rf_driver.h
new file mode 100644
index 00000000000..7c9a1c4084b
--- /dev/null
+++ b/sys/dev/raidframe/rf_driver.h
@@ -0,0 +1,126 @@
+/* $OpenBSD: rf_driver.h,v 1.1 1999/01/11 14:29:19 niklas Exp $ */
+/* $NetBSD: rf_driver.h,v 1.1 1998/11/13 04:20:29 oster Exp $ */
+/*
+ * rf_driver.h
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * :
+ * Log: rf_driver.h,v
+ * Revision 1.11 1996/07/11 19:08:00 jimz
+ * generalize reconstruction mechanism
+ * allow raid1 reconstructs via copyback (done with array
+ * quiesced, not online, therefore not disk-directed)
+ *
+ * Revision 1.10 1996/06/10 14:18:58 jimz
+ * move user, throughput stats into per-array structure
+ *
+ * Revision 1.9 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.8 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.7 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.6 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.5 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.4 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.3 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.2 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.1 1996/05/18 19:56:10 jimz
+ * Initial revision
+ *
+ */
+
+#ifndef _RF__RF_DRIVER_H_
+#define _RF__RF_DRIVER_H_
+
+#include "rf_threadstuff.h"
+#include "rf_types.h"
+
+RF_DECLARE_EXTERN_MUTEX(rf_printf_mutex)
+
+int rf_BootRaidframe(void);
+int rf_UnbootRaidframe(void);
+int rf_Shutdown(RF_Raid_t *raidPtr);
+int rf_Configure(RF_Raid_t *raidPtr, RF_Config_t *cfgPtr);
+RF_RaidAccessDesc_t *rf_AllocRaidAccDesc(RF_Raid_t *raidPtr, RF_IoType_t type,
+ RF_RaidAddr_t raidAddress, RF_SectorCount_t numBlocks, caddr_t bufPtr,
+ void *bp, RF_DagHeader_t **paramDAG, RF_AccessStripeMapHeader_t **paramASM,
+ RF_RaidAccessFlags_t flags, void (*cbF)(struct buf *), void *cbA,
+ RF_AccessState_t *states);
+void rf_FreeRaidAccDesc(RF_RaidAccessDesc_t *desc);
+int rf_DoAccess(RF_Raid_t *raidPtr, RF_IoType_t type, int async_flag,
+ RF_RaidAddr_t raidAddress, RF_SectorCount_t numBlocks, caddr_t bufPtr,
+ void *bp_in, RF_DagHeader_t **paramDAG,
+ RF_AccessStripeMapHeader_t **paramASM, RF_RaidAccessFlags_t flags,
+ RF_RaidAccessDesc_t **paramDesc, void (*cbF)(struct buf *), void *cbA);
+int rf_SetReconfiguredMode(RF_Raid_t *raidPtr, RF_RowCol_t row,
+ RF_RowCol_t col);
+int rf_FailDisk(RF_Raid_t *raidPtr, RF_RowCol_t frow, RF_RowCol_t fcol,
+ int initRecon);
+#ifdef SIMULATE
+void rf_ScheduleContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc);
+#endif /* SIMULATE */
+void rf_SignalQuiescenceLock(RF_Raid_t *raidPtr, RF_RaidReconDesc_t *reconDesc);
+int rf_SuspendNewRequestsAndWait(RF_Raid_t *raidPtr);
+void rf_ResumeNewRequests(RF_Raid_t *raidPtr);
+void rf_StartThroughputStats(RF_Raid_t *raidPtr);
+void rf_StartUserStats(RF_Raid_t *raidPtr);
+void rf_StopUserStats(RF_Raid_t *raidPtr);
+void rf_UpdateUserStats(RF_Raid_t *raidPtr, int rt, int numsect);
+void rf_PrintUserStats(RF_Raid_t *raidPtr);
+
+#endif /* !_RF__RF_DRIVER_H_ */
diff --git a/sys/dev/raidframe/rf_engine.c b/sys/dev/raidframe/rf_engine.c
new file mode 100644
index 00000000000..c99782cbed5
--- /dev/null
+++ b/sys/dev/raidframe/rf_engine.c
@@ -0,0 +1,1096 @@
+/* $OpenBSD: rf_engine.c,v 1.1 1999/01/11 14:29:19 niklas Exp $ */
+/* $NetBSD: rf_engine.c,v 1.2 1998/11/13 11:48:26 simonb Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II, Mark Holland, Rachad Youssef
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/****************************************************************************
+ * *
+ * engine.c -- code for DAG execution engine *
+ * *
+ * Modified to work as follows (holland): *
+ * A user-thread calls into DispatchDAG, which fires off the nodes that *
+ * are direct successors to the header node. DispatchDAG then returns, *
+ * and the rest of the I/O continues asynchronously. As each node *
+ * completes, the node execution function calls FinishNode(). FinishNode *
+ * scans the list of successors to the node and increments the antecedent *
+ * counts. Each node that becomes enabled is placed on a central node *
+ * queue. A dedicated dag-execution thread grabs nodes off of this *
+ * queue and fires them. *
+ * *
+ * NULL nodes are never fired. *
+ * *
+ * Terminator nodes are never fired, but rather cause the callback *
+ * associated with the DAG to be invoked. *
+ * *
+ * If a node fails, the dag either rolls forward to the completion or *
+ * rolls back, undoing previously-completed nodes and fails atomically. *
+ * The direction of recovery is determined by the location of the failed *
+ * node in the graph. If the failure occured before the commit node in *
+ * the graph, backward recovery is used. Otherwise, forward recovery is *
+ * used. *
+ * *
+ ****************************************************************************/
+
+/*
+ * :
+ *
+ * Log: rf_engine.c,v
+ * Revision 1.56 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.55 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.54 1996/07/17 21:00:58 jimz
+ * clean up timer interface, tracing
+ *
+ * Revision 1.53 1996/07/15 17:22:18 jimz
+ * nit-pick code cleanup
+ * resolve stdlib problems on DEC OSF
+ *
+ * Revision 1.52 1996/06/17 03:17:08 jimz
+ * correctly shut down engine thread in kernel
+ *
+ * Revision 1.51 1996/06/14 15:02:10 jimz
+ * make new engine code happy in simulator
+ *
+ * Revision 1.50 1996/06/14 14:19:48 jimz
+ * use diskgroup to control engine thread, make all engine-thread-related
+ * stuff per-array
+ *
+ * Revision 1.49 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.48 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.47 1996/06/06 01:23:23 jimz
+ * fix bug in node traversal when firing multiple nodes simultaneously
+ *
+ * Revision 1.46 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.45 1996/05/30 12:59:18 jimz
+ * make etimer happier, more portable
+ *
+ * Revision 1.44 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.43 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.42 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.41 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.40 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.39 1996/05/20 16:15:17 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.38 1996/05/18 20:09:54 jimz
+ * bit of cleanup to compile cleanly in kernel, once again
+ *
+ * Revision 1.37 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.36 1996/05/15 20:24:19 wvcii
+ * fixed syntax bug in SIMULATE clause above ProcessNode
+ *
+ * Revision 1.35 1996/05/08 21:01:24 jimz
+ * fixed up enum type names that were conflicting with other
+ * enums and function names (ie, "panic")
+ * future naming trends will be towards RF_ and rf_ for
+ * everything raidframe-related
+ *
+ * Revision 1.34 1996/05/08 15:25:28 wvcii
+ * eliminated dead code
+ * merged common cases (sim/user/kernel)
+ * entire node lists (arrays) now fired atomically
+ * reordered source code for readability
+ * beefed-up & corrected comments
+ *
+ * Revision 1.33 1996/05/07 19:39:40 jimz
+ * 1. fixed problems in PropogateResults() with nodes being referenced
+ * after they were no longer valid
+ * 2. fixed problems in PropogateResults() with the node list being
+ * incorrectly threaded
+ *
+ * Revision 1.32 1996/05/07 19:03:56 wvcii
+ * in PropagateResults, fixed a bug in the rollBackward case:
+ * node data is copied before the call to FinishNode which
+ * frees the node and destroys its data.
+ *
+ * Revision 1.31 1996/05/07 17:45:17 jimz
+ * remove old #if 0 code from PropogateResults() (was kept in
+ * previous version for archival purposes (rcsdiff))
+ *
+ * Revision 1.30 1996/05/07 17:44:19 jimz
+ * fix threading of nodes to be fired in PropagateResults()
+ * fix iteration through skiplist in PropagateResults()
+ * fix incorrect accesses to freed memory (dereferencing a
+ * node that was freed by the action of calling FinishNode()
+ * on it, which in turn completed its DAG) in PropagateResults()
+ *
+ * Revision 1.29 1996/05/02 15:04:15 wvcii
+ * fixed bad array index in PropagateResults
+ *
+ * Revision 1.28 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.27 1995/12/08 15:07:03 arw
+ * cache code cleanup
+ *
+ * Revision 1.26 1995/11/07 16:18:01 wvcii
+ * numerous changes associated with roll-away error recovery
+ * when a node fails, dag enters rollForward or rollBackward state
+ *
+ * Revision 1.25 1995/09/06 19:27:17 wvcii
+ * added debug vars enableRollAway and debugRecovery
+ *
+ */
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#include "rf_threadstuff.h"
+
+#ifndef KERNEL
+#include <stdio.h>
+#include <stdlib.h>
+#endif /* !KERNEL */
+
+#include <sys/errno.h>
+
+#include "rf_dag.h"
+#include "rf_engine.h"
+#include "rf_threadid.h"
+#include "rf_etimer.h"
+#include "rf_general.h"
+#include "rf_dagutils.h"
+#include "rf_shutdown.h"
+#include "rf_raid.h"
+
+#ifndef SIMULATE
+static void DAGExecutionThread(RF_ThreadArg_t arg);
+#endif /* !SIMULATE */
+
+#define DO_INIT(_l_,_r_) { \
+ int _rc; \
+ _rc = rf_create_managed_mutex(_l_,&(_r_)->node_queue_mutex); \
+ if (_rc) { \
+ return(_rc); \
+ } \
+ _rc = rf_create_managed_cond(_l_,&(_r_)->node_queue_cond); \
+ if (_rc) { \
+ return(_rc); \
+ } \
+}
+
+/* synchronization primitives for this file. DO_WAIT should be enclosed in a while loop. */
+#ifndef KERNEL
+
+#define DO_LOCK(_r_) RF_LOCK_MUTEX((_r_)->node_queue_mutex)
+#define DO_UNLOCK(_r_) RF_UNLOCK_MUTEX((_r_)->node_queue_mutex)
+#define DO_WAIT(_r_) RF_WAIT_COND((_r_)->node_queue_cond, (_r_)->node_queue_mutex)
+#define DO_SIGNAL(_r_) RF_SIGNAL_COND((_r_)->node_queue_cond)
+
+#else /* !KERNEL */
+
+/*
+ * XXX Is this spl-ing really necessary?
+ */
+#define DO_LOCK(_r_) { ks = splbio(); RF_LOCK_MUTEX((_r_)->node_queue_mutex); }
+#define DO_UNLOCK(_r_) { RF_UNLOCK_MUTEX((_r_)->node_queue_mutex); splx(ks); }
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#define DO_WAIT(_r_) mpsleep(&(_r_)->node_queue, PZERO, "raidframe nq", 0, (void *) simple_lock_addr((_r_)->node_queue_mutex), MS_LOCK_SIMPLE)
+#else
+#define DO_WAIT(_r_) tsleep(&(_r_)->node_queue, PRIBIO | PCATCH, "raidframe nq",0)
+#endif
+#define DO_SIGNAL(_r_) wakeup(&(_r_)->node_queue)
+
+#endif /* !KERNEL */
+
+static void rf_ShutdownEngine(void *);
+
+static void rf_ShutdownEngine(arg)
+ void *arg;
+{
+ RF_Raid_t *raidPtr;
+
+ raidPtr = (RF_Raid_t *)arg;
+#ifndef SIMULATE
+ raidPtr->shutdown_engine = 1;
+ DO_SIGNAL(raidPtr);
+ /* XXX something is missing here... */
+#ifdef DEBUG
+ printf("IGNORING WAIT_STOP\n");
+#endif
+#if 0
+ RF_THREADGROUP_WAIT_STOP(&raidPtr->engine_tg);
+#endif
+#endif /* !SIMULATE */
+}
+
+int rf_ConfigureEngine(
+ RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr)
+{
+ int rc, tid=0;
+
+ if (rf_engineDebug) {
+ rf_get_threadid(tid);
+ }
+
+ DO_INIT(listp,raidPtr);
+
+ raidPtr->node_queue = NULL;
+ raidPtr->dags_in_flight = 0;
+
+#ifndef SIMULATE
+ rc = rf_init_managed_threadgroup(listp, &raidPtr->engine_tg);
+ if (rc)
+ return(rc);
+
+ /* we create the execution thread only once per system boot.
+ * no need to check return code b/c the kernel panics if it can't create the thread.
+ */
+ if (rf_engineDebug) {
+ printf("[%d] Creating engine thread\n", tid);
+ }
+
+ if (RF_CREATE_THREAD(raidPtr->engine_thread, DAGExecutionThread, raidPtr)) {
+ RF_ERRORMSG("RAIDFRAME: Unable to create engine thread\n");
+ return(ENOMEM);
+ }
+ if (rf_engineDebug) {
+ printf("[%d] Created engine thread\n", tid);
+ }
+ RF_THREADGROUP_STARTED(&raidPtr->engine_tg);
+ /* XXX something is missing here... */
+#ifdef debug
+ printf("Skipping the WAIT_START!!\n");
+#endif
+#if 0
+ RF_THREADGROUP_WAIT_START(&raidPtr->engine_tg);
+#endif
+ /* engine thread is now running and waiting for work */
+ if (rf_engineDebug) {
+ printf("[%d] Engine thread running and waiting for events\n", tid);
+ }
+#endif /* !SIMULATE */
+
+ rc = rf_ShutdownCreate(listp, rf_ShutdownEngine, raidPtr);
+ if (rc) {
+ RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ rf_ShutdownEngine(NULL);
+ }
+
+ return(rc);
+}
+
+static int BranchDone(RF_DagNode_t *node)
+{
+ int i;
+
+ /* return true if forward execution is completed for a node and it's succedents */
+ switch (node->status) {
+ case rf_wait :
+ /* should never be called in this state */
+ RF_PANIC();
+ break;
+ case rf_fired :
+ /* node is currently executing, so we're not done */
+ return(RF_FALSE);
+ case rf_good :
+ for (i = 0; i < node->numSuccedents; i++) /* for each succedent */
+ if (!BranchDone(node->succedents[i])) /* recursively check branch */
+ return RF_FALSE;
+ return RF_TRUE; /* node and all succedent branches aren't in fired state */
+ break;
+ case rf_bad :
+ /* succedents can't fire */
+ return(RF_TRUE);
+ case rf_recover :
+ /* should never be called in this state */
+ RF_PANIC();
+ break;
+ case rf_undone :
+ case rf_panic :
+ /* XXX need to fix this case */
+ /* for now, assume that we're done */
+ return(RF_TRUE);
+ break;
+ default :
+ /* illegal node status */
+ RF_PANIC();
+ break;
+ }
+}
+
+#ifdef SIMULATE
+/* this is only ifdef SIMULATE because nothing else needs it */
+/* recursively determine if a DAG has completed execution */
+static int DAGDone(RF_DagHeader_t *dag)
+{
+ int i;
+
+ for (i = 0; i < dag->numSuccedents; i++)
+ if (!BranchDone(dag->succedents[i]))
+ return RF_FALSE;
+ return RF_TRUE;
+}
+#endif /* SIMULATE */
+
+static int NodeReady(RF_DagNode_t *node)
+{
+ int ready;
+
+ switch (node->dagHdr->status) {
+ case rf_enable :
+ case rf_rollForward :
+ if ((node->status == rf_wait) && (node->numAntecedents == node->numAntDone))
+ ready = RF_TRUE;
+ else
+ ready = RF_FALSE;
+ break;
+ case rf_rollBackward :
+ RF_ASSERT(node->numSuccDone <= node->numSuccedents);
+ RF_ASSERT(node->numSuccFired <= node->numSuccedents);
+ RF_ASSERT(node->numSuccFired <= node->numSuccDone);
+ if ((node->status == rf_good) && (node->numSuccDone == node->numSuccedents))
+ ready = RF_TRUE;
+ else
+ ready = RF_FALSE;
+ break;
+ default :
+ printf("Execution engine found illegal DAG status in NodeReady\n");
+ RF_PANIC();
+ break;
+ }
+
+ return(ready);
+}
+
+
+
+/* user context and dag-exec-thread context:
+ * Fire a node. The node's status field determines which function, do or undo,
+ * to be fired.
+ * This routine assumes that the node's status field has alread been set to
+ * "fired" or "recover" to indicate the direction of execution.
+ */
+static void FireNode(RF_DagNode_t *node)
+{
+ int tid;
+
+ switch (node->status) {
+ case rf_fired :
+ /* fire the do function of a node */
+ if (rf_engineDebug) {
+ rf_get_threadid(tid);
+ printf("[%d] Firing node 0x%lx (%s)\n",tid,(unsigned long) node, node->name);
+ }
+#ifdef KERNEL
+ if (node->flags & RF_DAGNODE_FLAG_YIELD) {
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ /* thread_block(); */
+ /* printf("Need to block the thread here...\n"); */
+ /* XXX thread_block is actually mentioned in
+ /usr/include/vm/vm_extern.h */
+#else
+ thread_block();
+#endif
+ }
+#endif /* KERNEL */
+ (*(node->doFunc)) (node);
+ break;
+ case rf_recover :
+ /* fire the undo function of a node */
+ if (rf_engineDebug || 1) {
+ rf_get_threadid(tid);
+ printf("[%d] Firing (undo) node 0x%lx (%s)\n",tid,(unsigned long) node, node->name);
+ }
+#ifdef KERNEL
+ if (node->flags & RF_DAGNODE_FLAG_YIELD)
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ /* thread_block(); */
+ /* printf("Need to block the thread here...\n"); */
+ /* XXX thread_block is actually mentioned in
+ /usr/include/vm/vm_extern.h */
+#else
+ thread_block();
+#endif
+#endif /* KERNEL */
+ (*(node->undoFunc)) (node);
+ break;
+ default :
+ RF_PANIC();
+ break;
+ }
+}
+
+
+
+/* user context:
+ * Attempt to fire each node in a linear array.
+ * The entire list is fired atomically.
+ */
+static void FireNodeArray(
+ int numNodes,
+ RF_DagNode_t **nodeList)
+{
+ RF_DagStatus_t dstat;
+ RF_DagNode_t *node;
+ int i, j;
+
+ /* first, mark all nodes which are ready to be fired */
+ for (i = 0; i < numNodes; i++) {
+ node = nodeList[i];
+ dstat = node->dagHdr->status;
+ RF_ASSERT((node->status == rf_wait) || (node->status == rf_good));
+ if (NodeReady(node)) {
+ if ((dstat == rf_enable) || (dstat == rf_rollForward)) {
+ RF_ASSERT(node->status == rf_wait);
+ if (node->commitNode)
+ node->dagHdr->numCommits++;
+ node->status = rf_fired;
+ for (j = 0; j < node->numAntecedents; j++)
+ node->antecedents[j]->numSuccFired++;
+ }
+ else {
+ RF_ASSERT(dstat == rf_rollBackward);
+ RF_ASSERT(node->status == rf_good);
+ RF_ASSERT(node->commitNode == RF_FALSE); /* only one commit node per graph */
+ node->status = rf_recover;
+ }
+ }
+ }
+ /* now, fire the nodes */
+ for (i = 0; i < numNodes; i++) {
+ if ((nodeList[i]->status == rf_fired) || (nodeList[i]->status == rf_recover))
+ FireNode(nodeList[i]);
+ }
+}
+
+
+#ifndef SIMULATE
+/* user context:
+ * Attempt to fire each node in a linked list.
+ * The entire list is fired atomically.
+ */
+static void FireNodeList(RF_DagNode_t *nodeList)
+{
+ RF_DagNode_t *node, *next;
+ RF_DagStatus_t dstat;
+ int j;
+
+ if (nodeList) {
+ /* first, mark all nodes which are ready to be fired */
+ for (node = nodeList; node; node = next) {
+ next = node->next;
+ dstat = node->dagHdr->status;
+ RF_ASSERT((node->status == rf_wait) || (node->status == rf_good));
+ if (NodeReady(node)) {
+ if ((dstat == rf_enable) || (dstat == rf_rollForward)) {
+ RF_ASSERT(node->status == rf_wait);
+ if (node->commitNode)
+ node->dagHdr->numCommits++;
+ node->status = rf_fired;
+ for (j = 0; j < node->numAntecedents; j++)
+ node->antecedents[j]->numSuccFired++;
+ }
+ else {
+ RF_ASSERT(dstat == rf_rollBackward);
+ RF_ASSERT(node->status == rf_good);
+ RF_ASSERT(node->commitNode == RF_FALSE); /* only one commit node per graph */
+ node->status = rf_recover;
+ }
+ }
+ }
+ /* now, fire the nodes */
+ for (node = nodeList; node; node = next) {
+ next = node->next;
+ if ((node->status == rf_fired) || (node->status == rf_recover))
+ FireNode(node);
+ }
+ }
+}
+#endif /* !SIMULATE */
+
+
+
+/* interrupt context:
+ * for each succedent
+ * propagate required results from node to succedent
+ * increment succedent's numAntDone
+ * place newly-enable nodes on node queue for firing
+ *
+ * To save context switches, we don't place NIL nodes on the node queue,
+ * but rather just process them as if they had fired. Note that NIL nodes
+ * that are the direct successors of the header will actually get fired by
+ * DispatchDAG, which is fine because no context switches are involved.
+ *
+ * Important: when running at user level, this can be called by any
+ * disk thread, and so the increment and check of the antecedent count
+ * must be locked. I used the node queue mutex and locked down the
+ * entire function, but this is certainly overkill.
+ */
+static void PropagateResults(
+ RF_DagNode_t *node,
+ int context)
+{
+ RF_DagNode_t *s, *a;
+ RF_Raid_t *raidPtr;
+ int tid, i, ks;
+#ifdef SIMULATE
+ RF_PropHeader_t *p; /* prop list for succedent i */
+#else /* SIMULATE */
+ RF_DagNode_t *finishlist = NULL; /* a list of NIL nodes to be finished */
+ RF_DagNode_t *skiplist = NULL; /* list of nodes with failed truedata antecedents */
+ RF_DagNode_t *firelist = NULL; /* a list of nodes to be fired */
+ RF_DagNode_t *q = NULL, *qh = NULL, *next;
+ int j, skipNode;
+#endif /* SIMULATE */
+
+ rf_get_threadid(tid);
+
+ raidPtr = node->dagHdr->raidPtr;
+
+ DO_LOCK(raidPtr);
+
+ /* debug - validate fire counts */
+ for (i = 0; i < node->numAntecedents; i++) {
+ a = *(node->antecedents + i);
+ RF_ASSERT(a->numSuccFired >= a->numSuccDone);
+ RF_ASSERT(a->numSuccFired <= a->numSuccedents);
+ a->numSuccDone++;
+ }
+
+ switch (node->dagHdr->status) {
+ case rf_enable :
+ case rf_rollForward :
+#ifdef SIMULATE
+ /* currently we never propagate results unless in simulation */
+ for (i = 0; i < node->numSuccedents; i++) {
+ s = *(node->succedents + i);
+ RF_ASSERT(s->status == rf_wait);
+ (s->numAntDone)++;
+ if (node->propList == NULL)
+ /* null propList implies no results to be propagated */
+ p = NULL;
+ else
+ /* p=head of prop list for succedent i */
+ p = *(node->propList + i);
+ while (p != NULL) {
+ /* bind node results to succedent's parameters */
+#if 0
+ *(s->params + p->paramNum) = *(node->results + p->resultNum);
+#else
+ s->params[p->paramNum].p = node->results[p->resultNum];
+#endif
+ p = p->next;
+ }
+ }
+#else /* SIMULATE */
+ for (i = 0; i < node->numSuccedents; i++) {
+ s = *(node->succedents + i);
+ RF_ASSERT(s->status == rf_wait);
+ (s->numAntDone)++;
+ if (s->numAntDone == s->numAntecedents) {
+ /* look for NIL nodes */
+ if (s->doFunc == rf_NullNodeFunc) {
+ /* don't fire NIL nodes, just process them */
+ s->next = finishlist;
+ finishlist = s;
+ }
+ else {
+ /* look to see if the node is to be skipped */
+ skipNode = RF_FALSE;
+ for (j = 0; j < s->numAntecedents; j++)
+ if ((s->antType[j] == rf_trueData) && (s->antecedents[j]->status == rf_bad))
+ skipNode = RF_TRUE;
+ if (skipNode) {
+ /* this node has one or more failed true data dependencies, so skip it */
+ s->next = skiplist;
+ skiplist = s;
+ }
+ else
+ /* add s to list of nodes (q) to execute */
+ if (context != RF_INTR_CONTEXT) {
+ /* we only have to enqueue if we're at intr context */
+ s->next = firelist; /* put node on a list to be fired after we unlock */
+ firelist = s;
+ } else { /* enqueue the node for the dag exec thread to fire */
+ RF_ASSERT(NodeReady(s));
+ if (q) {
+ q->next = s;
+ q = s;
+ }
+ else {
+ qh = q = s;
+ qh->next = NULL;
+ }
+ }
+ }
+ }
+ }
+
+ if (q) {
+ /* xfer our local list of nodes to the node queue */
+ q->next = raidPtr->node_queue; raidPtr->node_queue = qh;
+ DO_SIGNAL(raidPtr);
+ }
+ DO_UNLOCK(raidPtr);
+
+ for (; skiplist; skiplist = next) {
+ next = skiplist->next;
+ skiplist->status = rf_skipped;
+ for (i = 0; i < skiplist->numAntecedents; i++) {
+ skiplist->antecedents[i]->numSuccFired++;
+ }
+ if (skiplist->commitNode) {
+ skiplist->dagHdr->numCommits++;
+ }
+ rf_FinishNode(skiplist, context);
+ }
+ for (; finishlist; finishlist = next) {
+ /* NIL nodes: no need to fire them */
+ next = finishlist->next;
+ finishlist->status = rf_good;
+ for (i = 0; i < finishlist->numAntecedents; i++) {
+ finishlist->antecedents[i]->numSuccFired++;
+ }
+ if (finishlist->commitNode)
+ finishlist->dagHdr->numCommits++;
+ /*
+ * Okay, here we're calling rf_FinishNode() on nodes that
+ * have the null function as their work proc. Such a node
+ * could be the terminal node in a DAG. If so, it will
+ * cause the DAG to complete, which will in turn free
+ * memory used by the DAG, which includes the node in
+ * question. Thus, we must avoid referencing the node
+ * at all after calling rf_FinishNode() on it.
+ */
+ rf_FinishNode(finishlist, context); /* recursive call */
+ }
+ /* fire all nodes in firelist */
+ FireNodeList(firelist);
+#endif /* SIMULATE */
+ break;
+
+ case rf_rollBackward :
+#ifdef SIMULATE
+#else /* SIMULATE */
+ for (i = 0; i < node->numAntecedents; i++) {
+ a = *(node->antecedents + i);
+ RF_ASSERT(a->status == rf_good);
+ RF_ASSERT(a->numSuccDone <= a->numSuccedents);
+ RF_ASSERT(a->numSuccDone <= a->numSuccFired);
+
+ if (a->numSuccDone == a->numSuccFired) {
+ if (a->undoFunc == rf_NullNodeFunc) {
+ /* don't fire NIL nodes, just process them */
+ a->next = finishlist;
+ finishlist = a;
+ } else {
+ if (context != RF_INTR_CONTEXT) {
+ /* we only have to enqueue if we're at intr context */
+ a->next = firelist; /* put node on a list to be fired after we unlock */
+ firelist = a;
+ } else { /* enqueue the node for the dag exec thread to fire */
+ RF_ASSERT(NodeReady(a));
+ if (q) {
+ q->next = a;
+ q = a;
+ }
+ else {
+ qh = q = a;
+ qh->next = NULL;
+ }
+ }
+ }
+ }
+
+ }
+ if (q) {
+ /* xfer our local list of nodes to the node queue */
+ q->next = raidPtr->node_queue; raidPtr->node_queue = qh;
+ DO_SIGNAL(raidPtr);
+ }
+ DO_UNLOCK(raidPtr);
+ for (; finishlist; finishlist = next) { /* NIL nodes: no need to fire them */
+ next = finishlist->next;
+ finishlist->status = rf_good;
+ /*
+ * Okay, here we're calling rf_FinishNode() on nodes that
+ * have the null function as their work proc. Such a node
+ * could be the first node in a DAG. If so, it will
+ * cause the DAG to complete, which will in turn free
+ * memory used by the DAG, which includes the node in
+ * question. Thus, we must avoid referencing the node
+ * at all after calling rf_FinishNode() on it.
+ */
+ rf_FinishNode(finishlist, context); /* recursive call */
+ }
+ /* fire all nodes in firelist */
+ FireNodeList(firelist);
+#endif /* SIMULATE */
+
+ break;
+ default :
+ printf("Engine found illegal DAG status in PropagateResults()\n");
+ RF_PANIC();
+ break;
+ }
+}
+
+
+
+/*
+ * Process a fired node which has completed
+ */
+static void ProcessNode(
+ RF_DagNode_t *node,
+ int context)
+{
+ RF_Raid_t *raidPtr;
+ int tid;
+
+ raidPtr = node->dagHdr->raidPtr;
+
+ switch (node->status) {
+ case rf_good :
+ /* normal case, don't need to do anything */
+ break;
+ case rf_bad :
+ if ((node->dagHdr->numCommits > 0) || (node->dagHdr->numCommitNodes == 0)) {
+ node->dagHdr->status = rf_rollForward; /* crossed commit barrier */
+ if (rf_engineDebug || 1) {
+ rf_get_threadid(tid);
+ printf("[%d] node (%s) returned fail, rolling forward\n", tid, node->name);
+ }
+ }
+ else {
+ node->dagHdr->status = rf_rollBackward; /* never reached commit barrier */
+ if (rf_engineDebug || 1) {
+ rf_get_threadid(tid);
+ printf("[%d] node (%s) returned fail, rolling backward\n", tid, node->name);
+ }
+ }
+ break;
+ case rf_undone :
+ /* normal rollBackward case, don't need to do anything */
+ break;
+ case rf_panic :
+ /* an undo node failed!!! */
+ printf("UNDO of a node failed!!!/n");
+ break;
+ default :
+ printf("node finished execution with an illegal status!!!\n");
+ RF_PANIC();
+ break;
+ }
+
+#ifdef SIMULATE
+ /* simulator fires nodes here.
+ * user/kernel rely upon PropagateResults to do this.
+ * XXX seems like this code should be merged so that the same thing happens for
+ * both sim, user, and kernel. -wvcii
+ */
+ switch (node->dagHdr->status) {
+ case rf_enable :
+ case rf_rollForward :
+ if (node->numSuccedents == 0) {
+ /* process terminal node */
+ if (rf_engineDebug) if (!DAGDone(node->dagHdr)) {
+ rf_get_threadid(tid);
+ printf("[%d] ProcessNode: !!!done but dag still in flight\n",tid);
+ RF_PANIC();
+ }
+ if (rf_engineDebug) printf("[%d] ProcessNode: !!!done will return true\n",tid);
+ /* Mark dag as done */
+ (node->dagHdr)->done=RF_TRUE;
+ raidPtr->dags_in_flight--;
+ }
+ else {
+ PropagateResults(node, context);
+ FireNodeArray(node->numSuccedents, node->succedents);
+ }
+ break;
+ case rf_rollBackward :
+ if (node->numAntecedents == 0) {
+ /* reached head of dag, we're done */
+ if (rf_engineDebug) if (!DAGDone(node->dagHdr)) {
+ rf_get_threadid(tid);
+ printf("[%d] ProcessNode: !!!done but dag still in flight\n",tid);
+ RF_PANIC();
+ }
+ if (rf_engineDebug) printf("[%d] ProcessNode: !!!done will return true\n",tid);
+ /* Mark dag as done */
+ (node->dagHdr)->done=RF_TRUE;
+ raidPtr->dags_in_flight--;
+ }
+ else {
+ PropagateResults(node, context);
+ FireNodeArray(node->numAntecedents, node->antecedents);
+ }
+ break;
+ default :
+ RF_PANIC();
+ break;
+ }
+
+
+#else /* SIMULATE */
+ /* enqueue node's succedents (antecedents if rollBackward) for execution */
+ PropagateResults(node, context);
+#endif /* SIMULATE */
+}
+
+
+
+/* user context or dag-exec-thread context:
+ * This is the first step in post-processing a newly-completed node.
+ * This routine is called by each node execution function to mark the node
+ * as complete and fire off any successors that have been enabled.
+ */
+int rf_FinishNode(
+ RF_DagNode_t *node,
+ int context)
+{
+ /* as far as I can tell, retcode is not used -wvcii */
+ int retcode = RF_FALSE;
+ node->dagHdr->numNodesCompleted++;
+ ProcessNode(node, context);
+
+#ifdef SIMULATE
+ if ((node->dagHdr)->done == RF_TRUE)
+ retcode = RF_TRUE;
+#endif /* SIMULATE */
+
+ return(retcode);
+}
+
+
+/* user context:
+ * submit dag for execution, return non-zero if we have to wait for completion.
+ * if and only if we return non-zero, we'll cause cbFunc to get invoked with
+ * cbArg when the DAG has completed.
+ *
+ * for now we always return 1. If the DAG does not cause any I/O, then the callback
+ * may get invoked before DispatchDAG returns. There's code in state 5 of ContinueRaidAccess
+ * to handle this.
+ *
+ * All we do here is fire the direct successors of the header node. The
+ * DAG execution thread does the rest of the dag processing.
+ */
+int rf_DispatchDAG(
+ RF_DagHeader_t *dag,
+ void (*cbFunc)(void *),
+ void *cbArg)
+{
+ RF_Raid_t *raidPtr;
+ int tid;
+
+ raidPtr = dag->raidPtr;
+ if (dag->tracerec) {
+ RF_ETIMER_START(dag->tracerec->timer);
+ }
+
+ if (rf_engineDebug || rf_validateDAGDebug) {
+ if (rf_ValidateDAG(dag))
+ RF_PANIC();
+ }
+ if (rf_engineDebug) {
+ rf_get_threadid(tid);
+ printf("[%d] Entering DispatchDAG\n",tid);
+ }
+
+ raidPtr->dags_in_flight++; /* debug only: blow off proper locking */
+ dag->cbFunc = cbFunc;
+ dag->cbArg = cbArg;
+ dag->numNodesCompleted = 0;
+ dag->status = rf_enable;
+ FireNodeArray(dag->numSuccedents, dag->succedents);
+ return(1);
+}
+
+/* dedicated kernel thread:
+ * the thread that handles all DAG node firing.
+ * To minimize locking and unlocking, we grab a copy of the entire node queue and then set the
+ * node queue to NULL before doing any firing of nodes. This way we only have to release the
+ * lock once. Of course, it's probably rare that there's more than one node in the queue at
+ * any one time, but it sometimes happens.
+ *
+ * In the kernel, this thread runs at spl0 and is not swappable. I copied these
+ * characteristics from the aio_completion_thread.
+ */
+
+#ifndef SIMULATE
+static void DAGExecutionThread(RF_ThreadArg_t arg)
+{
+ RF_DagNode_t *nd, *local_nq, *term_nq, *fire_nq;
+ RF_Raid_t *raidPtr;
+ int ks, tid;
+ int s;
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+ RF_Thread_t thread;
+#endif
+
+ raidPtr = (RF_Raid_t *)arg;
+
+ rf_assign_threadid();
+ if (rf_engineDebug) {
+ rf_get_threadid(tid);
+ printf("[%d] Engine thread is running\n", tid);
+ }
+
+#ifdef KERNEL
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+ thread = current_thread();
+ thread_swappable(thread, RF_FALSE);
+ thread->priority = thread->sched_pri = BASEPRI_SYSTEM;
+ s = spl0();
+#endif
+ /* XXX what to put here XXX */
+
+ s=splbio();
+
+#endif /* KERNEL */
+
+ RF_THREADGROUP_RUNNING(&raidPtr->engine_tg);
+
+ DO_LOCK(raidPtr);
+ while (!raidPtr->shutdown_engine) {
+
+ while (raidPtr->node_queue != NULL) {
+ local_nq = raidPtr->node_queue;
+ fire_nq = NULL;
+ term_nq = NULL;
+ raidPtr->node_queue = NULL;
+ DO_UNLOCK(raidPtr);
+
+ /* first, strip out the terminal nodes */
+ while (local_nq) {
+ nd = local_nq;
+ local_nq = local_nq->next;
+ switch(nd->dagHdr->status) {
+ case rf_enable :
+ case rf_rollForward :
+ if (nd->numSuccedents == 0) {
+ /* end of the dag, add to callback list */
+ nd->next = term_nq;
+ term_nq = nd;
+ }
+ else {
+ /* not the end, add to the fire queue */
+ nd->next = fire_nq;
+ fire_nq = nd;
+ }
+ break;
+ case rf_rollBackward :
+ if (nd->numAntecedents == 0) {
+ /* end of the dag, add to the callback list */
+ nd->next = term_nq;
+ term_nq = nd;
+ }
+ else {
+ /* not the end, add to the fire queue */
+ nd->next = fire_nq;
+ fire_nq = nd;
+ }
+ break;
+ default :
+ RF_PANIC();
+ break;
+ }
+ }
+
+ /* execute callback of dags which have reached the terminal node */
+ while (term_nq) {
+ nd = term_nq;
+ term_nq = term_nq->next;
+ nd->next = NULL;
+ (nd->dagHdr->cbFunc)(nd->dagHdr->cbArg);
+ raidPtr->dags_in_flight--; /* debug only */
+ }
+
+ /* fire remaining nodes */
+ FireNodeList(fire_nq);
+
+ DO_LOCK(raidPtr);
+ }
+ while (!raidPtr->shutdown_engine && raidPtr->node_queue == NULL)
+ DO_WAIT(raidPtr);
+ }
+ DO_UNLOCK(raidPtr);
+
+ RF_THREADGROUP_DONE(&raidPtr->engine_tg);
+#ifdef KERNEL
+#if defined(__NetBSD__) || defined(__OpenBSD__)
+ splx(s);
+ kthread_exit(0);
+#else
+ splx(s);
+ thread_terminate(thread);
+ thread_halt_self();
+#endif
+#endif /* KERNEL */
+}
+
+#endif /* !SIMULATE */
diff --git a/sys/dev/raidframe/rf_engine.h b/sys/dev/raidframe/rf_engine.h
new file mode 100644
index 00000000000..c3186aa791f
--- /dev/null
+++ b/sys/dev/raidframe/rf_engine.h
@@ -0,0 +1,75 @@
+/* $OpenBSD: rf_engine.h,v 1.1 1999/01/11 14:29:19 niklas Exp $ */
+/* $NetBSD: rf_engine.h,v 1.1 1998/11/13 04:20:29 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II, Mark Holland, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/**********************************************************
+ * *
+ * engine.h -- header file for execution engine functions *
+ * *
+ **********************************************************/
+
+/* :
+ * Log: rf_engine.h,v
+ * Revision 1.11 1996/06/14 14:16:22 jimz
+ * new decl of ConfigureEngine
+ *
+ * Revision 1.10 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.9 1996/05/30 12:59:18 jimz
+ * make etimer happier, more portable
+ *
+ * Revision 1.8 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.7 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.6 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.5 1995/12/01 18:12:17 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_ENGINE_H_
+#define _RF__RF_ENGINE_H_
+
+int rf_ConfigureEngine(RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr, RF_Config_t *cfgPtr);
+
+int rf_FinishNode(RF_DagNode_t *node, int context); /* return finished node to engine */
+
+int rf_DispatchDAG(RF_DagHeader_t *dag, void (*cbFunc)(void *), void *cbArg); /* execute dag */
+
+#endif /* !_RF__RF_ENGINE_H_ */
diff --git a/sys/dev/raidframe/rf_etimer.h b/sys/dev/raidframe/rf_etimer.h
new file mode 100644
index 00000000000..5d78b80eac2
--- /dev/null
+++ b/sys/dev/raidframe/rf_etimer.h
@@ -0,0 +1,353 @@
+/* $OpenBSD: rf_etimer.h,v 1.1 1999/01/11 14:29:20 niklas Exp $ */
+/* $NetBSD: rf_etimer.h,v 1.1 1998/11/13 04:20:29 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_etimer.h -- header file for code related to accurate timing
+ * This code currently assumes that the elapsed time between START_TIMER
+ * and START_TIMER is less than the period of the cycle counter. This
+ * means the events you want to time must be less than:
+ * clock speed max time
+ * ---------- --------
+ * 175 MHz 24 sec
+ * 150 MHz 28 sec
+ * 125 MHz 34 sec
+ *
+ *
+ * :
+ * Log: rf_etimer.h,v
+ * Revision 1.32 1996/08/13 18:11:09 jimz
+ * want MACH&&!__osf__, not just MACH for mach timing (MACH defined under OSF/1)
+ *
+ * Revision 1.31 1996/08/12 20:11:38 jimz
+ * use read_real_time() on AIX4+
+ *
+ * Revision 1.30 1996/08/09 18:48:12 jimz
+ * for now, use gettimeofday() on MACH
+ * (should eventually use better clock stuff)
+ *
+ * Revision 1.29 1996/08/07 21:09:08 jimz
+ * add IRIX as a gettimeofday system
+ *
+ * Revision 1.28 1996/08/06 22:25:23 jimz
+ * add LINUX_I386
+ *
+ * Revision 1.27 1996/07/30 04:45:53 jimz
+ * add ultrix stuff
+ *
+ * Revision 1.26 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.25 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.24 1996/07/27 18:40:24 jimz
+ * cleanup sweep
+ *
+ * Revision 1.23 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.22 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.21 1996/07/17 21:00:58 jimz
+ * clean up timer interface, tracing
+ *
+ * Revision 1.20 1996/07/17 14:26:28 jimz
+ * rf_scc -> rf_rpcc
+ *
+ * Revision 1.19 1996/06/14 21:24:48 jimz
+ * move out ConfigureEtimer
+ *
+ * Revision 1.18 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.17 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.16 1996/05/30 12:59:18 jimz
+ * make etimer happier, more portable
+ *
+ * Revision 1.15 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.14 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.13 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.12 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.11 1995/12/01 18:10:40 root
+ * added copyright info
+ *
+ * Revision 1.10 1995/09/29 14:27:32 wvcii
+ * removed printfs from ConfigureEtimer()
+ *
+ * Revision 1.9 95/09/19 22:57:31 jimz
+ * added kernel version of ConfigureEtimer
+ *
+ * Revision 1.8 1995/09/14 13:03:04 amiri
+ * set default CPU speed to 125Mhz to avoid divide by zero problems.
+ *
+ * Revision 1.7 1995/09/11 19:04:36 wvcii
+ * timer autoconfigs using pdl routine to check cpu speed
+ * value may still be overridden via config debug var timerTicksPerSec
+ *
+ */
+
+
+#ifndef _RF__RF_TIMER_H_
+#define _RF__RF_TIMER_H_
+
+#include "rf_options.h"
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#if defined(__NetBSD__) || defined(__OpenBSD__)
+
+#ifdef KERNEL
+extern unsigned int rpcc(void);
+#define rf_read_cycle_counter rpcc
+#else /* KERNEL */
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+/* XXX does this function even exist anywhere??? GO */
+extern unsigned int rf_rpcc();
+#endif
+#define rf_read_cycle_counter rf_rpcc
+#endif /* KERNEL */
+
+#define RF_DEF_TIMER_MAX_VAL 0xFFFFFFFF
+
+typedef struct RF_EtimerVal_s {
+ unsigned ccnt; /* cycle count */
+} RF_EtimerVal_t;
+
+struct RF_Etimer_s {
+ RF_EtimerVal_t st;
+ RF_EtimerVal_t et;
+ unsigned long ticks; /* elapsed time in ticks */
+};
+
+extern long rf_timer_max_val;
+extern long rf_timer_ticks_per_second;
+extern unsigned long rf_timer_ticks_per_usec;
+
+#define RF_ETIMER_TICKS2US(_tcks_) ( (_tcks_) / rf_timer_ticks_per_usec )
+#define RF_ETIMER_START(_t_) { (_t_).st.ccnt = rf_read_cycle_counter(); }
+#define RF_ETIMER_STOP(_t_) { (_t_).et.ccnt = rf_read_cycle_counter(); }
+#define RF_ETIMER_EVAL(_t_) { \
+ if ((_t_).st.ccnt < (_t_).et.ccnt) \
+ (_t_).ticks = (_t_).et.ccnt - (_t_).st.ccnt; \
+ else \
+ (_t_).ticks = rf_timer_max_val - ((_t_).st.ccnt - (_t_).et.ccnt); \
+}
+
+#define RF_ETIMER_VAL_TICKS(_t_) ((_t_).ticks)
+#define RF_ETIMER_VAL_US(_t_) (RF_ETIMER_TICKS2US((_t_).ticks))
+#define RF_ETIMER_VAL_MS(_t_) (RF_ETIMER_TICKS2US((_t_).ticks)/1000)
+
+#endif /* __NetBSD__ || __OpenBSD__ */
+
+
+#if defined(__alpha) && !defined(__NetBSD__) && !defined(__OpenBSD__)
+
+#ifdef KERNEL
+extern unsigned int rpcc();
+#define rf_read_cycle_counter rpcc
+#else /* KERNEL */
+extern unsigned int rf_rpcc();
+#define rf_read_cycle_counter rf_rpcc
+#endif /* KERNEL */
+
+#define RF_DEF_TIMER_MAX_VAL 0xFFFFFFFF
+
+typedef struct RF_EtimerVal_s {
+ unsigned ccnt; /* cycle count */
+} RF_EtimerVal_t;
+
+struct RF_Etimer_s {
+ RF_EtimerVal_t st;
+ RF_EtimerVal_t et;
+ unsigned long ticks; /* elapsed time in ticks */
+};
+
+extern long rf_timer_max_val;
+extern long rf_timer_ticks_per_second;
+extern unsigned long rf_timer_ticks_per_usec;
+
+#define RF_ETIMER_TICKS2US(_tcks_) ( (_tcks_) / rf_timer_ticks_per_usec )
+#define RF_ETIMER_START(_t_) { (_t_).st.ccnt = rf_read_cycle_counter(); }
+#define RF_ETIMER_STOP(_t_) { (_t_).et.ccnt = rf_read_cycle_counter(); }
+#define RF_ETIMER_EVAL(_t_) { \
+ if ((_t_).st.ccnt < (_t_).et.ccnt) \
+ (_t_).ticks = (_t_).et.ccnt - (_t_).st.ccnt; \
+ else \
+ (_t_).ticks = rf_timer_max_val - ((_t_).st.ccnt - (_t_).et.ccnt); \
+}
+
+#define RF_ETIMER_VAL_TICKS(_t_) ((_t_).ticks)
+#define RF_ETIMER_VAL_US(_t_) (RF_ETIMER_TICKS2US((_t_).ticks))
+#define RF_ETIMER_VAL_MS(_t_) (RF_ETIMER_TICKS2US((_t_).ticks)/1000)
+
+#endif /* __alpha */
+
+#ifdef _IBMR2
+
+extern void rf_rtclock(unsigned int *secs, unsigned int *nsecs);
+
+#define RF_MSEC_PER_SEC 1000
+#define RF_USEC_PER_SEC 1000000
+#define RF_NSEC_PER_SEC 1000000000
+
+typedef struct RF_EtimerVal_s {
+ unsigned int secs;
+ unsigned int nsecs;
+} RF_EtimerVal_t;
+
+struct RF_Etimer_s {
+ RF_EtimerVal_t start;
+ RF_EtimerVal_t end;
+ RF_EtimerVal_t elapsed;
+};
+
+#if RF_AIXVERS >= 4
+
+#include <sys/time.h>
+
+#define RF_ETIMER_START(_t_) { \
+ timebasestruct_t tb; \
+ tb.flag = 1; \
+ read_real_time(&tb, TIMEBASE_SZ); \
+ (_t_).start.secs = tb.tb_high; \
+ (_t_).start.nsecs = tb.tb_low; \
+}
+
+#define RF_ETIMER_STOP(_t_) { \
+ timebasestruct_t tb; \
+ tb.flag = 1; \
+ read_real_time(&tb, TIMEBASE_SZ); \
+ (_t_).end.secs = tb.tb_high; \
+ (_t_).end.nsecs = tb.tb_low; \
+}
+
+#else /* RF_AIXVERS >= 4 */
+
+#define RF_ETIMER_START(_t_) { \
+ rf_rtclock(&((_t_).start.secs), &((_t_).start.nsecs)); \
+}
+
+#define RF_ETIMER_STOP(_t_) { \
+ rf_rtclock(&((_t_).end.secs), &((_t_).end.nsecs)); \
+}
+
+#endif /* RF_AIXVERS >= 4 */
+
+#define RF_ETIMER_EVAL(_t_) { \
+ if ((_t_).end.nsecs >= (_t_).start.nsecs) { \
+ (_t_).elapsed.nsecs = (_t_).end.nsecs - (_t_).start.nsecs; \
+ (_t_).elapsed.secs = (_t_).end.secs - (_t_).start.nsecs; \
+ } \
+ else { \
+ (_t_).elapsed.nsecs = RF_NSEC_PER_SEC + (_t_).end.nsecs; \
+ (_t_).elapsed.nsecs -= (_t_).start.nsecs; \
+ (_t_).elapsed.secs = (_t_).end.secs - (_t_).start.secs + 1; \
+ } \
+}
+
+#define RF_ETIMER_VAL_US(_t_) (((_t_).elapsed.secs*RF_USEC_PER_SEC)+((_t_).elapsed.nsecs/1000))
+#define RF_ETIMER_VAL_MS(_t_) (((_t_).elapsed.secs*RF_MSEC_PER_SEC)+((_t_).elapsed.nsecs/1000000))
+
+#endif /* _IBMR2 */
+
+/*
+ * XXX investigate better timing for these
+ */
+#if defined(hpux) || defined(sun) || defined(NETBSD_I386) || defined(OPENBSD_I386) || defined(ultrix) || defined(LINUX_I386) || defined(IRIX) || (defined(MACH) && !defined(__osf__))
+#include <sys/time.h>
+
+#define RF_USEC_PER_SEC 1000000
+
+struct RF_Etimer_s {
+ struct timeval start;
+ struct timeval end;
+ struct timeval elapsed;
+};
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#define RF_ETIMER_START(_t_) { \
+ gettimeofday(&((_t_).start), NULL); \
+}
+
+#define RF_ETIMER_STOP(_t_) { \
+ gettimeofday(&((_t_).end), NULL); \
+}
+
+#else
+#define RF_ETIMER_START(_t_) { \
+}
+/* XXX these just drop off the end of the world... */
+#define RF_ETIMER_STOP(_t_) { \
+}
+#endif
+
+#define RF_ETIMER_EVAL(_t_) { \
+ if ((_t_).end.tv_usec >= (_t_).start.tv_usec) { \
+ (_t_).elapsed.tv_usec = (_t_).end.tv_usec - (_t_).start.tv_usec; \
+ (_t_).elapsed.tv_sec = (_t_).end.tv_sec - (_t_).start.tv_usec; \
+ } \
+ else { \
+ (_t_).elapsed.tv_usec = RF_USEC_PER_SEC + (_t_).end.tv_usec; \
+ (_t_).elapsed.tv_usec -= (_t_).start.tv_usec; \
+ (_t_).elapsed.tv_sec = (_t_).end.tv_sec - (_t_).start.tv_sec + 1; \
+ } \
+}
+
+#define RF_ETIMER_VAL_US(_t_) (((_t_).elapsed.tv_sec*RF_USEC_PER_SEC)+(_t_).elapsed.tv_usec)
+#define RF_ETIMER_VAL_MS(_t_) (((_t_).elapsed.tv_sec*RF_MSEC_PER_SEC)+((_t_).elapsed.tv_usec/1000))
+
+#endif /* hpux || sun || NETBSD_I386 || OPENBSD_I386 || ultrix || LINUX_I386 || IRIX || (MACH && !__osf__) */
+
+#endif /* !_RF__RF_TIMER_H_ */
diff --git a/sys/dev/raidframe/rf_evenodd.c b/sys/dev/raidframe/rf_evenodd.c
new file mode 100644
index 00000000000..90d18653cda
--- /dev/null
+++ b/sys/dev/raidframe/rf_evenodd.c
@@ -0,0 +1,556 @@
+/* $OpenBSD: rf_evenodd.c,v 1.1 1999/01/11 14:29:21 niklas Exp $ */
+/* $NetBSD: rf_evenodd.c,v 1.1 1998/11/13 04:20:29 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Chang-Ming Wu
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************************
+ *
+ * rf_evenodd.c -- implements EVENODD array architecture
+ *
+ ****************************************************************************************/
+
+#include "rf_archs.h"
+
+#if RF_INCLUDE_EVENODD > 0
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_dagffrd.h"
+#include "rf_dagffwr.h"
+#include "rf_dagdegrd.h"
+#include "rf_dagdegwr.h"
+#include "rf_dagutils.h"
+#include "rf_dagfuncs.h"
+#include "rf_threadid.h"
+#include "rf_etimer.h"
+#include "rf_general.h"
+#include "rf_evenodd.h"
+#include "rf_configure.h"
+#include "rf_parityscan.h"
+#include "rf_utils.h"
+#include "rf_map.h"
+#include "rf_pq.h"
+#include "rf_mcpair.h"
+#include "rf_sys.h"
+#include "rf_evenodd.h"
+#include "rf_evenodd_dagfuncs.h"
+#include "rf_evenodd_dags.h"
+#include "rf_engine.h"
+
+typedef struct RF_EvenOddConfigInfo_s {
+ RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by IdentifyStripe */
+} RF_EvenOddConfigInfo_t;
+
+int rf_ConfigureEvenOdd(listp, raidPtr, cfgPtr)
+ RF_ShutdownList_t **listp;
+ RF_Raid_t *raidPtr;
+ RF_Config_t *cfgPtr;
+{
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_EvenOddConfigInfo_t *info;
+ RF_RowCol_t i, j, startdisk;
+
+ RF_MallocAndAdd(info, sizeof(RF_EvenOddConfigInfo_t), (RF_EvenOddConfigInfo_t *), raidPtr->cleanupList);
+ layoutPtr->layoutSpecificInfo = (void *) info;
+
+ RF_ASSERT(raidPtr->numRow == 1);
+
+ info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol, raidPtr->numCol, raidPtr->cleanupList);
+ startdisk = 0;
+ for (i=0; i<raidPtr->numCol; i++) {
+ for (j=0; j<raidPtr->numCol; j++) {
+ info->stripeIdentifier[i][j] = (startdisk + j) % raidPtr->numCol;
+ }
+ if ((startdisk -= 2) < 0) startdisk += raidPtr->numCol;
+ }
+
+ /* fill in the remaining layout parameters */
+ layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
+ layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+ layoutPtr->numDataCol = raidPtr->numCol-2; /* ORIG: layoutPtr->numDataCol = raidPtr->numCol-1; */
+#if RF_EO_MATRIX_DIM > 17
+ if (raidPtr->numCol <= 17){
+ printf("Number of stripe units in a parity stripe is smaller than 17. Please\n");
+ printf("define the macro RF_EO_MATRIX_DIM in file rf_evenodd_dagfuncs.h to \n");
+ printf("be 17 to increase performance. \n");
+ return(EINVAL);
+ }
+#elif RF_EO_MATRIX_DIM == 17
+ if (raidPtr->numCol > 17) {
+ printf("Number of stripe units in a parity stripe is bigger than 17. Please\n");
+ printf("define the macro RF_EO_MATRIX_DIM in file rf_evenodd_dagfuncs.h to \n");
+ printf("be 257 for encoding and decoding functions to work. \n");
+ return(EINVAL);
+ }
+#endif
+ layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+ layoutPtr->numParityCol = 2;
+ layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk;
+ raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
+
+ raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+
+ return(0);
+}
+
+int rf_GetDefaultNumFloatingReconBuffersEvenOdd(RF_Raid_t *raidPtr)
+{
+ return(20);
+}
+
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitEvenOdd(RF_Raid_t *raidPtr)
+{
+ return(10);
+}
+
+void rf_IdentifyStripeEvenOdd(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids,
+ RF_RowCol_t *outRow)
+{
+ RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr);
+ RF_EvenOddConfigInfo_t *info = (RF_EvenOddConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+
+ *outRow = 0;
+ *diskids = info->stripeIdentifier[ stripeID % raidPtr->numCol ];
+}
+
+/* The layout of stripe unit on the disks are: c0 c1 c2 c3 c4
+
+ 0 1 2 E P
+ 5 E P 3 4
+ P 6 7 8 E
+ 10 11 E P 9
+ E P 12 13 14
+ ....
+
+ We use the MapSectorRAID5 to map data information because the routine can be shown to map exactly
+ the layout of data stripe unit as shown above although we have 2 redundant information now.
+ But for E and P, we use rf_MapEEvenOdd and rf_MapParityEvenOdd which are different method from raid-5.
+*/
+
+
+void rf_MapParityEvenOdd(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+ RF_StripeNum_t endSUIDofthisStrip = (SUID/raidPtr->Layout.numDataCol + 1)*raidPtr->Layout.numDataCol - 1;
+
+ *row = 0;
+ *col = ( endSUIDofthisStrip + 2)%raidPtr->numCol;
+ *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
+ (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+void rf_MapEEvenOdd(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+ RF_StripeNum_t endSUIDofthisStrip = (SUID/raidPtr->Layout.numDataCol + 1)*raidPtr->Layout.numDataCol - 1;
+
+ *row = 0;
+ *col = ( endSUIDofthisStrip + 1)%raidPtr->numCol;
+ *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
+ (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+void rf_EODagSelect(
+ RF_Raid_t *raidPtr,
+ RF_IoType_t type,
+ RF_AccessStripeMap_t *asmap,
+ RF_VoidFuncPtr *createFunc)
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ unsigned ndfail = asmap->numDataFailed;
+ unsigned npfail = asmap->numParityFailed +asmap->numQFailed;
+ unsigned ntfail = npfail + ndfail;
+
+ RF_ASSERT(RF_IO_IS_R_OR_W(type));
+ if (ntfail > 2)
+ {
+ RF_ERRORMSG("more than two disks failed in a single group! Aborting I/O operation.\n");
+ /* *infoFunc = */ *createFunc = NULL;
+ return;
+ }
+
+ /* ok, we can do this I/O */
+ if (type == RF_IO_TYPE_READ)
+ {
+ switch (ndfail)
+ {
+ case 0:
+ /* fault free read */
+ *createFunc = (RF_VoidFuncPtr)rf_CreateFaultFreeReadDAG; /* same as raid 5 */
+ break;
+ case 1:
+ /* lost a single data unit */
+ /* two cases:
+ (1) parity is not lost.
+ do a normal raid 5 reconstruct read.
+ (2) parity is lost.
+ do a reconstruct read using "e".
+ */
+ if (ntfail == 2) /* also lost redundancy */
+ {
+ if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
+ *createFunc = (RF_VoidFuncPtr)rf_EO_110_CreateReadDAG;
+ else
+ *createFunc = (RF_VoidFuncPtr)rf_EO_101_CreateReadDAG;
+ }
+ else
+ {
+ /* P and E are ok. But is there a failure
+ in some unaccessed data unit?
+ */
+ if (rf_NumFailedDataUnitsInStripe(raidPtr,asmap)==2)
+ *createFunc = (RF_VoidFuncPtr)rf_EO_200_CreateReadDAG;
+ else
+ *createFunc = (RF_VoidFuncPtr)rf_EO_100_CreateReadDAG;
+ }
+ break;
+ case 2:
+ /* *createFunc = rf_EO_200_CreateReadDAG; */
+ *createFunc = NULL;
+ break;
+ }
+ return;
+ }
+
+ /* a write */
+ switch (ntfail)
+ {
+ case 0: /* fault free */
+ if (rf_suppressLocksAndLargeWrites ||
+ (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
+ (asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
+
+ *createFunc = (RF_VoidFuncPtr)rf_EOCreateSmallWriteDAG;
+ }
+ else {
+ *createFunc = (RF_VoidFuncPtr)rf_EOCreateLargeWriteDAG;
+ }
+ break;
+
+ case 1: /* single disk fault */
+ if (npfail==1)
+ {
+ RF_ASSERT ((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
+ if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q)
+ { /* q died, treat like normal mode raid5 write.*/
+ if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
+ || (asmap->parityInfo->next!=NULL) || rf_NumFailedDataUnitsInStripe(raidPtr,asmap))
+ *createFunc = (RF_VoidFuncPtr)rf_EO_001_CreateSmallWriteDAG;
+ else
+ *createFunc = (RF_VoidFuncPtr)rf_EO_001_CreateLargeWriteDAG;
+ }
+ else
+ { /* parity died, small write only updating Q */
+ if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
+ || (asmap->qInfo->next!=NULL) || rf_NumFailedDataUnitsInStripe(raidPtr,asmap))
+ *createFunc = (RF_VoidFuncPtr)rf_EO_010_CreateSmallWriteDAG;
+ else
+ *createFunc = (RF_VoidFuncPtr)rf_EO_010_CreateLargeWriteDAG;
+ }
+ }
+ else
+ { /* data missing.
+ Do a P reconstruct write if only a single data unit
+ is lost in the stripe, otherwise a reconstruct
+ write which employnig both P and E units. */
+ if (rf_NumFailedDataUnitsInStripe(raidPtr,asmap)==2)
+ {
+ if (asmap->numStripeUnitsAccessed == 1)
+ *createFunc = (RF_VoidFuncPtr)rf_EO_200_CreateWriteDAG;
+ else
+ *createFunc = NULL; /* No direct support for this case now, like that in Raid-5 */
+ }
+ else
+ {
+ if (asmap->numStripeUnitsAccessed != 1 && asmap->failedPDAs[0]->numSector != layoutPtr->sectorsPerStripeUnit)
+ *createFunc = NULL; /* No direct support for this case now, like that in Raid-5 */
+ else *createFunc = (RF_VoidFuncPtr)rf_EO_100_CreateWriteDAG;
+ }
+ }
+ break;
+
+ case 2: /* two disk faults */
+ switch (npfail)
+ {
+ case 2: /* both p and q dead */
+ *createFunc = (RF_VoidFuncPtr)rf_EO_011_CreateWriteDAG;
+ break;
+ case 1: /* either p or q and dead data */
+ RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
+ RF_ASSERT ((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
+ if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
+ {
+ if(asmap->numStripeUnitsAccessed != 1 && asmap->failedPDAs[0]->numSector != layoutPtr->sectorsPerStripeUnit)
+ *createFunc = NULL; /* In both PQ and EvenOdd, no direct support for this case now, like that in Raid-5 */
+ else
+ *createFunc = (RF_VoidFuncPtr)rf_EO_101_CreateWriteDAG;
+ }
+ else
+ {
+ if (asmap->numStripeUnitsAccessed != 1 && asmap->failedPDAs[0]->numSector != layoutPtr->sectorsPerStripeUnit)
+ *createFunc = NULL; /* No direct support for this case, like that in Raid-5 */
+ else
+ *createFunc = (RF_VoidFuncPtr)rf_EO_110_CreateWriteDAG;
+ }
+ break;
+ case 0: /* double data loss */
+ /* if(asmap->failedPDAs[0]->numSector + asmap->failedPDAs[1]->numSector == 2 * layoutPtr->sectorsPerStripeUnit )
+ *createFunc = rf_EOCreateLargeWriteDAG;
+ else */
+ *createFunc = NULL; /* currently, in Evenodd, No support for simultaneous access of both failed SUs */
+ break;
+ }
+ break;
+
+ default: /* more than 2 disk faults */
+ *createFunc = NULL;
+ RF_PANIC();
+ }
+ return;
+}
+
+
+int rf_VerifyParityEvenOdd(raidPtr, raidAddr, parityPDA, correct_it, flags)
+ RF_Raid_t *raidPtr;
+ RF_RaidAddr_t raidAddr;
+ RF_PhysDiskAddr_t *parityPDA;
+ int correct_it;
+ RF_RaidAccessFlags_t flags;
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_RaidAddr_t startAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr);
+ RF_SectorCount_t numsector = parityPDA->numSector;
+ int numbytes = rf_RaidAddressToByte(raidPtr, numsector);
+ int bytesPerStripe = numbytes * layoutPtr->numDataCol;
+ RF_DagHeader_t *rd_dag_h, *wr_dag_h; /* read, write dag */
+ RF_DagNode_t *blockNode, *unblockNode, *wrBlock, *wrUnblock;
+ RF_AccessStripeMapHeader_t *asm_h;
+ RF_AccessStripeMap_t *asmap;
+ RF_AllocListElem_t *alloclist;
+ RF_PhysDiskAddr_t *pda;
+ char *pbuf, *buf, *end_p, *p;
+ char *redundantbuf2;
+ int redundantTwoErr = 0, redundantOneErr = 0;
+ int parity_cant_correct = RF_FALSE, red2_cant_correct = RF_FALSE, parity_corrected = RF_FALSE, red2_corrected = RF_FALSE;
+ int i, retcode;
+ RF_ReconUnitNum_t which_ru;
+ RF_StripeNum_t psID = rf_RaidAddressToParityStripeID(layoutPtr, raidAddr, &which_ru);
+ int stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
+ RF_AccTraceEntry_t tracerec;
+ RF_MCPair_t *mcpair;
+
+ retcode = RF_PARITY_OKAY;
+
+ mcpair = rf_AllocMCPair();
+ rf_MakeAllocList(alloclist);
+ RF_MallocAndAdd(buf, numbytes * (layoutPtr->numDataCol + layoutPtr->numParityCol), (char *), alloclist);
+ RF_CallocAndAdd(pbuf, 1, numbytes, (char *), alloclist); /* use calloc to make sure buffer is zeroed */
+ end_p = buf + bytesPerStripe;
+ RF_CallocAndAdd(redundantbuf2, 1, numbytes, (char *), alloclist); /* use calloc to make sure buffer is zeroed */
+
+ rd_dag_h = rf_MakeSimpleDAG(raidPtr, stripeWidth, numbytes, buf, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+ "Rod", alloclist, flags, RF_IO_NORMAL_PRIORITY);
+ blockNode = rd_dag_h->succedents[0];
+ unblockNode = blockNode->succedents[0]->succedents[0];
+
+ /* map the stripe and fill in the PDAs in the dag */
+ asm_h = rf_MapAccess(raidPtr, startAddr, layoutPtr->dataSectorsPerStripe, buf, RF_DONT_REMAP);
+ asmap = asm_h->stripeMap;
+
+ for (pda=asmap->physInfo,i=0; i<layoutPtr->numDataCol; i++,pda=pda->next) {
+ RF_ASSERT(pda);
+ rf_RangeRestrictPDA(raidPtr, parityPDA, pda, 0, 1);
+ RF_ASSERT(pda->numSector != 0);
+ if (rf_TryToRedirectPDA(raidPtr, pda, 0)) goto out; /* no way to verify parity if disk is dead. return w/ good status */
+ blockNode->succedents[i]->params[0].p = pda;
+ blockNode->succedents[i]->params[2].v = psID;
+ blockNode->succedents[i]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ }
+
+ RF_ASSERT(!asmap->parityInfo->next);
+ rf_RangeRestrictPDA(raidPtr, parityPDA, asmap->parityInfo, 0, 1);
+ RF_ASSERT(asmap->parityInfo->numSector != 0);
+ if (rf_TryToRedirectPDA(raidPtr, asmap->parityInfo, 1))
+ goto out;
+ blockNode->succedents[ layoutPtr->numDataCol ]->params[0].p = asmap->parityInfo;
+
+ RF_ASSERT(!asmap->qInfo->next);
+ rf_RangeRestrictPDA(raidPtr, parityPDA, asmap->qInfo, 0, 1);
+ RF_ASSERT(asmap->qInfo->numSector != 0);
+ if (rf_TryToRedirectPDA(raidPtr, asmap->qInfo, 1)) goto out;
+ /*
+ * if disk is dead, b/c no reconstruction is implemented right now,
+ * the function "rf_TryToRedirectPDA" always return one, which cause
+ * go to out and return w/ good status
+ */
+ blockNode->succedents[ layoutPtr->numDataCol +1 ]->params[0].p = asmap->qInfo;
+
+ /* fire off the DAG */
+ bzero((char *)&tracerec,sizeof(tracerec));
+ rd_dag_h->tracerec = &tracerec;
+
+ if (rf_verifyParityDebug) {
+ printf("Parity verify read dag:\n");
+ rf_PrintDAGList(rd_dag_h);
+ }
+
+ RF_LOCK_MUTEX(mcpair->mutex);
+ mcpair->flag = 0;
+ rf_DispatchDAG(rd_dag_h, (void (*)(void *))rf_MCPairWakeupFunc,
+ (void *) mcpair);
+ while (!mcpair->flag) RF_WAIT_COND(mcpair->cond, mcpair->mutex);
+ RF_UNLOCK_MUTEX(mcpair->mutex);
+ if (rd_dag_h->status != rf_enable) {
+ RF_ERRORMSG("Unable to verify parity: can't read the stripe\n");
+ retcode = RF_PARITY_COULD_NOT_VERIFY;
+ goto out;
+ }
+
+ for (p=buf, i=0; p<end_p; p+=numbytes, i++) {
+ rf_e_encToBuf(raidPtr, i, p, RF_EO_MATRIX_DIM - 2, redundantbuf2, numsector);
+ /*
+ * the corresponding columes in EvenOdd encoding Matrix for these p pointers which point
+ * to the databuffer in a full stripe are sequentially from 0 to layoutPtr->numDataCol-1
+ */
+ rf_bxor(p, pbuf, numbytes, NULL);
+ }
+ RF_ASSERT(i==layoutPtr->numDataCol);
+
+ for (i=0; i<numbytes; i++) {
+ if (pbuf[i] != buf[bytesPerStripe+i]) {
+ if (!correct_it) {
+ RF_ERRORMSG3("Parity verify error: byte %d of parity is 0x%x should be 0x%x\n",
+ i,(u_char) buf[bytesPerStripe+i],(u_char) pbuf[i]);
+ }
+ }
+ redundantOneErr = 1;
+ break;
+ }
+
+ for (i=0; i<numbytes; i++) {
+ if (redundantbuf2[i] != buf[bytesPerStripe+numbytes+i]) {
+ if (!correct_it) {
+ RF_ERRORMSG3("Parity verify error: byte %d of second redundant information is 0x%x should be 0x%x\n",
+ i,(u_char) buf[bytesPerStripe+numbytes+i],(u_char) redundantbuf2[i]);
+ }
+ redundantTwoErr = 1;
+ break;
+ }
+ }
+ if (redundantOneErr || redundantTwoErr )
+ retcode = RF_PARITY_BAD;
+
+ /* correct the first redundant disk, ie parity if it is error */
+ if (redundantOneErr && correct_it) {
+ wr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, numbytes, pbuf, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+ "Wnp", alloclist, flags, RF_IO_NORMAL_PRIORITY);
+ wrBlock = wr_dag_h->succedents[0]; wrUnblock = wrBlock->succedents[0]->succedents[0];
+ wrBlock->succedents[0]->params[0].p = asmap->parityInfo;
+ wrBlock->succedents[0]->params[2].v = psID;
+ wrBlock->succedents[0]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ bzero((char *)&tracerec,sizeof(tracerec));
+ wr_dag_h->tracerec = &tracerec;
+ if (rf_verifyParityDebug) {
+ printf("Parity verify write dag:\n");
+ rf_PrintDAGList(wr_dag_h);
+ }
+ RF_LOCK_MUTEX(mcpair->mutex);
+ mcpair->flag = 0;
+ rf_DispatchDAG(wr_dag_h, (void (*)(void *))rf_MCPairWakeupFunc,
+ (void *) mcpair);
+ while (!mcpair->flag)
+ RF_WAIT_COND(mcpair->cond, mcpair->mutex);
+ RF_UNLOCK_MUTEX(mcpair->mutex);
+ if (wr_dag_h->status != rf_enable) {
+ RF_ERRORMSG("Unable to correct parity in VerifyParity: can't write the stripe\n");
+ parity_cant_correct = RF_TRUE;
+ } else {
+ parity_corrected = RF_TRUE;
+ }
+ rf_FreeDAG(wr_dag_h);
+ }
+
+ if (redundantTwoErr && correct_it) {
+ wr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, numbytes, redundantbuf2, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+ "Wnred2", alloclist, flags, RF_IO_NORMAL_PRIORITY);
+ wrBlock = wr_dag_h->succedents[0]; wrUnblock = wrBlock->succedents[0]->succedents[0];
+ wrBlock->succedents[0]->params[0].p = asmap->qInfo;
+ wrBlock->succedents[0]->params[2].v = psID;
+ wrBlock->succedents[0]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ bzero((char *)&tracerec,sizeof(tracerec));
+ wr_dag_h->tracerec = &tracerec;
+ if (rf_verifyParityDebug) {
+ printf("Dag of write new second redundant information in parity verify :\n");
+ rf_PrintDAGList(wr_dag_h);
+ }
+ RF_LOCK_MUTEX(mcpair->mutex);
+ mcpair->flag = 0;
+ rf_DispatchDAG(wr_dag_h, (void (*)(void *))rf_MCPairWakeupFunc,
+ (void *) mcpair);
+ while (!mcpair->flag)
+ RF_WAIT_COND(mcpair->cond, mcpair->mutex);
+ RF_UNLOCK_MUTEX(mcpair->mutex);
+ if (wr_dag_h->status != rf_enable) {
+ RF_ERRORMSG("Unable to correct second redundant information in VerifyParity: can't write the stripe\n");
+ red2_cant_correct = RF_TRUE;
+ } else {
+ red2_corrected = RF_TRUE;
+ }
+ rf_FreeDAG(wr_dag_h);
+ }
+ if ( (redundantOneErr && parity_cant_correct) ||
+ (redundantTwoErr && red2_cant_correct ))
+ retcode = RF_PARITY_COULD_NOT_CORRECT;
+ if ( (retcode = RF_PARITY_BAD) && parity_corrected && red2_corrected )
+ retcode = RF_PARITY_CORRECTED;
+
+
+out:
+ rf_FreeAccessStripeMap(asm_h);
+ rf_FreeAllocList(alloclist);
+ rf_FreeDAG(rd_dag_h);
+ rf_FreeMCPair(mcpair);
+ return(retcode);
+}
+
+#endif /* RF_INCLUDE_EVENODD > 0 */
diff --git a/sys/dev/raidframe/rf_evenodd.h b/sys/dev/raidframe/rf_evenodd.h
new file mode 100644
index 00000000000..24e5a811447
--- /dev/null
+++ b/sys/dev/raidframe/rf_evenodd.h
@@ -0,0 +1,49 @@
+/* $OpenBSD: rf_evenodd.h,v 1.1 1999/01/11 14:29:21 niklas Exp $ */
+/* $NetBSD: rf_evenodd.h,v 1.1 1998/11/13 04:20:29 oster Exp $ */
+/*
+ * Copyright (c) 1995, 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Chang-Ming Wu
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_EVENODD_H_
+#define _RF__RF_EVENODD_H_
+
+/* extern declerations of the failure mode functions. */
+int rf_ConfigureEvenOdd(RF_ShutdownList_t **shutdownListp, RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr);
+int rf_GetDefaultNumFloatingReconBuffersEvenOdd(RF_Raid_t *raidPtr);
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitEvenOdd(RF_Raid_t *raidPtr);
+void rf_IdentifyStripeEvenOdd(RF_Raid_t *raidPtr, RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids, RF_RowCol_t *outrow);
+void rf_MapParityEvenOdd(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_MapEEvenOdd(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_EODagSelect(RF_Raid_t *raidPtr, RF_IoType_t type,
+ RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc);
+int rf_VerifyParityEvenOdd(RF_Raid_t *raidPtr, RF_RaidAddr_t raidAddr,
+ RF_PhysDiskAddr_t *parityPDA, int correct_it, RF_RaidAccessFlags_t flags);
+
+#endif /* !_RF__RF_EVENODD_H_ */
diff --git a/sys/dev/raidframe/rf_evenodd_dagfuncs.c b/sys/dev/raidframe/rf_evenodd_dagfuncs.c
new file mode 100644
index 00000000000..2762ac725af
--- /dev/null
+++ b/sys/dev/raidframe/rf_evenodd_dagfuncs.c
@@ -0,0 +1,887 @@
+/* $OpenBSD: rf_evenodd_dagfuncs.c,v 1.1 1999/01/11 14:29:21 niklas Exp $ */
+/* $NetBSD: rf_evenodd_dagfuncs.c,v 1.1 1998/11/13 04:20:29 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: ChangMing Wu
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * Code for RAID-EVENODD architecture.
+ */
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_dagffrd.h"
+#include "rf_dagffwr.h"
+#include "rf_dagdegrd.h"
+#include "rf_dagdegwr.h"
+#include "rf_dagutils.h"
+#include "rf_dagfuncs.h"
+#include "rf_threadid.h"
+#include "rf_etimer.h"
+#include "rf_general.h"
+#include "rf_configure.h"
+#include "rf_parityscan.h"
+#include "rf_sys.h"
+#include "rf_evenodd.h"
+#include "rf_evenodd_dagfuncs.h"
+
+/* These redundant functions are for small write */
+RF_RedFuncs_t rf_EOSmallWritePFuncs = { rf_RegularXorFunc, "Regular Old-New P", rf_SimpleXorFunc, "Simple Old-New P" };
+RF_RedFuncs_t rf_EOSmallWriteEFuncs = { rf_RegularONEFunc, "Regular Old-New E", rf_SimpleONEFunc, "Regular Old-New E" };
+
+/* These redundant functions are for degraded read */
+RF_RedFuncs_t rf_eoPRecoveryFuncs = { rf_RecoveryXorFunc, "Recovery Xr", rf_RecoveryXorFunc, "Recovery Xr"};
+RF_RedFuncs_t rf_eoERecoveryFuncs = { rf_RecoveryEFunc, "Recovery E Func", rf_RecoveryEFunc, "Recovery E Func" };
+
+/**********************************************************************************************
+ * the following encoding node functions is used in EO_000_CreateLargeWriteDAG
+ **********************************************************************************************/
+int rf_RegularPEFunc(node)
+ RF_DagNode_t *node;
+{
+ rf_RegularESubroutine(node,node->results[1]);
+ rf_RegularXorFunc(node); /* does the wakeup here! */
+#if 1
+ return(0); /* XXX This was missing... GO */
+#endif
+}
+
+
+/************************************************************************************************
+ * For EO_001_CreateSmallWriteDAG, there are (i)RegularONEFunc() and (ii)SimpleONEFunc() to
+ * be used. The previous case is when write access at least sectors of full stripe unit.
+ * The later function is used when the write access two stripe units but with total sectors
+ * less than sectors per SU. In this case, the access of parity and 'E' are shown as disconnected
+ * areas in their stripe unit and parity write and 'E' write are both devided into two distinct
+ * writes( totally four). This simple old-new write and regular old-new write happen as in RAID-5
+ ************************************************************************************************/
+
+/* Algorithm:
+ 1. Store the difference of old data and new data in the Rod buffer.
+ 2. then encode this buffer into the buffer which already have old 'E' information inside it,
+ the result can be shown to be the new 'E' information.
+ 3. xor the Wnd buffer into the difference buffer to recover the original old data.
+ Here we have another alternative: to allocate a temporary buffer for storing the difference of
+ old data and new data, then encode temp buf into old 'E' buf to form new 'E', but this approach
+ take the same speed as the previous, and need more memory.
+*/
+int rf_RegularONEFunc(node)
+ RF_DagNode_t *node;
+{
+ RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p;
+ RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout;
+ int EpdaIndex = (node->numParams-1)/2 - 1; /* the parameter of node where you can find e-pda */
+ int i, k, retcode = 0;
+ int suoffset, length;
+ RF_RowCol_t scol;
+ char *srcbuf, *destbuf;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+ RF_Etimer_t timer;
+ RF_PhysDiskAddr_t *pda, *EPDA = (RF_PhysDiskAddr_t *) node->params[EpdaIndex].p;
+ int ESUOffset = rf_StripeUnitOffset(layoutPtr, EPDA->startSector); /* generally zero */
+
+ RF_ASSERT( EPDA->type == RF_PDA_TYPE_Q );
+ RF_ASSERT(ESUOffset == 0);
+
+ RF_ETIMER_START(timer);
+
+ /* Xor the Wnd buffer into Rod buffer, the difference of old data and new data is stored in Rod buffer */
+ for( k=0; k< EpdaIndex; k += 2) {
+ length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *)node->params[k].p)->numSector );
+ retcode = rf_bxor( node->params[k+EpdaIndex+3].p, node->params[k+1].p, length, node->dagHdr->bp);
+ }
+ /* Start to encoding the buffer storing the difference of old data and new data into 'E' buffer */
+ for (i=0; i<EpdaIndex; i+=2) if (node->params[i+1].p != node->results[0]) { /* results[0] is buf ptr of E */
+ pda = (RF_PhysDiskAddr_t *) node->params[i].p;
+ srcbuf = (char *) node->params[i+1].p;
+ scol = rf_EUCol(layoutPtr, pda->raidAddress );
+ suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+ destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr,suoffset);
+ rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
+ }
+ /* Recover the original old data to be used by parity encoding function in XorNode */
+ for( k=0; k< EpdaIndex; k += 2) {
+ length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *)node->params[k].p)->numSector );
+ retcode = rf_bxor( node->params[k+EpdaIndex+3].p, node->params[k+1].p, length, node->dagHdr->bp);
+ }
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ tracerec->q_us += RF_ETIMER_VAL_US(timer);
+ rf_GenericWakeupFunc(node, 0);
+#if 1
+ return(0); /* XXX this was missing.. GO */
+#endif
+}
+
+int rf_SimpleONEFunc(node)
+ RF_DagNode_t *node;
+{
+ RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p;
+ RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout;
+ RF_PhysDiskAddr_t *pda = (RF_PhysDiskAddr_t *) node->params[0].p;
+ int retcode = 0;
+ char *srcbuf, *destbuf;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+ int length;
+ RF_RowCol_t scol;
+ RF_Etimer_t timer;
+
+ RF_ASSERT( ((RF_PhysDiskAddr_t *)node->params[2].p)->type == RF_PDA_TYPE_Q );
+ if (node->dagHdr->status == rf_enable) {
+ RF_ETIMER_START(timer);
+ length = rf_RaidAddressToByte(raidPtr, ((RF_PhysDiskAddr_t *)node->params[4].p)->numSector );/* this is a pda of writeDataNodes */
+ /* bxor to buffer of readDataNodes */
+ retcode = rf_bxor( node->params[5].p, node->params[1].p, length, node->dagHdr->bp);
+ /* find out the corresponding colume in encoding matrix for write colume to be encoded into redundant disk 'E' */
+ scol = rf_EUCol(layoutPtr, pda->raidAddress );
+ srcbuf = node->params[1].p;
+ destbuf = node->params[3].p;
+ /* Start encoding process */
+ rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
+ rf_bxor( node->params[5].p, node->params[1].p, length, node->dagHdr->bp);
+ RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->q_us += RF_ETIMER_VAL_US(timer);
+
+ }
+ return(rf_GenericWakeupFunc(node, retcode)); /* call wake func explicitly since no I/O in this node */
+}
+
+
+/****** called by rf_RegularPEFunc(node) and rf_RegularEFunc(node) in f.f. large write ********/
+void rf_RegularESubroutine(node, ebuf)
+ RF_DagNode_t *node;
+ char *ebuf;
+{
+ RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p;
+ RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout;
+ RF_PhysDiskAddr_t *pda;
+ int i, suoffset;
+ RF_RowCol_t scol;
+ char *srcbuf, *destbuf;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+ RF_Etimer_t timer;
+
+ RF_ETIMER_START(timer);
+ for (i=0; i<node->numParams-2; i+=2) {
+ RF_ASSERT( node->params[i+1].p != ebuf );
+ pda = (RF_PhysDiskAddr_t *) node->params[i].p;
+ suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+ scol = rf_EUCol(layoutPtr, pda->raidAddress );
+ srcbuf = (char *) node->params[i+1].p;
+ destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset );
+ rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
+ }
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ tracerec->xor_us += RF_ETIMER_VAL_US(timer);
+}
+
+
+/*******************************************************************************************
+ * Used in EO_001_CreateLargeWriteDAG
+ ******************************************************************************************/
+int rf_RegularEFunc(node)
+ RF_DagNode_t *node;
+{
+ rf_RegularESubroutine(node, node->results[0]);
+ rf_GenericWakeupFunc(node, 0);
+#if 1
+ return(0); /* XXX this was missing?.. GO */
+#endif
+}
+
+/*******************************************************************************************
+ * This degraded function allow only two case:
+ * 1. when write access the full failed stripe unit, then the access can be more than
+ * one tripe units.
+ * 2. when write access only part of the failed SU, we assume accesses of more than
+ * one stripe unit is not allowed so that the write can be dealt with like a
+ * large write.
+ * The following function is based on these assumptions. So except in the second case,
+ * it looks the same as a large write encodeing function. But this is not exactly the
+ * normal way for doing a degraded write, since raidframe have to break cases of access
+ * other than the above two into smaller accesses. We may have to change
+ * DegrESubroutin in the future.
+ *******************************************************************************************/
+void rf_DegrESubroutine(node, ebuf)
+ RF_DagNode_t *node;
+ char *ebuf;
+{
+ RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p;
+ RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout;
+ RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams-2].p;
+ RF_PhysDiskAddr_t *pda;
+ int i, suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
+ RF_RowCol_t scol;
+ char *srcbuf, *destbuf;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+ RF_Etimer_t timer;
+
+ RF_ETIMER_START(timer);
+ for (i=0; i<node->numParams-2; i+=2) {
+ RF_ASSERT( node->params[i+1].p != ebuf );
+ pda = (RF_PhysDiskAddr_t *) node->params[i].p;
+ suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+ scol = rf_EUCol(layoutPtr, pda->raidAddress );
+ srcbuf = (char *) node->params[i+1].p;
+ destbuf = ebuf + rf_RaidAddressToByte(raidPtr, suoffset-failedSUOffset);
+ rf_e_encToBuf(raidPtr, scol, srcbuf, RF_EO_MATRIX_DIM - 2, destbuf, pda->numSector);
+ }
+
+ RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); tracerec->q_us += RF_ETIMER_VAL_US(timer);
+}
+
+
+/**************************************************************************************
+ * This function is used in case where one data disk failed and both redundant disks
+ * alive. It is used in the EO_100_CreateWriteDAG. Note: if there is another disk
+ * failed in the stripe but not accessed at this time, then we should, instead, use
+ * the rf_EOWriteDoubleRecoveryFunc().
+ **************************************************************************************/
+int rf_Degraded_100_EOFunc(node)
+ RF_DagNode_t *node;
+{
+ rf_DegrESubroutine(node, node->results[1]);
+ rf_RecoveryXorFunc(node); /* does the wakeup here! */
+#if 1
+ return(0); /* XXX this was missing... SHould these be void functions??? GO */
+#endif
+}
+
+/**************************************************************************************
+ * This function is to encode one sector in one of the data disks to the E disk.
+ * However, in evenodd this function can also be used as decoding function to recover
+ * data from dead disk in the case of parity failure and a single data failure.
+ **************************************************************************************/
+void rf_e_EncOneSect(
+ RF_RowCol_t srcLogicCol,
+ char *srcSecbuf,
+ RF_RowCol_t destLogicCol,
+ char *destSecbuf,
+ int bytesPerSector)
+{
+ int S_index; /* index of the EU in the src col which need be Xored into all EUs in a dest sector */
+ int numRowInEncMatix = (RF_EO_MATRIX_DIM) -1;
+ RF_RowCol_t j, indexInDest, /* row index of an encoding unit in the destination colume of encoding matrix */
+ indexInSrc; /* row index of an encoding unit in the source colume used for recovery */
+ int bytesPerEU = bytesPerSector/numRowInEncMatix;
+
+#if RF_EO_MATRIX_DIM > 17
+ int shortsPerEU = bytesPerEU/sizeof(short);
+ short *destShortBuf, *srcShortBuf1, *srcShortBuf2;
+ register short temp1;
+#elif RF_EO_MATRIX_DIM == 17
+ int longsPerEU = bytesPerEU/sizeof(long);
+ long *destLongBuf, *srcLongBuf1, *srcLongBuf2;
+ register long temp1;
+#endif
+
+#if RF_EO_MATRIX_DIM > 17
+ RF_ASSERT( sizeof(short) == 2 || sizeof(short) == 1 );
+ RF_ASSERT( bytesPerEU % sizeof(short) == 0 );
+#elif RF_EO_MATRIX_DIM == 17
+ RF_ASSERT( sizeof(long) == 8 || sizeof(long) == 4 );
+ RF_ASSERT( bytesPerEU % sizeof(long) == 0);
+#endif
+
+ S_index = rf_EO_Mod( ( RF_EO_MATRIX_DIM -1 + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM);
+#if RF_EO_MATRIX_DIM > 17
+ srcShortBuf1 = (short *)(srcSecbuf + S_index * bytesPerEU);
+#elif RF_EO_MATRIX_DIM == 17
+ srcLongBuf1 = (long *)(srcSecbuf + S_index * bytesPerEU);
+#endif
+
+ for( indexInDest = 0; indexInDest < numRowInEncMatix ; indexInDest++){
+ indexInSrc = rf_EO_Mod( (indexInDest + destLogicCol - srcLogicCol), RF_EO_MATRIX_DIM );
+
+#if RF_EO_MATRIX_DIM > 17
+ destShortBuf = (short *)(destSecbuf + indexInDest * bytesPerEU);
+ srcShortBuf2 = (short *)(srcSecbuf + indexInSrc * bytesPerEU);
+ for(j=0; j < shortsPerEU; j++) {
+ temp1 = destShortBuf[j]^srcShortBuf1[j];
+ /* note: S_index won't be at the end row for any src col! */
+ if(indexInSrc != RF_EO_MATRIX_DIM -1) destShortBuf[j] = (srcShortBuf2[j])^temp1;
+ /* if indexInSrc is at the end row, ie. RF_EO_MATRIX_DIM -1, then all elements are zero! */
+ else destShortBuf[j] = temp1;
+ }
+
+#elif RF_EO_MATRIX_DIM == 17
+ destLongBuf = (long *)(destSecbuf + indexInDest * bytesPerEU);
+ srcLongBuf2 = (long *)(srcSecbuf + indexInSrc * bytesPerEU);
+ for(j=0; j < longsPerEU; j++) {
+ temp1 = destLongBuf[j]^srcLongBuf1[j];
+ if(indexInSrc != RF_EO_MATRIX_DIM -1) destLongBuf[j] = (srcLongBuf2[j])^temp1;
+ else destLongBuf[j] = temp1;
+ }
+#endif
+ }
+}
+
+void rf_e_encToBuf(
+ RF_Raid_t *raidPtr,
+ RF_RowCol_t srcLogicCol,
+ char *srcbuf,
+ RF_RowCol_t destLogicCol,
+ char *destbuf,
+ int numSector)
+{
+ int i, bytesPerSector = rf_RaidAddressToByte(raidPtr, 1);
+
+ for (i=0; i < numSector; i++)
+ {
+ rf_e_EncOneSect( srcLogicCol, srcbuf, destLogicCol, destbuf, bytesPerSector);
+ srcbuf += bytesPerSector;
+ destbuf += bytesPerSector;
+ }
+}
+
+/**************************************************************************************
+ * when parity die and one data die, We use second redundant information, 'E',
+ * to recover the data in dead disk. This function is used in the recovery node of
+ * for EO_110_CreateReadDAG
+ **************************************************************************************/
+int rf_RecoveryEFunc(node)
+ RF_DagNode_t *node;
+{
+ RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p;
+ RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout;
+ RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams-2].p;
+ RF_RowCol_t scol, /*source logical column*/
+ fcol = rf_EUCol(layoutPtr, failedPDA->raidAddress ); /* logical column of failed SU */
+ int i;
+ RF_PhysDiskAddr_t *pda;
+ int suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr,failedPDA->startSector);
+ char *srcbuf, *destbuf;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+ RF_Etimer_t timer;
+
+ bzero( (char *)node->results[0], rf_RaidAddressToByte(raidPtr,failedPDA->numSector));
+ if (node->dagHdr->status == rf_enable) {
+ RF_ETIMER_START(timer);
+ for (i=0; i<node->numParams-2; i+=2) if (node->params[i+1].p != node->results[0]) {
+ pda = (RF_PhysDiskAddr_t *) node->params[i].p;
+ if( i == node->numParams - 4 ) scol = RF_EO_MATRIX_DIM - 2; /* the colume of redundant E */
+ else scol = rf_EUCol(layoutPtr, pda->raidAddress );
+ srcbuf = (char *) node->params[i+1].p;
+ suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+ destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr,suoffset-failedSUOffset);
+ rf_e_encToBuf(raidPtr, scol, srcbuf, fcol, destbuf, pda->numSector);
+ }
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ tracerec->xor_us += RF_ETIMER_VAL_US(timer);
+ }
+ return (rf_GenericWakeupFunc(node, 0)); /* node execute successfully */
+}
+
+/**************************************************************************************
+ * This function is used in the case where one data and the parity have filed.
+ * (in EO_110_CreateWriteDAG )
+ **************************************************************************************/
+int rf_EO_DegradedWriteEFunc(RF_DagNode_t *node)
+{
+ rf_DegrESubroutine(node, node->results[0]);
+ rf_GenericWakeupFunc(node, 0);
+#if 1
+ return(0); /* XXX Yet another one!! GO */
+#endif
+}
+
+
+
+/**************************************************************************************
+ * THE FUNCTION IS FOR DOUBLE DEGRADED READ AND WRITE CASES
+ **************************************************************************************/
+
+void rf_doubleEOdecode(
+ RF_Raid_t *raidPtr,
+ char **rrdbuf,
+ char **dest,
+ RF_RowCol_t *fcol,
+ char *pbuf,
+ char *ebuf)
+{
+ RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
+ int i, j, k, f1, f2, row;
+ int rrdrow, erow, count = 0;
+ int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1 );
+ int numRowInEncMatix = (RF_EO_MATRIX_DIM) -1;
+#if 0
+ int pcol = (RF_EO_MATRIX_DIM) - 1;
+#endif
+ int ecol = (RF_EO_MATRIX_DIM) - 2;
+ int bytesPerEU = bytesPerSector/numRowInEncMatix;
+ int numDataCol = layoutPtr->numDataCol;
+#if RF_EO_MATRIX_DIM > 17
+ int shortsPerEU = bytesPerEU/sizeof(short);
+ short *rrdbuf_current, *pbuf_current, *ebuf_current;
+ short *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current;
+ register short *temp;
+ short *P;
+
+ RF_ASSERT( bytesPerEU % sizeof(short) == 0);
+ RF_Malloc(P, bytesPerEU, (short *));
+ RF_Malloc(temp, bytesPerEU, (short *));
+#elif RF_EO_MATRIX_DIM == 17
+ int longsPerEU = bytesPerEU/sizeof(long);
+ long *rrdbuf_current, *pbuf_current, *ebuf_current;
+ long *dest_smaller, *dest_smaller_current, *dest_larger, *dest_larger_current;
+ register long *temp;
+ long *P;
+
+ RF_ASSERT( bytesPerEU % sizeof(long) == 0);
+ RF_Malloc(P, bytesPerEU, (long *));
+ RF_Malloc(temp, bytesPerEU, (long *));
+#endif
+ RF_ASSERT( *((long *)dest[0]) == 0);
+ RF_ASSERT( *((long *)dest[1]) == 0);
+ bzero((char *)P, bytesPerEU);
+ bzero((char *)temp, bytesPerEU);
+ RF_ASSERT( *P == 0 );
+ /* calculate the 'P' parameter, which, not parity, is the Xor of all elements in
+ the last two column, ie. 'E' and 'parity' colume, see the Ref. paper by Blaum, et al 1993 */
+ for( i=0; i< numRowInEncMatix; i++)
+ for( k=0; k< longsPerEU; k++) {
+#if RF_EO_MATRIX_DIM > 17
+ ebuf_current = ((short *)ebuf) + i*shortsPerEU + k;
+ pbuf_current = ((short *)pbuf) + i*shortsPerEU + k;
+#elif RF_EO_MATRIX_DIM == 17
+ ebuf_current = ((long *)ebuf) + i*longsPerEU + k;
+ pbuf_current = ((long *)pbuf) + i*longsPerEU + k;
+#endif
+ P[k] ^= *ebuf_current;
+ P[k] ^= *pbuf_current;
+ }
+ RF_ASSERT( fcol[0] != fcol[1] );
+ if( fcol[0] < fcol[1] ) {
+#if RF_EO_MATRIX_DIM > 17
+ dest_smaller = (short *)(dest[0]);
+ dest_larger = (short *)(dest[1]);
+#elif RF_EO_MATRIX_DIM == 17
+ dest_smaller = (long *)(dest[0]);
+ dest_larger = (long *)(dest[1]);
+#endif
+ f1 = fcol[0];
+ f2 = fcol[1];
+ }
+ else {
+#if RF_EO_MATRIX_DIM > 17
+ dest_smaller = (short *)(dest[1]);
+ dest_larger = (short *)(dest[0]);
+#elif RF_EO_MATRIX_DIM == 17
+ dest_smaller = (long *)(dest[1]);
+ dest_larger = (long *)(dest[0]);
+#endif
+ f1 = fcol[1];
+ f2 = fcol[0];
+ }
+ row = (RF_EO_MATRIX_DIM) -1;
+ while( (row = rf_EO_Mod( (row+f1-f2), RF_EO_MATRIX_DIM )) != ( (RF_EO_MATRIX_DIM) -1) )
+ {
+#if RF_EO_MATRIX_DIM > 17
+ dest_larger_current = dest_larger + row*shortsPerEU;
+ dest_smaller_current = dest_smaller + row*shortsPerEU;
+#elif RF_EO_MATRIX_DIM == 17
+ dest_larger_current = dest_larger + row*longsPerEU;
+ dest_smaller_current = dest_smaller + row*longsPerEU;
+#endif
+ /** Do the diagonal recovery. Initially, temp[k] = (failed 1),
+ which is the failed data in the colume which has smaller col index. **/
+ /* step 1: ^(SUM of nonfailed in-diagonal A(rrdrow,0..m-3)) */
+ for( j=0; j< numDataCol; j++)
+ {
+ if( j == f1 || j == f2 ) continue;
+ rrdrow = rf_EO_Mod( (row+f2-j), RF_EO_MATRIX_DIM );
+ if ( rrdrow != (RF_EO_MATRIX_DIM) -1 ) {
+#if RF_EO_MATRIX_DIM > 17
+ rrdbuf_current = (short *)(rrdbuf[j]) + rrdrow * shortsPerEU;
+ for (k=0; k< shortsPerEU; k++) temp[k] ^= *(rrdbuf_current + k);
+#elif RF_EO_MATRIX_DIM == 17
+ rrdbuf_current = (long *)(rrdbuf[j]) + rrdrow * longsPerEU;
+ for (k=0; k< longsPerEU; k++) temp[k] ^= *(rrdbuf_current + k);
+#endif
+ }
+ }
+ /* step 2: ^E(erow,m-2), If erow is at the buttom row, don't Xor into it
+ E(erow,m-2) = (principle diagonal) ^ (failed 1) ^ (failed 2)
+ ^ ( SUM of nonfailed in-diagonal A(rrdrow,0..m-3) )
+ After this step, temp[k] = (principle diagonal) ^ (failed 2) */
+
+ erow = rf_EO_Mod( (row+f2-ecol), (RF_EO_MATRIX_DIM) );
+ if ( erow != (RF_EO_MATRIX_DIM) -1) {
+#if RF_EO_MATRIX_DIM > 17
+ ebuf_current = (short *)ebuf + shortsPerEU * erow;
+ for (k=0; k< shortsPerEU; k++) temp[k] ^= *(ebuf_current+k);
+#elif RF_EO_MATRIX_DIM == 17
+ ebuf_current = (long *)ebuf + longsPerEU * erow;
+ for (k=0; k< longsPerEU; k++) temp[k] ^= *(ebuf_current+k);
+#endif
+ }
+ /* step 3: ^P to obtain the failed data (failed 2).
+ P can be proved to be actually (principle diagonal)
+ After this step, temp[k] = (failed 2), the failed data to be recovered */
+#if RF_EO_MATRIX_DIM > 17
+ for (k=0; k< shortsPerEU; k++) temp[k] ^= P[k];
+ /* Put the data to the destination buffer */
+ for (k=0; k< shortsPerEU; k++) dest_larger_current[k] = temp[k];
+#elif RF_EO_MATRIX_DIM == 17
+ for (k=0; k< longsPerEU; k++) temp[k] ^= P[k];
+ /* Put the data to the destination buffer */
+ for (k=0; k< longsPerEU; k++) dest_larger_current[k] = temp[k];
+#endif
+
+ /** THE FOLLOWING DO THE HORIZONTAL XOR **/
+ /* step 1: ^(SUM of A(row,0..m-3)), ie. all nonfailed data columes */
+ for (j=0; j< numDataCol; j++)
+ {
+ if( j == f1 || j == f2 ) continue;
+#if RF_EO_MATRIX_DIM > 17
+ rrdbuf_current = (short *)(rrdbuf[j]) + row * shortsPerEU;
+ for (k=0; k< shortsPerEU; k++) temp[k] ^= *(rrdbuf_current+k);
+#elif RF_EO_MATRIX_DIM == 17
+ rrdbuf_current = (long *)(rrdbuf[j]) + row * longsPerEU;
+ for (k=0; k< longsPerEU; k++) temp[k] ^= *(rrdbuf_current+k);
+#endif
+ }
+ /* step 2: ^A(row,m-1) */
+ /* step 3: Put the data to the destination buffer */
+#if RF_EO_MATRIX_DIM > 17
+ pbuf_current = (short *)pbuf + shortsPerEU * row;
+ for (k=0; k< shortsPerEU; k++) temp[k] ^= *(pbuf_current+k);
+ for (k=0; k< shortsPerEU; k++) dest_smaller_current[k] = temp[k];
+#elif RF_EO_MATRIX_DIM == 17
+ pbuf_current = (long *)pbuf + longsPerEU * row;
+ for (k=0; k< longsPerEU; k++) temp[k] ^= *(pbuf_current+k);
+ for (k=0; k< longsPerEU; k++) dest_smaller_current[k] = temp[k];
+#endif
+ count++;
+ }
+ /* Check if all Encoding Unit in the data buffer have been decoded,
+ according EvenOdd theory, if "RF_EO_MATRIX_DIM" is a prime number,
+ this algorithm will covered all buffer */
+ RF_ASSERT( count == numRowInEncMatix );
+ RF_Free((char *)P, bytesPerEU);
+ RF_Free((char *)temp, bytesPerEU);
+}
+
+
+/***************************************************************************************
+* This function is called by double degragded read
+* EO_200_CreateReadDAG
+*
+***************************************************************************************/
+int rf_EvenOddDoubleRecoveryFunc(node)
+ RF_DagNode_t *node;
+{
+ int ndataParam = 0;
+ int np = node->numParams;
+ RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p;
+ RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p;
+ RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
+ int i, prm, sector, nresults = node->numResults;
+ RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
+ unsigned sosAddr;
+ int two = 0, mallc_one= 0, mallc_two = 0; /* flags to indicate if memory is allocated */
+ int bytesPerSector = rf_RaidAddressToByte(raidPtr, 1 );
+ RF_PhysDiskAddr_t *ppda,*ppda2,*epda,*epda2,*pda, *pda0, *pda1, npda;
+ RF_RowCol_t fcol[2], fsuoff[2], fsuend[2], numDataCol = layoutPtr->numDataCol;
+ char **buf, *ebuf, *pbuf, *dest[2];
+ long *suoff=NULL, *suend=NULL, *prmToCol=NULL, psuoff, esuoff;
+ RF_SectorNum_t startSector, endSector;
+ RF_Etimer_t timer;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+
+ RF_ETIMER_START(timer);
+
+ /* Find out the number of parameters which are pdas for data information */
+ for (i = 0; i<= np; i++)
+ if( ((RF_PhysDiskAddr_t *)node->params[i].p)->type != RF_PDA_TYPE_DATA) {ndataParam = i ; break; }
+
+ RF_Malloc(buf, numDataCol*sizeof(char *), (char **));
+ if (ndataParam != 0 ){
+ RF_Malloc(suoff, ndataParam*sizeof(long), (long *) );
+ RF_Malloc(suend, ndataParam*sizeof(long), (long *) );
+ RF_Malloc(prmToCol, ndataParam*sizeof(long), (long *) );
+ }
+
+ if (asmap->failedPDAs[1] &&
+ (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) {
+ RF_ASSERT(0); /* currently, no support for this situation */
+ ppda = node->params[np-6].p;
+ ppda2 = node->params[np-5].p;
+ RF_ASSERT( ppda2->type == RF_PDA_TYPE_PARITY );
+ epda = node->params[np-4].p;
+ epda2 = node->params[np-3].p;
+ RF_ASSERT( epda2->type == RF_PDA_TYPE_Q );
+ two = 1;
+ }
+ else {
+ ppda = node->params[np-4].p;
+ epda = node->params[np-3].p;
+ psuoff = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
+ esuoff = rf_StripeUnitOffset(layoutPtr, epda->startSector);
+ RF_ASSERT( psuoff == esuoff );
+ }
+ /*
+ the followings have three goals:
+ 1. determine the startSector to begin decoding and endSector to end decoding.
+ 2. determine the colume numbers of the two failed disks.
+ 3. determine the offset and end offset of the access within each failed stripe unit.
+ */
+ if( nresults == 1 ) {
+ /* find the startSector to begin decoding */
+ pda = node->results[0];
+ bzero(pda->bufPtr, bytesPerSector*pda->numSector );
+ fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda->startSector );
+ fsuend[0] = fsuoff[0] + pda->numSector;
+ startSector = fsuoff[0];
+ endSector = fsuend[0];
+
+ /* find out the the column of failed disk being accessed */
+ fcol[0] = rf_EUCol(layoutPtr, pda->raidAddress );
+
+ /* find out the other failed colume not accessed */
+ sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+ for (i=0; i < numDataCol; i++) {
+ npda.raidAddress = sosAddr + (i * secPerSU);
+ (raidPtr->Layout.map->MapSector)(raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
+ /* skip over dead disks */
+ if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
+ if (i != fcol[0]) break;
+ }
+ RF_ASSERT (i < numDataCol);
+ fcol[1] = i;
+ }
+ else {
+ RF_ASSERT ( nresults == 2 );
+ pda0 = node->results[0]; bzero(pda0->bufPtr, bytesPerSector*pda0->numSector );
+ pda1 = node->results[1]; bzero(pda1->bufPtr, bytesPerSector*pda1->numSector );
+ /* determine the failed colume numbers of the two failed disks. */
+ fcol[0] = rf_EUCol(layoutPtr, pda0->raidAddress );
+ fcol[1] = rf_EUCol(layoutPtr, pda1->raidAddress );
+ /* determine the offset and end offset of the access within each failed stripe unit. */
+ fsuoff[0] = rf_StripeUnitOffset(layoutPtr, pda0->startSector );
+ fsuend[0] = fsuoff[0] + pda0->numSector;
+ fsuoff[1] = rf_StripeUnitOffset(layoutPtr, pda1->startSector );
+ fsuend[1] = fsuoff[1] + pda1->numSector;
+ /* determine the startSector to begin decoding */
+ startSector = RF_MIN( pda0->startSector, pda1->startSector );
+ /* determine the endSector to end decoding */
+ endSector = RF_MAX( fsuend[0], fsuend[1] );
+ }
+ /*
+ assign the beginning sector and the end sector for each parameter
+ find out the corresponding colume # for each parameter
+ */
+ for( prm=0; prm < ndataParam; prm++ ) {
+ pda = node->params[prm].p;
+ suoff[prm] = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+ suend[prm] = suoff[prm] + pda->numSector;
+ prmToCol[prm] = rf_EUCol(layoutPtr, pda->raidAddress );
+ }
+ /* 'sector' is the sector for the current decoding algorithm. For each sector in the failed SU,
+ find out the corresponding parameters that cover the current sector and that are needed for
+ decoding of this sector in failed SU. 2. Find out if sector is in the shadow of any accessed
+ failed SU. If not, malloc a temporary space of a sector in size.
+ */
+ for( sector = startSector; sector < endSector; sector++ ){
+ if ( nresults == 2 )
+ if( !(fsuoff[0]<=sector && sector<fsuend[0]) && !(fsuoff[1]<=sector && sector<fsuend[1]) )continue;
+ for( prm=0; prm < ndataParam; prm++ )
+ if( suoff[prm] <= sector && sector < suend[prm] )
+ buf[(prmToCol[prm])] = ((RF_PhysDiskAddr_t *)node->params[prm].p)->bufPtr +
+ rf_RaidAddressToByte(raidPtr, sector-suoff[prm]);
+ /* find out if sector is in the shadow of any accessed failed SU. If yes, assign dest[0], dest[1] to point
+ at suitable position of the buffer corresponding to failed SUs. if no, malloc a temporary space of
+ a sector in size for destination of decoding.
+ */
+ RF_ASSERT( nresults == 1 || nresults == 2 );
+ if ( nresults == 1) {
+ dest[0] = ((RF_PhysDiskAddr_t *)node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector-fsuoff[0]);
+ /* Always malloc temp buffer to dest[1] */
+ RF_Malloc( dest[1], bytesPerSector, (char *) );
+ bzero(dest[1],bytesPerSector); mallc_two = 1; }
+ else {
+ if( fsuoff[0] <= sector && sector < fsuend[0] )
+ dest[0] = ((RF_PhysDiskAddr_t *)node->results[0])->bufPtr + rf_RaidAddressToByte(raidPtr, sector-fsuoff[0]);
+ else { RF_Malloc( dest[0], bytesPerSector, (char *) );
+ bzero(dest[0],bytesPerSector); mallc_one = 1; }
+ if( fsuoff[1] <= sector && sector < fsuend[1] )
+ dest[1] = ((RF_PhysDiskAddr_t *)node->results[1])->bufPtr + rf_RaidAddressToByte(raidPtr, sector-fsuoff[1]);
+ else { RF_Malloc( dest[1], bytesPerSector, (char *) );
+ bzero(dest[1],bytesPerSector); mallc_two = 1; }
+ RF_ASSERT( mallc_one == 0 || mallc_two == 0 );
+ }
+ pbuf = ppda->bufPtr + rf_RaidAddressToByte(raidPtr, sector-psuoff );
+ ebuf = epda->bufPtr + rf_RaidAddressToByte(raidPtr, sector-esuoff );
+ /*
+ * After finish finding all needed sectors, call doubleEOdecode function for decoding
+ * one sector to destination.
+ */
+ rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf );
+ /* free all allocated memory, and mark flag to indicate no memory is being allocated */
+ if( mallc_one == 1) RF_Free( dest[0], bytesPerSector );
+ if( mallc_two == 1) RF_Free( dest[1], bytesPerSector );
+ mallc_one = mallc_two = 0;
+ }
+ RF_Free(buf, numDataCol*sizeof(char *));
+ if (ndataParam != 0){
+ RF_Free(suoff, ndataParam*sizeof(long));
+ RF_Free(suend, ndataParam*sizeof(long));
+ RF_Free(prmToCol, ndataParam*sizeof(long));
+ }
+
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ if (tracerec) {
+ tracerec->q_us += RF_ETIMER_VAL_US(timer);
+ }
+ rf_GenericWakeupFunc(node,0);
+#if 1
+ return(0); /* XXX is this even close!!?!?!!? GO */
+#endif
+}
+
+
+/* currently, only access of one of the two failed SU is allowed in this function.
+ * also, asmap->numStripeUnitsAccessed is limited to be one, the RaidFrame will break large access into
+ * many accesses of single stripe unit.
+ */
+
+int rf_EOWriteDoubleRecoveryFunc(node)
+ RF_DagNode_t *node;
+{
+ int np = node->numParams;
+ RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p;
+ RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p;
+ RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
+ RF_SectorNum_t sector;
+ RF_RowCol_t col, scol;
+ int prm, i, j;
+ RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
+ unsigned sosAddr;
+ unsigned bytesPerSector = rf_RaidAddressToByte(raidPtr, 1 );
+ RF_int64 numbytes;
+ RF_SectorNum_t startSector, endSector;
+ RF_PhysDiskAddr_t *ppda,*epda,*pda, *fpda, npda;
+ RF_RowCol_t fcol[2], numDataCol = layoutPtr->numDataCol;
+ char **buf; /* buf[0], buf[1], buf[2], ...etc. point to buffer storing data read from col0, col1, col2 */
+ char *ebuf, *pbuf, *dest[2], *olddata[2];
+ RF_Etimer_t timer;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+
+ RF_ASSERT( asmap->numDataFailed == 1 ); /* currently only support this case, the other failed SU is not being accessed */
+ RF_ETIMER_START(timer);
+ RF_Malloc(buf, numDataCol*sizeof(char *), (char **));
+
+ ppda = node->results[0]; /* Instead of being buffers, node->results[0] and [1] are Ppda and Epda */
+ epda = node->results[1];
+ fpda = asmap->failedPDAs[0];
+
+ /* First, recovery the failed old SU using EvenOdd double decoding */
+ /* determine the startSector and endSector for decoding */
+ startSector = rf_StripeUnitOffset(layoutPtr, fpda->startSector );
+ endSector = startSector + fpda->numSector;
+ /* Assign buf[col] pointers to point to each non-failed colume and initialize the pbuf
+ and ebuf to point at the beginning of each source buffers and destination buffers */
+ for( prm=0; prm < numDataCol-2; prm++ ) {
+ pda = (RF_PhysDiskAddr_t *)node->params[prm].p;
+ col = rf_EUCol(layoutPtr, pda->raidAddress );
+ buf[col] = pda->bufPtr;
+ }
+ /* pbuf and ebuf: they will change values as double recovery decoding goes on */
+ pbuf = ppda->bufPtr;
+ ebuf = epda->bufPtr;
+ /* find out the logical colume numbers in the encoding matrix of the two failed columes */
+ fcol[0] = rf_EUCol(layoutPtr, fpda->raidAddress );
+
+ /* find out the other failed colume not accessed this time */
+ sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+ for (i=0; i < numDataCol; i++) {
+ npda.raidAddress = sosAddr + (i * secPerSU);
+ (raidPtr->Layout.map->MapSector)(raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
+ /* skip over dead disks */
+ if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
+ if (i != fcol[0]) break;
+ }
+ RF_ASSERT (i < numDataCol);
+ fcol[1] = i;
+ /* assign temporary space to put recovered failed SU */
+ numbytes = fpda->numSector * bytesPerSector;
+ RF_Malloc(olddata[0], numbytes, (char *) );
+ RF_Malloc(olddata[1], numbytes, (char *) );
+ dest[0] = olddata[0];
+ dest[1] = olddata[1];
+ bzero(olddata[0], numbytes);
+ bzero(olddata[1], numbytes);
+ /* Begin the recovery decoding, initially buf[j], ebuf, pbuf, dest[j] have already
+ pointed at the beginning of each source buffers and destination buffers */
+ for( sector = startSector, i=0; sector < endSector; sector++ , i++){
+ rf_doubleEOdecode(raidPtr, buf, dest, fcol, pbuf, ebuf );
+ for (j=0; j < numDataCol; j++)
+ if( ( j != fcol[0]) && ( j != fcol[1] ) ) buf[j] += bytesPerSector;
+ dest[0] += bytesPerSector;
+ dest[1] += bytesPerSector;
+ ebuf += bytesPerSector;
+ pbuf += bytesPerSector;
+ }
+ /* after recovery, the buffer pointed by olddata[0] is the old failed data.
+ With new writing data and this old data, use small write to calculate
+ the new redundant informations
+ */
+ /* node->params[ 0, ... PDAPerDisk * (numDataCol - 2)-1 ] are Pdas of Rrd;
+ params[ PDAPerDisk*(numDataCol - 2), ... PDAPerDisk*numDataCol -1 ] are Pdas of Rp, ( Rp2 ), Re, ( Re2 ) ;
+ params[ PDAPerDisk*numDataCol, ... PDAPerDisk*numDataCol +asmap->numStripeUnitsAccessed -asmap->numDataFailed-1]
+ are Pdas of wudNodes;
+ For current implementation, we assume the simplest case:
+ asmap->numStripeUnitsAccessed == 1 and asmap->numDataFailed == 1 ie. PDAPerDisk = 1
+ then node->params[numDataCol] must be the new data to be writen to the failed disk. We first bxor the new data
+ into the old recovered data, then do the same things as small write.
+ */
+
+ rf_bxor( ((RF_PhysDiskAddr_t *)node->params[numDataCol].p)->bufPtr, olddata[0], numbytes, node->dagHdr->bp);
+ /* do new 'E' calculation */
+ /* find out the corresponding colume in encoding matrix for write colume to be encoded into redundant disk 'E' */
+ scol = rf_EUCol(layoutPtr, fpda->raidAddress );
+ /* olddata[0] now is source buffer pointer; epda->bufPtr is the dest buffer pointer */
+ rf_e_encToBuf(raidPtr, scol, olddata[0], RF_EO_MATRIX_DIM - 2, epda->bufPtr, fpda->numSector);
+
+ /* do new 'P' calculation */
+ rf_bxor( olddata[0], ppda->bufPtr, numbytes, node->dagHdr->bp);
+ /* Free the allocated buffer */
+ RF_Free( olddata[0], numbytes );
+ RF_Free( olddata[1], numbytes );
+ RF_Free( buf, numDataCol*sizeof(char *));
+
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ if (tracerec) {
+ tracerec->q_us += RF_ETIMER_VAL_US(timer);
+ }
+
+ rf_GenericWakeupFunc(node,0);
+ return(0);
+}
diff --git a/sys/dev/raidframe/rf_evenodd_dagfuncs.h b/sys/dev/raidframe/rf_evenodd_dagfuncs.h
new file mode 100644
index 00000000000..9773e57cedd
--- /dev/null
+++ b/sys/dev/raidframe/rf_evenodd_dagfuncs.h
@@ -0,0 +1,77 @@
+/* $OpenBSD: rf_evenodd_dagfuncs.h,v 1.1 1999/01/11 14:29:22 niklas Exp $ */
+/* $NetBSD: rf_evenodd_dagfuncs.h,v 1.1 1998/11/13 04:20:29 oster Exp $ */
+/*
+ * rf_evenodd_dagfuncs.h
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Chang-Ming Wu
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_EVENODD_DAGFUNCS_H_
+#define _RF__RF_EVENODD_DAGFUNCS_H_
+
+extern RF_RedFuncs_t rf_EOSmallWriteEFuncs;
+extern RF_RedFuncs_t rf_EOSmallWritePFuncs;
+extern RF_RedFuncs_t rf_eoERecoveryFuncs;
+extern RF_RedFuncs_t rf_eoPRecoveryFuncs;
+extern RF_RedFuncs_t rf_eoERecoveryFuncs;
+
+int rf_RegularPEFunc(RF_DagNode_t *node);
+int rf_RegularONEFunc(RF_DagNode_t *node);
+int rf_SimpleONEFunc(RF_DagNode_t *node);
+void rf_RegularESubroutine(RF_DagNode_t *node, char *ebuf);
+int rf_RegularEFunc(RF_DagNode_t *node);
+void rf_DegrESubroutine(RF_DagNode_t *node, char *ebuf);
+int rf_Degraded_100_EOFunc(RF_DagNode_t *node);
+void rf_e_EncOneSect(RF_RowCol_t srcLogicCol, char *srcSecbuf,
+ RF_RowCol_t destLogicCol, char *destSecbuf, int bytesPerSector);
+void rf_e_encToBuf(RF_Raid_t *raidPtr, RF_RowCol_t srcLogicCol,
+ char *srcbuf, RF_RowCol_t destLogicCol, char *destbuf, int numSector);
+int rf_RecoveryEFunc(RF_DagNode_t *node);
+int rf_EO_DegradedWriteEFunc(RF_DagNode_t *node);
+void rf_doubleEOdecode(RF_Raid_t *raidPtr, char **rrdbuf, char **dest,
+ RF_RowCol_t *fcol, char *pbuf, char *ebuf);
+int rf_EvenOddDoubleRecoveryFunc(RF_DagNode_t *node);
+int rf_EOWriteDoubleRecoveryFunc(RF_DagNode_t *node);
+
+#define rf_EUCol(_layoutPtr_, _addr_ ) \
+( (_addr_)%( (_layoutPtr_)->dataSectorsPerStripe ) )/((_layoutPtr_)->sectorsPerStripeUnit)
+
+#define rf_EO_Mod( _int1_, _int2_ ) \
+( ((_int1_) < 0)? (((_int1_)+(_int2_))%(_int2_)) : (_int1_)%(_int2_) )
+
+#define rf_OffsetOfNextEUBoundary(_offset_, sec_per_eu) ((_offset_)/(sec_per_eu) + 1)*(sec_per_eu)
+
+#define RF_EO_MATRIX_DIM 17
+
+/*
+ * RF_EO_MATRIX_DIM should be a prime number: and "bytesPerSector" should be
+ * dividable by ( RF_EO_MATRIX_DIM - 1) to fully encode and utilize the space
+ * in a sector, this number could also be 17. Tha later case doesn't apply
+ * for disk array larger than 17 columns totally.
+ */
+
+#endif /* !_RF__RF_EVENODD_DAGFUNCS_H_ */
diff --git a/sys/dev/raidframe/rf_evenodd_dags.c b/sys/dev/raidframe/rf_evenodd_dags.c
new file mode 100644
index 00000000000..775fd5008f9
--- /dev/null
+++ b/sys/dev/raidframe/rf_evenodd_dags.c
@@ -0,0 +1,199 @@
+/* $OpenBSD: rf_evenodd_dags.c,v 1.1 1999/01/11 14:29:22 niklas Exp $ */
+/* $NetBSD: rf_evenodd_dags.c,v 1.1 1998/11/13 04:20:29 oster Exp $ */
+/*
+ * rf_evenodd_dags.c
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Chang-Ming Wu
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#include "rf_archs.h"
+
+#if RF_INCLUDE_EVENODD > 0
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_dagfuncs.h"
+#include "rf_dagutils.h"
+#include "rf_etimer.h"
+#include "rf_acctrace.h"
+#include "rf_general.h"
+#include "rf_evenodd_dags.h"
+#include "rf_evenodd.h"
+#include "rf_evenodd_dagfuncs.h"
+#include "rf_pq.h"
+#include "rf_dagdegrd.h"
+#include "rf_dagdegwr.h"
+#include "rf_dagffwr.h"
+
+
+/*
+ * Lost one data.
+ * Use P to reconstruct missing data.
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_EO_100_CreateReadDAG)
+{
+ rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_eoPRecoveryFuncs);
+}
+
+/*
+ * Lost data + E.
+ * Use P to reconstruct missing data.
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_EO_101_CreateReadDAG)
+{
+ rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_eoPRecoveryFuncs);
+}
+
+/*
+ * Lost data + P.
+ * Make E look like P, and use Eor for Xor, and we can
+ * use degraded read DAG.
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_EO_110_CreateReadDAG)
+{
+ RF_PhysDiskAddr_t *temp;
+ /* swap P and E pointers to fake out the DegradedReadDAG code */
+ temp = asmap->parityInfo; asmap->parityInfo = asmap->qInfo; asmap->qInfo = temp;
+ rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_eoERecoveryFuncs);
+}
+
+/*
+ * Lost two data.
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_EOCreateDoubleDegradedReadDAG)
+{
+ rf_EO_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList);
+}
+
+/*
+ * Lost two data.
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_EO_200_CreateReadDAG)
+{
+ rf_EOCreateDoubleDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList);
+}
+
+RF_CREATE_DAG_FUNC_DECL(rf_EO_100_CreateWriteDAG)
+{
+ if (asmap->numStripeUnitsAccessed != 1 &&
+ asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit)
+ RF_PANIC();
+ rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2, (int (*)(RF_DagNode_t *))rf_Degraded_100_EOFunc, RF_TRUE);
+}
+
+/*
+ * E is dead. Small write.
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_EO_001_CreateSmallWriteDAG)
+{
+ rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_EOSmallWritePFuncs, NULL);
+}
+
+/*
+ * E is dead. Large write.
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_EO_001_CreateLargeWriteDAG)
+{
+ rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularPFunc, RF_TRUE);
+}
+
+/*
+ * P is dead. Small write.
+ * Swap E + P, use single-degraded stuff.
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_EO_010_CreateSmallWriteDAG)
+{
+ RF_PhysDiskAddr_t *temp;
+ /* swap P and E pointers to fake out the DegradedReadDAG code */
+ temp = asmap->parityInfo; asmap->parityInfo = asmap->qInfo; asmap->qInfo = temp;
+ rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_EOSmallWriteEFuncs, NULL);
+}
+
+/*
+ * P is dead. Large write.
+ * Swap E + P, use single-degraded stuff.
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_EO_010_CreateLargeWriteDAG)
+{
+ RF_PhysDiskAddr_t *temp;
+ /* swap P and E pointers to fake out the code */
+ temp = asmap->parityInfo; asmap->parityInfo = asmap->qInfo; asmap->qInfo = temp;
+ rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularEFunc, RF_FALSE);
+}
+
+RF_CREATE_DAG_FUNC_DECL(rf_EO_011_CreateWriteDAG)
+{
+ rf_CreateNonRedundantWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+ RF_IO_TYPE_WRITE);
+}
+
+RF_CREATE_DAG_FUNC_DECL(rf_EO_110_CreateWriteDAG)
+{
+ RF_PhysDiskAddr_t *temp;
+
+ if (asmap->numStripeUnitsAccessed != 1 &&
+ asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit)
+ {
+ RF_PANIC();
+ }
+ /* swap P and E to fake out parity code */
+ temp = asmap->parityInfo; asmap->parityInfo = asmap->qInfo; asmap->qInfo = temp;
+ rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList,1, (int (*)(RF_DagNode_t *))rf_EO_DegradedWriteEFunc, RF_FALSE);
+ /* is the regular E func the right one to call? */
+}
+
+RF_CREATE_DAG_FUNC_DECL(rf_EO_101_CreateWriteDAG)
+{
+ if (asmap->numStripeUnitsAccessed != 1 &&
+ asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit)
+ RF_PANIC();
+ rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList,1, rf_RecoveryXorFunc, RF_TRUE);
+}
+
+RF_CREATE_DAG_FUNC_DECL(rf_EO_DoubleDegRead)
+{
+ rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
+ "Re", "EvenOddRecovery", rf_EvenOddDoubleRecoveryFunc);
+}
+
+RF_CREATE_DAG_FUNC_DECL(rf_EOCreateSmallWriteDAG)
+{
+ rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_EOSmallWriteEFuncs);
+}
+
+RF_CREATE_DAG_FUNC_DECL(rf_EOCreateLargeWriteDAG)
+{
+ rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2, rf_RegularPEFunc, RF_FALSE);
+}
+
+RF_CREATE_DAG_FUNC_DECL(rf_EO_200_CreateWriteDAG)
+{
+ rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Re", "We", "EOWrDDRecovery", rf_EOWriteDoubleRecoveryFunc);
+}
+
+#endif /* RF_INCLUDE_EVENODD > 0 */
diff --git a/sys/dev/raidframe/rf_evenodd_dags.h b/sys/dev/raidframe/rf_evenodd_dags.h
new file mode 100644
index 00000000000..3d125e8aa25
--- /dev/null
+++ b/sys/dev/raidframe/rf_evenodd_dags.h
@@ -0,0 +1,64 @@
+/* $OpenBSD: rf_evenodd_dags.h,v 1.1 1999/01/11 14:29:22 niklas Exp $ */
+/* $NetBSD: rf_evenodd_dags.h,v 1.1 1998/11/13 04:20:29 oster Exp $ */
+/*
+ * rf_evenodd_dags.h
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Chang-Ming Wu
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_EVENODD_DAGS_H_
+#define _RF__RF_EVENODD_DAGS_H_
+
+#include "rf_types.h"
+
+#if RF_UTILITY == 0
+#include "rf_dag.h"
+
+/* extern decl's of the failure mode EO functions.
+ * swiped from rf_pqdeg.h
+ */
+
+RF_CREATE_DAG_FUNC_DECL(rf_EO_100_CreateReadDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_101_CreateReadDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_110_CreateReadDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_200_CreateReadDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EOCreateDoubleDegradedReadDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_100_CreateWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_010_CreateSmallWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_001_CreateSmallWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_010_CreateLargeWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_001_CreateLargeWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_011_CreateWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_110_CreateWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_101_CreateWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_DoubleDegRead);
+RF_CREATE_DAG_FUNC_DECL(rf_EOCreateSmallWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EOCreateLargeWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_EO_200_CreateWriteDAG);
+#endif /* RF_UTILITY == 0 */
+
+#endif /* !_RF__RF_EVENODD_DAGS_H_ */
diff --git a/sys/dev/raidframe/rf_fifo.c b/sys/dev/raidframe/rf_fifo.c
new file mode 100644
index 00000000000..63367aeb4ab
--- /dev/null
+++ b/sys/dev/raidframe/rf_fifo.c
@@ -0,0 +1,371 @@
+/* $OpenBSD: rf_fifo.c,v 1.1 1999/01/11 14:29:22 niklas Exp $ */
+/* $NetBSD: rf_fifo.c,v 1.1 1998/11/13 04:20:29 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/***************************************************
+ *
+ * rf_fifo.c -- prioritized fifo queue code.
+ * There are only two priority levels: hi and lo.
+ *
+ * Aug 4, 1994, adapted from raidSim version (MCH)
+ *
+ ***************************************************/
+
+/*
+ * :
+ * Log: rf_fifo.c,v
+ * Revision 1.20 1996/06/18 20:53:11 jimz
+ * fix up disk queueing (remove configure routine,
+ * add shutdown list arg to create routines)
+ *
+ * Revision 1.19 1996/06/14 00:08:21 jimz
+ * make happier in all environments
+ *
+ * Revision 1.18 1996/06/13 20:41:24 jimz
+ * add random queueing
+ *
+ * Revision 1.17 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.16 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.15 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.14 1996/06/06 01:15:02 jimz
+ * added debugging
+ *
+ * Revision 1.13 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.12 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.11 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.10 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.9 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.8 1995/12/01 18:22:15 root
+ * added copyright info
+ *
+ * Revision 1.7 1995/11/07 15:32:16 wvcii
+ * added function FifoPeek()
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_alloclist.h"
+#include "rf_stripelocks.h"
+#include "rf_layout.h"
+#include "rf_diskqueue.h"
+#include "rf_fifo.h"
+#include "rf_debugMem.h"
+#include "rf_general.h"
+#include "rf_threadid.h"
+#include "rf_options.h"
+
+#if !defined(KERNEL) && RF_INCLUDE_QUEUE_RANDOM > 0
+#include "rf_randmacros.h"
+RF_DECLARE_STATIC_RANDOM
+#endif /* !KERNEL && RF_INCLUDE_QUEUE_RANDOM > 0 */
+
+/* just malloc a header, zero it (via calloc), and return it */
+/*ARGSUSED*/
+void *rf_FifoCreate(sectPerDisk, clList, listp)
+ RF_SectorCount_t sectPerDisk;
+ RF_AllocListElem_t *clList;
+ RF_ShutdownList_t **listp;
+{
+ RF_FifoHeader_t *q;
+
+#if !defined(KERNEL) && RF_INCLUDE_QUEUE_RANDOM > 0
+ RF_INIT_STATIC_RANDOM(1);
+#endif /* !KERNEL && RF_INCLUDE_QUEUE_RANDOM > 0 */
+ RF_CallocAndAdd(q, 1, sizeof(RF_FifoHeader_t), (RF_FifoHeader_t *), clList);
+ q->hq_count = q->lq_count = 0;
+#if !defined(KERNEL) && RF_INCLUDE_QUEUE_RANDOM > 0
+ q->rval = (long)RF_STATIC_RANDOM();
+#endif /* !KERNEL && RF_INCLUDE_QUEUE_RANDOM > 0 */
+ return((void *)q);
+}
+
+void rf_FifoEnqueue(q_in, elem, priority)
+ void *q_in;
+ RF_DiskQueueData_t *elem;
+ int priority;
+{
+ RF_FifoHeader_t *q = (RF_FifoHeader_t *)q_in;
+
+ RF_ASSERT(priority == RF_IO_NORMAL_PRIORITY || priority == RF_IO_LOW_PRIORITY);
+
+ elem->next = NULL;
+ if (priority == RF_IO_NORMAL_PRIORITY) {
+ if (!q->hq_tail) {
+ RF_ASSERT(q->hq_count == 0 && q->hq_head == NULL);
+ q->hq_head = q->hq_tail = elem;
+ } else {
+ RF_ASSERT(q->hq_count != 0 && q->hq_head != NULL);
+ q->hq_tail->next = elem;
+ q->hq_tail = elem;
+ }
+ q->hq_count++;
+ }
+ else {
+ RF_ASSERT(elem->next == NULL);
+ if (rf_fifoDebug) {
+ int tid;
+ rf_get_threadid(tid);
+ printf("[%d] fifo: ENQ lopri\n", tid);
+ }
+ if (!q->lq_tail) {
+ RF_ASSERT(q->lq_count == 0 && q->lq_head == NULL);
+ q->lq_head = q->lq_tail = elem;
+ } else {
+ RF_ASSERT(q->lq_count != 0 && q->lq_head != NULL);
+ q->lq_tail->next = elem;
+ q->lq_tail = elem;
+ }
+ q->lq_count++;
+ }
+ if ((q->hq_count + q->lq_count)!= elem->queue->queueLength) {
+ printf("Queue lengths differ!: %d %d %d\n",
+ q->hq_count, q->lq_count, (int)elem->queue->queueLength);
+ printf("%d %d %d %d\n",
+ (int)elem->queue->numOutstanding,
+ (int)elem->queue->maxOutstanding,
+ (int)elem->queue->row,
+ (int)elem->queue->col);
+ }
+ RF_ASSERT((q->hq_count + q->lq_count) == elem->queue->queueLength);
+}
+
+RF_DiskQueueData_t *rf_FifoDequeue(q_in)
+ void *q_in;
+{
+ RF_FifoHeader_t *q = (RF_FifoHeader_t *) q_in;
+ RF_DiskQueueData_t *nd;
+
+ RF_ASSERT(q);
+ if (q->hq_head) {
+ RF_ASSERT(q->hq_count != 0 && q->hq_tail != NULL);
+ nd = q->hq_head; q->hq_head = q->hq_head->next;
+ if (!q->hq_head) q->hq_tail = NULL;
+ nd->next = NULL;
+ q->hq_count--;
+ } else if (q->lq_head) {
+ RF_ASSERT(q->lq_count != 0 && q->lq_tail != NULL);
+ nd = q->lq_head; q->lq_head = q->lq_head->next;
+ if (!q->lq_head) q->lq_tail = NULL;
+ nd->next = NULL;
+ q->lq_count--;
+ if (rf_fifoDebug) {
+ int tid;
+ rf_get_threadid(tid);
+ printf("[%d] fifo: DEQ lopri %lx\n", tid, (long)nd);
+ }
+ } else {
+ RF_ASSERT(q->hq_count == 0 && q->lq_count == 0 && q->hq_tail == NULL && q->lq_tail == NULL);
+ nd = NULL;
+ }
+ return(nd);
+}
+
+/* This never gets used!! No loss (I hope) if we don't include it... GO */
+#if !defined(__NetBSD__) && !defined(__OpenBSD__) && !defined(_KERNEL)
+
+static RF_DiskQueueData_t *n_in_q(headp, tailp, countp, n, deq)
+ RF_DiskQueueData_t **headp;
+ RF_DiskQueueData_t **tailp;
+ int *countp;
+ int n;
+ int deq;
+{
+ RF_DiskQueueData_t *r, *s;
+ int i;
+
+ for(s=NULL,i=n,r=*headp;r;s=r,r=r->next) {
+ if (i == 0)
+ break;
+ i--;
+ }
+ RF_ASSERT(r != NULL);
+ if (deq == 0)
+ return(r);
+ if (s) {
+ s->next = r->next;
+ }
+ else {
+ *headp = r->next;
+ }
+ if (*tailp == r)
+ *tailp = s;
+ (*countp)--;
+ return(r);
+}
+#endif
+
+#if !defined(KERNEL) && RF_INCLUDE_QUEUE_RANDOM > 0
+RF_DiskQueueData_t *rf_RandomPeek(q_in)
+ void *q_in;
+{
+ RF_FifoHeader_t *q = (RF_FifoHeader_t *) q_in;
+ RF_DiskQueueData_t *req;
+ int n;
+
+ if (q->hq_head) {
+ n = q->rval % q->hq_count;
+ req = n_in_q(&q->hq_head, &q->hq_tail, &q->hq_count, n, 0);
+ }
+ else {
+ RF_ASSERT(q->hq_count == 0);
+ if (q->lq_head == NULL) {
+ RF_ASSERT(q->lq_count == 0);
+ return(NULL);
+ }
+ n = q->rval % q->lq_count;
+ req = n_in_q(&q->lq_head, &q->lq_tail, &q->lq_count, n, 0);
+ }
+ RF_ASSERT((q->hq_count + q->lq_count) == req->queue->queueLength);
+ RF_ASSERT(req != NULL);
+ return(req);
+}
+
+RF_DiskQueueData_t *rf_RandomDequeue(q_in)
+ void *q_in;
+{
+ RF_FifoHeader_t *q = (RF_FifoHeader_t *) q_in;
+ RF_DiskQueueData_t *req;
+ int n;
+
+ if (q->hq_head) {
+ n = q->rval % q->hq_count;
+ q->rval = (long)RF_STATIC_RANDOM();
+ req = n_in_q(&q->hq_head, &q->hq_tail, &q->hq_count, n, 1);
+ }
+ else {
+ RF_ASSERT(q->hq_count == 0);
+ if (q->lq_head == NULL) {
+ RF_ASSERT(q->lq_count == 0);
+ return(NULL);
+ }
+ n = q->rval % q->lq_count;
+ q->rval = (long)RF_STATIC_RANDOM();
+ req = n_in_q(&q->lq_head, &q->lq_tail, &q->lq_count, n, 1);
+ }
+ RF_ASSERT((q->hq_count + q->lq_count) == (req->queue->queueLength-1));
+ return(req);
+}
+#endif /* !KERNEL && RF_INCLUDE_QUEUE_RANDOM > 0 */
+
+/* Return ptr to item at head of queue. Used to examine request
+ * info without actually dequeueing the request.
+ */
+RF_DiskQueueData_t *rf_FifoPeek(void *q_in)
+{
+ RF_DiskQueueData_t *headElement = NULL;
+ RF_FifoHeader_t *q = (RF_FifoHeader_t *) q_in;
+
+ RF_ASSERT(q);
+ if (q->hq_head)
+ headElement = q->hq_head;
+ else if (q->lq_head)
+ headElement = q->lq_head;
+ return(headElement);
+}
+
+/* We sometimes need to promote a low priority access to a regular priority access.
+ * Currently, this is only used when the user wants to write a stripe which is currently
+ * under reconstruction.
+ * This routine will promote all accesses tagged with the indicated parityStripeID from
+ * the low priority queue to the end of the normal priority queue.
+ * We assume the queue is locked upon entry.
+ */
+int rf_FifoPromote(q_in, parityStripeID, which_ru)
+ void *q_in;
+ RF_StripeNum_t parityStripeID;
+ RF_ReconUnitNum_t which_ru;
+{
+ RF_FifoHeader_t *q = (RF_FifoHeader_t *) q_in;
+ RF_DiskQueueData_t *lp = q->lq_head, *pt = NULL; /* lp = lo-pri queue pointer, pt = trailer */
+ int retval = 0;
+
+ while (lp) {
+
+ /* search for the indicated parity stripe in the low-pri queue */
+ if (lp->parityStripeID == parityStripeID && lp->which_ru == which_ru) {
+ /*printf("FifoPromote: promoting access for psid %ld\n",parityStripeID);*/
+ if (pt) pt->next = lp->next; /* delete an entry other than the first */
+ else q->lq_head = lp->next; /* delete the head entry */
+
+ if (!q->lq_head) q->lq_tail = NULL; /* we deleted the only entry */
+ else if (lp == q->lq_tail) q->lq_tail = pt; /* we deleted the tail entry */
+
+ lp->next = NULL;
+ q->lq_count--;
+
+ if (q->hq_tail) {q->hq_tail->next = lp; q->hq_tail = lp;} /* append to hi-priority queue */
+ else {q->hq_head = q->hq_tail = lp;}
+ q->hq_count++;
+
+ /*UpdateShortestSeekFinishTimeForced(lp->requestPtr, lp->diskState);*/ /* deal with this later, if ever */
+
+ lp = (pt) ? pt->next : q->lq_head; /* reset low-pri pointer and continue */
+ retval++;
+
+ } else {pt = lp; lp = lp->next;}
+ }
+
+ /* sanity check. delete this if you ever put more than one entry in the low-pri queue */
+ RF_ASSERT(retval == 0 || retval == 1);
+ if (rf_fifoDebug) {
+ int tid;
+ rf_get_threadid(tid);
+ printf("[%d] fifo: promote %d\n", tid, retval);
+ }
+ return(retval);
+}
diff --git a/sys/dev/raidframe/rf_fifo.h b/sys/dev/raidframe/rf_fifo.h
new file mode 100644
index 00000000000..44d2cc577f4
--- /dev/null
+++ b/sys/dev/raidframe/rf_fifo.h
@@ -0,0 +1,115 @@
+/* $OpenBSD: rf_fifo.h,v 1.1 1999/01/11 14:29:23 niklas Exp $ */
+/* $NetBSD: rf_fifo.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_fifo.h -- prioritized FIFO queue code.
+ *
+ * 4-9-93 Created (MCH)
+ */
+
+/*
+ * :
+ * Log: rf_fifo.h,v
+ * Revision 1.12 1996/06/18 20:53:11 jimz
+ * fix up disk queueing (remove configure routine,
+ * add shutdown list arg to create routines)
+ *
+ * Revision 1.11 1996/06/13 20:41:28 jimz
+ * add random queueing
+ *
+ * Revision 1.10 1996/06/13 20:38:28 jimz
+ * add random dequeue, peek
+ *
+ * Revision 1.9 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.8 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.7 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.6 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.5 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.4 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.3 1995/12/01 18:22:26 root
+ * added copyright info
+ *
+ * Revision 1.2 1995/11/07 15:31:57 wvcii
+ * added Peek() function
+ *
+ */
+
+#ifndef _RF__RF_FIFO_H_
+#define _RF__RF_FIFO_H_
+
+#include "rf_archs.h"
+#include "rf_types.h"
+#include "rf_diskqueue.h"
+
+typedef struct RF_FifoHeader_s {
+ RF_DiskQueueData_t *hq_head, *hq_tail; /* high priority requests */
+ RF_DiskQueueData_t *lq_head, *lq_tail; /* low priority requests */
+ int hq_count, lq_count; /* debug only */
+#if !defined(KERNEL) && RF_INCLUDE_QUEUE_RANDOM > 0
+ long rval; /* next random number (random qpolicy) */
+#endif /* !KERNEL && RF_INCLUDE_QUEUE_RANDOM > 0 */
+} RF_FifoHeader_t;
+
+extern void *rf_FifoCreate(RF_SectorCount_t sectPerDisk,
+ RF_AllocListElem_t *clList, RF_ShutdownList_t **listp);
+extern void rf_FifoEnqueue(void *q_in, RF_DiskQueueData_t *elem,
+ int priority);
+extern RF_DiskQueueData_t *rf_FifoDequeue(void *q_in);
+extern RF_DiskQueueData_t *rf_FifoPeek(void *q_in);
+extern int rf_FifoPromote(void *q_in, RF_StripeNum_t parityStripeID,
+ RF_ReconUnitNum_t which_ru);
+#if !defined(KERNEL) && RF_INCLUDE_QUEUE_RANDOM > 0
+extern RF_DiskQueueData_t *rf_RandomDequeue(void *q_in);
+extern RF_DiskQueueData_t *rf_RandomPeek(void *q_in);
+#endif /* !KERNEL && RF_INCLUDE_QUEUE_RANDOM > 0 */
+
+#endif /* !_RF__RF_FIFO_H_ */
diff --git a/sys/dev/raidframe/rf_freelist.h b/sys/dev/raidframe/rf_freelist.h
new file mode 100644
index 00000000000..8f8e4f5120d
--- /dev/null
+++ b/sys/dev/raidframe/rf_freelist.h
@@ -0,0 +1,734 @@
+/* $OpenBSD: rf_freelist.h,v 1.1 1999/01/11 14:29:23 niklas Exp $ */
+/* $NetBSD: rf_freelist.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */
+/*
+ * rf_freelist.h
+ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * :
+ * Log: rf_freelist.h,v
+ * Revision 1.13 1996/06/10 12:50:57 jimz
+ * Add counters to freelists to track number of allocations, frees,
+ * grows, max size, etc. Adjust a couple sets of PRIME params based
+ * on the results.
+ *
+ * Revision 1.12 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.11 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.10 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.9 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.8 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.7 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.6 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.5 1996/05/20 16:16:12 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.4 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.3 1996/05/16 16:04:52 jimz
+ * allow init func to fail for FREELIST ops
+ *
+ * Revision 1.2 1996/05/16 14:54:08 jimz
+ * added _INIT and _CLEAN versions of ops for objects with
+ * internal allocations
+ *
+ * Revision 1.1 1996/05/15 23:37:53 jimz
+ * Initial revision
+ *
+ */
+/*
+ * rf_freelist.h -- code to manage counted freelists
+ *
+ * Keep an arena of fixed-size objects. When a new object is needed,
+ * allocate it as necessary. When an object is freed, either put it
+ * in the arena, or really free it, depending on the maximum arena
+ * size.
+ */
+
+#ifndef _RF__RF_FREELIST_H_
+#define _RF__RF_FREELIST_H_
+
+#include "rf_types.h"
+#include "rf_debugMem.h"
+#include "rf_general.h"
+#include "rf_threadstuff.h"
+
+#define RF_FREELIST_STATS 0
+
+#if RF_FREELIST_STATS > 0
+typedef struct RF_FreeListStats_s {
+ char *file;
+ int line;
+ int allocations;
+ int frees;
+ int max_free;
+ int grows;
+ int outstanding;
+ int max_outstanding;
+} RF_FreeListStats_t;
+
+#define RF_FREELIST_STAT_INIT(_fl_) { \
+ bzero((char *)&((_fl_)->stats), sizeof(RF_FreeListStats_t)); \
+ (_fl_)->stats.file = __FILE__; \
+ (_fl_)->stats.line = __LINE__; \
+}
+
+#define RF_FREELIST_STAT_ALLOC(_fl_) { \
+ (_fl_)->stats.allocations++; \
+ (_fl_)->stats.outstanding++; \
+ if ((_fl_)->stats.outstanding > (_fl_)->stats.max_outstanding) \
+ (_fl_)->stats.max_outstanding = (_fl_)->stats.outstanding; \
+}
+
+#define RF_FREELIST_STAT_FREE_UPDATE(_fl_) { \
+ if ((_fl_)->free_cnt > (_fl_)->stats.max_free) \
+ (_fl_)->stats.max_free = (_fl_)->free_cnt; \
+}
+
+#define RF_FREELIST_STAT_FREE(_fl_) { \
+ (_fl_)->stats.frees++; \
+ (_fl_)->stats.outstanding--; \
+ RF_FREELIST_STAT_FREE_UPDATE(_fl_); \
+}
+
+#define RF_FREELIST_STAT_GROW(_fl_) { \
+ (_fl_)->stats.grows++; \
+ RF_FREELIST_STAT_FREE_UPDATE(_fl_); \
+}
+
+#define RF_FREELIST_STAT_REPORT(_fl_) { \
+ printf("Freelist at %s %d (%s)\n", (_fl_)->stats.file, (_fl_)->stats.line, RF_STRING(_fl_)); \
+ printf(" %d allocations, %d frees\n", (_fl_)->stats.allocations, (_fl_)->stats.frees); \
+ printf(" %d grows\n", (_fl_)->stats.grows); \
+ printf(" %d outstanding\n", (_fl_)->stats.outstanding); \
+ printf(" %d free (max)\n", (_fl_)->stats.max_free); \
+ printf(" %d outstanding (max)\n", (_fl_)->stats.max_outstanding); \
+}
+
+#else /* RF_FREELIST_STATS > 0 */
+
+#define RF_FREELIST_STAT_INIT(_fl_)
+#define RF_FREELIST_STAT_ALLOC(_fl_)
+#define RF_FREELIST_STAT_FREE_UPDATE(_fl_)
+#define RF_FREELIST_STAT_FREE(_fl_)
+#define RF_FREELIST_STAT_GROW(_fl_)
+#define RF_FREELIST_STAT_REPORT(_fl_)
+
+#endif /* RF_FREELIST_STATS > 0 */
+
+struct RF_FreeList_s {
+ void *objlist; /* list of free obj */
+ int free_cnt; /* how many free obj */
+ int max_free_cnt; /* max free arena size */
+ int obj_inc; /* how many to allocate at a time */
+ int obj_size; /* size of objects */
+ RF_DECLARE_MUTEX(lock)
+#if RF_FREELIST_STATS > 0
+ RF_FreeListStats_t stats; /* statistics */
+#endif /* RF_FREELIST_STATS > 0 */
+};
+
+/*
+ * fl = freelist
+ * maxcnt = max number of items in arena
+ * inc = how many to allocate at a time
+ * size = size of object
+ */
+#define RF_FREELIST_CREATE(_fl_,_maxcnt_,_inc_,_size_) { \
+ int rc; \
+ RF_ASSERT((_inc_) > 0); \
+ RF_Malloc(_fl_, sizeof(RF_FreeList_t), (RF_FreeList_t *)); \
+ (_fl_)->objlist = NULL; \
+ (_fl_)->free_cnt = 0; \
+ (_fl_)->max_free_cnt = _maxcnt_; \
+ (_fl_)->obj_inc = _inc_; \
+ (_fl_)->obj_size = _size_; \
+ rc = rf_mutex_init(&(_fl_)->lock); \
+ if (rc) { \
+ RF_Free(_fl_, sizeof(RF_FreeList_t)); \
+ _fl_ = NULL; \
+ } \
+ RF_FREELIST_STAT_INIT(_fl_); \
+}
+
+/*
+ * fl = freelist
+ * cnt = number to prime with
+ * nextp = name of "next" pointer in obj
+ * cast = object cast
+ */
+#define RF_FREELIST_PRIME(_fl_,_cnt_,_nextp_,_cast_) { \
+ void *_p; \
+ int _i; \
+ RF_LOCK_MUTEX((_fl_)->lock); \
+ for(_i=0;_i<(_cnt_);_i++) { \
+ RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
+ if (_p) { \
+ (_cast_(_p))->_nextp_ = (_fl_)->objlist; \
+ (_fl_)->objlist = _p; \
+ (_fl_)->free_cnt++; \
+ } \
+ else { \
+ break; \
+ } \
+ } \
+ RF_FREELIST_STAT_FREE_UPDATE(_fl_); \
+ RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+#define RF_FREELIST_MUTEX_OF(_fl_) ((_fl_)->lock)
+
+#define RF_FREELIST_DO_UNLOCK(_fl_) { \
+ RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+#define RF_FREELIST_DO_LOCK(_fl_) { \
+ RF_LOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl = freelist
+ * cnt = number to prime with
+ * nextp = name of "next" pointer in obj
+ * cast = object cast
+ * init = func to call to init obj
+ */
+#define RF_FREELIST_PRIME_INIT(_fl_,_cnt_,_nextp_,_cast_,_init_) { \
+ void *_p; \
+ int _i; \
+ RF_LOCK_MUTEX((_fl_)->lock); \
+ for(_i=0;_i<(_cnt_);_i++) { \
+ RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
+ if (_init_ (_cast_ _p)) { \
+ RF_Free(_p,(_fl_)->obj_size); \
+ _p = NULL; \
+ } \
+ if (_p) { \
+ (_cast_(_p))->_nextp_ = (_fl_)->objlist; \
+ (_fl_)->objlist = _p; \
+ (_fl_)->free_cnt++; \
+ } \
+ else { \
+ break; \
+ } \
+ } \
+ RF_FREELIST_STAT_FREE_UPDATE(_fl_); \
+ RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl = freelist
+ * cnt = number to prime with
+ * nextp = name of "next" pointer in obj
+ * cast = object cast
+ * init = func to call to init obj
+ * arg = arg to init obj func
+ */
+#define RF_FREELIST_PRIME_INIT_ARG(_fl_,_cnt_,_nextp_,_cast_,_init_,_arg_) { \
+ void *_p; \
+ int _i; \
+ RF_LOCK_MUTEX((_fl_)->lock); \
+ for(_i=0;_i<(_cnt_);_i++) { \
+ RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
+ if (_init_ (_cast_ _p,_arg_)) { \
+ RF_Free(_p,(_fl_)->obj_size); \
+ _p = NULL; \
+ } \
+ if (_p) { \
+ (_cast_(_p))->_nextp_ = (_fl_)->objlist; \
+ (_fl_)->objlist = _p; \
+ (_fl_)->free_cnt++; \
+ } \
+ else { \
+ break; \
+ } \
+ } \
+ RF_FREELIST_STAT_FREE_UPDATE(_fl_); \
+ RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl = freelist
+ * obj = object to allocate
+ * nextp = name of "next" pointer in obj
+ * cast = cast of obj assignment
+ * init = init obj func
+ */
+#define RF_FREELIST_GET_INIT(_fl_,_obj_,_nextp_,_cast_,_init_) { \
+ void *_p; \
+ int _i; \
+ RF_LOCK_MUTEX((_fl_)->lock); \
+ RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \
+ if (_fl_->objlist) { \
+ _obj_ = _cast_((_fl_)->objlist); \
+ (_fl_)->objlist = (void *)((_obj_)->_nextp_); \
+ (_fl_)->free_cnt--; \
+ } \
+ else { \
+ /* \
+ * Allocate one at a time so we can free \
+ * one at a time without cleverness when arena \
+ * is full. \
+ */ \
+ RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \
+ if (_obj_) { \
+ if (_init_ (_obj_)) { \
+ RF_Free(_obj_,(_fl_)->obj_size); \
+ _obj_ = NULL; \
+ } \
+ else { \
+ for(_i=1;_i<(_fl_)->obj_inc;_i++) { \
+ RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
+ if (_p) { \
+ if (_init_ (_p)) { \
+ RF_Free(_p,(_fl_)->obj_size); \
+ _p = NULL; \
+ break; \
+ } \
+ (_cast_(_p))->_nextp_ = (_fl_)->objlist; \
+ (_fl_)->objlist = _p; \
+ } \
+ else { \
+ break; \
+ } \
+ } \
+ } \
+ } \
+ RF_FREELIST_STAT_GROW(_fl_); \
+ } \
+ RF_FREELIST_STAT_ALLOC(_fl_); \
+ RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl = freelist
+ * obj = object to allocate
+ * nextp = name of "next" pointer in obj
+ * cast = cast of obj assignment
+ * init = init obj func
+ * arg = arg to init obj func
+ */
+#define RF_FREELIST_GET_INIT_ARG(_fl_,_obj_,_nextp_,_cast_,_init_,_arg_) { \
+ void *_p; \
+ int _i; \
+ RF_LOCK_MUTEX((_fl_)->lock); \
+ RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \
+ if (_fl_->objlist) { \
+ _obj_ = _cast_((_fl_)->objlist); \
+ (_fl_)->objlist = (void *)((_obj_)->_nextp_); \
+ (_fl_)->free_cnt--; \
+ } \
+ else { \
+ /* \
+ * Allocate one at a time so we can free \
+ * one at a time without cleverness when arena \
+ * is full. \
+ */ \
+ RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \
+ if (_obj_) { \
+ if (_init_ (_obj_,_arg_)) { \
+ RF_Free(_obj_,(_fl_)->obj_size); \
+ _obj_ = NULL; \
+ } \
+ else { \
+ for(_i=1;_i<(_fl_)->obj_inc;_i++) { \
+ RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
+ if (_p) { \
+ if (_init_ (_p,_arg_)) { \
+ RF_Free(_p,(_fl_)->obj_size); \
+ _p = NULL; \
+ break; \
+ } \
+ (_cast_(_p))->_nextp_ = (_fl_)->objlist; \
+ (_fl_)->objlist = _p; \
+ } \
+ else { \
+ break; \
+ } \
+ } \
+ } \
+ } \
+ RF_FREELIST_STAT_GROW(_fl_); \
+ } \
+ RF_FREELIST_STAT_ALLOC(_fl_); \
+ RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl = freelist
+ * obj = object to allocate
+ * nextp = name of "next" pointer in obj
+ * cast = cast of obj assignment
+ * init = init obj func
+ */
+#define RF_FREELIST_GET_INIT_NOUNLOCK(_fl_,_obj_,_nextp_,_cast_,_init_) { \
+ void *_p; \
+ int _i; \
+ RF_LOCK_MUTEX((_fl_)->lock); \
+ RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \
+ if (_fl_->objlist) { \
+ _obj_ = _cast_((_fl_)->objlist); \
+ (_fl_)->objlist = (void *)((_obj_)->_nextp_); \
+ (_fl_)->free_cnt--; \
+ } \
+ else { \
+ /* \
+ * Allocate one at a time so we can free \
+ * one at a time without cleverness when arena \
+ * is full. \
+ */ \
+ RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \
+ if (_obj_) { \
+ if (_init_ (_obj_)) { \
+ RF_Free(_obj_,(_fl_)->obj_size); \
+ _obj_ = NULL; \
+ } \
+ else { \
+ for(_i=1;_i<(_fl_)->obj_inc;_i++) { \
+ RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
+ if (_p) { \
+ if (_init_ (_p)) { \
+ RF_Free(_p,(_fl_)->obj_size); \
+ _p = NULL; \
+ break; \
+ } \
+ (_cast_(_p))->_nextp_ = (_fl_)->objlist; \
+ (_fl_)->objlist = _p; \
+ } \
+ else { \
+ break; \
+ } \
+ } \
+ } \
+ } \
+ RF_FREELIST_STAT_GROW(_fl_); \
+ } \
+ RF_FREELIST_STAT_ALLOC(_fl_); \
+}
+
+/*
+ * fl = freelist
+ * obj = object to allocate
+ * nextp = name of "next" pointer in obj
+ * cast = cast of obj assignment
+ */
+#define RF_FREELIST_GET(_fl_,_obj_,_nextp_,_cast_) { \
+ void *_p; \
+ int _i; \
+ RF_LOCK_MUTEX((_fl_)->lock); \
+ RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \
+ if (_fl_->objlist) { \
+ _obj_ = _cast_((_fl_)->objlist); \
+ (_fl_)->objlist = (void *)((_obj_)->_nextp_); \
+ (_fl_)->free_cnt--; \
+ } \
+ else { \
+ /* \
+ * Allocate one at a time so we can free \
+ * one at a time without cleverness when arena \
+ * is full. \
+ */ \
+ RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \
+ if (_obj_) { \
+ for(_i=1;_i<(_fl_)->obj_inc;_i++) { \
+ RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
+ if (_p) { \
+ (_cast_(_p))->_nextp_ = (_fl_)->objlist; \
+ (_fl_)->objlist = _p; \
+ } \
+ else { \
+ break; \
+ } \
+ } \
+ } \
+ RF_FREELIST_STAT_GROW(_fl_); \
+ } \
+ RF_FREELIST_STAT_ALLOC(_fl_); \
+ RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl = freelist
+ * obj = object to allocate
+ * nextp = name of "next" pointer in obj
+ * cast = cast of obj assignment
+ * num = num objs to return
+ */
+#define RF_FREELIST_GET_N(_fl_,_obj_,_nextp_,_cast_,_num_) { \
+ void *_p, *_l, *_f; \
+ int _i, _n; \
+ _l = _f = NULL; \
+ _n = 0; \
+ RF_LOCK_MUTEX((_fl_)->lock); \
+ RF_ASSERT(sizeof(*(_obj_))==((_fl_)->obj_size)); \
+ for(_n=0;_n<_num_;_n++) { \
+ if (_fl_->objlist) { \
+ _obj_ = _cast_((_fl_)->objlist); \
+ (_fl_)->objlist = (void *)((_obj_)->_nextp_); \
+ (_fl_)->free_cnt--; \
+ } \
+ else { \
+ /* \
+ * Allocate one at a time so we can free \
+ * one at a time without cleverness when arena \
+ * is full. \
+ */ \
+ RF_Calloc(_obj_,1,(_fl_)->obj_size,_cast_); \
+ if (_obj_) { \
+ for(_i=1;_i<(_fl_)->obj_inc;_i++) { \
+ RF_Calloc(_p,1,(_fl_)->obj_size,(void *)); \
+ if (_p) { \
+ (_cast_(_p))->_nextp_ = (_fl_)->objlist; \
+ (_fl_)->objlist = _p; \
+ } \
+ else { \
+ break; \
+ } \
+ } \
+ } \
+ RF_FREELIST_STAT_GROW(_fl_); \
+ } \
+ if (_f == NULL) \
+ _f = _obj_; \
+ if (_obj_) { \
+ (_cast_(_obj_))->_nextp_ = _l; \
+ _l = _obj_; \
+ RF_FREELIST_STAT_ALLOC(_fl_); \
+ } \
+ else { \
+ (_cast_(_f))->_nextp_ = (_fl_)->objlist; \
+ (_fl_)->objlist = _l; \
+ _n = _num_; \
+ } \
+ } \
+ RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl = freelist
+ * obj = object to free
+ * nextp = name of "next" pointer in obj
+ */
+#define RF_FREELIST_FREE(_fl_,_obj_,_nextp_) { \
+ RF_LOCK_MUTEX((_fl_)->lock); \
+ if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \
+ RF_Free(_obj_,(_fl_)->obj_size); \
+ } \
+ else { \
+ RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \
+ (_obj_)->_nextp_ = (_fl_)->objlist; \
+ (_fl_)->objlist = (void *)(_obj_); \
+ (_fl_)->free_cnt++; \
+ } \
+ RF_FREELIST_STAT_FREE(_fl_); \
+ RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl = freelist
+ * obj = object to free
+ * nextp = name of "next" pointer in obj
+ * num = num to free (debugging)
+ */
+#define RF_FREELIST_FREE_N(_fl_,_obj_,_nextp_,_cast_,_num_) { \
+ void *_no; \
+ int _n; \
+ _n = 0; \
+ RF_LOCK_MUTEX((_fl_)->lock); \
+ while(_obj_) { \
+ _no = (_cast_(_obj_))->_nextp_; \
+ if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \
+ RF_Free(_obj_,(_fl_)->obj_size); \
+ } \
+ else { \
+ RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \
+ (_obj_)->_nextp_ = (_fl_)->objlist; \
+ (_fl_)->objlist = (void *)(_obj_); \
+ (_fl_)->free_cnt++; \
+ } \
+ _n++; \
+ _obj_ = _no; \
+ RF_FREELIST_STAT_FREE(_fl_); \
+ } \
+ RF_ASSERT(_n==(_num_)); \
+ RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl = freelist
+ * obj = object to free
+ * nextp = name of "next" pointer in obj
+ * clean = undo for init
+ */
+#define RF_FREELIST_FREE_CLEAN(_fl_,_obj_,_nextp_,_clean_) { \
+ RF_LOCK_MUTEX((_fl_)->lock); \
+ if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \
+ _clean_ (_obj_); \
+ RF_Free(_obj_,(_fl_)->obj_size); \
+ } \
+ else { \
+ RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \
+ (_obj_)->_nextp_ = (_fl_)->objlist; \
+ (_fl_)->objlist = (void *)(_obj_); \
+ (_fl_)->free_cnt++; \
+ } \
+ RF_FREELIST_STAT_FREE(_fl_); \
+ RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl = freelist
+ * obj = object to free
+ * nextp = name of "next" pointer in obj
+ * clean = undo for init
+ * arg = arg for undo func
+ */
+#define RF_FREELIST_FREE_CLEAN_ARG(_fl_,_obj_,_nextp_,_clean_,_arg_) { \
+ RF_LOCK_MUTEX((_fl_)->lock); \
+ if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \
+ _clean_ (_obj_,_arg_); \
+ RF_Free(_obj_,(_fl_)->obj_size); \
+ } \
+ else { \
+ RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \
+ (_obj_)->_nextp_ = (_fl_)->objlist; \
+ (_fl_)->objlist = (void *)(_obj_); \
+ (_fl_)->free_cnt++; \
+ } \
+ RF_FREELIST_STAT_FREE(_fl_); \
+ RF_UNLOCK_MUTEX((_fl_)->lock); \
+}
+
+/*
+ * fl = freelist
+ * obj = object to free
+ * nextp = name of "next" pointer in obj
+ * clean = undo for init
+ */
+#define RF_FREELIST_FREE_CLEAN_NOUNLOCK(_fl_,_obj_,_nextp_,_clean_) { \
+ RF_LOCK_MUTEX((_fl_)->lock); \
+ if ((_fl_)->free_cnt == (_fl_)->max_free_cnt) { \
+ _clean_ (_obj_); \
+ RF_Free(_obj_,(_fl_)->obj_size); \
+ } \
+ else { \
+ RF_ASSERT((_fl_)->free_cnt < (_fl_)->max_free_cnt); \
+ (_obj_)->_nextp_ = (_fl_)->objlist; \
+ (_fl_)->objlist = (void *)(_obj_); \
+ (_fl_)->free_cnt++; \
+ } \
+ RF_FREELIST_STAT_FREE(_fl_); \
+}
+
+/*
+ * fl = freelist
+ * nextp = name of "next" pointer in obj
+ * cast = cast to object type
+ */
+#define RF_FREELIST_DESTROY(_fl_,_nextp_,_cast_) { \
+ void *_cur, *_next; \
+ RF_FREELIST_STAT_REPORT(_fl_); \
+ rf_mutex_destroy(&((_fl_)->lock)); \
+ for(_cur=(_fl_)->objlist;_cur;_cur=_next) { \
+ _next = (_cast_ _cur)->_nextp_; \
+ RF_Free(_cur,(_fl_)->obj_size); \
+ } \
+ RF_Free(_fl_,sizeof(RF_FreeList_t)); \
+}
+
+/*
+ * fl = freelist
+ * nextp = name of "next" pointer in obj
+ * cast = cast to object type
+ * clean = func to undo obj init
+ */
+#define RF_FREELIST_DESTROY_CLEAN(_fl_,_nextp_,_cast_,_clean_) { \
+ void *_cur, *_next; \
+ RF_FREELIST_STAT_REPORT(_fl_); \
+ rf_mutex_destroy(&((_fl_)->lock)); \
+ for(_cur=(_fl_)->objlist;_cur;_cur=_next) { \
+ _next = (_cast_ _cur)->_nextp_; \
+ _clean_ (_cur); \
+ RF_Free(_cur,(_fl_)->obj_size); \
+ } \
+ RF_Free(_fl_,sizeof(RF_FreeList_t)); \
+}
+
+/*
+ * fl = freelist
+ * nextp = name of "next" pointer in obj
+ * cast = cast to object type
+ * clean = func to undo obj init
+ * arg = arg for undo func
+ */
+#define RF_FREELIST_DESTROY_CLEAN_ARG(_fl_,_nextp_,_cast_,_clean_,_arg_) { \
+ void *_cur, *_next; \
+ RF_FREELIST_STAT_REPORT(_fl_); \
+ rf_mutex_destroy(&((_fl_)->lock)); \
+ for(_cur=(_fl_)->objlist;_cur;_cur=_next) { \
+ _next = (_cast_ _cur)->_nextp_; \
+ _clean_ (_cur,_arg_); \
+ RF_Free(_cur,(_fl_)->obj_size); \
+ } \
+ RF_Free(_fl_,sizeof(RF_FreeList_t)); \
+}
+
+#endif /* !_RF__RF_FREELIST_H_ */
diff --git a/sys/dev/raidframe/rf_general.h b/sys/dev/raidframe/rf_general.h
new file mode 100644
index 00000000000..3879520133f
--- /dev/null
+++ b/sys/dev/raidframe/rf_general.h
@@ -0,0 +1,269 @@
+/* $OpenBSD: rf_general.h,v 1.1 1999/01/11 14:29:23 niklas Exp $ */
+/* $NetBSD: rf_general.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_general.h -- some general-use definitions
+ */
+
+/*
+ * :
+ * Log: rf_general.h,v
+ * Revision 1.26 1996/08/09 16:44:57 jimz
+ * sunos port
+ *
+ * Revision 1.25 1996/08/07 21:08:57 jimz
+ * get NBPG defined for IRIX
+ *
+ * Revision 1.24 1996/08/06 22:02:06 jimz
+ * include linux/user.h for linux to get NBPG
+ *
+ * Revision 1.23 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.22 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.21 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.20 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.19 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.18 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.17 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.16 1996/05/21 18:53:13 jimz
+ * be sure that noop macros don't confuse conditionals and loops
+ *
+ * Revision 1.15 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.14 1996/05/08 21:01:24 jimz
+ * fixed up enum type names that were conflicting with other
+ * enums and function names (ie, "panic")
+ * future naming trends will be towards RF_ and rf_ for
+ * everything raidframe-related
+ *
+ * Revision 1.13 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.12 1995/12/01 18:29:08 root
+ * added copyright info
+ *
+ * Revision 1.11 1995/09/19 22:59:52 jimz
+ * Add kernel macro RF_DKU_END_IO(). When DKUSAGE is not defined,
+ * this is a no-op. When it is defined, it calls dku_end_io()
+ * correctly given a raidframe unit number and a buf pointer.
+ *
+ * Revision 1.10 1995/07/03 18:13:56 holland
+ * changed kernel defn of GETTIME
+ *
+ * Revision 1.9 1995/07/02 15:07:42 holland
+ * bug fixes related to getting distributed sparing numbers
+ *
+ * Revision 1.8 1995/06/12 15:54:40 rachad
+ * Added garbege collection for log structured storage
+ *
+ * Revision 1.7 1995/06/03 19:18:16 holland
+ * changes related to kernelization: access traces
+ * changes related to distributed sparing: some bug fixes
+ *
+ * Revision 1.6 1995/05/01 13:28:00 holland
+ * parity range locks, locking disk requests, recon+parityscan in kernel, etc.
+ *
+ * Revision 1.5 1995/04/06 14:47:56 rachad
+ * merge completed
+ *
+ * Revision 1.4 1995/03/15 20:45:23 holland
+ * distr sparing changes.
+ *
+ * Revision 1.3 1995/02/03 22:31:36 holland
+ * many changes related to kernelization
+ *
+ * Revision 1.2 1994/11/29 21:37:10 danner
+ * Added divide by zero check.
+ *
+ */
+
+/*#define NOASSERT*/
+
+#ifndef _RF__RF_GENERAL_H_
+#define _RF__RF_GENERAL_H_
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#if !defined(KERNEL) && !defined(NOASSERT)
+#include <assert.h>
+#endif /* !KERNEL && !NOASSERT */
+
+/* error reporting and handling */
+
+#ifndef KERNEL
+
+#define RF_ERRORMSG(s) fprintf(stderr,(s))
+#define RF_ERRORMSG1(s,a) fprintf(stderr,(s),(a))
+#define RF_ERRORMSG2(s,a,b) fprintf(stderr,(s),(a),(b))
+#define RF_ERRORMSG3(s,a,b,c) fprintf(stderr,(s),(a),(b),(c))
+#define RF_ERRORMSG4(s,a,b,c,d) fprintf(stderr,(s),(a),(b),(c),(d))
+#define RF_ERRORMSG5(s,a,b,c,d,e) fprintf(stderr,(s),(a),(b),(c),(d),(e))
+#ifndef NOASSERT
+#define RF_ASSERT(x) {assert(x);}
+#else /* !NOASSERT */
+#define RF_ASSERT(x) {/*noop*/}
+#endif /* !NOASSERT */
+#define RF_PANIC() {printf("YIKES! Something terrible happened at line %d of file %s. Use a debugger.\n",__LINE__,__FILE__); abort();}
+
+#else /* !KERNEL */
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+#include<sys/systm.h> /* printf, sprintf, and friends */
+#endif
+#define RF_ERRORMSG(s) printf((s))
+#define RF_ERRORMSG1(s,a) printf((s),(a))
+#define RF_ERRORMSG2(s,a,b) printf((s),(a),(b))
+#define RF_ERRORMSG3(s,a,b,c) printf((s),(a),(b),(c))
+#define RF_ERRORMSG4(s,a,b,c,d) printf((s),(a),(b),(c),(d))
+#define RF_ERRORMSG5(s,a,b,c,d,e) printf((s),(a),(b),(c),(d),(e))
+#define perror(x)
+extern char rf_panicbuf[];
+#define RF_PANIC() {sprintf(rf_panicbuf,"raidframe error at line %d file %s",__LINE__,__FILE__); panic(rf_panicbuf);}
+
+#ifdef RF_ASSERT
+#undef RF_ASSERT
+#endif /* RF_ASSERT */
+#ifndef NOASSERT
+#define RF_ASSERT(_x_) { \
+ if (!(_x_)) { \
+ sprintf(rf_panicbuf, \
+ "raidframe error at line %d file %s (failed asserting %s)\n", \
+ __LINE__, __FILE__, #_x_); \
+ panic(rf_panicbuf); \
+ } \
+}
+#else /* !NOASSERT */
+#define RF_ASSERT(x) {/*noop*/}
+#endif /* !NOASSERT */
+
+#endif /* !KERNEL */
+
+/* random stuff */
+#define RF_MAX(a,b) (((a) > (b)) ? (a) : (b))
+#define RF_MIN(a,b) (((a) < (b)) ? (a) : (b))
+
+/* divide-by-zero check */
+#define RF_DB0_CHECK(a,b) ( ((b)==0) ? 0 : (a)/(b) )
+
+/* get time of day */
+#ifdef KERNEL
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+extern struct timeval time;
+#endif /* !__NetBSD__ && !__OpenBSD__ */
+#define RF_GETTIME(_t) microtime(&(_t))
+#else /* KERNEL */
+#define RF_GETTIME(_t) gettimeofday(&(_t), NULL);
+#endif /* KERNEL */
+
+/*
+ * zero memory- not all bzero calls go through here, only
+ * those which in the kernel may have a user address
+ */
+#ifdef KERNEL
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#define RF_BZERO(_bp,_b,_l) if (IS_SYS_VA(_b)) bzero(_b,_l); else rf_BzeroWithRemap(_bp,_b,_l)
+#else
+
+#define RF_BZERO(_bp,_b,_l) bzero(_b,_l) /* XXX This is likely incorrect. GO*/
+#endif /* __NetBSD__ || __OpenBSD__ */
+#else /* KERNEL */
+#define RF_BZERO(_bp,_b,_l) bzero(_b,_l)
+#endif /* KERNEL */
+
+#ifdef sun
+#include <sys/param.h>
+#ifndef NBPG
+#define NBPG PAGESIZE
+#endif /* !NBPG */
+#endif /* sun */
+
+#ifdef IRIX
+#include <sys/tfp.h>
+#define NBPG _PAGESZ
+#endif /* IRIX */
+
+#ifdef LINUX
+#include <linux/user.h>
+#endif /* LINUX */
+
+#define RF_UL(x) ((unsigned long) (x))
+#define RF_PGMASK RF_UL(NBPG-1)
+#define RF_BLIP(x) (NBPG - (RF_UL(x) & RF_PGMASK)) /* bytes left in page */
+#define RF_PAGE_ALIGNED(x) ((RF_UL(x) & RF_PGMASK) == 0)
+
+#ifdef KERNEL
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include <dkusage.h>
+#endif
+#if DKUSAGE > 0
+#define RF_DKU_END_IO(_unit_,_bp_) { \
+ int s = splbio(); \
+ dku_end_io(DKU_RAIDFRAME_BUS, _unit_, 0, \
+ (((_bp_)->b_flags&(B_READ|B_WRITE) == B_READ) ? \
+ CAM_DIR_IN : CAM_DIR_OUT), \
+ (_bp_)->b_bcount); \
+ splx(s); \
+}
+#else /* DKUSAGE > 0 */
+#define RF_DKU_END_IO(unit) { /* noop */ }
+#endif /* DKUSAGE > 0 */
+#endif /* KERNEL */
+
+#ifdef __STDC__
+#define RF_STRING(_str_) #_str_
+#else /* __STDC__ */
+#define RF_STRING(_str_) "_str_"
+#endif /* __STDC__ */
+
+#endif /* !_RF__RF_GENERAL_H_ */
diff --git a/sys/dev/raidframe/rf_geniq.c b/sys/dev/raidframe/rf_geniq.c
new file mode 100644
index 00000000000..bfe55cb87d2
--- /dev/null
+++ b/sys/dev/raidframe/rf_geniq.c
@@ -0,0 +1,199 @@
+/* $NetBSD: rf_geniq.c,v 1.1 1998/11/13 04:20:30 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_geniq.c
+ * code which implements Reed-Solomon encoding for RAID level 6
+ */
+
+/* :
+ * Log: rf_geniq.c,v
+ * Revision 1.12 1996/07/29 16:37:00 jimz
+ * remove archs.h include to avoid VPATH problems in kernel
+ * rf_invertq.c now must include archs.h before invertq.h
+ *
+ * Revision 1.11 1996/07/29 15:04:16 jimz
+ * correct rf_archs.h path for kernel
+ *
+ * Revision 1.10 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.9 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.8 1996/07/15 17:22:18 jimz
+ * nit-pick code cleanup
+ * resolve stdlib problems on DEC OSF
+ *
+ * Revision 1.7 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.6 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.5 1995/12/01 18:29:18 root
+ * added copyright info
+ *
+ */
+
+#define RF_UTILITY 1
+#include "rf_pqdeg.h"
+
+/*
+ five bit lfsr
+ poly - feedback connections
+
+ val = value;
+*/
+int lsfr_shift(val,poly)
+unsigned val, poly;
+{
+ unsigned new;
+ unsigned int i;
+ unsigned high = (val >> 4) & 1;
+ unsigned bit;
+
+ new = (poly & 1) ? high : 0;
+
+ for (i=1; i <=4; i++)
+ {
+ bit = (val >> (i-1)) & 1;
+ if (poly & (1<<i)) /* there is a feedback connection */
+ new = new | ((bit ^ high)<<i);
+ else
+ new = new | (bit << i);
+ }
+ return new;
+}
+
+/* generate Q matricies for the data */
+
+RF_ua32_t rf_qfor[32];
+
+void main()
+{
+ unsigned int i,j,l,a,b;
+ unsigned int val;
+ unsigned int r;
+ unsigned int m,p,q;
+
+ RF_ua32_t k;
+
+ printf("/*\n");
+ printf(" * rf_invertq.h\n");
+ printf(" */\n");
+ printf("/*\n");
+ printf(" * GENERATED FILE -- DO NOT EDIT\n");
+ printf(" */\n");
+ printf("\n");
+ printf("#ifndef _RF__RF_INVERTQ_H_\n");
+ printf("#define _RF__RF_INVERTQ_H_\n");
+ printf("\n");
+ printf("/*\n");
+ printf(" * rf_geniq.c must include rf_archs.h before including\n");
+ printf(" * this file (to get VPATH magic right with the way we\n");
+ printf(" * generate this file in kernel trees)\n");
+ printf(" */\n");
+ printf("/* #include \"rf_archs.h\" */\n");
+ printf("\n");
+ printf("#if (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_RAID6 > 0)\n");
+ printf("\n");
+ printf("#define RF_Q_COLS 32\n");
+ printf("RF_ua32_t rf_rn = {\n");
+ k[0] = 1;
+ for (j=0 ; j < 31; j++)
+ k[j+1] = lsfr_shift(k[j],5);
+ for (j=0; j < 32; j++)
+ printf("%d, ",k[j]);
+ printf("};\n");
+
+ printf("RF_ua32_t rf_qfor[32] = {\n");
+ for (i=0; i < 32; i++)
+ {
+ printf("/* i = %d */ { 0, ",i);
+ rf_qfor[i][0] = 0;
+ for (j=1; j < 32; j++)
+ {
+ val = j;
+ for (l=0; l < i; l++)
+ val = lsfr_shift(val,5);
+ rf_qfor[i][j] = val;
+ printf("%d, ",val);
+ }
+ printf("},\n");
+ }
+ printf("};\n");
+ printf("#define RF_Q_DATA_COL(col_num) rf_rn[col_num],rf_qfor[28-(col_num)]\n");
+
+ /* generate the inverse tables. (i,j,p,q) */
+ /* The table just stores a. Get b back from
+ the parity */
+ printf("#ifdef KERNEL\n");
+ printf("RF_ua1024_t rf_qinv[1]; /* don't compile monster table into kernel */\n");
+ printf("#elif defined(NO_PQ)\n");
+ printf("RF_ua1024_t rf_qinv[29*29];\n");
+ printf("#else /* !KERNEL && NO_PQ */\n");
+ printf("RF_ua1024_t rf_qinv[29*29] = {\n");
+ for (i=0; i < 29; i++)
+ {
+ for (j =0; j < 29; j++)
+ {
+ printf("/* i %d, j %d */{ ",i,j);
+ if (i==j)
+ for (l=0; l < 1023; l++) printf("0, ");
+ else
+ {
+ for (p=0; p < 32; p++)
+ for (q=0; q < 32; q++)
+ {
+ /* What are a, b such that
+ a ^ b = p; and
+ qfor[(28-i)][a ^ rf_rn[i+1]] ^ qfor[(28-j)][b ^ rf_rn[j+1]] = q.
+ Solve by guessing a. Then testing.
+ */
+ for ( a =0 ; a < 32; a++ )
+ {
+ b = a ^ p;
+ if ( (rf_qfor[28-i][a^ k[i+1]] ^ rf_qfor[28-j][b ^ k[j+1]]) == q )
+ break;
+ }
+ if (a == 32) printf("unable to solve %d %d %d %d\n",i,j,p,q);
+ printf("%d,",a);
+ }
+ }
+ printf("},\n");
+ }
+ }
+ printf("};\n");
+ printf("\n#endif /* (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */\n\n");
+ printf("#endif /* !KERNEL && NO_PQ */\n");
+ printf("#endif /* !_RF__RF_INVERTQ_H_ */\n");
+ exit(0);
+}
diff --git a/sys/dev/raidframe/rf_geometry.c b/sys/dev/raidframe/rf_geometry.c
new file mode 100644
index 00000000000..15da7cdda8e
--- /dev/null
+++ b/sys/dev/raidframe/rf_geometry.c
@@ -0,0 +1,891 @@
+/* $OpenBSD: rf_geometry.c,v 1.1 1999/01/11 14:29:24 niklas Exp $ */
+/* $NetBSD: rf_geometry.c,v 1.1 1998/11/13 04:20:30 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * Changes:
+ * 10/24/91 Changes to support disk bus contention model
+ * (MCH) 1. Added media_done_time param to Access_time()
+ *
+ * 08/18/92 Geometry routines have been modified to support zone-bit
+ * recording.
+ * (AS) 1. Each routine which originally referenced the variable
+ * 'disk->geom->sectors_per_track' has been modified,
+ * since the number of sectors per track varies on disks
+ * with zone-bit recording.
+ */
+
+/* :
+ * Log: rf_geometry.c,v
+ * Revision 1.18 1996/08/11 00:40:57 jimz
+ * fix up broken comment
+ *
+ * Revision 1.17 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.16 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.15 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.14 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.13 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.12 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.11 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.10 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.9 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.8 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.7 1995/12/01 18:29:34 root
+ * added copyright info
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_geometry.h"
+#include "rf_raid.h"
+#include "rf_general.h"
+#include "rf_debugMem.h"
+
+#define DISK_DB "disk_db"
+#define DISK_NAME "HP2247"
+
+#define ABS_DIFF(a,b) ( ((a)>(b)) ? ((a)-(b)) : ((b)-(a)) )
+
+static RF_GeometryList_t *geom_list = (RF_GeometryList_t *) NULL;
+
+RF_TICS_t rf_globalSpinup = 1.5;
+
+#define NM_LGTH 80
+#define NM_PATN " %80s"
+
+static RF_GeometryList_t *Fetch_geometry_db(FILE *fd);
+static void Format_disk(RF_DiskState_t *disk, long sectors_per_block);
+static long Find_cyl(RF_SectorNum_t block, RF_DiskState_t *disk);
+static long Find_track(RF_SectorNum_t block, RF_DiskState_t *disk);
+static long Find_phys_sector(RF_SectorNum_t block, RF_DiskState_t *disk);
+static RF_TICS_t Delay_to(RF_TICS_t cur_time, RF_SectorNum_t block,
+ RF_DiskState_t *disk);
+static RF_TICS_t Seek_time(long to_cyl, long to_track, long from_cyl,
+ long from_track, RF_DiskState_t *disk);
+static RF_TICS_t Seek(RF_TICS_t cur_time, RF_SectorNum_t block,
+ RF_DiskState_t *disk, long update);
+static RF_TICS_t Rotate(RF_TICS_t cur_time, RF_SectorNum_t block,
+ RF_DiskState_t *disk, long update);
+static RF_TICS_t Seek_Rotate(RF_TICS_t cur_time, RF_SectorNum_t block,
+ RF_DiskState_t *disk, long update);
+static RF_TICS_t GAP(long sec_per_track, RF_DiskState_t *disk);
+static RF_TICS_t Block_access_time(RF_TICS_t cur_time, RF_SectorNum_t block,
+ RF_SectorCount_t numblocks, RF_DiskState_t *disk, long update);
+static void Zero_stats(RF_DiskState_t *disk);
+static RF_TICS_t Update_stats(RF_TICS_t cur_time, RF_TICS_t seek, RF_TICS_t rotate,
+ RF_TICS_t transfer, RF_DiskState_t *disk);
+static void rf_DiskParam(long numCyls, RF_TICS_t minSeek, RF_TICS_t avgSeek, RF_TICS_t maxSeek,
+ RF_TICS_t *a, RF_TICS_t *b, RF_TICS_t *c);
+
+static RF_GeometryList_t *Fetch_geometry_db(fd)
+ FILE *fd;
+{
+ long ret, lineno;
+ char name[NM_LGTH], title[20];
+ RF_GeometryList_t * list = (RF_GeometryList_t *) NULL,
+ ** next_ptr = & list;
+
+ if( RF_MAX_DISKNAME_LEN<NM_LGTH ) RF_PANIC();
+ lineno = 0;
+ while( (ret = fscanf( fd, " %20s", title )) != EOF ) {
+ float tmp_f1, tmp_f2, tmp_f3, tmp_f4;
+ float tmp_f5=0.0;
+ float tmp_f6=0.0;
+ RF_Geometry_t *g;
+ long i, x, y, z, num_cylinders;
+ RF_ZoneList_t ** znext_ptr;
+
+ if( ret == 1 && strncmp( "enddisk", title, 8 ) == 0 ) break;
+
+ RF_Calloc(*next_ptr, 1, sizeof(RF_GeometryList_t), (RF_GeometryList_t *));
+ (*next_ptr)->next = (RF_GeometryList_t *) NULL;
+ RF_Calloc(g, 1, sizeof(RF_Geometry_t), (RF_Geometry_t *));
+ (*next_ptr)->disk = g;
+ next_ptr = &( (*next_ptr)->next ); /*prep for next iteration */
+ lineno++;
+ if (fscanf( fd, NM_PATN, name ) != 1) {
+ fprintf(stderr,"Disk DB Error: Can't get disk name from disk db\n");
+ fprintf(stderr,"lineno=%d\n", lineno);
+ fprintf(stderr,"name=\"%s\"\n", name);
+ exit(1);
+ }
+ lineno++;
+ if ( (fscanf(fd, " tracks per cylinder %ld", &(g->tracks_per_cyl)) != 1) || g->tracks_per_cyl <= 0) {
+ fprintf(stderr,"Disk DB Error: Missing or invalid tracks/cyl for disk %s\n", name); exit(1);
+ }
+ lineno++;
+ if ( (fscanf(fd, " number of disk zones %ld", &(g->num_zones)) != 1) || g->num_zones <= 0) {
+ fprintf(stderr,"Disk DB Error: Missing or invalid number of zones for disk %s\n", name); exit(1);
+ }
+
+
+
+ /* This section of code creates the linked list which
+ contains the disk's zone information. */
+ g->zbr_data = (RF_ZoneList_t *) NULL;
+ znext_ptr = &(g->zbr_data);
+ num_cylinders = 0;
+
+ /* This for-loop reads in the cylinder count, the sectors
+ per track, and track skew for each zone on the disk. */
+ for (i=1; i <= g->num_zones; i++) {
+ lineno++;
+ if ( (fscanf(fd, " number of cylinders in zone %ld", &x) != 1) || x < 1) {
+ fprintf(stderr,"Disk DB Error: Zone %ld: Missing or invalid cyls/zone for disk %s\n", i, name); exit(1);
+ }
+ lineno++;
+ if ( (fscanf(fd, " sectors per track in zone %ld", &y) != 1) || y < 1 ) {
+ fprintf(stderr,"Disk DB Error: Zone %ld: Missing or invalid sectors/track for disk %s\n", i, name); exit(1);
+ }
+ lineno++;
+ if ( (fscanf(fd, " track skew in zone %ld", &z) != 1) || z < 0 ) {
+ fprintf(stderr,"Disk DB Error: Zone %ld: Missing or invalid track skew for disk %s\n",i, name); exit(1);
+ }
+
+ RF_Calloc(*znext_ptr, 1, sizeof(RF_ZoneList_t), (RF_ZoneList_t *));
+ (*znext_ptr)->next = (RF_ZoneList_t *) NULL;
+ (*znext_ptr)->zone.num_cylinders = x;
+ (*znext_ptr)->zone.sec_per_track = y;
+ (*znext_ptr)->zone.track_skew = z;
+ (*znext_ptr)->zone.num_sectors =
+ (*znext_ptr)->zone.num_cylinders *
+ g->tracks_per_cyl *
+ (*znext_ptr)->zone.sec_per_track;
+ znext_ptr = &((*znext_ptr)->next);
+ num_cylinders = num_cylinders + x;
+ } /* End of for-loop */
+
+ lineno++;
+ if ( (fscanf(fd, " revolution time %f", &tmp_f1) != 1) || tmp_f1 <= 0) {
+ fprintf(stderr,"Disk DB Error: Missing or invalid revolution time for disk %s\n",name); exit(1);
+ }
+ lineno++;
+ if ( (fscanf(fd, " 1 cylinder seek time %f", &tmp_f2 ) != 1) || tmp_f2 <= 0) {
+ fprintf(stderr,"Disk DB Error: Missing or invalid 1-cyl seek time for disk %s\n",name); exit(1);
+ }
+ lineno++;
+ if ( (fscanf(fd, " max stroke seek time %f", &tmp_f3) != 1) || tmp_f3 <= 0) {
+ fprintf(stderr,"Disk DB Error: Missing or invalid max seek time for disk %s\n",name); exit(1);
+ }
+ lineno++;
+ if ( (fscanf(fd, " average seek time %f", &tmp_f4) != 1) || tmp_f4 <= 0) {
+ fprintf(stderr,"Disk DB Error: Missing or invalid avg seek time for disk %s\n",name); exit(1);
+ }
+ lineno++;
+ if ( (fscanf(fd, " time to sleep %f", &tmp_f5) != 1) || tmp_f4 <= 0) {
+ fprintf(stderr,"Disk DB Error: Missing or invalid time to sleep for disk %s\n",name); exit(1);
+ }
+ lineno++;
+ if ( (fscanf(fd, " time to spinup %f", &tmp_f6) != 1) || tmp_f4 <= 0) {
+ fprintf(stderr,"Disk DB Error: Missing or invalid time to sleep for disk %s\n",name); exit(1);
+ }
+ strcpy( g->disk_name, name );
+ g->revolution_time = tmp_f1;
+ g->seek_one_cyl = tmp_f2;
+ g->seek_max_stroke = tmp_f3;
+ g->seek_avg = tmp_f4;
+ g->time_to_sleep = tmp_f5;
+ g->time_to_spinup = tmp_f6;
+ /* convert disk specs to seek equation coeff */
+ rf_DiskParam( num_cylinders, g->seek_one_cyl,
+ g->seek_avg, g->seek_max_stroke,
+ &g->seek_sqrt_coeff, &g->seek_linear_coeff,
+ &g->seek_constant_coeff );
+ }
+ return( list );
+}
+
+static void Format_disk(disk, sectors_per_block)
+ RF_DiskState_t *disk;
+ long sectors_per_block;
+{
+ long sector_count = 0;
+ RF_ZoneList_t *z;
+
+ if( disk == (RF_DiskState_t *) NULL ) RF_PANIC();
+ if( disk->geom == (RF_Geometry_t *) NULL ) RF_PANIC();
+ if( sectors_per_block <=0 ) RF_PANIC();
+
+ disk->sectors_per_block = sectors_per_block;
+ z = disk->geom->zbr_data;
+ /* This while-loop visits each disk zone and computes the total
+ number of sectors on the disk. */
+ while (z != (RF_ZoneList_t *) NULL) {
+ sector_count = sector_count + (z->zone.num_cylinders *
+ disk->geom->tracks_per_cyl *
+ z->zone.sec_per_track);
+ z = z->next;
+ }
+
+ disk->last_block_index = (sector_count / sectors_per_block) - 1;
+}
+
+void rf_InitDisk( disk, disk_db, disk_name, init_cyl, init_track, init_offset, row, col)
+ RF_DiskState_t *disk;
+ char *disk_db;
+ char *disk_name;
+ long init_cyl;
+ long init_track;
+ RF_TICS_t init_offset;
+ int row;
+ int col;
+{
+ RF_GeometryList_t *gp;
+ FILE *f;
+
+ RF_ASSERT( disk != (RF_DiskState_t *) NULL );
+
+ disk->cur_cyl = init_cyl;
+ disk->cur_track = init_track;
+ disk->index_offset = init_offset;
+ disk->geom = (RF_Geometry_t *) NULL;
+ disk->queueFinishTime = 0.0;
+ disk->lastBlock = 0;
+ disk->row=row;
+ disk->col=col;
+ Zero_stats(disk);
+
+ if (strncmp(disk_name,"/dev",4 )==0) strcpy(disk_name,"HP2247");
+
+ if( geom_list == (RF_GeometryList_t *) NULL ) {
+ f = fopen(disk_db,"r");
+ if (f == NULL) {
+ fprintf(stderr, "ERROR: RAIDframe could not open disk db %s\n", disk_db);
+ exit(1);
+ }
+ geom_list = Fetch_geometry_db( f );
+ fclose( f );
+ }
+ for( gp = geom_list; gp != (RF_GeometryList_t *) NULL; gp = gp->next ) {
+ RF_ASSERT( gp->disk != (RF_Geometry_t *) NULL
+ && gp->disk->disk_name != (char *) NULL );
+ if( strncmp( disk_name, gp->disk->disk_name, RF_MAX_DISKNAME_LEN )
+ == 0 ) {
+ disk->geom = gp->disk;
+ break;
+ }
+ }
+ if( disk->geom == (RF_Geometry_t *) NULL ) {
+ fprintf( stderr, "Disk %s not found in database %s\n",
+ disk_name, disk_db );
+ exit(1);
+ }
+
+ Format_disk( disk, 1 );
+}
+
+static long Find_cyl( block, disk )
+ RF_SectorNum_t block;
+ RF_DiskState_t *disk;
+{
+ RF_ZoneList_t * z;
+ long tmp;
+
+ long log_sector = block * disk->sectors_per_block;
+ long cylinder = 0;
+ z = disk->geom->zbr_data;
+ /* This while-loop finds the zone to which log_sector belongs,
+ computes the starting cylinder number of this zone, and
+ computes the sector offset into this zone. */
+ while (log_sector >= z->zone.num_sectors) {
+ log_sector = log_sector - z->zone.num_sectors;
+ cylinder = cylinder + z->zone.num_cylinders;
+ z = z->next;
+ }
+
+ /* The cylinder to which log_sector belongs equals the starting
+ cylinder number of its zone plus the cylinder offset into
+ the zone. */
+ tmp = cylinder + (log_sector / (z->zone.sec_per_track *
+ disk->geom->tracks_per_cyl));
+
+ return( tmp );
+}
+
+static long Find_track( block, disk )
+ RF_SectorNum_t block;
+ RF_DiskState_t *disk;
+{
+ RF_ZoneList_t * z;
+ long tmp;
+
+ long log_sector = block * disk->sectors_per_block;
+ long track = 0;
+ z = disk->geom->zbr_data;
+ /* This while-loop finds the zone to which log_sector belongs,
+ computes the starting track number of this zone, and computes
+ the sector offset into this zone. */
+ while (log_sector >= z->zone.num_sectors) {
+ log_sector = log_sector - z->zone.num_sectors;
+ track = track + (z->zone.num_cylinders *
+ disk->geom->tracks_per_cyl);
+ z = z->next;
+ }
+
+ /* The track to which log_sector belongs equals the starting
+ track number of its zone plus the track offset into the zone,
+ modulo the number of tracks per cylinder on the disk. */
+ tmp = (track + (log_sector / z->zone.sec_per_track)) %
+ disk->geom->tracks_per_cyl;
+
+ return( tmp );
+}
+
+/*
+ ** The position of a logical sector relative to the index mark on any track
+ ** is not simple. A simple organization would be:
+**
+** track 0 : 0, 1, 2, 3, ... N-1
+** track 1 : N,N+1,N+2,N+3, ... 2N-1
+** ^
+** Index mark just before this point
+**
+** This is not good because sequential access of sectors N-1 then N
+** will require a full revolution in between (because track switch requires
+** a couple of sectors to recalibrate from embedded servo). So frequently
+** sequentially numbered sectors are physically skewed so that the next
+** accessible sector after N-1 will be N (with a skew of 2)
+**
+** track 0 : 0, 1, 2, 3, ... N-1
+** track 1 : 2N-2,2N-1, N, N+1, ... 2N-3
+** ^
+** Index mark just before this point
+**
+** Layout gets even more complex with cylinder boundaries. Seek time
+** is A + B*M where M is the number of cylinders to seek over. On a sequential
+** access that crosses a cylinder boundary, the disk will rotate for
+** A+B seconds, then "track skew" sectors (inter-sector gaps actually)
+** before it can access another sector, so the cylinder to cylinder skew
+** is "track skew" + CEIL( sectors_per_track*(A+B)/revolution_time ).
+**
+** So if sector 0 is 0 sectors from the index mark on the first track,
+** where is sector X relative to the index mark on its track?
+**
+** ( ( X % sectors_per_track ) basic relative position **
+** + track_skew * ( X / sectors_per_track ) skewed for each track **
+** + CEIL( sectors_per_track*(A+B)/revolution_time )
+** * ( X / sectors_per_cylinder ) skewed more for each cyl **
+** ) % sectors_per_track wrapped around in the track **
+**
+**
+*/
+
+static long Find_phys_sector(block, disk)
+ RF_SectorNum_t block;
+ RF_DiskState_t *disk;
+{
+ long phys = 0;
+ RF_ZoneList_t * z;
+ long previous_spt = 1;
+ long sector = block * disk->sectors_per_block;
+
+ z = disk->geom->zbr_data;
+ /* This while-loop finds the zone to which sector belongs,
+ and computes the physical sector up to that zone. */
+ while (sector >= z->zone.num_sectors) {
+ sector = sector - z->zone.num_sectors;
+ /* By first multiplying 'phys' by the sectors per track in
+ the current zone divided by the sectors per track in the
+ previous zone, we convert a given physical sector in one
+ zone to an equivalent physical sector in another zone. */
+ phys = ((phys * z->zone.sec_per_track / previous_spt) +
+ (((z->zone.num_sectors - 1) % z->zone.sec_per_track) +
+ (z->zone.track_skew * z->zone.num_cylinders *
+ disk->geom->tracks_per_cyl) +
+ (long) ceil( (double) z->zone.sec_per_track *
+ (disk->geom->seek_constant_coeff) /
+ disk->geom->revolution_time) *
+ z->zone.num_cylinders)) %
+ z->zone.sec_per_track;
+ previous_spt = z->zone.sec_per_track;
+ z = z->next;
+ }
+
+ /* The final physical sector equals the physical sector up to
+ the particular zone, plus the physical sector caused by the
+ sector offset into this zone. */
+ phys = ((phys * z->zone.sec_per_track / previous_spt) +
+ ((sector % z->zone.sec_per_track) +
+ (z->zone.track_skew * (sector / z->zone.sec_per_track)) +
+ (long) ceil( (RF_TICS_t) z->zone.sec_per_track *
+ (disk->geom->seek_constant_coeff) /
+ disk->geom->revolution_time) *
+ (sector / (z->zone.sec_per_track *
+ disk->geom->tracks_per_cyl)))) %
+ z->zone.sec_per_track;
+
+
+ return( phys );
+}
+
+/*
+ ** When each disk starts up, its index mark is a fraction (f) of a rotation
+ ** ahead from its heads (in the direction of rotation). The sector
+ ** under its heads is at a fraction f of a rotation from the index
+ ** mark. After T time has past, T/rotation_time revolutions have occured, so
+ ** the sector under the heads is at a fraction FRAC(f+T/rotation_time) of a
+ ** rotation from the index mark. If the target block is at physical sector
+ ** X relative to its index mark, then it is at fraction (X/sectors_per_track),
+ ** so the rotational delay is
+ ** ((X/sectors_per_track)-FRAC(f+T/rotation_time)) * revolution_time
+ ** if this is positive, otherwise it is
+ ** (1+(X/sectors_per_track)-FRAC(f+T/rotation_time)) * revolution_time
+ */
+
+#define FRAC(a) ( (a) - (long) floor(a) )
+
+static RF_TICS_t Delay_to(cur_time, block, disk)
+ RF_TICS_t cur_time;
+ RF_SectorNum_t block;
+ RF_DiskState_t *disk;
+{
+ RF_TICS_t tmp;
+ RF_ZoneList_t *z;
+
+ long sector = block * disk->sectors_per_block;
+ z = disk->geom->zbr_data;
+ /* This while-loop finds the zone to which sector belongs. */
+ while (sector >= z->zone.num_sectors) {
+ sector = sector - z->zone.num_sectors;
+ z = z->next;
+ }
+
+ tmp = (
+ (RF_TICS_t) Find_phys_sector(block,disk)/z->zone.sec_per_track
+ - FRAC(disk->index_offset+cur_time/disk->geom->revolution_time)
+ ) * disk->geom->revolution_time;
+ if( tmp < 0 ) tmp += disk->geom->revolution_time;
+ if( tmp < 0 ) RF_PANIC();
+ return( tmp );
+}
+
+/* Hmmm...they seem to be computing the head switch time as
+ * equal to the track skew penalty. Is this an approximation?
+ * (MCH)
+ */
+static RF_TICS_t Seek_time( to_cyl, to_track, from_cyl, from_track, disk )
+ long to_cyl;
+ long to_track;
+ long from_cyl;
+ long from_track;
+ RF_DiskState_t *disk;
+{
+ long cyls = ABS_DIFF( from_cyl, to_cyl ) - 1;
+ RF_TICS_t seek = 0.0;
+ RF_ZoneList_t * z;
+
+ /* printf("Seek_time: from_cyl %ld, to_cyl %ld, from_trk %ld, to_trk %ld\n",from_cyl, to_cyl, from_track, to_track); */
+ if( from_cyl != to_cyl ) {
+ z = disk->geom->zbr_data;
+ /* This while-loop finds the zone to which to_cyl belongs. */
+ while (to_cyl >= z->zone.num_cylinders) {
+ to_cyl = to_cyl - z->zone.num_cylinders;
+ z = z->next;
+ }
+
+ seek = disk->geom->seek_constant_coeff
+ + disk->geom->seek_linear_coeff * cyls
+ + disk->geom->seek_sqrt_coeff * sqrt( (double) cyls )
+ + z->zone.track_skew * disk->geom->revolution_time /
+ z->zone.sec_per_track;
+
+ } else if( from_track != to_track ) {
+ /* from_track and to_track must lie in the same zone. */
+ z = disk->geom->zbr_data;
+ /* This while-loop finds the zone to which from_cyl belongs. */
+ while (from_cyl >= z->zone.num_cylinders) {
+ from_cyl = from_cyl - z->zone.num_cylinders;
+ z = z->next;
+ }
+
+ seek = z->zone.track_skew
+ * disk->geom->revolution_time
+ / z->zone.sec_per_track;
+ }
+ return( seek );
+}
+
+static RF_TICS_t Seek(cur_time, block, disk, update)
+ RF_TICS_t cur_time;
+ RF_SectorNum_t block;
+ RF_DiskState_t *disk;
+ long update;
+{
+ long cur_cyl, cur_track;
+ /*
+ ** current location is derived from the time,
+ ** current track and current cylinder
+ **
+ ** update current location as you go
+ */
+
+ RF_ASSERT( block <= disk->last_block_index );
+ cur_cyl = disk->cur_cyl;
+ cur_track = disk->cur_track;
+ if (update) {
+ disk->cur_cyl = Find_cyl( block, disk );
+ disk->cur_track = Find_track( block, disk );
+ }
+ return( Seek_time( disk->cur_cyl, disk->cur_track,
+ cur_cyl, cur_track, disk ) );
+}
+
+static RF_TICS_t Rotate(cur_time, block, disk, update)
+ RF_TICS_t cur_time;
+ RF_SectorNum_t block;
+ RF_DiskState_t *disk;
+ long update;
+{
+ /*
+ ** current location is derived from the time,
+ ** current track and current cylinder
+ **
+ ** block the process until at the appropriate block
+ ** updating current location as you go
+ */
+
+ RF_ASSERT( block <= disk->last_block_index );
+ return( Delay_to( cur_time, block, disk ) );
+}
+
+static RF_TICS_t Seek_Rotate(cur_time, block, disk, update)
+ RF_TICS_t cur_time;
+ RF_SectorNum_t block;
+ RF_DiskState_t *disk;
+ long update;
+{
+ RF_TICS_t seek, delay;
+
+ RF_ASSERT( block <= disk->last_block_index );
+ seek = Seek( cur_time, block, disk, update );
+ delay = seek + Rotate( cur_time+seek, block, disk, update );
+ return( delay );
+}
+
+static RF_TICS_t GAP(sec_per_track, disk)
+ long sec_per_track;
+ RF_DiskState_t *disk;
+{
+ RF_TICS_t tmp = (disk->geom->revolution_time/(100*sec_per_track));
+ return (tmp);
+}
+
+RF_TICS_t Block_access_time(cur_time, block, numblocks, disk, update)
+ RF_TICS_t cur_time;
+ RF_SectorNum_t block;
+ RF_SectorCount_t numblocks;
+ RF_DiskState_t *disk;
+ long update;
+{
+ RF_TICS_t delay = 0;
+ long cur = block, end = block + numblocks;
+ long sector, tmp;
+ RF_ZoneList_t * z;
+ /*
+ ** this is the same as Seek_Rotate by merit of the mapping
+ ** except that the access ends before the gap to the next block
+ */
+ RF_ASSERT( numblocks > 0 && end-1 <= disk->last_block_index );
+
+ while( cur < end ) {
+ sector = cur * disk->sectors_per_block;
+ z = disk->geom->zbr_data;
+ /* This while-loop finds the zone to which sector belongs. */
+ while (sector >= z->zone.num_sectors) {
+ sector = sector - z->zone.num_sectors;
+ z = z->next;
+ }
+
+ tmp = RF_MIN( end - cur, z->zone.sec_per_track
+ - cur % z->zone.sec_per_track );
+ delay += tmp * disk->geom->revolution_time /
+ z->zone.sec_per_track -
+ GAP(z->zone.sec_per_track, disk);
+ cur += tmp;
+ if( cur != end )
+ delay += Seek_Rotate( cur_time+delay, cur, disk, update );
+ }
+ return( delay );
+}
+
+static void Zero_stats(disk)
+ RF_DiskState_t *disk;
+{
+ char traceFileName[64];
+ disk->stats.num_events = 0;
+ disk->stats.seek_sum = 0;
+ disk->stats.seekSq_sum = 0;
+ disk->stats.rotate_sum = 0;
+ disk->stats.rotateSq_sum = 0;
+ disk->stats.transfer_sum = 0;
+ disk->stats.transferSq_sum = 0;
+ disk->stats.access_sum = 0;
+ disk->stats.accessSq_sum = 0;
+ disk->stats.sleep_sum=0;
+ disk->stats.idle_sum=0;
+ disk->stats.rw_sum=0;
+ disk->stats.spinup_sum=0;
+ disk->stats.last_acc=0;
+ if (rf_diskTrace){
+ sprintf (traceFileName,"rf_diskTracer%dc%d\0",disk->row,disk->col);
+ if ( (disk->traceFile= fopen(traceFileName, "w")) == NULL) {
+ perror(traceFileName); RF_PANIC();}
+ }
+}
+
+static RF_TICS_t Update_stats(cur_time, seek, rotate, transfer, disk)
+ RF_TICS_t cur_time;
+ RF_TICS_t seek;
+ RF_TICS_t rotate;
+ RF_TICS_t transfer;
+ RF_DiskState_t *disk;
+{
+ RF_TICS_t spinup=0;
+ RF_TICS_t sleep=0;
+ RF_TICS_t idle=0;
+
+ disk->stats.num_events++;
+ disk->stats.seek_sum += seek;
+ disk->stats.seekSq_sum += seek*seek;
+ disk->stats.rotate_sum += rotate;
+ disk->stats.rotateSq_sum += rotate*rotate;
+ disk->stats.transfer_sum += transfer;
+ disk->stats.transferSq_sum += transfer*transfer;
+ disk->stats.access_sum += seek+rotate+transfer;
+ disk->stats.accessSq_sum +=
+ (seek+rotate+transfer)*(seek+rotate+transfer);
+
+/* ASSERT (cur_time - disk->stats.last_acc >= 0); */
+
+ if (cur_time-disk->stats.last_acc>disk->geom->time_to_sleep){
+ idle=disk->geom->time_to_sleep;
+
+ sleep = cur_time - disk->stats.last_acc - idle;
+ spinup=disk->geom->time_to_spinup;
+ rf_globalSpinup = spinup;
+ }
+
+ else{
+ idle=cur_time - disk->stats.last_acc;
+ }
+
+
+ disk->stats.sleep_sum+=sleep;
+ disk->stats.idle_sum+=idle;
+ disk->stats.rw_sum+=seek+rotate+transfer;
+ disk->stats.spinup_sum+=spinup;
+
+ if (rf_diskTrace){
+ fprintf(disk->traceFile,"%g %g\n",disk->stats.last_acc,2.0);
+ fprintf(disk->traceFile,"%g %g\n",(disk->stats.last_acc+idle),2.0);
+ if (sleep){
+ fprintf(disk->traceFile,"%g %g\n",(disk->stats.last_acc+idle),1.0);
+ fprintf(disk->traceFile,"%g %g\n",(disk->stats.last_acc+idle+sleep),1.0);
+ }
+
+ if (spinup){
+ fprintf(disk->traceFile,"%g %g\n",(cur_time),4.0);
+ fprintf(disk->traceFile,"%g %g\n",(cur_time+spinup),4.0);
+ }
+
+ fprintf(disk->traceFile,"%g %g\n",(cur_time+spinup),3.0);
+ fprintf(disk->traceFile,"%g %g\n",(cur_time+spinup+seek+rotate+transfer),3.0);
+
+
+ }
+
+ disk->stats.last_acc=cur_time+spinup+seek+rotate+transfer;
+
+ return(spinup);
+}
+
+
+void rf_StopStats(disk, cur_time)
+ RF_DiskState_t *disk;
+ RF_TICS_t cur_time;
+{
+
+ RF_TICS_t sleep=0;
+ RF_TICS_t idle=0;
+
+ if (cur_time - disk->stats.last_acc > disk->geom->time_to_sleep){
+
+ sleep = cur_time - disk->stats.last_acc-disk->geom->time_to_sleep;
+ idle = disk->geom->time_to_sleep;
+
+ }
+
+
+
+ else{
+ idle=cur_time - disk->stats.last_acc;
+ }
+
+ disk->stats.sleep_sum+=sleep;
+ disk->stats.idle_sum+=idle;
+
+ if (rf_diskTrace){
+ fprintf(disk->traceFile,"%g %g\n",disk->stats.last_acc,2.0);
+ fprintf(disk->traceFile,"%g %g\n",(disk->stats.last_acc+idle),2.0);
+ if (sleep){
+ fprintf(disk->traceFile,"%g %g\n",(disk->stats.last_acc+idle),1.0);
+ fprintf(disk->traceFile,"%g %g\n",(disk->stats.last_acc+idle+sleep),1.0);
+ }
+ fclose(disk->traceFile);
+ }
+}
+
+/* Sometimes num_events is zero because the disk was failed at the start
+ * of the simulation and never replaced. This causes a crash on some
+ * architectures, which is why we have the conditional.
+ */
+void rf_Report_stats(
+ RF_DiskState_t *disk,
+ long *numEventsPtr,
+ RF_TICS_t *avgSeekPtr,
+ RF_TICS_t *avgRotatePtr,
+ RF_TICS_t *avgTransferPtr,
+ RF_TICS_t *avgAccessPtr,
+ RF_TICS_t *SleepPtr,
+ RF_TICS_t *IdlePtr,
+ RF_TICS_t *RwPtr,
+ RF_TICS_t *SpinupPtr)
+{
+ *numEventsPtr = disk->stats.num_events;
+ if (disk->stats.num_events) {
+ *avgSeekPtr = disk->stats.seek_sum / disk->stats.num_events;
+ *avgRotatePtr = disk->stats.rotate_sum / disk->stats.num_events;
+ *avgTransferPtr = disk->stats.transfer_sum / disk->stats.num_events;
+ *avgAccessPtr = disk->stats.access_sum / disk->stats.num_events;
+ } else {
+ *avgSeekPtr = 0;
+ *avgRotatePtr = 0;
+ *avgTransferPtr = 0;
+ *avgAccessPtr = 0;
+ }
+ *SleepPtr = disk->stats.sleep_sum;
+ *IdlePtr = disk->stats.idle_sum;
+ *RwPtr = disk->stats.rw_sum ;
+ *SpinupPtr = disk->stats.spinup_sum ;
+}
+
+int rf_Access_time( access_time, cur_time, block, numblocks, disk, media_done_time, update )
+ RF_TICS_t *access_time;
+ RF_TICS_t cur_time;
+ RF_SectorNum_t block;
+ RF_SectorCount_t numblocks;
+ RF_DiskState_t *disk;
+ RF_TICS_t *media_done_time;
+ long update; /* 1 => update disk state, 0 => don't */
+{
+ /*
+ * first move to the start of the data, then sweep to the end
+ */
+ RF_TICS_t spinup=0;
+ RF_TICS_t seek = Seek( cur_time, block, disk, update );
+ RF_TICS_t rotate = Rotate( cur_time+seek, block, disk, update );
+ RF_TICS_t transfer = Block_access_time( cur_time+seek+rotate, block,
+ numblocks, disk, update );
+
+ if (update) spinup=Update_stats(cur_time, seek, rotate, transfer, disk );
+ *media_done_time = seek+rotate+transfer;
+ *access_time =( seek+rotate+transfer+spinup);
+ return(0);
+}
+
+/* added to take into account the fact that maping code acounts for the disk label */
+
+void rf_GeometryDoReadCapacity(disk, numBlocks, blockSize)
+ RF_DiskState_t *disk;
+ RF_SectorCount_t *numBlocks;
+ int *blockSize;
+{
+ *numBlocks= (disk->last_block_index + 1 )-rf_protectedSectors;
+
+ *blockSize= (disk->sectors_per_block*512 );
+
+ /* in bytes */
+}
+
+
+/* END GEOMETRY ROUTINES **********************************************/
+
+
+static void rf_DiskParam(numCyls, minSeek, avgSeek, maxSeek, a, b, c)
+ long numCyls;
+ RF_TICS_t minSeek;
+ RF_TICS_t avgSeek;
+ RF_TICS_t maxSeek;
+ RF_TICS_t *a;
+ RF_TICS_t *b;
+ RF_TICS_t *c;
+{
+ if (minSeek == avgSeek && minSeek == maxSeek) {
+ *a = 0.0; *b = 0.0; *c = minSeek;
+ } else {
+ *a = ( 15 * avgSeek - 10 * minSeek - 5 * maxSeek ) / ( 3 * sqrt( (double) numCyls ));
+ *b = ( 7 * minSeek + 8 * maxSeek - 15 * avgSeek ) / ( 3 * numCyls );
+ *c = minSeek;
+ }
+}
diff --git a/sys/dev/raidframe/rf_geometry.h b/sys/dev/raidframe/rf_geometry.h
new file mode 100644
index 00000000000..3d77b1ea402
--- /dev/null
+++ b/sys/dev/raidframe/rf_geometry.h
@@ -0,0 +1,155 @@
+/* $OpenBSD: rf_geometry.h,v 1.1 1999/01/11 14:29:24 niklas Exp $ */
+/* $NetBSD: rf_geometry.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */
+/* geometry.h
+ * code from raidSim to model disk behavior
+ */
+/*
+ * Changes:
+ * 8/18/92 Additional structures have been declared and existing
+ * structures have been modified in order to support zone-
+ * bit recording.
+ * (AS) 1. The types 'Zone_data' and 'Zone_list' have been defined.
+ * (AS) 2. The type 'Geometry' has been modified.
+ */
+
+/* :
+ * Log: rf_geometry.h,v
+ * Revision 1.10 1996/08/06 22:25:08 jimz
+ * include raidframe stuff before system stuff
+ *
+ * Revision 1.9 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.8 1996/05/31 10:16:14 jimz
+ * add raidsim note
+ *
+ * Revision 1.7 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.6 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.5 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.4 1995/12/01 18:29:45 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_GEOMETRY_H_
+#define _RF__RF_GEOMETRY_H_
+
+#include "rf_types.h"
+#include "rf_sys.h"
+#ifndef _KERNEL
+#include <string.h>
+#include <math.h>
+#if defined(__NetBSD__) || defined(__OpenBSD__)
+#include <stdio.h>
+#endif /* __NetBSD__ || __OpenBSD__ */
+#endif
+
+#define RF_MAX_DISKNAME_LEN 80
+
+typedef struct RF_ZoneData_s {
+ long num_cylinders; /* Number of cylinders in zone */
+ long sec_per_track; /* Sectors per track in zone */
+ long track_skew; /* Skew of each track in zone */
+ long num_sectors; /* Number of sectors in zone */
+} RF_ZoneData_t;
+
+/*
+ * Linked list containing zone data
+ */
+typedef struct RF_ZoneList_s RF_ZoneList_t;
+struct RF_ZoneList_s {
+ RF_ZoneData_t zone; /* for each disk */
+ RF_ZoneList_t *next;
+};
+
+typedef struct RF_Geometry_s {
+ char disk_name[RF_MAX_DISKNAME_LEN]; /* name for a type of disk */
+ long tracks_per_cyl; /* tracks in a cylinder */
+ /* assume 1 head per track, 1 set of read/write electronics */
+ long num_zones; /* number of ZBR zones on disk */
+ RF_TICS_t revolution_time; /* milliseconds per revolution */
+ RF_TICS_t seek_one_cyl; /* adjacent cylinder seek time */
+ RF_TICS_t seek_max_stroke; /* end to end seek time */
+ RF_TICS_t seek_avg; /* random from/to average time */
+ /*
+ * seek time = a * (x-1)^0.5 + b * (x-1) + c
+ * x >= 1 is the seek distance in cylinders
+ */
+ RF_TICS_t seek_sqrt_coeff; /* a */
+ RF_TICS_t seek_linear_coeff; /* b */
+ RF_TICS_t seek_constant_coeff; /* c */
+ RF_ZoneList_t *zbr_data; /* linked list with ZBR data */
+ RF_TICS_t time_to_sleep; /* seconds of idle time before disks goes to sleep */
+ RF_TICS_t time_to_spinup; /* seconds spin up takes */
+} RF_Geometry_t;
+
+typedef struct RF_GeometryList_s RF_GeometryList_t;
+struct RF_GeometryList_s {
+ RF_Geometry_t *disk;
+ RF_GeometryList_t *next;
+};
+
+typedef struct RF_DiskStats_s {
+ long num_events;
+ RF_TICS_t seek_sum;
+ RF_TICS_t seekSq_sum;
+ RF_TICS_t rotate_sum;
+ RF_TICS_t rotateSq_sum;
+ RF_TICS_t transfer_sum;
+ RF_TICS_t transferSq_sum;
+ RF_TICS_t access_sum;
+ RF_TICS_t accessSq_sum;
+ RF_TICS_t sleep_sum;
+ RF_TICS_t idle_sum;
+ RF_TICS_t rw_sum;
+ RF_TICS_t spinup_sum;
+ RF_TICS_t last_acc; /* time the last acces was finished */
+} RF_DiskStats_t;
+
+struct RF_DiskState_s {
+ int row;
+ int col;
+ RF_Geometry_t *geom;
+ long sectors_per_block; /* formatted per disk */
+ long last_block_index; /* format result for convenience */
+ RF_TICS_t index_offset; /* powerup head offset to index mark */
+ long cur_track; /* current track */
+ long cur_cyl; /* current cylinder */
+ RF_DiskStats_t stats; /* disk statistics */
+
+ RF_TICS_t queueFinishTime; /* used by shortest-seek code */
+ long lastBlock;
+ FILE *traceFile;
+};
+typedef struct RF_DiskState_s RF_DiskState_t;
+
+extern RF_TICS_t rf_globalSpinup;
+
+void rf_InitDisk(RF_DiskState_t *disk, char *disk_name, char *disk_db, long init_cyl,
+ long init_track, RF_TICS_t init_offset, int row, int col);
+void rf_StopStats(RF_DiskState_t *disk, RF_TICS_t cur_time);
+void rf_Report_stats(RF_DiskState_t *disk, long *numEventsPtr, RF_TICS_t *avgSeekPtr,
+ RF_TICS_t *avgRotatePtr, RF_TICS_t *avgTransferPtr, RF_TICS_t *avgAccessPtr,
+ RF_TICS_t *SleepPtr, RF_TICS_t *IdlePtr, RF_TICS_t *RwPtr, RF_TICS_t *SpinupPtr);
+int rf_Access_time(RF_TICS_t *access_time, RF_TICS_t cur_time,
+ RF_SectorNum_t block, RF_SectorCount_t numblocks, RF_DiskState_t *disk,
+ RF_TICS_t *media_done_time, long update);
+void rf_GeometryDoReadCapacity(RF_DiskState_t *disk, RF_SectorCount_t *numBlocks,
+ int *blockSize);
+
+#endif /* !_RF__RF_GEOMETRY_H_ */
diff --git a/sys/dev/raidframe/rf_heap.c b/sys/dev/raidframe/rf_heap.c
new file mode 100644
index 00000000000..ecb7a14518d
--- /dev/null
+++ b/sys/dev/raidframe/rf_heap.c
@@ -0,0 +1,274 @@
+/* $OpenBSD: rf_heap.c,v 1.1 1999/01/11 14:29:25 niklas Exp $ */
+/* $NetBSD: rf_heap.c,v 1.1 1998/11/13 04:20:30 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* We manage a heap of data,key pairs, where the key a simple data type
+ * and the data is any singular data type. We allow the caller to add
+ * pairs, remote pairs, peek at the top pair, and do delete/add combinations.
+ * The latter are efficient because we only reheap once.
+ *
+ * David Kotz 1990? and 1993
+ *
+ * Modify the heap to work with events, with the smallest time on the top.
+ * Song Bac Toh, 1994
+ */
+
+/* :
+ * Log: rf_heap.c,v
+ * Revision 1.8 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.7 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.6 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.5 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.4 1995/12/01 19:03:58 root
+ * added copyright info
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_heap.h"
+#include "rf_general.h"
+
+/* return RF_TRUE if the two requests in the heap match */
+#define Matching_REQUESTS(HeapData1, HeapData2) \
+((HeapData1->disk == HeapData2->disk) && \
+ (HeapData1->req_code == HeapData2->req_code))
+
+/* getting around in the heap */
+/* we don't use the 0th element of the array */
+#define ROOT 1
+#define LCHILD(p) (2 * (p))
+#define RCHILD(p) (2 * (p) + 1)
+#define PARENT(c) ((c) / 2)
+
+/* @SUBTITLE "Debugging macros" */
+/* The following are used for debugging our callers
+ * as well as internal stuff
+ */
+
+#define CHECK_INVARIANTS 1
+
+#ifdef CHECK_INVARIANTS
+#define INVARIANT2(x, y) \
+{ \
+ if (!(x)) { \
+ fprintf(stderr, "INVARIANT false: in \"%s\", line %d\n", \
+ __FILE__, __LINE__); \
+ fprintf(stderr, (y)); \
+ exit(1); \
+ } \
+}
+
+/*
+#define INVARIANT3(x, y, z) \
+ { \
+ if (!(x)) { \
+ fprintf(stderr, "INVARIANT false: in \"%s\", line %d\n", \
+ __FILE__, __LINE__); \
+ fprintf(stderr, (y), (z)); \
+ exit(1); \
+ } \
+ }
+ */
+#else /* CHECK_INVARIANTS */
+/* #define INVARIANT2(x, y) */
+/* #define INVARIANT3(x, y, z) already defined in modularize.h */
+#endif /* CHECK_INVARIANTS */
+
+/**** Rachad, must add to general debug structure */
+
+
+/* @SUBTITLE "InitHeap: Allocate a new heap" */
+/* might return NULL if no free memory */
+RF_Heap_t rf_InitHeap(int maxsize)
+{
+ RF_Heap_t hp;
+
+ RF_ASSERT(maxsize > 0);
+ RF_Malloc(hp, sizeof(struct RF_Heap_s),(RF_Heap_t));
+ if (hp == NULL) {
+ fprintf(stderr, "InitHeap: No memory for heap\n");
+ return(NULL);
+ }
+
+ RF_Malloc(hp->heap,sizeof(RF_HeapEntry_t)*(maxsize+1),(RF_HeapEntry_t *));
+ if (hp->heap == NULL) {
+ fprintf(stderr, "InitHeap: No memory for heap of %d elements\n",
+ maxsize);
+ RF_Free(hp,-1); /* -1 means don't cause an error if the size does not match */
+ return(NULL);
+ }
+
+ hp->numheap = 0;
+ hp->maxsize = maxsize;
+
+ return(hp);
+}
+
+/* @SUBTITLE "FreeHeap: delete a heap" */
+void rf_FreeHeap(RF_Heap_t hp)
+{
+ if (hp != NULL) {
+ RF_Free(hp->heap,sizeof(RF_HeapEntry_t)*(hp->maxsize+1));
+ RF_Free(hp,sizeof(struct RF_Heap_s));
+ }
+}
+
+/* @SUBTITLE "AddHeap: Add an element to the heap" */
+void rf_AddHeap(RF_Heap_t hp, RF_HeapData_t *data, RF_HeapKey_t key)
+{
+ int node;
+
+ INVARIANT2(hp != NULL, "AddHeap: NULL heap\n");
+ INVARIANT2((hp->numheap < RF_HEAP_MAX), "AddHeap: Heap overflowed\n");
+
+ /* use new space end of heap */
+ node = ++(hp->numheap);
+
+ /* and reheap */
+ while (node != ROOT && hp->heap[PARENT(node)].key > key) {
+ hp->heap[node] = hp->heap[PARENT(node)];
+ node = PARENT(node);
+ }
+
+ hp->heap[node].data = data;
+ hp->heap[node].key = key;
+}
+
+/* @SUBTITLE "TopHeap: Return top element of heap" */
+int rf_TopHeap(RF_Heap_t hp, RF_HeapData_t **data, RF_HeapKey_t *key)
+{
+ INVARIANT2(hp != NULL, "TopHeap: NULL heap\n");
+
+ if (hp->numheap > 0) {
+ if (data)
+ *data = hp->heap[ROOT].data;
+ if (key)
+ *key = hp->heap[ROOT].key;
+ return(RF_HEAP_FOUND);
+ }
+ else {
+ return(RF_HEAP_NONE);
+ }
+}
+
+/* @SUBTITLE "RepHeap: Replace top of heap with given element and reheap" */
+/* note that hp->numheap does not change, and should already be > 0 */
+void rf_RepHeap(RF_Heap_t hp, RF_HeapData_t *data, RF_HeapKey_t key)
+{
+ int node; /* node in heap */
+ int lchild, rchild; /* left and right children of node */
+ int left, right; /* left and right children exist? */
+ int swapped; /* swap was made? */
+ RF_HeapEntry_t *heap; /* pointer to the base of this heap array */
+
+ INVARIANT2(hp != NULL, "RepHeap: NULL heap\n");
+
+ /* If heap is empty just add this element */
+ /* if used properly this case should never come up */
+ if (hp->numheap == 0) {
+ rf_AddHeap(hp, data, key);
+
+ return;
+ }
+
+ heap = hp->heap; /* cache the heap base pointer */
+
+ node = ROOT;
+
+ do {
+ lchild = LCHILD(node);
+ rchild = RCHILD(node);
+ left = (lchild <= hp->numheap);
+ right = (rchild <= hp->numheap);
+
+ /* Both children exist: which is smaller? */
+ if (left && right)
+ if (heap[lchild].key < heap[rchild].key)
+ right = RF_HEAP_NONE;
+ else
+ left = RF_HEAP_NONE;
+
+ /* Now only one of left and right is true. compare it with us */
+ if (left && heap[lchild].key < key) {
+ /* swap with left child */
+ heap[node] = heap[lchild];
+ node = lchild;
+ swapped = RF_HEAP_FOUND;
+ } else if (right && heap[rchild].key < key) {
+ /* swap with right child */
+ heap[node] = heap[rchild];
+ node = rchild;
+ swapped = RF_HEAP_FOUND;
+ } else
+ swapped = RF_HEAP_NONE;
+ } while (swapped);
+
+ /* final resting place for new element */
+ heap[node].key = key;
+ heap[node].data = data;
+}
+
+/* @SUBTITLE "RemHeap: Remove top element and reheap" */
+int rf_RemHeap(RF_Heap_t hp, RF_HeapData_t **data, RF_HeapKey_t *key)
+{
+ int node;
+
+ /* we don't check hp's validity because TopHeap will do it for us */
+
+ /* get the top element into data and key, if any */
+ if (rf_TopHeap(hp, data, key)) {
+ /* there was something there, so replace top with last element */
+ node = hp->numheap--;
+ if (hp->numheap > 0)
+ rf_RepHeap(hp, hp->heap[node].data, hp->heap[node].key);
+
+ return(RF_HEAP_FOUND);
+ } else{
+ return(RF_HEAP_NONE);
+ }
+}
+
diff --git a/sys/dev/raidframe/rf_heap.h b/sys/dev/raidframe/rf_heap.h
new file mode 100644
index 00000000000..bf8f8cfdaf9
--- /dev/null
+++ b/sys/dev/raidframe/rf_heap.h
@@ -0,0 +1,128 @@
+/* $OpenBSD: rf_heap.h,v 1.1 1999/01/11 14:29:25 niklas Exp $ */
+/* $NetBSD: rf_heap.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* @TITLE "heap.h - interface to heap management implementation */
+/* We manage a heap of data,key pairs, where the key could be any
+ * simple data type
+ * and the data is any pointer data type. We allow the caller to add
+ * pairs, remote pairs, peek at the top pair, and do delete/add combinations.
+ * The latter are efficient because we only reheap once.
+ *
+ * David Kotz 1990? and 1993
+ */
+
+/* :
+ * Log: rf_heap.h,v
+ * Revision 1.8 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.7 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.6 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.5 1995/12/01 19:04:07 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_HEAP_H_
+#define _RF__RF_HEAP_H_
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_desc.h"
+
+#define RF_HEAP_MAX 10240
+
+#define RF_HEAP_FOUND 1
+#define RF_HEAP_NONE 0
+
+typedef RF_TICS_t RF_HeapKey_t;
+
+typedef struct RF_HeapData_s RF_HeapData_t;
+typedef struct RF_Heap_s *RF_Heap_t;
+typedef struct RF_HeapEntry_s RF_HeapEntry_t;
+
+/* heap data */
+struct RF_HeapData_s {
+ RF_TICS_t eventTime;
+ int disk;
+ int (*CompleteFunc)(); /* function to be called upon completion */
+ void *argument; /* argument to be passed to CompleteFunc */
+ int owner; /* which task is resposable for this request */
+ int row;
+ int col; /* coordinates of disk */
+ RF_Raid_t *raidPtr;
+ void *diskid;
+ /* Dag event */
+ RF_RaidAccessDesc_t *desc;
+};
+
+struct RF_HeapEntry_s {
+ RF_HeapData_t *data; /* the arbitrary data */
+ RF_HeapKey_t key; /* key for comparison */
+};
+
+struct RF_Heap_s {
+ RF_HeapEntry_t *heap; /* the heap in use (an array) */
+ int numheap; /* number of elements in heap */
+ int maxsize;
+};
+
+/* set up heap to hold maxsize nodes */
+RF_Heap_t rf_InitHeap(int maxsize);
+
+/* delete a heap data structure */
+void rf_FreeHeap(RF_Heap_t hp);
+
+/* add the element to the heap */
+void rf_AddHeap(RF_Heap_t hp, RF_HeapData_t *data, RF_HeapKey_t key);
+
+/* return top of the heap, without removing it from heap (FALSE if empty) */
+int rf_TopHeap(RF_Heap_t hp, RF_HeapData_t **data, RF_HeapKey_t *key);
+
+/* replace the heap's top item with a new item, and reheap */
+void rf_RepHeap(RF_Heap_t hp, RF_HeapData_t *data, RF_HeapKey_t key);
+
+/* remove the heap's top item, if any (FALSE if empty heap) */
+int rf_RemHeap(RF_Heap_t hp, RF_HeapData_t **data, RF_HeapKey_t *key);
+
+#endif /* !_RF__RF_HEAP_H_ */
diff --git a/sys/dev/raidframe/rf_hist.h b/sys/dev/raidframe/rf_hist.h
new file mode 100644
index 00000000000..371c544d316
--- /dev/null
+++ b/sys/dev/raidframe/rf_hist.h
@@ -0,0 +1,73 @@
+/* $OpenBSD: rf_hist.h,v 1.1 1999/01/11 14:29:25 niklas Exp $ */
+/* $NetBSD: rf_hist.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */
+/*
+ * rf_hist.h
+ *
+ * Histgram operations for RAIDframe stats
+ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/* :
+ * Log: rf_hist.h,v
+ * Revision 1.3 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.2 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.1 1996/05/31 10:33:05 jimz
+ * Initial revision
+ *
+ */
+
+#ifndef _RF__RF_HIST_H_
+#define _RF__RF_HIST_H_
+
+#include "rf_types.h"
+
+#define RF_HIST_RESOLUTION 5
+#define RF_HIST_MIN_VAL 0
+#define RF_HIST_MAX_VAL 1000
+#define RF_HIST_RANGE (RF_HIST_MAX_VAL - RF_HIST_MIN_VAL)
+#define RF_HIST_NUM_BUCKETS (RF_HIST_RANGE / RF_HIST_RESOLUTION + 1)
+
+typedef RF_uint32 RF_Hist_t;
+
+#define RF_HIST_ADD(_hist_,_val_) { \
+ RF_Hist_t val; \
+ val = ((RF_Hist_t)(_val_)) / 1000; \
+ if (val >= RF_HIST_MAX_VAL) \
+ _hist_[RF_HIST_NUM_BUCKETS-1]++; \
+ else \
+ _hist_[(val - RF_HIST_MIN_VAL) / RF_HIST_RESOLUTION]++; \
+}
+
+#endif /* !_RF__RF_HIST_H_ */
diff --git a/sys/dev/raidframe/rf_interdecluster.c b/sys/dev/raidframe/rf_interdecluster.c
new file mode 100644
index 00000000000..3ce97d075ee
--- /dev/null
+++ b/sys/dev/raidframe/rf_interdecluster.c
@@ -0,0 +1,361 @@
+/* $OpenBSD: rf_interdecluster.c,v 1.1 1999/01/11 14:29:26 niklas Exp $ */
+/* $NetBSD: rf_interdecluster.c,v 1.1 1998/11/13 04:20:30 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Khalil Amiri
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/************************************************************
+ *
+ * rf_interdecluster.c -- implements interleaved declustering
+ *
+ ************************************************************/
+
+/* :
+ * Log: rf_interdecluster.c,v
+ * Revision 1.24 1996/08/02 13:20:38 jimz
+ * get rid of bogus (long) casts
+ *
+ * Revision 1.23 1996/07/31 16:56:18 jimz
+ * dataBytesPerStripe, sectorsPerDisk init arch-indep.
+ *
+ * Revision 1.22 1996/07/29 14:05:12 jimz
+ * fix numPUs/numRUs confusion (everything is now numRUs)
+ * clean up some commenting, return values
+ *
+ * Revision 1.21 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.20 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.19 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.18 1996/06/19 17:53:48 jimz
+ * move GetNumSparePUs, InstallSpareTable ops into layout switch
+ *
+ * Revision 1.17 1996/06/11 15:17:55 wvcii
+ * added include of rf_interdecluster.h
+ * fixed parameter list of rf_ConfigureInterDecluster
+ * fixed return type of rf_GetNumSparePUsInterDecluster
+ * removed include of rf_raid1.h
+ *
+ * Revision 1.16 1996/06/11 08:55:15 jimz
+ * improved error-checking at configuration time
+ *
+ * Revision 1.15 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.14 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.13 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.12 1996/06/06 18:41:48 jimz
+ * add interleaved declustering dag selection
+ *
+ * Revision 1.11 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.10 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.9 1996/05/31 05:03:01 amiri
+ * fixed a bug related to sparing layout.
+ *
+ * Revision 1.8 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.7 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.6 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.5 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.4 1996/05/03 19:50:38 wvcii
+ * removed include of rf_redstripe.h
+ * fixed change log parameters in header
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_interdecluster.h"
+#include "rf_dag.h"
+#include "rf_dagutils.h"
+#include "rf_dagfuncs.h"
+#include "rf_threadid.h"
+#include "rf_general.h"
+#include "rf_utils.h"
+#include "rf_dagffrd.h"
+#include "rf_dagdegrd.h"
+#include "rf_dagffwr.h"
+#include "rf_dagdegwr.h"
+
+typedef struct RF_InterdeclusterConfigInfo_s {
+ RF_RowCol_t **stripeIdentifier; /* filled in at config time
+ * and used by IdentifyStripe */
+ RF_StripeCount_t numSparingRegions;
+ RF_StripeCount_t stripeUnitsPerSparingRegion;
+ RF_SectorNum_t mirrorStripeOffset;
+} RF_InterdeclusterConfigInfo_t;
+
+int rf_ConfigureInterDecluster(
+ RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr)
+{
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_StripeCount_t num_used_stripeUnitsPerDisk;
+ RF_InterdeclusterConfigInfo_t *info;
+ RF_RowCol_t i, tmp, SUs_per_region;
+
+ /* create an Interleaved Declustering configuration structure */
+ RF_MallocAndAdd(info, sizeof(RF_InterdeclusterConfigInfo_t), (RF_InterdeclusterConfigInfo_t *),
+ raidPtr->cleanupList);
+ if (info == NULL)
+ return(ENOMEM);
+ layoutPtr->layoutSpecificInfo = (void *) info;
+
+ /* fill in the config structure. */
+ SUs_per_region = raidPtr->numCol * (raidPtr->numCol - 1);
+ info->stripeIdentifier = rf_make_2d_array(SUs_per_region, 2 , raidPtr->cleanupList);
+ if (info->stripeIdentifier == NULL)
+ return(ENOMEM);
+ for (i=0; i< SUs_per_region; i++) {
+ info->stripeIdentifier[i][0] = i / (raidPtr->numCol-1);
+ tmp = i / raidPtr->numCol;
+ info->stripeIdentifier[i][1] = (i+1+tmp) % raidPtr->numCol;
+ }
+
+ /* no spare tables */
+ RF_ASSERT(raidPtr->numRow == 1);
+
+ /* fill in the remaining layout parameters */
+
+ /* total number of stripes should a multiple of 2*numCol: Each sparing region consists of
+ 2*numCol stripes: n-1 primary copy, n-1 secondary copy and 2 for spare .. */
+ num_used_stripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk - (layoutPtr->stripeUnitsPerDisk %
+ (2*raidPtr->numCol) );
+ info->numSparingRegions = num_used_stripeUnitsPerDisk / (2*raidPtr->numCol);
+ /* this is in fact the number of stripe units (that are primary data copies) in the sparing region */
+ info->stripeUnitsPerSparingRegion = raidPtr->numCol * (raidPtr->numCol - 1);
+ info->mirrorStripeOffset = info->numSparingRegions * (raidPtr->numCol+1);
+ layoutPtr->numStripe = info->numSparingRegions * info->stripeUnitsPerSparingRegion;
+ layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+ layoutPtr->numDataCol = 1;
+ layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+ layoutPtr->numParityCol = 1;
+
+ layoutPtr->dataStripeUnitsPerDisk = num_used_stripeUnitsPerDisk;
+
+ raidPtr->sectorsPerDisk =
+ num_used_stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
+
+ raidPtr->totalSectors =
+ (layoutPtr->numStripe) * layoutPtr->sectorsPerStripeUnit;
+
+ layoutPtr->stripeUnitsPerDisk = raidPtr->sectorsPerDisk / layoutPtr->sectorsPerStripeUnit;
+
+ return(0);
+}
+
+int rf_GetDefaultNumFloatingReconBuffersInterDecluster(RF_Raid_t *raidPtr)
+{
+ return(30);
+}
+
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitInterDecluster(RF_Raid_t *raidPtr)
+{
+ return(raidPtr->sectorsPerDisk);
+}
+
+RF_ReconUnitCount_t rf_GetNumSpareRUsInterDecluster(
+ RF_Raid_t *raidPtr)
+{
+ RF_InterdeclusterConfigInfo_t *info = (RF_InterdeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+
+ return ( 2 * ((RF_ReconUnitCount_t) info->numSparingRegions) );
+ /* the layout uses two stripe units per disk as spare within each sparing region */
+}
+
+/* Maps to the primary copy of the data, i.e. the first mirror pair */
+void rf_MapSectorInterDecluster(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_InterdeclusterConfigInfo_t *info = (RF_InterdeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+ RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+ RF_StripeNum_t su_offset_into_disk, mirror_su_offset_into_disk;
+ RF_StripeNum_t sparing_region_id, index_within_region;
+ int col_before_remap;
+
+ *row = 0;
+ sparing_region_id = SUID / info->stripeUnitsPerSparingRegion;
+ index_within_region = SUID % info->stripeUnitsPerSparingRegion;
+ su_offset_into_disk = index_within_region % (raidPtr->numCol-1);
+ mirror_su_offset_into_disk = index_within_region / raidPtr->numCol;
+ col_before_remap = index_within_region / (raidPtr->numCol-1);
+
+ if (!remap) {
+ *col = col_before_remap;;
+ *diskSector = ( su_offset_into_disk + ( (raidPtr->numCol-1) * sparing_region_id) ) *
+ raidPtr->Layout.sectorsPerStripeUnit;
+ *diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+ }
+ else {
+ /* remap sector to spare space...*/
+ *diskSector = sparing_region_id * (raidPtr->numCol+1) * raidPtr->Layout.sectorsPerStripeUnit;
+ *diskSector += (raidPtr->numCol-1) * raidPtr->Layout.sectorsPerStripeUnit;
+ *diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+ *col = (index_within_region + 1 + mirror_su_offset_into_disk) % raidPtr->numCol;
+ *col = (*col + 1) % raidPtr->numCol;
+ if (*col == col_before_remap) *col = (*col + 1) % raidPtr->numCol;
+ }
+}
+
+/* Maps to the second copy of the mirror pair. */
+void rf_MapParityInterDecluster(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_InterdeclusterConfigInfo_t *info = (RF_InterdeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+ RF_StripeNum_t sparing_region_id, index_within_region, mirror_su_offset_into_disk;
+ RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+ int col_before_remap;
+
+ sparing_region_id = SUID / info->stripeUnitsPerSparingRegion;
+ index_within_region = SUID % info->stripeUnitsPerSparingRegion;
+ mirror_su_offset_into_disk = index_within_region / raidPtr->numCol;
+ col_before_remap = (index_within_region + 1 + mirror_su_offset_into_disk) % raidPtr->numCol;
+
+ *row = 0;
+ if (!remap) {
+ *col = col_before_remap;
+ *diskSector = info->mirrorStripeOffset * raidPtr->Layout.sectorsPerStripeUnit;
+ *diskSector += sparing_region_id * (raidPtr->numCol-1) * raidPtr->Layout.sectorsPerStripeUnit;
+ *diskSector += mirror_su_offset_into_disk * raidPtr->Layout.sectorsPerStripeUnit;
+ *diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+ }
+ else {
+ /* remap parity to spare space ... */
+ *diskSector = sparing_region_id * (raidPtr->numCol+1) * raidPtr->Layout.sectorsPerStripeUnit;
+ *diskSector += (raidPtr->numCol) * raidPtr->Layout.sectorsPerStripeUnit;
+ *diskSector += (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+ *col = index_within_region / (raidPtr->numCol-1);
+ *col = (*col + 1) % raidPtr->numCol;
+ if (*col == col_before_remap) *col = (*col + 1) % raidPtr->numCol;
+ }
+}
+
+void rf_IdentifyStripeInterDecluster(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids,
+ RF_RowCol_t *outRow)
+{
+ RF_InterdeclusterConfigInfo_t *info = (RF_InterdeclusterConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+ RF_StripeNum_t SUID;
+
+ SUID = addr / raidPtr->Layout.sectorsPerStripeUnit;
+ SUID = SUID % info->stripeUnitsPerSparingRegion;
+
+ *outRow = 0;
+ *diskids = info->stripeIdentifier[ SUID ];
+}
+
+void rf_MapSIDToPSIDInterDecluster(
+ RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t stripeID,
+ RF_StripeNum_t *psID,
+ RF_ReconUnitNum_t *which_ru)
+{
+ *which_ru = 0;
+ *psID = stripeID;
+}
+
+/******************************************************************************
+ * select a graph to perform a single-stripe access
+ *
+ * Parameters: raidPtr - description of the physical array
+ * type - type of operation (read or write) requested
+ * asmap - logical & physical addresses for this access
+ * createFunc - name of function to use to create the graph
+ *****************************************************************************/
+
+void rf_RAIDIDagSelect(
+ RF_Raid_t *raidPtr,
+ RF_IoType_t type,
+ RF_AccessStripeMap_t *asmap,
+ RF_VoidFuncPtr *createFunc)
+{
+ RF_ASSERT(RF_IO_IS_R_OR_W(type));
+
+ if (asmap->numDataFailed + asmap->numParityFailed > 1) {
+ RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n");
+ *createFunc = NULL;
+ return;
+ }
+
+ *createFunc = (type == RF_IO_TYPE_READ) ? (RF_VoidFuncPtr)rf_CreateFaultFreeReadDAG : (RF_VoidFuncPtr)rf_CreateRaidOneWriteDAG;
+ if (type == RF_IO_TYPE_READ) {
+ if (asmap->numDataFailed == 0)
+ *createFunc = (RF_VoidFuncPtr)rf_CreateMirrorPartitionReadDAG;
+ else
+ *createFunc = (RF_VoidFuncPtr)rf_CreateRaidOneDegradedReadDAG;
+ }
+ else
+ *createFunc = (RF_VoidFuncPtr)rf_CreateRaidOneWriteDAG;
+}
diff --git a/sys/dev/raidframe/rf_interdecluster.h b/sys/dev/raidframe/rf_interdecluster.h
new file mode 100644
index 00000000000..a76ea9dcb46
--- /dev/null
+++ b/sys/dev/raidframe/rf_interdecluster.h
@@ -0,0 +1,112 @@
+/* $OpenBSD: rf_interdecluster.h,v 1.1 1999/01/11 14:29:26 niklas Exp $ */
+/* $NetBSD: rf_interdecluster.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Khalil Amiri
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_interdecluster.h
+ * header file for Interleaved Declustering
+ */
+
+/*
+ * :
+ * Log: rf_interdecluster.h,v
+ * Revision 1.13 1996/07/29 14:05:12 jimz
+ * fix numPUs/numRUs confusion (everything is now numRUs)
+ * clean up some commenting, return values
+ *
+ * Revision 1.12 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.11 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.10 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.9 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.8 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.7 1996/06/06 18:41:58 jimz
+ * add RAIDIDagSelect
+ *
+ * Revision 1.6 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.5 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.4 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.3 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.2 1995/12/01 19:07:25 root
+ * added copyright info
+ *
+ * Revision 1.1 1995/11/28 21:38:27 amiri
+ * Initial revision
+ */
+
+#ifndef _RF__RF_INTERDECLUSTER_H_
+#define _RF__RF_INTERDECLUSTER_H_
+
+int rf_ConfigureInterDecluster(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr);
+int rf_GetDefaultNumFloatingReconBuffersInterDecluster(RF_Raid_t *raidPtr);
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitInterDecluster(RF_Raid_t *raidPtr);
+RF_ReconUnitCount_t rf_GetNumSpareRUsInterDecluster(RF_Raid_t *raidPtr);
+void rf_MapSectorInterDecluster(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_MapParityInterDecluster(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_IdentifyStripeInterDecluster(RF_Raid_t *raidPtr, RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids, RF_RowCol_t *outRow);
+void rf_MapSIDToPSIDInterDecluster(RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t stripeID, RF_StripeNum_t *psID,
+ RF_ReconUnitNum_t *which_ru);
+void rf_RAIDIDagSelect(RF_Raid_t *raidPtr, RF_IoType_t type,
+ RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc);
+
+#endif /* !_RF__RF_INTERDECLUSTER_H_ */
diff --git a/sys/dev/raidframe/rf_invertq.c b/sys/dev/raidframe/rf_invertq.c
new file mode 100644
index 00000000000..c1e07aa257f
--- /dev/null
+++ b/sys/dev/raidframe/rf_invertq.c
@@ -0,0 +1,55 @@
+/* $OpenBSD: rf_invertq.c,v 1.1 1999/01/11 14:29:26 niklas Exp $ */
+/* $NetBSD: rf_invertq.c,v 1.1 1998/11/13 04:20:30 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* :
+ * Log: rf_invertq.c,v
+ * Revision 1.5 1996/07/29 16:36:36 jimz
+ * include rf_archs.h here, not rf_invertq.h, to avoid VPATH
+ * problems in OSF/1 kernel
+ *
+ * Revision 1.4 1995/11/30 15:57:27 wvcii
+ * added copyright info
+ *
+ */
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#include "rf_archs.h"
+#include "rf_pqdeg.h"
+#ifdef KERNEL
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include <raidframe/du_data/rf_invertq.h>
+#else
+#include "rf_invertq.h" /* XXX this is a hack. */
+#endif /* !__NetBSD__ && !__OpenBSD__ */
+#else /* KERNEL */
+#include "rf_invertq.h"
+#endif /* KERNEL */
diff --git a/sys/dev/raidframe/rf_invertq.h b/sys/dev/raidframe/rf_invertq.h
new file mode 100644
index 00000000000..e9c1e69d768
--- /dev/null
+++ b/sys/dev/raidframe/rf_invertq.h
@@ -0,0 +1,73 @@
+/* $OpenBSD: rf_invertq.h,v 1.1 1999/01/11 14:29:27 niklas Exp $ */
+/* $NetBSD: rf_invertq.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */
+/*
+ * rf_invertq.h
+ */
+/*
+ * This is normally a generated file. Not so for Net- and OpenBSD.
+ */
+
+#ifndef _RF__RF_INVERTQ_H_
+#define _RF__RF_INVERTQ_H_
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+/*
+ * rf_geniq.c must include rf_archs.h before including
+ * this file (to get VPATH magic right with the way we
+ * generate this file in kernel trees)
+ */
+/* #include "rf_archs.h" */
+
+#if (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
+
+#define RF_Q_COLS 32
+RF_ua32_t rf_rn = {
+1, 2, 4, 8, 16, 5, 10, 20, 13, 26, 17, 7, 14, 28, 29, 31, 27, 19, 3, 6, 12, 24, 21, 15, 30, 25, 23, 11, 22, 9, 18, 1, };
+RF_ua32_t rf_qfor[32] = {
+/* i = 0 */ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, },
+/* i = 1 */ { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 5, 7, 1, 3, 13, 15, 9, 11, 21, 23, 17, 19, 29, 31, 25, 27, },
+/* i = 2 */ { 0, 4, 8, 12, 16, 20, 24, 28, 5, 1, 13, 9, 21, 17, 29, 25, 10, 14, 2, 6, 26, 30, 18, 22, 15, 11, 7, 3, 31, 27, 23, 19, },
+/* i = 3 */ { 0, 8, 16, 24, 5, 13, 21, 29, 10, 2, 26, 18, 15, 7, 31, 23, 20, 28, 4, 12, 17, 25, 1, 9, 30, 22, 14, 6, 27, 19, 11, 3, },
+/* i = 4 */ { 0, 16, 5, 21, 10, 26, 15, 31, 20, 4, 17, 1, 30, 14, 27, 11, 13, 29, 8, 24, 7, 23, 2, 18, 25, 9, 28, 12, 19, 3, 22, 6, },
+/* i = 5 */ { 0, 5, 10, 15, 20, 17, 30, 27, 13, 8, 7, 2, 25, 28, 19, 22, 26, 31, 16, 21, 14, 11, 4, 1, 23, 18, 29, 24, 3, 6, 9, 12, },
+/* i = 6 */ { 0, 10, 20, 30, 13, 7, 25, 19, 26, 16, 14, 4, 23, 29, 3, 9, 17, 27, 5, 15, 28, 22, 8, 2, 11, 1, 31, 21, 6, 12, 18, 24, },
+/* i = 7 */ { 0, 20, 13, 25, 26, 14, 23, 3, 17, 5, 28, 8, 11, 31, 6, 18, 7, 19, 10, 30, 29, 9, 16, 4, 22, 2, 27, 15, 12, 24, 1, 21, },
+/* i = 8 */ { 0, 13, 26, 23, 17, 28, 11, 6, 7, 10, 29, 16, 22, 27, 12, 1, 14, 3, 20, 25, 31, 18, 5, 8, 9, 4, 19, 30, 24, 21, 2, 15, },
+/* i = 9 */ { 0, 26, 17, 11, 7, 29, 22, 12, 14, 20, 31, 5, 9, 19, 24, 2, 28, 6, 13, 23, 27, 1, 10, 16, 18, 8, 3, 25, 21, 15, 4, 30, },
+/* i = 10 */ { 0, 17, 7, 22, 14, 31, 9, 24, 28, 13, 27, 10, 18, 3, 21, 4, 29, 12, 26, 11, 19, 2, 20, 5, 1, 16, 6, 23, 15, 30, 8, 25, },
+/* i = 11 */ { 0, 7, 14, 9, 28, 27, 18, 21, 29, 26, 19, 20, 1, 6, 15, 8, 31, 24, 17, 22, 3, 4, 13, 10, 2, 5, 12, 11, 30, 25, 16, 23, },
+/* i = 12 */ { 0, 14, 28, 18, 29, 19, 1, 15, 31, 17, 3, 13, 2, 12, 30, 16, 27, 21, 7, 9, 6, 8, 26, 20, 4, 10, 24, 22, 25, 23, 5, 11, },
+/* i = 13 */ { 0, 28, 29, 1, 31, 3, 2, 30, 27, 7, 6, 26, 4, 24, 25, 5, 19, 15, 14, 18, 12, 16, 17, 13, 8, 20, 21, 9, 23, 11, 10, 22, },
+/* i = 14 */ { 0, 29, 31, 2, 27, 6, 4, 25, 19, 14, 12, 17, 8, 21, 23, 10, 3, 30, 28, 1, 24, 5, 7, 26, 16, 13, 15, 18, 11, 22, 20, 9, },
+/* i = 15 */ { 0, 31, 27, 4, 19, 12, 8, 23, 3, 28, 24, 7, 16, 15, 11, 20, 6, 25, 29, 2, 21, 10, 14, 17, 5, 26, 30, 1, 22, 9, 13, 18, },
+/* i = 16 */ { 0, 27, 19, 8, 3, 24, 16, 11, 6, 29, 21, 14, 5, 30, 22, 13, 12, 23, 31, 4, 15, 20, 28, 7, 10, 17, 25, 2, 9, 18, 26, 1, },
+/* i = 17 */ { 0, 19, 3, 16, 6, 21, 5, 22, 12, 31, 15, 28, 10, 25, 9, 26, 24, 11, 27, 8, 30, 13, 29, 14, 20, 7, 23, 4, 18, 1, 17, 2, },
+/* i = 18 */ { 0, 3, 6, 5, 12, 15, 10, 9, 24, 27, 30, 29, 20, 23, 18, 17, 21, 22, 19, 16, 25, 26, 31, 28, 13, 14, 11, 8, 1, 2, 7, 4, },
+/* i = 19 */ { 0, 6, 12, 10, 24, 30, 20, 18, 21, 19, 25, 31, 13, 11, 1, 7, 15, 9, 3, 5, 23, 17, 27, 29, 26, 28, 22, 16, 2, 4, 14, 8, },
+/* i = 20 */ { 0, 12, 24, 20, 21, 25, 13, 1, 15, 3, 23, 27, 26, 22, 2, 14, 30, 18, 6, 10, 11, 7, 19, 31, 17, 29, 9, 5, 4, 8, 28, 16, },
+/* i = 21 */ { 0, 24, 21, 13, 15, 23, 26, 2, 30, 6, 11, 19, 17, 9, 4, 28, 25, 1, 12, 20, 22, 14, 3, 27, 7, 31, 18, 10, 8, 16, 29, 5, },
+/* i = 22 */ { 0, 21, 15, 26, 30, 11, 17, 4, 25, 12, 22, 3, 7, 18, 8, 29, 23, 2, 24, 13, 9, 28, 6, 19, 14, 27, 1, 20, 16, 5, 31, 10, },
+/* i = 23 */ { 0, 15, 30, 17, 25, 22, 7, 8, 23, 24, 9, 6, 14, 1, 16, 31, 11, 4, 21, 26, 18, 29, 12, 3, 28, 19, 2, 13, 5, 10, 27, 20, },
+/* i = 24 */ { 0, 30, 25, 7, 23, 9, 14, 16, 11, 21, 18, 12, 28, 2, 5, 27, 22, 8, 15, 17, 1, 31, 24, 6, 29, 3, 4, 26, 10, 20, 19, 13, },
+/* i = 25 */ { 0, 25, 23, 14, 11, 18, 28, 5, 22, 15, 1, 24, 29, 4, 10, 19, 9, 16, 30, 7, 2, 27, 21, 12, 31, 6, 8, 17, 20, 13, 3, 26, },
+/* i = 26 */ { 0, 23, 11, 28, 22, 1, 29, 10, 9, 30, 2, 21, 31, 8, 20, 3, 18, 5, 25, 14, 4, 19, 15, 24, 27, 12, 16, 7, 13, 26, 6, 17, },
+/* i = 27 */ { 0, 11, 22, 29, 9, 2, 31, 20, 18, 25, 4, 15, 27, 16, 13, 6, 1, 10, 23, 28, 8, 3, 30, 21, 19, 24, 5, 14, 26, 17, 12, 7, },
+/* i = 28 */ { 0, 22, 9, 31, 18, 4, 27, 13, 1, 23, 8, 30, 19, 5, 26, 12, 2, 20, 11, 29, 16, 6, 25, 15, 3, 21, 10, 28, 17, 7, 24, 14, },
+/* i = 29 */ { 0, 9, 18, 27, 1, 8, 19, 26, 2, 11, 16, 25, 3, 10, 17, 24, 4, 13, 22, 31, 5, 12, 23, 30, 6, 15, 20, 29, 7, 14, 21, 28, },
+/* i = 30 */ { 0, 18, 1, 19, 2, 16, 3, 17, 4, 22, 5, 23, 6, 20, 7, 21, 8, 26, 9, 27, 10, 24, 11, 25, 12, 30, 13, 31, 14, 28, 15, 29, },
+/* i = 31 */ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, },
+};
+#define RF_Q_DATA_COL(col_num) rf_rn[col_num],rf_qfor[28-(col_num)]
+#ifdef KERNEL
+RF_ua1024_t rf_qinv[1]; /* don't compile monster table into kernel */
+#elif defined(NO_PQ)
+RF_ua1024_t rf_qinv[29*29];
+#else /* !KERNEL && NO_PQ */
+
+#endif /* !KERNEL && NO_PQ */
+
+#endif /* (RF_INCLUDE_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */
+#endif /* !_RF__RF_INVERTQ_H_ */
diff --git a/sys/dev/raidframe/rf_kintf.h b/sys/dev/raidframe/rf_kintf.h
new file mode 100644
index 00000000000..e270aa0b933
--- /dev/null
+++ b/sys/dev/raidframe/rf_kintf.h
@@ -0,0 +1,71 @@
+/* $OpenBSD: rf_kintf.h,v 1.1 1999/01/11 14:29:27 niklas Exp $ */
+/* $NetBSD: rf_kintf.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */
+/*
+ * rf_kintf.h
+ *
+ * RAIDframe exported kernel interface
+ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * :
+ * Log: rf_kintf.h,v
+ * Revision 1.2 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.1 1996/05/31 18:59:14 jimz
+ * Initial revision
+ *
+ */
+
+#ifndef _RF__RF_KINTF_H_
+#define _RF__RF_KINTF_H_
+
+#include "rf_types.h"
+
+int rf_boot(void);
+int rf_open(dev_t dev, int flag, int fmt);
+int rf_close(dev_t dev, int flag, int fmt);
+void rf_strategy(struct buf *bp);
+void rf_minphys(struct buf *bp);
+int rf_read(dev_t dev, struct uio *uio);
+int rf_write(dev_t dev, struct uio *uio);
+int rf_size(dev_t dev);
+int rf_ioctl(dev_t dev, int cmd, caddr_t data, int flag);
+void rf_ReconKernelThread(void);
+int rf_GetSpareTableFromDaemon(RF_SparetWait_t *req);
+caddr_t rf_MapToKernelSpace(struct buf *bp, caddr_t addr);
+int rf_BzeroWithRemap(struct buf *bp, char *databuf, int len);
+int rf_DoAccessKernel(RF_Raid_t *raidPtr, struct buf *bp,
+ RF_RaidAccessFlags_t flags, void (*cbFunc)(struct buf *), void *cbArg);
+int rf_DispatchKernelIO(RF_DiskQueue_t *queue, RF_DiskQueueData_t *req);
+
+#endif /* _RF__RF_KINTF_H_ */
diff --git a/sys/dev/raidframe/rf_layout.c b/sys/dev/raidframe/rf_layout.c
new file mode 100644
index 00000000000..a8a06e044ff
--- /dev/null
+++ b/sys/dev/raidframe/rf_layout.c
@@ -0,0 +1,720 @@
+/* $OpenBSD: rf_layout.c,v 1.1 1999/01/11 14:29:27 niklas Exp $ */
+/* $NetBSD: rf_layout.c,v 1.1 1998/11/13 04:20:30 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_layout.c -- driver code dealing with layout and mapping issues
+ */
+
+/*
+ * :
+ * Log: rf_layout.c,v
+ * Revision 1.71 1996/08/20 22:41:30 jimz
+ * add declustered evenodd
+ *
+ * Revision 1.70 1996/07/31 16:56:18 jimz
+ * dataBytesPerStripe, sectorsPerDisk init arch-indep.
+ *
+ * Revision 1.69 1996/07/31 15:34:46 jimz
+ * add EvenOdd
+ *
+ * Revision 1.68 1996/07/29 14:05:12 jimz
+ * fix numPUs/numRUs confusion (everything is now numRUs)
+ * clean up some commenting, return values
+ *
+ * Revision 1.67 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.66 1996/07/27 18:40:24 jimz
+ * cleanup sweep
+ *
+ * Revision 1.65 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.64 1996/07/15 17:22:18 jimz
+ * nit-pick code cleanup
+ * resolve stdlib problems on DEC OSF
+ *
+ * Revision 1.63 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.62 1996/07/11 19:08:00 jimz
+ * generalize reconstruction mechanism
+ * allow raid1 reconstructs via copyback (done with array
+ * quiesced, not online, therefore not disk-directed)
+ *
+ * Revision 1.61 1996/06/19 22:23:01 jimz
+ * parity verification is now a layout-configurable thing
+ * not all layouts currently support it (correctly, anyway)
+ *
+ * Revision 1.60 1996/06/19 17:53:48 jimz
+ * move GetNumSparePUs, InstallSpareTable ops into layout switch
+ *
+ * Revision 1.59 1996/06/19 14:57:58 jimz
+ * move layout-specific config parsing hooks into RF_LayoutSW_t
+ * table in rf_layout.c
+ *
+ * Revision 1.58 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.57 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.56 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.55 1996/06/06 18:41:35 jimz
+ * change interleaved declustering dag selection to an
+ * interleaved-declustering-specific routine (so we can
+ * use the partitioned mirror node)
+ *
+ * Revision 1.54 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.53 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.52 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.51 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.50 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.49 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.48 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.47 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.46 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.45 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.44 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.43 1996/02/22 16:46:35 amiri
+ * modified chained declustering to use a seperate DAG selection routine
+ *
+ * Revision 1.42 1995/12/01 19:16:11 root
+ * added copyright info
+ *
+ * Revision 1.41 1995/11/28 21:31:02 amiri
+ * added Interleaved Declustering to switch table
+ *
+ * Revision 1.40 1995/11/20 14:35:17 arw
+ * moved rf_StartThroughputStats in DefaultWrite and DefaultRead
+ *
+ * Revision 1.39 1995/11/19 16:28:46 wvcii
+ * replaced LaunchDAGState with CreateDAGState, ExecuteDAGState
+ *
+ * Revision 1.38 1995/11/17 19:00:41 wvcii
+ * added MapQ entries to switch table
+ *
+ * Revision 1.37 1995/11/17 16:58:13 amiri
+ * Added the Chained Declustering architecture ('C'),
+ * essentially a variant of mirroring.
+ *
+ * Revision 1.36 1995/11/16 16:16:10 amiri
+ * Added RAID5 with rotated sparing ('R' configuration)
+ *
+ * Revision 1.35 1995/11/07 15:41:17 wvcii
+ * modified state lists: DefaultStates, VSReadStates
+ * necessary to support new states (LaunchDAGState, ProcessDAGState)
+ *
+ * Revision 1.34 1995/10/18 01:23:20 amiri
+ * added ifndef SIMULATE wrapper around rf_StartThroughputStats()
+ *
+ * Revision 1.33 1995/10/13 15:05:46 arw
+ * added rf_StartThroughputStats to DefaultRead and DefaultWrite
+ *
+ * Revision 1.32 1995/10/12 16:04:23 jimz
+ * added config names to mapsw entires
+ *
+ * Revision 1.31 1995/10/04 03:57:48 wvcii
+ * added raid level 1 to mapsw
+ *
+ * Revision 1.30 1995/09/07 01:26:55 jimz
+ * Achive basic compilation in kernel. Kernel functionality
+ * is not guaranteed at all, but it'll compile. Mostly. I hope.
+ *
+ * Revision 1.29 1995/07/28 21:43:42 robby
+ * checkin after leaving for Rice. Bye
+ *
+ * Revision 1.28 1995/07/26 03:26:14 robby
+ * *** empty log message ***
+ *
+ * Revision 1.27 1995/07/21 19:47:52 rachad
+ * Added raid 0 /5 with caching architectures
+ *
+ * Revision 1.26 1995/07/21 19:29:27 robby
+ * added virtual striping states
+ *
+ * Revision 1.25 1995/07/10 21:41:47 robby
+ * switched to have my own virtual stripng write function from the cache
+ *
+ * Revision 1.24 1995/07/10 20:51:59 robby
+ * added virtual striping states
+ *
+ * Revision 1.23 1995/07/10 16:57:42 robby
+ * updated alloclistelem struct to the correct struct name
+ *
+ * Revision 1.22 1995/07/08 20:06:11 rachad
+ * *** empty log message ***
+ *
+ * Revision 1.21 1995/07/08 19:43:16 cfb
+ * *** empty log message ***
+ *
+ * Revision 1.20 1995/07/08 18:05:39 rachad
+ * Linked up Claudsons code with the real cache
+ *
+ * Revision 1.19 1995/07/06 14:29:36 robby
+ * added defaults states list to the layout switch
+ *
+ * Revision 1.18 1995/06/23 13:40:34 robby
+ * updeated to prototypes in rf_layout.h
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_archs.h"
+#include "rf_raid.h"
+#include "rf_configure.h"
+#include "rf_dag.h"
+#include "rf_desc.h"
+#include "rf_decluster.h"
+#include "rf_pq.h"
+#include "rf_declusterPQ.h"
+#include "rf_raid0.h"
+#include "rf_raid1.h"
+#include "rf_raid4.h"
+#include "rf_raid5.h"
+#include "rf_states.h"
+#if RF_INCLUDE_RAID5_RS > 0
+#include "rf_raid5_rotatedspare.h"
+#endif /* RF_INCLUDE_RAID5_RS > 0 */
+#if RF_INCLUDE_CHAINDECLUSTER > 0
+#include "rf_chaindecluster.h"
+#endif /* RF_INCLUDE_CHAINDECLUSTER > 0 */
+#if RF_INCLUDE_INTERDECLUSTER > 0
+#include "rf_interdecluster.h"
+#endif /* RF_INCLUDE_INTERDECLUSTER > 0 */
+#if RF_INCLUDE_PARITYLOGGING > 0
+#include "rf_paritylogging.h"
+#endif /* RF_INCLUDE_PARITYLOGGING > 0 */
+#if RF_INCLUDE_EVENODD > 0
+#include "rf_evenodd.h"
+#endif /* RF_INCLUDE_EVENODD > 0 */
+#include "rf_general.h"
+#include "rf_driver.h"
+#include "rf_parityscan.h"
+#include "rf_reconbuffer.h"
+#include "rf_reconutil.h"
+
+/***********************************************************************
+ *
+ * the layout switch defines all the layouts that are supported.
+ * fields are: layout ID, init routine, shutdown routine, map
+ * sector, map parity, identify stripe, dag selection, map stripeid
+ * to parity stripe id (optional), num faults tolerated, special
+ * flags.
+ *
+ ***********************************************************************/
+
+static RF_AccessState_t DefaultStates[] = {rf_QuiesceState,
+ rf_IncrAccessesCountState, rf_MapState, rf_LockState, rf_CreateDAGState,
+ rf_ExecuteDAGState, rf_ProcessDAGState, rf_DecrAccessesCountState,
+ rf_CleanupState, rf_LastState};
+
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && !defined(_KERNEL)
+/* XXX Gross hack to shutup gcc -- it complains that DefaultStates is not
+used when compiling this in userland.. I hate to burst it's bubble, but
+DefaultStates is used all over the place here in the initialization of
+lots of data structures. GO */
+RF_AccessState_t *NothingAtAll = DefaultStates;
+#endif
+
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+/* XXX Remove static so GCC doesn't complain about these being unused! */
+int distSpareYes = 1;
+int distSpareNo = 0;
+#else
+static int distSpareYes = 1;
+static int distSpareNo = 0;
+#endif
+#ifdef KERNEL
+#define RF_NK2(a,b)
+#else /* KERNEL */
+#define RF_NK2(a,b) a,b,
+#endif /* KERNEL */
+
+#if RF_UTILITY > 0
+#define RF_NU(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p)
+#else /* RF_UTILITY > 0 */
+#define RF_NU(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p
+#endif /* RF_UTILITY > 0 */
+
+static RF_LayoutSW_t mapsw[] = {
+ /* parity declustering */
+ {'T', "Parity declustering",
+ RF_NK2(rf_MakeLayoutSpecificDeclustered, &distSpareNo)
+ RF_NU(
+ rf_ConfigureDeclustered,
+ rf_MapSectorDeclustered, rf_MapParityDeclustered, NULL,
+ rf_IdentifyStripeDeclustered,
+ rf_RaidFiveDagSelect,
+ rf_MapSIDToPSIDDeclustered,
+ rf_GetDefaultHeadSepLimitDeclustered,
+ rf_GetDefaultNumFloatingReconBuffersDeclustered,
+ NULL, NULL,
+ rf_SubmitReconBufferBasic,
+ rf_VerifyParityBasic,
+ 1,
+ DefaultStates,
+ 0)
+ },
+
+ /* parity declustering with distributed sparing */
+ {'D', "Distributed sparing parity declustering",
+ RF_NK2(rf_MakeLayoutSpecificDeclustered, &distSpareYes)
+ RF_NU(
+ rf_ConfigureDeclusteredDS,
+ rf_MapSectorDeclustered, rf_MapParityDeclustered, NULL,
+ rf_IdentifyStripeDeclustered,
+ rf_RaidFiveDagSelect,
+ rf_MapSIDToPSIDDeclustered,
+ rf_GetDefaultHeadSepLimitDeclustered,
+ rf_GetDefaultNumFloatingReconBuffersDeclustered,
+ rf_GetNumSpareRUsDeclustered, rf_InstallSpareTable,
+ rf_SubmitReconBufferBasic,
+ rf_VerifyParityBasic,
+ 1,
+ DefaultStates,
+ RF_DISTRIBUTE_SPARE|RF_BD_DECLUSTERED)
+ },
+
+#if RF_INCLUDE_DECL_PQ > 0
+ /* declustered P+Q */
+ {'Q', "Declustered P+Q",
+ RF_NK2(rf_MakeLayoutSpecificDeclustered, &distSpareNo)
+ RF_NU(
+ rf_ConfigureDeclusteredPQ,
+ rf_MapSectorDeclusteredPQ, rf_MapParityDeclusteredPQ, rf_MapQDeclusteredPQ,
+ rf_IdentifyStripeDeclusteredPQ,
+ rf_PQDagSelect,
+ rf_MapSIDToPSIDDeclustered,
+ rf_GetDefaultHeadSepLimitDeclustered,
+ rf_GetDefaultNumFloatingReconBuffersPQ,
+ NULL, NULL,
+ NULL,
+ rf_VerifyParityBasic,
+ 2,
+ DefaultStates,
+ 0)
+ },
+#endif /* RF_INCLUDE_DECL_PQ > 0 */
+
+#if RF_INCLUDE_RAID5_RS > 0
+ /* RAID 5 with rotated sparing */
+ {'R', "RAID Level 5 rotated sparing",
+ RF_NK2(rf_MakeLayoutSpecificNULL, NULL)
+ RF_NU(
+ rf_ConfigureRAID5_RS,
+ rf_MapSectorRAID5_RS, rf_MapParityRAID5_RS, NULL,
+ rf_IdentifyStripeRAID5_RS,
+ rf_RaidFiveDagSelect,
+ rf_MapSIDToPSIDRAID5_RS,
+ rf_GetDefaultHeadSepLimitRAID5,
+ rf_GetDefaultNumFloatingReconBuffersRAID5,
+ rf_GetNumSpareRUsRAID5_RS, NULL,
+ rf_SubmitReconBufferBasic,
+ rf_VerifyParityBasic,
+ 1,
+ DefaultStates,
+ RF_DISTRIBUTE_SPARE)
+ },
+#endif /* RF_INCLUDE_RAID5_RS > 0 */
+
+#if RF_INCLUDE_CHAINDECLUSTER > 0
+ /* Chained Declustering */
+ {'C', "Chained Declustering",
+ RF_NK2(rf_MakeLayoutSpecificNULL, NULL)
+ RF_NU(
+ rf_ConfigureChainDecluster,
+ rf_MapSectorChainDecluster, rf_MapParityChainDecluster, NULL,
+ rf_IdentifyStripeChainDecluster,
+ rf_RAIDCDagSelect,
+ rf_MapSIDToPSIDChainDecluster,
+ NULL,
+ NULL,
+ rf_GetNumSpareRUsChainDecluster, NULL,
+ rf_SubmitReconBufferBasic,
+ rf_VerifyParityBasic,
+ 1,
+ DefaultStates,
+ 0)
+ },
+#endif /* RF_INCLUDE_CHAINDECLUSTER > 0 */
+
+#if RF_INCLUDE_INTERDECLUSTER > 0
+ /* Interleaved Declustering */
+ {'I', "Interleaved Declustering",
+ RF_NK2(rf_MakeLayoutSpecificNULL, NULL)
+ RF_NU(
+ rf_ConfigureInterDecluster,
+ rf_MapSectorInterDecluster, rf_MapParityInterDecluster, NULL,
+ rf_IdentifyStripeInterDecluster,
+ rf_RAIDIDagSelect,
+ rf_MapSIDToPSIDInterDecluster,
+ rf_GetDefaultHeadSepLimitInterDecluster,
+ rf_GetDefaultNumFloatingReconBuffersInterDecluster,
+ rf_GetNumSpareRUsInterDecluster, NULL,
+ rf_SubmitReconBufferBasic,
+ rf_VerifyParityBasic,
+ 1,
+ DefaultStates,
+ RF_DISTRIBUTE_SPARE)
+ },
+#endif /* RF_INCLUDE_INTERDECLUSTER > 0 */
+
+#if RF_INCLUDE_RAID0 > 0
+ /* RAID level 0 */
+ {'0', "RAID Level 0",
+ RF_NK2(rf_MakeLayoutSpecificNULL, NULL)
+ RF_NU(
+ rf_ConfigureRAID0,
+ rf_MapSectorRAID0, rf_MapParityRAID0, NULL,
+ rf_IdentifyStripeRAID0,
+ rf_RAID0DagSelect,
+ rf_MapSIDToPSIDRAID0,
+ NULL,
+ NULL,
+ NULL, NULL,
+ NULL,
+ rf_VerifyParityRAID0,
+ 0,
+ DefaultStates,
+ 0)
+ },
+#endif /* RF_INCLUDE_RAID0 > 0 */
+
+#if RF_INCLUDE_RAID1 > 0
+ /* RAID level 1 */
+ {'1', "RAID Level 1",
+ RF_NK2(rf_MakeLayoutSpecificNULL, NULL)
+ RF_NU(
+ rf_ConfigureRAID1,
+ rf_MapSectorRAID1, rf_MapParityRAID1, NULL,
+ rf_IdentifyStripeRAID1,
+ rf_RAID1DagSelect,
+ rf_MapSIDToPSIDRAID1,
+ NULL,
+ NULL,
+ NULL, NULL,
+ rf_SubmitReconBufferRAID1,
+ rf_VerifyParityRAID1,
+ 1,
+ DefaultStates,
+ 0)
+ },
+#endif /* RF_INCLUDE_RAID1 > 0 */
+
+#if RF_INCLUDE_RAID4 > 0
+ /* RAID level 4 */
+ {'4', "RAID Level 4",
+ RF_NK2(rf_MakeLayoutSpecificNULL, NULL)
+ RF_NU(
+ rf_ConfigureRAID4,
+ rf_MapSectorRAID4, rf_MapParityRAID4, NULL,
+ rf_IdentifyStripeRAID4,
+ rf_RaidFiveDagSelect,
+ rf_MapSIDToPSIDRAID4,
+ rf_GetDefaultHeadSepLimitRAID4,
+ rf_GetDefaultNumFloatingReconBuffersRAID4,
+ NULL, NULL,
+ rf_SubmitReconBufferBasic,
+ rf_VerifyParityBasic,
+ 1,
+ DefaultStates,
+ 0)
+ },
+#endif /* RF_INCLUDE_RAID4 > 0 */
+
+#if RF_INCLUDE_RAID5 > 0
+ /* RAID level 5 */
+ {'5', "RAID Level 5",
+ RF_NK2(rf_MakeLayoutSpecificNULL, NULL)
+ RF_NU(
+ rf_ConfigureRAID5,
+ rf_MapSectorRAID5, rf_MapParityRAID5, NULL,
+ rf_IdentifyStripeRAID5,
+ rf_RaidFiveDagSelect,
+ rf_MapSIDToPSIDRAID5,
+ rf_GetDefaultHeadSepLimitRAID5,
+ rf_GetDefaultNumFloatingReconBuffersRAID5,
+ NULL, NULL,
+ rf_SubmitReconBufferBasic,
+ rf_VerifyParityBasic,
+ 1,
+ DefaultStates,
+ 0)
+ },
+#endif /* RF_INCLUDE_RAID5 > 0 */
+
+#if RF_INCLUDE_EVENODD > 0
+ /* Evenodd */
+ {'E', "EvenOdd",
+ RF_NK2(rf_MakeLayoutSpecificNULL, NULL)
+ RF_NU(
+ rf_ConfigureEvenOdd,
+ rf_MapSectorRAID5, rf_MapParityEvenOdd, rf_MapEEvenOdd,
+ rf_IdentifyStripeEvenOdd,
+ rf_EODagSelect,
+ rf_MapSIDToPSIDRAID5,
+ NULL,
+ NULL,
+ NULL, NULL,
+ NULL, /* no reconstruction, yet */
+ rf_VerifyParityEvenOdd,
+ 2,
+ DefaultStates,
+ 0)
+ },
+#endif /* RF_INCLUDE_EVENODD > 0 */
+
+#if RF_INCLUDE_EVENODD > 0
+ /* Declustered Evenodd */
+ {'e', "Declustered EvenOdd",
+ RF_NK2(rf_MakeLayoutSpecificDeclustered, &distSpareNo)
+ RF_NU(
+ rf_ConfigureDeclusteredPQ,
+ rf_MapSectorDeclusteredPQ, rf_MapParityDeclusteredPQ, rf_MapQDeclusteredPQ,
+ rf_IdentifyStripeDeclusteredPQ,
+ rf_EODagSelect,
+ rf_MapSIDToPSIDRAID5,
+ rf_GetDefaultHeadSepLimitDeclustered,
+ rf_GetDefaultNumFloatingReconBuffersPQ,
+ NULL, NULL,
+ NULL, /* no reconstruction, yet */
+ rf_VerifyParityEvenOdd,
+ 2,
+ DefaultStates,
+ 0)
+ },
+#endif /* RF_INCLUDE_EVENODD > 0 */
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+ /* parity logging */
+ {'L', "Parity logging",
+ RF_NK2(rf_MakeLayoutSpecificNULL, NULL)
+ RF_NU(
+ rf_ConfigureParityLogging,
+ rf_MapSectorParityLogging, rf_MapParityParityLogging, NULL,
+ rf_IdentifyStripeParityLogging,
+ rf_ParityLoggingDagSelect,
+ rf_MapSIDToPSIDParityLogging,
+ rf_GetDefaultHeadSepLimitParityLogging,
+ rf_GetDefaultNumFloatingReconBuffersParityLogging,
+ NULL, NULL,
+ rf_SubmitReconBufferBasic,
+ NULL,
+ 1,
+ DefaultStates,
+ 0)
+ },
+#endif /* RF_INCLUDE_PARITYLOGGING > 0 */
+
+ /* end-of-list marker */
+ { '\0', NULL,
+ RF_NK2(NULL, NULL)
+ RF_NU(
+ NULL,
+ NULL, NULL, NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL,
+ NULL, NULL,
+ NULL,
+ NULL,
+ 0,
+ NULL,
+ 0)
+ }
+};
+
+RF_LayoutSW_t *rf_GetLayout(RF_ParityConfig_t parityConfig)
+{
+ RF_LayoutSW_t *p;
+
+ /* look up the specific layout */
+ for (p=&mapsw[0]; p->parityConfig; p++)
+ if (p->parityConfig == parityConfig)
+ break;
+ if (!p->parityConfig)
+ return(NULL);
+ RF_ASSERT(p->parityConfig == parityConfig);
+ return(p);
+}
+
+#if RF_UTILITY == 0
+/*****************************************************************************************
+ *
+ * ConfigureLayout --
+ *
+ * read the configuration file and set up the RAID layout parameters. After reading
+ * common params, invokes the layout-specific configuration routine to finish
+ * the configuration.
+ *
+ ****************************************************************************************/
+int rf_ConfigureLayout(
+ RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr)
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_ParityConfig_t parityConfig;
+ RF_LayoutSW_t *p;
+ int retval;
+
+ layoutPtr->sectorsPerStripeUnit = cfgPtr->sectPerSU;
+ layoutPtr->SUsPerPU = cfgPtr->SUsPerPU;
+ layoutPtr->SUsPerRU = cfgPtr->SUsPerRU;
+ parityConfig = cfgPtr->parityConfig;
+
+ layoutPtr->stripeUnitsPerDisk = raidPtr->sectorsPerDisk / layoutPtr->sectorsPerStripeUnit;
+
+ p = rf_GetLayout(parityConfig);
+ if (p == NULL) {
+ RF_ERRORMSG1("Unknown parity configuration '%c'", parityConfig);
+ return(EINVAL);
+ }
+ RF_ASSERT(p->parityConfig == parityConfig);
+ layoutPtr->map = p;
+
+ /* initialize the specific layout */
+
+ retval = (p->Configure)(listp, raidPtr, cfgPtr);
+
+ if (retval)
+ return(retval);
+
+ layoutPtr->dataBytesPerStripe = layoutPtr->dataSectorsPerStripe << raidPtr->logBytesPerSector;
+ raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
+
+ if (rf_forceNumFloatingReconBufs >= 0) {
+ raidPtr->numFloatingReconBufs = rf_forceNumFloatingReconBufs;
+ }
+ else {
+ raidPtr->numFloatingReconBufs = rf_GetDefaultNumFloatingReconBuffers(raidPtr);
+ }
+
+ if (rf_forceHeadSepLimit >= 0) {
+ raidPtr->headSepLimit = rf_forceHeadSepLimit;
+ }
+ else {
+ raidPtr->headSepLimit = rf_GetDefaultHeadSepLimit(raidPtr);
+ }
+
+ printf("RAIDFRAME: Configure (%s): total number of sectors is %lu (%lu MB)\n",
+ layoutPtr->map->configName,
+ (unsigned long)raidPtr->totalSectors,
+ (unsigned long)(raidPtr->totalSectors / 1024 * (1<<raidPtr->logBytesPerSector) / 1024));
+ if (raidPtr->headSepLimit >= 0) {
+ printf("RAIDFRAME(%s): Using %ld floating recon bufs with head sep limit %ld\n",
+ layoutPtr->map->configName, (long)raidPtr->numFloatingReconBufs, (long)raidPtr->headSepLimit);
+ }
+ else {
+ printf("RAIDFRAME(%s): Using %ld floating recon bufs with no head sep limit\n",
+ layoutPtr->map->configName, (long)raidPtr->numFloatingReconBufs);
+ }
+
+ return(0);
+}
+
+/* typically there is a 1-1 mapping between stripes and parity stripes.
+ * however, the declustering code supports packing multiple stripes into
+ * a single parity stripe, so as to increase the size of the reconstruction
+ * unit without affecting the size of the stripe unit. This routine finds
+ * the parity stripe identifier associated with a stripe ID. There is also
+ * a RaidAddressToParityStripeID macro in layout.h
+ */
+RF_StripeNum_t rf_MapStripeIDToParityStripeID(layoutPtr, stripeID, which_ru)
+ RF_RaidLayout_t *layoutPtr;
+ RF_StripeNum_t stripeID;
+ RF_ReconUnitNum_t *which_ru;
+{
+ RF_StripeNum_t parityStripeID;
+
+ /* quick exit in the common case of SUsPerPU==1 */
+ if ((layoutPtr->SUsPerPU == 1) || !layoutPtr->map->MapSIDToPSID) {
+ *which_ru = 0;
+ return(stripeID);
+ }
+ else {
+ (layoutPtr->map->MapSIDToPSID)(layoutPtr, stripeID, &parityStripeID, which_ru);
+ }
+ return(parityStripeID);
+}
+#endif /* RF_UTILITY == 0 */
diff --git a/sys/dev/raidframe/rf_layout.h b/sys/dev/raidframe/rf_layout.h
new file mode 100644
index 00000000000..4259947f67f
--- /dev/null
+++ b/sys/dev/raidframe/rf_layout.h
@@ -0,0 +1,493 @@
+/* $OpenBSD: rf_layout.h,v 1.1 1999/01/11 14:29:28 niklas Exp $ */
+/* $NetBSD: rf_layout.h,v 1.1 1998/11/13 04:20:30 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_layout.h -- header file defining layout data structures
+ */
+
+/*
+ * :
+ * Log: rf_layout.h,v
+ * Revision 1.50 1996/11/05 21:10:40 jimz
+ * failed pda generalization
+ *
+ * Revision 1.49 1996/07/29 14:05:12 jimz
+ * fix numPUs/numRUs confusion (everything is now numRUs)
+ * clean up some commenting, return values
+ *
+ * Revision 1.48 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.47 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.46 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.45 1996/07/11 19:08:00 jimz
+ * generalize reconstruction mechanism
+ * allow raid1 reconstructs via copyback (done with array
+ * quiesced, not online, therefore not disk-directed)
+ *
+ * Revision 1.44 1996/06/19 22:23:01 jimz
+ * parity verification is now a layout-configurable thing
+ * not all layouts currently support it (correctly, anyway)
+ *
+ * Revision 1.43 1996/06/19 17:53:48 jimz
+ * move GetNumSparePUs, InstallSpareTable ops into layout switch
+ *
+ * Revision 1.42 1996/06/19 14:56:48 jimz
+ * move layout-specific config parsing hooks into RF_LayoutSW_t
+ * table in rf_layout.c
+ *
+ * Revision 1.41 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.40 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.39 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.38 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.37 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.36 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.35 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.34 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.33 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.32 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.31 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.30 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.29 1995/12/01 19:16:19 root
+ * added copyright info
+ *
+ * Revision 1.28 1995/11/28 21:26:49 amiri
+ * defined a declustering flag RF_BD_DECLUSTERED
+ *
+ * Revision 1.27 1995/11/17 19:00:59 wvcii
+ * created MapQ entry in switch table
+ * added prototyping to MapParity
+ *
+ * Revision 1.26 1995/11/07 15:40:27 wvcii
+ * changed prototype of SeclectionFunc in mapsw
+ * function no longer returns numHdrSucc, numTermAnt
+ *
+ * Revision 1.25 1995/10/12 20:57:08 arw
+ * added lots of comments
+ *
+ * Revision 1.24 1995/10/12 16:04:08 jimz
+ * added config name to mapsw
+ *
+ * Revision 1.23 1995/07/26 03:28:31 robby
+ * intermediary checkin
+ *
+ * Revision 1.22 1995/07/10 20:51:08 robby
+ * added to the asm info for the virtual striping locks
+ *
+ * Revision 1.21 1995/07/10 16:57:47 robby
+ * updated alloclistelem struct to the correct struct name
+ *
+ * Revision 1.20 1995/07/08 20:06:11 rachad
+ * *** empty log message ***
+ *
+ * Revision 1.19 1995/07/08 18:05:39 rachad
+ * Linked up Claudsons code with the real cache
+ *
+ * Revision 1.18 1995/07/06 14:29:36 robby
+ * added defaults states list to the layout switch
+ *
+ * Revision 1.17 1995/06/23 13:40:14 robby
+ * updeated to prototypes in rf_layout.h
+ *
+ * Revision 1.16 1995/06/08 22:11:03 holland
+ * bug fixes related to mutiple-row arrays
+ *
+ * Revision 1.15 1995/05/24 21:43:23 wvcii
+ * added field numParityLogCol to RaidLayout
+ *
+ * Revision 1.14 95/05/02 22:46:53 holland
+ * minor code cleanups.
+ *
+ * Revision 1.13 1995/05/02 12:48:01 holland
+ * eliminated some unused code.
+ *
+ * Revision 1.12 1995/05/01 13:28:00 holland
+ * parity range locks, locking disk requests, recon+parityscan in kernel, etc.
+ *
+ * Revision 1.11 1995/03/15 20:01:17 holland
+ * added REMAP and DONT_REMAP
+ *
+ * Revision 1.10 1995/03/09 19:54:11 rachad
+ * Added suport for threadless simulator
+ *
+ * Revision 1.9 1995/03/03 21:48:58 holland
+ * minor changes.
+ *
+ * Revision 1.8 1995/03/01 20:25:48 holland
+ * kernelization changes
+ *
+ * Revision 1.7 1995/02/03 22:31:36 holland
+ * many changes related to kernelization
+ *
+ * Revision 1.6 1995/01/30 14:53:46 holland
+ * extensive changes related to making DoIO non-blocking
+ *
+ * Revision 1.5 1995/01/24 23:58:46 holland
+ * multi-way recon XOR, plus various small changes
+ *
+ * Revision 1.4 1995/01/04 19:28:35 holland
+ * corrected comments around mapsw
+ *
+ * Revision 1.3 1994/11/28 22:15:45 danner
+ * Added type field to the physdiskaddr struct.
+ *
+ */
+
+#ifndef _RF__RF_LAYOUT_H_
+#define _RF__RF_LAYOUT_H_
+
+#include "rf_types.h"
+#include "rf_archs.h"
+#include "rf_alloclist.h"
+
+/*****************************************************************************************
+ *
+ * This structure identifies all layout-specific operations and parameters.
+ *
+ ****************************************************************************************/
+
+typedef struct RF_LayoutSW_s {
+ RF_ParityConfig_t parityConfig;
+ char *configName;
+
+#ifndef KERNEL
+ /* layout-specific parsing */
+ int (*MakeLayoutSpecific)(FILE *fp, RF_Config_t *cfgPtr, void *arg);
+ void *makeLayoutSpecificArg;
+#endif /* !KERNEL */
+
+#if RF_UTILITY == 0
+ /* initialization routine */
+ int (*Configure)(RF_ShutdownList_t **shutdownListp, RF_Raid_t *raidPtr, RF_Config_t *cfgPtr);
+
+ /* routine to map RAID sector address -> physical (row, col, offset) */
+ void (*MapSector)(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+
+ /* routine to map RAID sector address -> physical (r,c,o) of parity unit */
+ void (*MapParity)(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+
+ /* routine to map RAID sector address -> physical (r,c,o) of Q unit */
+ void (*MapQ)(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector, RF_RowCol_t *row,
+ RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+
+ /* routine to identify the disks comprising a stripe */
+ void (*IdentifyStripe)(RF_Raid_t *raidPtr, RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids, RF_RowCol_t *outRow);
+
+ /* routine to select a dag */
+ void (*SelectionFunc)(RF_Raid_t *raidPtr, RF_IoType_t type,
+ RF_AccessStripeMap_t *asmap,
+ RF_VoidFuncPtr *);
+#if 0
+ void (**createFunc)(RF_Raid_t *,
+ RF_AccessStripeMap_t *,
+ RF_DagHeader_t *, void *,
+ RF_RaidAccessFlags_t,
+ RF_AllocListElem_t *));
+
+#endif
+
+ /* map a stripe ID to a parity stripe ID. This is typically the identity mapping */
+ void (*MapSIDToPSID)(RF_RaidLayout_t *layoutPtr, RF_StripeNum_t stripeID,
+ RF_StripeNum_t *psID, RF_ReconUnitNum_t *which_ru);
+
+ /* get default head separation limit (may be NULL) */
+ RF_HeadSepLimit_t (*GetDefaultHeadSepLimit)(RF_Raid_t *raidPtr);
+
+ /* get default num recon buffers (may be NULL) */
+ int (*GetDefaultNumFloatingReconBuffers)(RF_Raid_t *raidPtr);
+
+ /* get number of spare recon units (may be NULL) */
+ RF_ReconUnitCount_t (*GetNumSpareRUs)(RF_Raid_t *raidPtr);
+
+ /* spare table installation (may be NULL) */
+ int (*InstallSpareTable)(RF_Raid_t *raidPtr, RF_RowCol_t frow, RF_RowCol_t fcol);
+
+ /* recon buffer submission function */
+ int (*SubmitReconBuffer)(RF_ReconBuffer_t *rbuf, int keep_it,
+ int use_committed);
+
+ /*
+ * verify that parity information for a stripe is correct
+ * see rf_parityscan.h for return vals
+ */
+ int (*VerifyParity)(RF_Raid_t *raidPtr, RF_RaidAddr_t raidAddr,
+ RF_PhysDiskAddr_t *parityPDA, int correct_it, RF_RaidAccessFlags_t flags);
+
+ /* number of faults tolerated by this mapping */
+ int faultsTolerated;
+
+ /* states to step through in an access. Must end with "LastState".
+ * The default is DefaultStates in rf_layout.c */
+ RF_AccessState_t *states;
+
+ RF_AccessStripeMapFlags_t flags;
+#endif /* RF_UTILITY == 0 */
+} RF_LayoutSW_t;
+
+/* enables remapping to spare location under dist sparing */
+#define RF_REMAP 1
+#define RF_DONT_REMAP 0
+
+/*
+ * Flags values for RF_AccessStripeMapFlags_t
+ */
+#define RF_NO_STRIPE_LOCKS 0x0001 /* suppress stripe locks */
+#define RF_DISTRIBUTE_SPARE 0x0002 /* distribute spare space in archs that support it */
+#define RF_BD_DECLUSTERED 0x0004 /* declustering uses block designs */
+
+/*************************************************************************
+ *
+ * this structure forms the layout component of the main Raid
+ * structure. It describes everything needed to define and perform
+ * the mapping of logical RAID addresses <-> physical disk addresses.
+ *
+ *************************************************************************/
+struct RF_RaidLayout_s {
+ /* configuration parameters */
+ RF_SectorCount_t sectorsPerStripeUnit; /* number of sectors in one stripe unit */
+ RF_StripeCount_t SUsPerPU; /* stripe units per parity unit */
+ RF_StripeCount_t SUsPerRU; /* stripe units per reconstruction unit */
+
+ /* redundant-but-useful info computed from the above, used in all layouts */
+ RF_StripeCount_t numStripe; /* total number of stripes in the array */
+ RF_SectorCount_t dataSectorsPerStripe;
+ RF_StripeCount_t dataStripeUnitsPerDisk;
+ u_int bytesPerStripeUnit;
+ u_int dataBytesPerStripe;
+ RF_StripeCount_t numDataCol; /* number of SUs of data per stripe (name here is a la RAID4) */
+ RF_StripeCount_t numParityCol; /* number of SUs of parity per stripe. Always 1 for now */
+ RF_StripeCount_t numParityLogCol; /* number of SUs of parity log per stripe. Always 1 for now */
+ RF_StripeCount_t stripeUnitsPerDisk;
+
+ RF_LayoutSW_t *map; /* ptr to struct holding mapping fns and information */
+ void *layoutSpecificInfo; /* ptr to a structure holding layout-specific params */
+};
+
+/*****************************************************************************************
+ *
+ * The mapping code returns a pointer to a list of AccessStripeMap structures, which
+ * describes all the mapping information about an access. The list contains one
+ * AccessStripeMap structure per stripe touched by the access. Each element in the list
+ * contains a stripe identifier and a pointer to a list of PhysDiskAddr structuress. Each
+ * element in this latter list describes the physical location of a stripe unit accessed
+ * within the corresponding stripe.
+ *
+ ****************************************************************************************/
+
+#define RF_PDA_TYPE_DATA 0
+#define RF_PDA_TYPE_PARITY 1
+#define RF_PDA_TYPE_Q 2
+
+struct RF_PhysDiskAddr_s {
+ RF_RowCol_t row,col; /* disk identifier */
+ RF_SectorNum_t startSector; /* sector offset into the disk */
+ RF_SectorCount_t numSector; /* number of sectors accessed */
+ int type; /* used by higher levels: currently, data, parity, or q */
+ caddr_t bufPtr; /* pointer to buffer supplying/receiving data */
+ RF_RaidAddr_t raidAddress; /* raid address corresponding to this physical disk address */
+ RF_PhysDiskAddr_t *next;
+};
+
+#define RF_MAX_FAILED_PDA RF_MAXCOL
+
+struct RF_AccessStripeMap_s {
+ RF_StripeNum_t stripeID; /* the stripe index */
+ RF_RaidAddr_t raidAddress; /* the starting raid address within this stripe */
+ RF_RaidAddr_t endRaidAddress; /* raid address one sector past the end of the access */
+ RF_SectorCount_t totalSectorsAccessed; /* total num sectors identified in physInfo list */
+ RF_StripeCount_t numStripeUnitsAccessed; /* total num elements in physInfo list */
+ int numDataFailed; /* number of failed data disks accessed */
+ int numParityFailed; /* number of failed parity disks accessed (0 or 1) */
+ int numQFailed; /* number of failed Q units accessed (0 or 1) */
+ RF_AccessStripeMapFlags_t flags; /* various flags */
+#if 0
+ RF_PhysDiskAddr_t *failedPDA; /* points to the PDA that has failed */
+ RF_PhysDiskAddr_t *failedPDAtwo; /* points to the second PDA that has failed, if any */
+#else
+ int numFailedPDAs; /* number of failed phys addrs */
+ RF_PhysDiskAddr_t *failedPDAs[RF_MAX_FAILED_PDA]; /* array of failed phys addrs */
+#endif
+ RF_PhysDiskAddr_t *physInfo; /* a list of PhysDiskAddr structs */
+ RF_PhysDiskAddr_t *parityInfo; /* list of physical addrs for the parity (P of P + Q ) */
+ RF_PhysDiskAddr_t *qInfo; /* list of physical addrs for the Q of P + Q */
+ RF_LockReqDesc_t lockReqDesc; /* used for stripe locking */
+ RF_RowCol_t origRow; /* the original row: we may redirect the acc to a different row */
+ RF_AccessStripeMap_t *next;
+};
+
+/* flag values */
+#define RF_ASM_REDIR_LARGE_WRITE 0x00000001 /* allows large-write creation code to redirect failed accs */
+#define RF_ASM_BAILOUT_DAG_USED 0x00000002 /* allows us to detect recursive calls to the bailout write dag */
+#define RF_ASM_FLAGS_LOCK_TRIED 0x00000004 /* we've acquired the lock on the first parity range in this parity stripe */
+#define RF_ASM_FLAGS_LOCK_TRIED2 0x00000008 /* we've acquired the lock on the 2nd parity range in this parity stripe */
+#define RF_ASM_FLAGS_FORCE_TRIED 0x00000010 /* we've done the force-recon call on this parity stripe */
+#define RF_ASM_FLAGS_RECON_BLOCKED 0x00000020 /* we blocked recon => we must unblock it later */
+
+struct RF_AccessStripeMapHeader_s {
+ RF_StripeCount_t numStripes; /* total number of stripes touched by this acc */
+ RF_AccessStripeMap_t *stripeMap; /* pointer to the actual map. Also used for making lists */
+ RF_AccessStripeMapHeader_t *next;
+};
+
+/*****************************************************************************************
+ *
+ * various routines mapping addresses in the RAID address space. These work across
+ * all layouts. DON'T PUT ANY LAYOUT-SPECIFIC CODE HERE.
+ *
+ ****************************************************************************************/
+
+/* return the identifier of the stripe containing the given address */
+#define rf_RaidAddressToStripeID(_layoutPtr_, _addr_) \
+ ( ((_addr_) / (_layoutPtr_)->sectorsPerStripeUnit) / (_layoutPtr_)->numDataCol )
+
+/* return the raid address of the start of the indicates stripe ID */
+#define rf_StripeIDToRaidAddress(_layoutPtr_, _sid_) \
+ ( ((_sid_) * (_layoutPtr_)->sectorsPerStripeUnit) * (_layoutPtr_)->numDataCol )
+
+/* return the identifier of the stripe containing the given stripe unit id */
+#define rf_StripeUnitIDToStripeID(_layoutPtr_, _addr_) \
+ ( (_addr_) / (_layoutPtr_)->numDataCol )
+
+/* return the identifier of the stripe unit containing the given address */
+#define rf_RaidAddressToStripeUnitID(_layoutPtr_, _addr_) \
+ ( ((_addr_) / (_layoutPtr_)->sectorsPerStripeUnit) )
+
+/* return the RAID address of next stripe boundary beyond the given address */
+#define rf_RaidAddressOfNextStripeBoundary(_layoutPtr_, _addr_) \
+ ( (((_addr_)/(_layoutPtr_)->dataSectorsPerStripe)+1) * (_layoutPtr_)->dataSectorsPerStripe )
+
+/* return the RAID address of the start of the stripe containing the given address */
+#define rf_RaidAddressOfPrevStripeBoundary(_layoutPtr_, _addr_) \
+ ( (((_addr_)/(_layoutPtr_)->dataSectorsPerStripe)+0) * (_layoutPtr_)->dataSectorsPerStripe )
+
+/* return the RAID address of next stripe unit boundary beyond the given address */
+#define rf_RaidAddressOfNextStripeUnitBoundary(_layoutPtr_, _addr_) \
+ ( (((_addr_)/(_layoutPtr_)->sectorsPerStripeUnit)+1L)*(_layoutPtr_)->sectorsPerStripeUnit )
+
+/* return the RAID address of the start of the stripe unit containing RAID address _addr_ */
+#define rf_RaidAddressOfPrevStripeUnitBoundary(_layoutPtr_, _addr_) \
+ ( (((_addr_)/(_layoutPtr_)->sectorsPerStripeUnit)+0)*(_layoutPtr_)->sectorsPerStripeUnit )
+
+/* returns the offset into the stripe. used by RaidAddressStripeAligned */
+#define rf_RaidAddressStripeOffset(_layoutPtr_, _addr_) \
+ ( (_addr_) % ((_layoutPtr_)->dataSectorsPerStripe) )
+
+/* returns the offset into the stripe unit. */
+#define rf_StripeUnitOffset(_layoutPtr_, _addr_) \
+ ( (_addr_) % ((_layoutPtr_)->sectorsPerStripeUnit) )
+
+/* returns nonzero if the given RAID address is stripe-aligned */
+#define rf_RaidAddressStripeAligned( __layoutPtr__, __addr__ ) \
+ ( rf_RaidAddressStripeOffset(__layoutPtr__, __addr__) == 0 )
+
+/* returns nonzero if the given address is stripe-unit aligned */
+#define rf_StripeUnitAligned( __layoutPtr__, __addr__ ) \
+ ( rf_StripeUnitOffset(__layoutPtr__, __addr__) == 0 )
+
+/* convert an address expressed in RAID blocks to/from an addr expressed in bytes */
+#define rf_RaidAddressToByte(_raidPtr_, _addr_) \
+ ( (_addr_) << ( (_raidPtr_)->logBytesPerSector ) )
+
+#define rf_ByteToRaidAddress(_raidPtr_, _addr_) \
+ ( (_addr_) >> ( (_raidPtr_)->logBytesPerSector ) )
+
+/* convert a raid address to/from a parity stripe ID. Conversion to raid address is easy,
+ * since we're asking for the address of the first sector in the parity stripe. Conversion to a
+ * parity stripe ID is more complex, since stripes are not contiguously allocated in
+ * parity stripes.
+ */
+#define rf_RaidAddressToParityStripeID(_layoutPtr_, _addr_, _ru_num_) \
+ rf_MapStripeIDToParityStripeID( (_layoutPtr_), rf_RaidAddressToStripeID( (_layoutPtr_), (_addr_) ), (_ru_num_) )
+
+#define rf_ParityStripeIDToRaidAddress(_layoutPtr_, _psid_) \
+ ( (_psid_) * (_layoutPtr_)->SUsPerPU * (_layoutPtr_)->numDataCol * (_layoutPtr_)->sectorsPerStripeUnit )
+
+RF_LayoutSW_t *rf_GetLayout(RF_ParityConfig_t parityConfig);
+int rf_ConfigureLayout(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr);
+RF_StripeNum_t rf_MapStripeIDToParityStripeID(RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t stripeID, RF_ReconUnitNum_t *which_ru);
+
+#endif /* !_RF__RF_LAYOUT_H_ */
diff --git a/sys/dev/raidframe/rf_map.c b/sys/dev/raidframe/rf_map.c
new file mode 100644
index 00000000000..11a3262a3a8
--- /dev/null
+++ b/sys/dev/raidframe/rf_map.c
@@ -0,0 +1,976 @@
+/* $OpenBSD: rf_map.c,v 1.1 1999/01/11 14:29:28 niklas Exp $ */
+/* $NetBSD: rf_map.c,v 1.1 1998/11/13 04:20:31 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/**************************************************************************
+ *
+ * map.c -- main code for mapping RAID addresses to physical disk addresses
+ *
+ **************************************************************************/
+
+/*
+ * :
+ * Log: rf_map.c,v
+ * Revision 1.53 1996/11/05 21:10:40 jimz
+ * failed pda generalization
+ *
+ * Revision 1.52 1996/08/20 19:58:39 jimz
+ * initialize numParityFailed and numQFailed to 0 in MarkFailuresInASMList
+ *
+ * Revision 1.51 1996/08/19 22:26:31 jimz
+ * add Chang's bugfixes for double-disk failures in MarkFailuresInASMList
+ *
+ * Revision 1.50 1996/08/19 21:38:06 jimz
+ * stripeOffset was uninitialized in CheckStripeForFailures
+ *
+ * Revision 1.49 1996/07/31 15:34:56 jimz
+ * evenodd changes; bugfixes for double-degraded archs, generalize
+ * some formerly PQ-only functions
+ *
+ * Revision 1.48 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.47 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.46 1996/06/10 12:50:57 jimz
+ * Add counters to freelists to track number of allocations, frees,
+ * grows, max size, etc. Adjust a couple sets of PRIME params based
+ * on the results.
+ *
+ * Revision 1.45 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.44 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.43 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.42 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.41 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.40 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.39 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.38 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.37 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.36 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.35 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.34 1996/05/20 16:14:45 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.33 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.32 1996/05/17 00:51:47 jimz
+ * reformat for readability
+ *
+ * Revision 1.31 1996/05/16 23:06:26 jimz
+ * convert asmhdr to use RF_FREELIST stuff
+ *
+ * Revision 1.30 1996/05/16 19:09:42 jimz
+ * grow init asm freelist to 32
+ *
+ * Revision 1.29 1996/05/16 15:27:55 jimz
+ * prime freelist pumps for asm and pda lists
+ *
+ * Revision 1.28 1996/05/02 14:58:35 jimz
+ * legibility cleanup
+ *
+ * Revision 1.27 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.26 1995/12/01 19:25:06 root
+ * added copyright info
+ *
+ * Revision 1.25 1995/11/17 19:01:57 wvcii
+ * added call to MapQ in two fault tolerant case
+ *
+ * Revision 1.24 1995/11/17 15:10:53 wvcii
+ * fixed bug in ASMCheckStatus - ASSERT was using disk sector addresses
+ * rather than raidAddress
+ *
+ * Revision 1.23 1995/07/26 03:26:51 robby
+ * map the allocation and freeing routines for some stuff non-static
+ *
+ * Revision 1.22 1995/06/28 09:33:45 holland
+ * bug fixes related to dist sparing and multiple-row arrays
+ *
+ * Revision 1.21 1995/06/28 04:51:08 holland
+ * added some asserts against zero-length accesses
+ *
+ * Revision 1.20 1995/06/23 13:40:06 robby
+ * updeated to prototypes in rf_layout.h
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_threadstuff.h"
+#include "rf_raid.h"
+#include "rf_general.h"
+#include "rf_map.h"
+#include "rf_freelist.h"
+#include "rf_shutdown.h"
+#include "rf_sys.h"
+
+static void rf_FreePDAList(RF_PhysDiskAddr_t *start, RF_PhysDiskAddr_t *end, int count);
+static void rf_FreeASMList(RF_AccessStripeMap_t *start, RF_AccessStripeMap_t *end,
+ int count);
+
+/*****************************************************************************************
+ *
+ * MapAccess -- main 1st order mapping routine.
+ *
+ * Maps an access in the RAID address space to the corresponding set of physical disk
+ * addresses. The result is returned as a list of AccessStripeMap structures, one per
+ * stripe accessed. Each ASM structure contains a pointer to a list of PhysDiskAddr
+ * structures, which describe the physical locations touched by the user access. Note
+ * that this routine returns only static mapping information, i.e. the list of physical
+ * addresses returned does not necessarily identify the set of physical locations that
+ * will actually be read or written.
+ *
+ * The routine also maps the parity. The physical disk location returned always
+ * indicates the entire parity unit, even when only a subset of it is being accessed.
+ * This is because an access that is not stripe unit aligned but that spans a stripe
+ * unit boundary may require access two distinct portions of the parity unit, and we
+ * can't yet tell which portion(s) we'll actually need. We leave it up to the algorithm
+ * selection code to decide what subset of the parity unit to access.
+ *
+ * Note that addresses in the RAID address space must always be maintained as
+ * longs, instead of ints.
+ *
+ * This routine returns NULL if numBlocks is 0
+ *
+ ****************************************************************************************/
+
+RF_AccessStripeMapHeader_t *rf_MapAccess(raidPtr, raidAddress, numBlocks, buffer, remap)
+ RF_Raid_t *raidPtr;
+ RF_RaidAddr_t raidAddress; /* starting address in RAID address space */
+ RF_SectorCount_t numBlocks; /* number of blocks in RAID address space to access */
+ caddr_t buffer; /* buffer to supply/receive data */
+ int remap; /* 1 => remap addresses to spare space */
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_AccessStripeMapHeader_t *asm_hdr = NULL;
+ RF_AccessStripeMap_t *asm_list = NULL, *asm_p = NULL;
+ int faultsTolerated = layoutPtr->map->faultsTolerated;
+ RF_RaidAddr_t startAddress = raidAddress; /* we'll change raidAddress along the way */
+ RF_RaidAddr_t endAddress = raidAddress + numBlocks;
+ RF_RaidDisk_t **disks = raidPtr->Disks;
+
+ RF_PhysDiskAddr_t *pda_p, *pda_q;
+ RF_StripeCount_t numStripes = 0;
+ RF_RaidAddr_t stripeRealEndAddress, stripeEndAddress, nextStripeUnitAddress;
+ RF_RaidAddr_t startAddrWithinStripe, lastRaidAddr;
+ RF_StripeCount_t totStripes;
+ RF_StripeNum_t stripeID, lastSID, SUID, lastSUID;
+ RF_AccessStripeMap_t *asmList, *t_asm;
+ RF_PhysDiskAddr_t *pdaList, *t_pda;
+
+ /* allocate all the ASMs and PDAs up front */
+ lastRaidAddr = raidAddress + numBlocks - 1 ;
+ stripeID = rf_RaidAddressToStripeID(layoutPtr, raidAddress);
+ lastSID = rf_RaidAddressToStripeID(layoutPtr, lastRaidAddr);
+ totStripes = lastSID - stripeID + 1;
+ SUID = rf_RaidAddressToStripeUnitID(layoutPtr, raidAddress);
+ lastSUID = rf_RaidAddressToStripeUnitID(layoutPtr, lastRaidAddr);
+
+ asmList = rf_AllocASMList(totStripes);
+ pdaList = rf_AllocPDAList(lastSUID - SUID + 1 + faultsTolerated * totStripes); /* may also need pda(s) per stripe for parity */
+
+ if (raidAddress+numBlocks > raidPtr->totalSectors) {
+ RF_ERRORMSG1("Unable to map access because offset (%d) was invalid\n",
+ (int)raidAddress);
+ return(NULL);
+ }
+
+ if (rf_mapDebug)
+ rf_PrintRaidAddressInfo(raidPtr, raidAddress, numBlocks);
+ for (; raidAddress < endAddress; ) {
+ /* make the next stripe structure */
+ RF_ASSERT(asmList);
+ t_asm = asmList;
+ asmList = asmList->next;
+ bzero((char *)t_asm, sizeof(RF_AccessStripeMap_t));
+ if (!asm_p)
+ asm_list = asm_p = t_asm;
+ else {
+ asm_p->next = t_asm;
+ asm_p = asm_p->next;
+ }
+ numStripes++;
+
+ /* map SUs from current location to the end of the stripe */
+ asm_p->stripeID = /*rf_RaidAddressToStripeID(layoutPtr, raidAddress)*/ stripeID++;
+ stripeRealEndAddress = rf_RaidAddressOfNextStripeBoundary(layoutPtr, raidAddress);
+ stripeEndAddress = RF_MIN(endAddress,stripeRealEndAddress );
+ asm_p->raidAddress = raidAddress;
+ asm_p->endRaidAddress = stripeEndAddress;
+
+ /* map each stripe unit in the stripe */
+ pda_p = NULL;
+ startAddrWithinStripe = raidAddress; /* Raid addr of start of portion of access that is within this stripe */
+ for (; raidAddress < stripeEndAddress; ) {
+ RF_ASSERT(pdaList);
+ t_pda = pdaList;
+ pdaList = pdaList->next;
+ bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t));
+ if (!pda_p)
+ asm_p->physInfo = pda_p = t_pda;
+ else {
+ pda_p->next = t_pda;
+ pda_p = pda_p->next;
+ }
+
+ pda_p->type = RF_PDA_TYPE_DATA;
+ (layoutPtr->map->MapSector)(raidPtr, raidAddress, &(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap);
+
+ /* mark any failures we find. failedPDA is don't-care if there is more than one failure */
+ pda_p->raidAddress = raidAddress; /* the RAID address corresponding to this physical disk address */
+ nextStripeUnitAddress = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, raidAddress);
+ pda_p->numSector = RF_MIN(endAddress, nextStripeUnitAddress) - raidAddress;
+ RF_ASSERT(pda_p->numSector != 0);
+ rf_ASMCheckStatus(raidPtr,pda_p,asm_p,disks,0);
+ pda_p->bufPtr = buffer + rf_RaidAddressToByte(raidPtr, (raidAddress - startAddress));
+ asm_p->totalSectorsAccessed += pda_p->numSector;
+ asm_p->numStripeUnitsAccessed++;
+ asm_p->origRow = pda_p->row; /* redundant but harmless to do this in every loop iteration */
+
+ raidAddress = RF_MIN(endAddress, nextStripeUnitAddress);
+ }
+
+ /* Map the parity. At this stage, the startSector and numSector fields
+ * for the parity unit are always set to indicate the entire parity unit.
+ * We may modify this after mapping the data portion.
+ */
+ switch (faultsTolerated)
+ {
+ case 0:
+ break;
+ case 1: /* single fault tolerant */
+ RF_ASSERT(pdaList);
+ t_pda = pdaList;
+ pdaList = pdaList->next;
+ bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t));
+ pda_p = asm_p->parityInfo = t_pda;
+ pda_p->type = RF_PDA_TYPE_PARITY;
+ (layoutPtr->map->MapParity)(raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe),
+ &(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap);
+ pda_p->numSector = layoutPtr->sectorsPerStripeUnit;
+ /* raidAddr may be needed to find unit to redirect to */
+ pda_p->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe);
+ rf_ASMCheckStatus(raidPtr,pda_p,asm_p,disks,1);
+ rf_ASMParityAdjust(asm_p->parityInfo,startAddrWithinStripe,endAddress,layoutPtr,asm_p);
+
+ break;
+ case 2: /* two fault tolerant */
+ RF_ASSERT(pdaList && pdaList->next);
+ t_pda = pdaList;
+ pdaList = pdaList->next;
+ bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t));
+ pda_p = asm_p->parityInfo = t_pda;
+ pda_p->type = RF_PDA_TYPE_PARITY;
+ t_pda = pdaList;
+ pdaList = pdaList->next;
+ bzero((char *)t_pda, sizeof(RF_PhysDiskAddr_t));
+ pda_q = asm_p->qInfo = t_pda;
+ pda_q->type = RF_PDA_TYPE_Q;
+ (layoutPtr->map->MapParity)(raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe),
+ &(pda_p->row), &(pda_p->col), &(pda_p->startSector), remap);
+ (layoutPtr->map->MapQ)(raidPtr, rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe),
+ &(pda_q->row), &(pda_q->col), &(pda_q->startSector), remap);
+ pda_q->numSector = pda_p->numSector = layoutPtr->sectorsPerStripeUnit;
+ /* raidAddr may be needed to find unit to redirect to */
+ pda_p->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe);
+ pda_q->raidAddress = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, startAddrWithinStripe);
+ /* failure mode stuff */
+ rf_ASMCheckStatus(raidPtr,pda_p,asm_p,disks,1);
+ rf_ASMCheckStatus(raidPtr,pda_q,asm_p,disks,1);
+ rf_ASMParityAdjust(asm_p->parityInfo,startAddrWithinStripe,endAddress,layoutPtr,asm_p);
+ rf_ASMParityAdjust(asm_p->qInfo,startAddrWithinStripe,endAddress,layoutPtr,asm_p);
+ break;
+ }
+ }
+ RF_ASSERT(asmList == NULL && pdaList == NULL);
+ /* make the header structure */
+ asm_hdr = rf_AllocAccessStripeMapHeader();
+ RF_ASSERT(numStripes == totStripes);
+ asm_hdr->numStripes = numStripes;
+ asm_hdr->stripeMap = asm_list;
+
+ if (rf_mapDebug)
+ rf_PrintAccessStripeMap(asm_hdr);
+ return(asm_hdr);
+}
+
+/*****************************************************************************************
+ * This routine walks through an ASM list and marks the PDAs that have failed.
+ * It's called only when a disk failure causes an in-flight DAG to fail.
+ * The parity may consist of two components, but we want to use only one failedPDA
+ * pointer. Thus we set failedPDA to point to the first parity component, and rely
+ * on the rest of the code to do the right thing with this.
+ ****************************************************************************************/
+
+void rf_MarkFailuresInASMList(raidPtr, asm_h)
+ RF_Raid_t *raidPtr;
+ RF_AccessStripeMapHeader_t *asm_h;
+{
+ RF_RaidDisk_t **disks = raidPtr->Disks;
+ RF_AccessStripeMap_t *asmap;
+ RF_PhysDiskAddr_t *pda;
+
+ for (asmap = asm_h->stripeMap; asmap; asmap = asmap->next) {
+ asmap->numDataFailed = asmap->numParityFailed = asmap->numQFailed = 0;
+ asmap->numFailedPDAs = 0;
+ bzero((char *)asmap->failedPDAs,
+ RF_MAX_FAILED_PDA*sizeof(RF_PhysDiskAddr_t *));
+ for (pda = asmap->physInfo; pda; pda=pda->next) {
+ if (RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
+ printf("DEAD DISK BOGUSLY DETECTED!!\n");
+ asmap->numDataFailed++;
+ asmap->failedPDAs[asmap->numFailedPDAs] = pda;
+ asmap->numFailedPDAs++;
+ }
+ }
+ pda = asmap->parityInfo;
+ if (pda && RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
+ asmap->numParityFailed++;
+ asmap->failedPDAs[asmap->numFailedPDAs] = pda;
+ asmap->numFailedPDAs++;
+ }
+ pda = asmap->qInfo;
+ if (pda && RF_DEAD_DISK(disks[pda->row][pda->col].status)) {
+ asmap->numQFailed++;
+ asmap->failedPDAs[asmap->numFailedPDAs] = pda;
+ asmap->numFailedPDAs++;
+ }
+ }
+}
+
+/*****************************************************************************************
+ *
+ * DuplicateASM -- duplicates an ASM and returns the new one
+ *
+ ****************************************************************************************/
+RF_AccessStripeMap_t *rf_DuplicateASM(asmap)
+ RF_AccessStripeMap_t *asmap;
+{
+ RF_AccessStripeMap_t *new_asm;
+ RF_PhysDiskAddr_t *pda, *new_pda, *t_pda;
+
+ new_pda = NULL;
+ new_asm = rf_AllocAccessStripeMapComponent();
+ bcopy((char *)asmap, (char *)new_asm, sizeof(RF_AccessStripeMap_t));
+ new_asm->numFailedPDAs = 0; /* ??? */
+ new_asm->failedPDAs[0] = NULL;
+ new_asm->physInfo = NULL;
+ new_asm->parityInfo = NULL;
+ new_asm->next = NULL;
+
+ for (pda = asmap->physInfo; pda; pda=pda->next) { /* copy the physInfo list */
+ t_pda = rf_AllocPhysDiskAddr();
+ bcopy((char *)pda, (char *)t_pda, sizeof(RF_PhysDiskAddr_t));
+ t_pda->next = NULL;
+ if (!new_asm->physInfo) {new_asm->physInfo = t_pda; new_pda = t_pda;}
+ else {new_pda->next = t_pda; new_pda = new_pda->next;}
+ if (pda == asmap->failedPDAs[0])
+ new_asm->failedPDAs[0] = t_pda;
+ }
+ for (pda = asmap->parityInfo; pda; pda=pda->next) { /* copy the parityInfo list */
+ t_pda = rf_AllocPhysDiskAddr();
+ bcopy((char *)pda, (char *)t_pda, sizeof(RF_PhysDiskAddr_t));
+ t_pda->next = NULL;
+ if (!new_asm->parityInfo) {new_asm->parityInfo = t_pda; new_pda = t_pda;}
+ else {new_pda->next = t_pda; new_pda = new_pda->next;}
+ if (pda == asmap->failedPDAs[0])
+ new_asm->failedPDAs[0] = t_pda;
+ }
+ return(new_asm);
+}
+
+/*****************************************************************************************
+ *
+ * DuplicatePDA -- duplicates a PDA and returns the new one
+ *
+ ****************************************************************************************/
+RF_PhysDiskAddr_t *rf_DuplicatePDA(pda)
+ RF_PhysDiskAddr_t *pda;
+{
+ RF_PhysDiskAddr_t *new;
+
+ new = rf_AllocPhysDiskAddr();
+ bcopy((char *)pda, (char *)new, sizeof(RF_PhysDiskAddr_t));
+ return(new);
+}
+
+/*****************************************************************************************
+ *
+ * routines to allocate and free list elements. All allocation routines zero the
+ * structure before returning it.
+ *
+ * FreePhysDiskAddr is static. It should never be called directly, because
+ * FreeAccessStripeMap takes care of freeing the PhysDiskAddr list.
+ *
+ ****************************************************************************************/
+
+static RF_FreeList_t *rf_asmhdr_freelist;
+#define RF_MAX_FREE_ASMHDR 128
+#define RF_ASMHDR_INC 16
+#define RF_ASMHDR_INITIAL 32
+
+static RF_FreeList_t *rf_asm_freelist;
+#define RF_MAX_FREE_ASM 192
+#define RF_ASM_INC 24
+#define RF_ASM_INITIAL 64
+
+static RF_FreeList_t *rf_pda_freelist;
+#define RF_MAX_FREE_PDA 192
+#define RF_PDA_INC 24
+#define RF_PDA_INITIAL 64
+
+/* called at shutdown time. So far, all that is necessary is to release all the free lists */
+static void rf_ShutdownMapModule(void *);
+static void rf_ShutdownMapModule(ignored)
+ void *ignored;
+{
+ RF_FREELIST_DESTROY(rf_asmhdr_freelist,next,(RF_AccessStripeMapHeader_t *));
+ RF_FREELIST_DESTROY(rf_pda_freelist,next,(RF_PhysDiskAddr_t *));
+ RF_FREELIST_DESTROY(rf_asm_freelist,next,(RF_AccessStripeMap_t *));
+}
+
+int rf_ConfigureMapModule(listp)
+ RF_ShutdownList_t **listp;
+{
+ int rc;
+
+ RF_FREELIST_CREATE(rf_asmhdr_freelist, RF_MAX_FREE_ASMHDR,
+ RF_ASMHDR_INC, sizeof(RF_AccessStripeMapHeader_t));
+ if (rf_asmhdr_freelist == NULL) {
+ return(ENOMEM);
+ }
+ RF_FREELIST_CREATE(rf_asm_freelist, RF_MAX_FREE_ASM,
+ RF_ASM_INC, sizeof(RF_AccessStripeMap_t));
+ if (rf_asm_freelist == NULL) {
+ RF_FREELIST_DESTROY(rf_asmhdr_freelist,next,(RF_AccessStripeMapHeader_t *));
+ return(ENOMEM);
+ }
+ RF_FREELIST_CREATE(rf_pda_freelist, RF_MAX_FREE_PDA,
+ RF_PDA_INC, sizeof(RF_PhysDiskAddr_t));
+ if (rf_pda_freelist == NULL) {
+ RF_FREELIST_DESTROY(rf_asmhdr_freelist,next,(RF_AccessStripeMapHeader_t *));
+ RF_FREELIST_DESTROY(rf_pda_freelist,next,(RF_PhysDiskAddr_t *));
+ return(ENOMEM);
+ }
+
+ rc = rf_ShutdownCreate(listp, rf_ShutdownMapModule, NULL);
+ if (rc) {
+ RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ rf_ShutdownMapModule(NULL);
+ return(rc);
+ }
+
+ RF_FREELIST_PRIME(rf_asmhdr_freelist, RF_ASMHDR_INITIAL,next,
+ (RF_AccessStripeMapHeader_t *));
+ RF_FREELIST_PRIME(rf_asm_freelist, RF_ASM_INITIAL,next,
+ (RF_AccessStripeMap_t *));
+ RF_FREELIST_PRIME(rf_pda_freelist, RF_PDA_INITIAL,next,
+ (RF_PhysDiskAddr_t *));
+
+ return(0);
+}
+
+RF_AccessStripeMapHeader_t *rf_AllocAccessStripeMapHeader()
+{
+ RF_AccessStripeMapHeader_t *p;
+
+ RF_FREELIST_GET(rf_asmhdr_freelist,p,next,(RF_AccessStripeMapHeader_t *));
+ bzero((char *)p, sizeof(RF_AccessStripeMapHeader_t));
+
+ return(p);
+}
+
+
+void rf_FreeAccessStripeMapHeader(p)
+ RF_AccessStripeMapHeader_t *p;
+{
+ RF_FREELIST_FREE(rf_asmhdr_freelist,p,next);
+}
+
+RF_PhysDiskAddr_t *rf_AllocPhysDiskAddr()
+{
+ RF_PhysDiskAddr_t *p;
+
+ RF_FREELIST_GET(rf_pda_freelist,p,next,(RF_PhysDiskAddr_t *));
+ bzero((char *)p, sizeof(RF_PhysDiskAddr_t));
+
+ return(p);
+}
+
+/* allocates a list of PDAs, locking the free list only once
+ * when we have to call calloc, we do it one component at a time to simplify
+ * the process of freeing the list at program shutdown. This should not be
+ * much of a performance hit, because it should be very infrequently executed.
+ */
+RF_PhysDiskAddr_t *rf_AllocPDAList(count)
+ int count;
+{
+ RF_PhysDiskAddr_t *p = NULL;
+
+ RF_FREELIST_GET_N(rf_pda_freelist,p,next,(RF_PhysDiskAddr_t *),count);
+ return(p);
+}
+
+void rf_FreePhysDiskAddr(p)
+ RF_PhysDiskAddr_t *p;
+{
+ RF_FREELIST_FREE(rf_pda_freelist,p,next);
+}
+
+static void rf_FreePDAList(l_start, l_end, count)
+ RF_PhysDiskAddr_t *l_start, *l_end; /* pointers to start and end of list */
+ int count; /* number of elements in list */
+{
+ RF_FREELIST_FREE_N(rf_pda_freelist,l_start,next,(RF_PhysDiskAddr_t *),count);
+}
+
+RF_AccessStripeMap_t *rf_AllocAccessStripeMapComponent()
+{
+ RF_AccessStripeMap_t *p;
+
+ RF_FREELIST_GET(rf_asm_freelist,p,next,(RF_AccessStripeMap_t *));
+ bzero((char *)p, sizeof(RF_AccessStripeMap_t));
+
+ return(p);
+}
+
+/* this is essentially identical to AllocPDAList. I should combine the two.
+ * when we have to call calloc, we do it one component at a time to simplify
+ * the process of freeing the list at program shutdown. This should not be
+ * much of a performance hit, because it should be very infrequently executed.
+ */
+RF_AccessStripeMap_t *rf_AllocASMList(count)
+ int count;
+{
+ RF_AccessStripeMap_t *p = NULL;
+
+ RF_FREELIST_GET_N(rf_asm_freelist,p,next,(RF_AccessStripeMap_t *),count);
+ return(p);
+}
+
+void rf_FreeAccessStripeMapComponent(p)
+ RF_AccessStripeMap_t *p;
+{
+ RF_FREELIST_FREE(rf_asm_freelist,p,next);
+}
+
+static void rf_FreeASMList(l_start, l_end, count)
+ RF_AccessStripeMap_t *l_start, *l_end;
+ int count;
+{
+ RF_FREELIST_FREE_N(rf_asm_freelist,l_start,next,(RF_AccessStripeMap_t *),count);
+}
+
+void rf_FreeAccessStripeMap(hdr)
+ RF_AccessStripeMapHeader_t *hdr;
+{
+ RF_AccessStripeMap_t *p, *pt = NULL;
+ RF_PhysDiskAddr_t *pdp, *trailer, *pdaList = NULL, *pdaEnd = NULL;
+ int count = 0, t, asm_count = 0;
+
+ for (p = hdr->stripeMap; p; p=p->next) {
+
+ /* link the 3 pda lists into the accumulating pda list */
+
+ if (!pdaList) pdaList = p->qInfo; else pdaEnd->next = p->qInfo;
+ for (trailer=NULL,pdp=p->qInfo; pdp; ) {trailer = pdp; pdp=pdp->next; count++;}
+ if (trailer) pdaEnd = trailer;
+
+ if (!pdaList) pdaList = p->parityInfo; else pdaEnd->next = p->parityInfo;
+ for (trailer=NULL,pdp=p->parityInfo; pdp; ) {trailer = pdp; pdp=pdp->next; count++;}
+ if (trailer) pdaEnd = trailer;
+
+ if (!pdaList) pdaList = p->physInfo; else pdaEnd->next = p->physInfo;
+ for (trailer=NULL,pdp=p->physInfo; pdp; ) {trailer = pdp; pdp=pdp->next; count++;}
+ if (trailer) pdaEnd = trailer;
+
+ pt = p;
+ asm_count++;
+ }
+
+ /* debug only */
+ for (t=0,pdp=pdaList; pdp; pdp=pdp->next)
+ t++;
+ RF_ASSERT(t == count);
+
+ if (pdaList)
+ rf_FreePDAList(pdaList, pdaEnd, count);
+ rf_FreeASMList(hdr->stripeMap, pt, asm_count);
+ rf_FreeAccessStripeMapHeader(hdr);
+}
+
+/* We can't use the large write optimization if there are any failures in the stripe.
+ * In the declustered layout, there is no way to immediately determine what disks
+ * constitute a stripe, so we actually have to hunt through the stripe looking for failures.
+ * The reason we map the parity instead of just using asm->parityInfo->col is because
+ * the latter may have been already redirected to a spare drive, which would
+ * mess up the computation of the stripe offset.
+ *
+ * ASSUMES AT MOST ONE FAILURE IN THE STRIPE.
+ */
+int rf_CheckStripeForFailures(raidPtr, asmap)
+ RF_Raid_t *raidPtr;
+ RF_AccessStripeMap_t *asmap;
+{
+ RF_RowCol_t trow, tcol, prow, pcol, *diskids, row, i;
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_StripeCount_t stripeOffset;
+ int numFailures;
+ RF_RaidAddr_t sosAddr;
+ RF_SectorNum_t diskOffset, poffset;
+ RF_RowCol_t testrow;
+
+ /* quick out in the fault-free case. */
+ RF_LOCK_MUTEX(raidPtr->mutex);
+ numFailures = raidPtr->numFailures;
+ RF_UNLOCK_MUTEX(raidPtr->mutex);
+ if (numFailures == 0) return(0);
+
+ sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+ row = asmap->physInfo->row;
+ (layoutPtr->map->IdentifyStripe)(raidPtr, asmap->raidAddress, &diskids, &testrow);
+ (layoutPtr->map->MapParity)(raidPtr, asmap->raidAddress, &prow, &pcol, &poffset, 0); /* get pcol */
+
+ /* this need not be true if we've redirected the access to a spare in another row
+ RF_ASSERT(row == testrow);
+ */
+ stripeOffset = 0;
+ for (i=0; i<layoutPtr->numDataCol+layoutPtr->numParityCol; i++) {
+ if (diskids[i] != pcol) {
+ if (RF_DEAD_DISK(raidPtr->Disks[testrow][diskids[i]].status)) {
+ if (raidPtr->status[testrow] != rf_rs_reconstructing)
+ return(1);
+ RF_ASSERT(raidPtr->reconControl[testrow]->fcol == diskids[i]);
+ layoutPtr->map->MapSector(raidPtr,
+ sosAddr + stripeOffset * layoutPtr->sectorsPerStripeUnit,
+ &trow, &tcol, &diskOffset, 0);
+ RF_ASSERT( (trow == testrow) && (tcol == diskids[i]) );
+ if (!rf_CheckRUReconstructed(raidPtr->reconControl[testrow]->reconMap, diskOffset))
+ return(1);
+ asmap->flags |= RF_ASM_REDIR_LARGE_WRITE;
+ return(0);
+ }
+ stripeOffset++;
+ }
+ }
+ return(0);
+}
+
+/*
+ return the number of failed data units in the stripe.
+*/
+
+int rf_NumFailedDataUnitsInStripe(raidPtr, asmap)
+ RF_Raid_t *raidPtr;
+ RF_AccessStripeMap_t *asmap;
+{
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_RowCol_t trow, tcol, row, i;
+ RF_SectorNum_t diskOffset;
+ RF_RaidAddr_t sosAddr;
+ int numFailures;
+
+ /* quick out in the fault-free case. */
+ RF_LOCK_MUTEX(raidPtr->mutex);
+ numFailures = raidPtr->numFailures;
+ RF_UNLOCK_MUTEX(raidPtr->mutex);
+ if (numFailures == 0) return(0);
+ numFailures = 0;
+
+ sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+ row = asmap->physInfo->row;
+ for (i=0; i<layoutPtr->numDataCol; i++)
+ {
+ (layoutPtr->map->MapSector)(raidPtr, sosAddr + i * layoutPtr->sectorsPerStripeUnit,
+ &trow, &tcol, &diskOffset, 0);
+ if (RF_DEAD_DISK(raidPtr->Disks[trow][tcol].status))
+ numFailures++;
+ }
+
+ return numFailures;
+}
+
+
+/*****************************************************************************************
+ *
+ * debug routines
+ *
+ ****************************************************************************************/
+
+void rf_PrintAccessStripeMap(asm_h)
+ RF_AccessStripeMapHeader_t *asm_h;
+{
+ rf_PrintFullAccessStripeMap(asm_h, 0);
+}
+
+void rf_PrintFullAccessStripeMap(asm_h, prbuf)
+ RF_AccessStripeMapHeader_t *asm_h;
+ int prbuf; /* flag to print buffer pointers */
+{
+ int i;
+ RF_AccessStripeMap_t *asmap = asm_h->stripeMap;
+ RF_PhysDiskAddr_t *p;
+ printf("%d stripes total\n", (int)asm_h->numStripes);
+ for (; asmap; asmap = asmap->next) {
+ /* printf("Num failures: %d\n",asmap->numDataFailed); */
+ /* printf("Num sectors: %d\n",(int)asmap->totalSectorsAccessed); */
+ printf("Stripe %d (%d sectors), failures: %d data, %d parity: ",
+ (int) asmap->stripeID,
+ (int) asmap->totalSectorsAccessed,
+ (int) asmap->numDataFailed,
+ (int) asmap->numParityFailed);
+ if (asmap->parityInfo) {
+ printf("Parity [r%d c%d s%d-%d", asmap->parityInfo->row, asmap->parityInfo->col,
+ (int)asmap->parityInfo->startSector,
+ (int)(asmap->parityInfo->startSector +
+ asmap->parityInfo->numSector - 1));
+ if (prbuf) printf(" b0x%lx",(unsigned long) asmap->parityInfo->bufPtr);
+ if (asmap->parityInfo->next) {
+ printf(", r%d c%d s%d-%d", asmap->parityInfo->next->row,
+ asmap->parityInfo->next->col,
+ (int) asmap->parityInfo->next->startSector,
+ (int)(asmap->parityInfo->next->startSector +
+ asmap->parityInfo->next->numSector - 1));
+ if (prbuf) printf(" b0x%lx",(unsigned long) asmap->parityInfo->next->bufPtr);
+ RF_ASSERT(asmap->parityInfo->next->next == NULL);
+ }
+ printf("]\n\t");
+ }
+ for (i=0,p=asmap->physInfo; p; p=p->next,i++) {
+ printf("SU r%d c%d s%d-%d ", p->row, p->col, (int)p->startSector,
+ (int)(p->startSector + p->numSector - 1));
+ if (prbuf) printf("b0x%lx ", (unsigned long) p->bufPtr);
+ if (i && !(i&1)) printf("\n\t");
+ }
+ printf("\n");
+ p = asm_h->stripeMap->failedPDAs[0];
+ if (asm_h->stripeMap->numDataFailed + asm_h->stripeMap->numParityFailed > 1) printf("[multiple failures]\n");
+ else if (asm_h->stripeMap->numDataFailed + asm_h->stripeMap->numParityFailed > 0)
+ printf("\t[Failed PDA: r%d c%d s%d-%d]\n",p->row, p->col,
+ (int)p->startSector, (int)(p->startSector + p->numSector-1));
+ }
+}
+
+void rf_PrintRaidAddressInfo(raidPtr, raidAddr, numBlocks)
+ RF_Raid_t *raidPtr;
+ RF_RaidAddr_t raidAddr;
+ RF_SectorCount_t numBlocks;
+{
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_RaidAddr_t ra, sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr);
+
+ printf("Raid addrs of SU boundaries from start of stripe to end of access:\n\t");
+ for (ra = sosAddr; ra <= raidAddr + numBlocks; ra += layoutPtr->sectorsPerStripeUnit) {
+ printf("%d (0x%x), ",(int)ra, (int)ra);
+ }
+ printf("\n");
+ printf("Offset into stripe unit: %d (0x%x)\n",
+ (int)(raidAddr % layoutPtr->sectorsPerStripeUnit),
+ (int)(raidAddr % layoutPtr->sectorsPerStripeUnit));
+}
+
+/*
+ given a parity descriptor and the starting address within a stripe,
+ range restrict the parity descriptor to touch only the correct stuff.
+*/
+void rf_ASMParityAdjust(
+ RF_PhysDiskAddr_t *toAdjust,
+ RF_StripeNum_t startAddrWithinStripe,
+ RF_SectorNum_t endAddress,
+ RF_RaidLayout_t *layoutPtr,
+ RF_AccessStripeMap_t *asm_p)
+{
+ RF_PhysDiskAddr_t *new_pda;
+
+ /* when we're accessing only a portion of one stripe unit, we want the parity descriptor
+ * to identify only the chunk of parity associated with the data. When the access spans
+ * exactly one stripe unit boundary and is less than a stripe unit in size, it uses two disjoint
+ * regions of the parity unit. When an access spans more than one stripe unit boundary, it
+ * uses all of the parity unit.
+ *
+ * To better handle the case where stripe units are small, we may eventually want to change
+ * the 2nd case so that if the SU size is below some threshold, we just read/write the whole
+ * thing instead of breaking it up into two accesses.
+ */
+ if (asm_p->numStripeUnitsAccessed == 1)
+ {
+ int x = (startAddrWithinStripe % layoutPtr->sectorsPerStripeUnit);
+ toAdjust->startSector += x;
+ toAdjust->raidAddress += x;
+ toAdjust->numSector = asm_p->physInfo->numSector;
+ RF_ASSERT(toAdjust->numSector != 0);
+ }
+ else
+ if (asm_p->numStripeUnitsAccessed == 2 && asm_p->totalSectorsAccessed < layoutPtr->sectorsPerStripeUnit)
+ {
+ int x = (startAddrWithinStripe % layoutPtr->sectorsPerStripeUnit);
+
+ /* create a second pda and copy the parity map info into it */
+ RF_ASSERT(toAdjust->next == NULL);
+ new_pda = toAdjust->next = rf_AllocPhysDiskAddr();
+ *new_pda = *toAdjust; /* structure assignment */
+ new_pda->next = NULL;
+
+ /* adjust the start sector & number of blocks for the first parity pda */
+ toAdjust->startSector += x;
+ toAdjust->raidAddress += x;
+ toAdjust->numSector = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, startAddrWithinStripe) - startAddrWithinStripe;
+ RF_ASSERT(toAdjust->numSector != 0);
+
+ /* adjust the second pda */
+ new_pda->numSector = endAddress - rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, endAddress);
+ /*new_pda->raidAddress = rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr, toAdjust->raidAddress);*/
+ RF_ASSERT(new_pda->numSector != 0);
+ }
+}
+
+/*
+ Check if a disk has been spared or failed. If spared,
+ redirect the I/O.
+ If it has been failed, record it in the asm pointer.
+ Fourth arg is whether data or parity.
+*/
+void rf_ASMCheckStatus(
+ RF_Raid_t *raidPtr,
+ RF_PhysDiskAddr_t *pda_p,
+ RF_AccessStripeMap_t *asm_p,
+ RF_RaidDisk_t **disks,
+ int parity)
+{
+ RF_DiskStatus_t dstatus;
+ RF_RowCol_t frow, fcol;
+
+ dstatus = disks[pda_p->row][pda_p->col].status;
+
+ if (dstatus == rf_ds_spared) {
+ /* if the disk has been spared, redirect access to the spare */
+ frow = pda_p->row; fcol = pda_p->col;
+ pda_p->row = disks[frow][fcol].spareRow;
+ pda_p->col = disks[frow][fcol].spareCol;
+ }
+ else if (dstatus == rf_ds_dist_spared) {
+ /* ditto if disk has been spared to dist spare space */
+ RF_RowCol_t or = pda_p->row, oc=pda_p->col;
+ RF_SectorNum_t oo = pda_p->startSector;
+
+ if (pda_p -> type == RF_PDA_TYPE_DATA)
+ raidPtr->Layout.map->MapSector(raidPtr, pda_p->raidAddress, &pda_p->row, &pda_p->col, &pda_p->startSector, RF_REMAP);
+ else
+ raidPtr->Layout.map->MapParity(raidPtr, pda_p->raidAddress, &pda_p->row, &pda_p->col, &pda_p->startSector, RF_REMAP);
+
+ if (rf_mapDebug) {
+ printf("Redirected r %d c %d o %d -> r%d c %d o %d\n",or,oc,(int)oo,
+ pda_p->row,pda_p->col,(int)pda_p->startSector);
+ }
+ } else if (RF_DEAD_DISK(dstatus)) {
+ /* if the disk is inaccessible, mark the failure */
+ if (parity)
+ asm_p->numParityFailed++;
+ else {
+ asm_p->numDataFailed++;
+#if 0
+ /* XXX Do we really want this spewing out on the console? GO */
+ printf("DATA_FAILED!\n");
+#endif
+ }
+ asm_p->failedPDAs[asm_p->numFailedPDAs] = pda_p;
+ asm_p->numFailedPDAs++;
+#if 0
+ switch (asm_p->numParityFailed + asm_p->numDataFailed)
+ {
+ case 1:
+ asm_p->failedPDAs[0] = pda_p;
+ break;
+ case 2:
+ asm_p->failedPDAs[1] = pda_p;
+ default:
+ break;
+ }
+#endif
+ }
+ /* the redirected access should never span a stripe unit boundary */
+ RF_ASSERT(rf_RaidAddressToStripeUnitID(&raidPtr->Layout,pda_p->raidAddress) ==
+ rf_RaidAddressToStripeUnitID(&raidPtr->Layout,pda_p->raidAddress + pda_p->numSector -1));
+ RF_ASSERT(pda_p->col != -1);
+}
diff --git a/sys/dev/raidframe/rf_map.h b/sys/dev/raidframe/rf_map.h
new file mode 100644
index 00000000000..827de180b51
--- /dev/null
+++ b/sys/dev/raidframe/rf_map.h
@@ -0,0 +1,134 @@
+/* $OpenBSD: rf_map.h,v 1.1 1999/01/11 14:29:29 niklas Exp $ */
+/* $NetBSD: rf_map.h,v 1.1 1998/11/13 04:20:31 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_map.h */
+
+/* :
+ * Log: rf_map.h,v
+ * Revision 1.9 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.8 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.7 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.6 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.5 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.4 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.3 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.2 1995/12/01 19:25:14 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_MAP_H_
+#define _RF__RF_MAP_H_
+
+#include "rf_types.h"
+#include "rf_alloclist.h"
+#include "rf_raid.h"
+
+/* mapping structure allocation and free routines */
+RF_AccessStripeMapHeader_t *rf_MapAccess(RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidAddress, RF_SectorCount_t numBlocks,
+ caddr_t buffer, int remap);
+
+void rf_MarkFailuresInASMList(RF_Raid_t *raidPtr,
+ RF_AccessStripeMapHeader_t *asm_h);
+
+RF_AccessStripeMap_t *rf_DuplicateASM(RF_AccessStripeMap_t *asmap);
+
+RF_PhysDiskAddr_t *rf_DuplicatePDA(RF_PhysDiskAddr_t *pda);
+
+int rf_ConfigureMapModule(RF_ShutdownList_t **listp);
+
+RF_AccessStripeMapHeader_t *rf_AllocAccessStripeMapHeader(void);
+
+void rf_FreeAccessStripeMapHeader(RF_AccessStripeMapHeader_t *p);
+
+RF_PhysDiskAddr_t *rf_AllocPhysDiskAddr(void);
+
+RF_PhysDiskAddr_t *rf_AllocPDAList(int count);
+
+void rf_FreePhysDiskAddr(RF_PhysDiskAddr_t *p);
+
+RF_AccessStripeMap_t *rf_AllocAccessStripeMapComponent(void);
+
+RF_AccessStripeMap_t *rf_AllocASMList(int count);
+
+void rf_FreeAccessStripeMapComponent(RF_AccessStripeMap_t *p);
+
+void rf_FreeAccessStripeMap(RF_AccessStripeMapHeader_t *hdr);
+
+int rf_CheckStripeForFailures(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap);
+
+int rf_NumFailedDataUnitsInStripe(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap);
+
+void rf_PrintAccessStripeMap(RF_AccessStripeMapHeader_t *asm_h);
+
+void rf_PrintFullAccessStripeMap(RF_AccessStripeMapHeader_t *asm_h, int prbuf);
+
+void rf_PrintRaidAddressInfo(RF_Raid_t *raidPtr, RF_RaidAddr_t raidAddr,
+ RF_SectorCount_t numBlocks);
+
+void rf_ASMParityAdjust(RF_PhysDiskAddr_t *toAdjust,
+ RF_StripeNum_t startAddrWithinStripe, RF_SectorNum_t endAddress,
+ RF_RaidLayout_t *layoutPtr, RF_AccessStripeMap_t *asm_p);
+
+void rf_ASMCheckStatus(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda_p,
+ RF_AccessStripeMap_t *asm_p, RF_RaidDisk_t **disks, int parity);
+
+#endif /* !_RF__RF_MAP_H_ */
diff --git a/sys/dev/raidframe/rf_mcpair.c b/sys/dev/raidframe/rf_mcpair.c
new file mode 100644
index 00000000000..4ed3a187b1c
--- /dev/null
+++ b/sys/dev/raidframe/rf_mcpair.c
@@ -0,0 +1,200 @@
+/* $OpenBSD: rf_mcpair.c,v 1.1 1999/01/11 14:29:29 niklas Exp $ */
+/* $NetBSD: rf_mcpair.c,v 1.1 1998/11/13 04:20:31 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_mcpair.c
+ * an mcpair is a structure containing a mutex and a condition variable.
+ * it's used to block the current thread until some event occurs.
+ */
+
+/* :
+ * Log: rf_mcpair.c,v
+ * Revision 1.16 1996/06/19 22:23:01 jimz
+ * parity verification is now a layout-configurable thing
+ * not all layouts currently support it (correctly, anyway)
+ *
+ * Revision 1.15 1996/06/17 03:18:04 jimz
+ * include shutdown.h for macroized ShutdownCreate
+ *
+ * Revision 1.14 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.13 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.12 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.11 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.10 1996/05/20 16:15:22 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.9 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.8 1996/05/16 16:04:42 jimz
+ * convert to return-val on FREELIST init
+ *
+ * Revision 1.7 1996/05/16 14:47:21 jimz
+ * rewrote to use RF_FREELIST
+ *
+ * Revision 1.6 1995/12/01 19:25:43 root
+ * added copyright info
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_threadstuff.h"
+#include "rf_mcpair.h"
+#include "rf_debugMem.h"
+#include "rf_freelist.h"
+#include "rf_shutdown.h"
+
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+#include <sys/proc.h>
+#endif
+
+static RF_FreeList_t *rf_mcpair_freelist;
+
+#define RF_MAX_FREE_MCPAIR 128
+#define RF_MCPAIR_INC 16
+#define RF_MCPAIR_INITIAL 24
+
+static int init_mcpair(RF_MCPair_t *);
+static void clean_mcpair(RF_MCPair_t *);
+static void rf_ShutdownMCPair(void *);
+
+
+
+static int init_mcpair(t)
+ RF_MCPair_t *t;
+{
+ int rc;
+
+ rc = rf_mutex_init(&t->mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ return(rc);
+ }
+ rc = rf_cond_init(&t->cond);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ rf_mutex_destroy(&t->mutex);
+ return(rc);
+ }
+ return(0);
+}
+
+static void clean_mcpair(t)
+ RF_MCPair_t *t;
+{
+ rf_mutex_destroy(&t->mutex);
+ rf_cond_destroy(&t->cond);
+}
+
+static void rf_ShutdownMCPair(ignored)
+ void *ignored;
+{
+ RF_FREELIST_DESTROY_CLEAN(rf_mcpair_freelist,next,(RF_MCPair_t *),clean_mcpair);
+}
+
+int rf_ConfigureMCPair(listp)
+ RF_ShutdownList_t **listp;
+{
+ int rc;
+
+ RF_FREELIST_CREATE(rf_mcpair_freelist, RF_MAX_FREE_MCPAIR,
+ RF_MCPAIR_INC, sizeof(RF_MCPair_t));
+ rc = rf_ShutdownCreate(listp, rf_ShutdownMCPair, NULL);
+ if (rc) {
+ RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
+ __FILE__, __LINE__, rc);
+ rf_ShutdownMCPair(NULL);
+ return(rc);
+ }
+ RF_FREELIST_PRIME_INIT(rf_mcpair_freelist, RF_MCPAIR_INITIAL,next,
+ (RF_MCPair_t *),init_mcpair);
+ return(0);
+}
+
+RF_MCPair_t *rf_AllocMCPair()
+{
+ RF_MCPair_t *t;
+
+ RF_FREELIST_GET_INIT(rf_mcpair_freelist,t,next,(RF_MCPair_t *),init_mcpair);
+ if (t) {
+ t->flag = 0;
+ t->next = NULL;
+ }
+ return(t);
+}
+
+void rf_FreeMCPair(t)
+ RF_MCPair_t *t;
+{
+ RF_FREELIST_FREE_CLEAN(rf_mcpair_freelist,t,next,clean_mcpair);
+}
+
+/* the callback function used to wake you up when you use an mcpair to wait for something */
+void rf_MCPairWakeupFunc(mcpair)
+ RF_MCPair_t *mcpair;
+{
+ RF_LOCK_MUTEX(mcpair->mutex);
+ mcpair->flag = 1;
+#if 0
+printf("MCPairWakeupFunc called!\n");
+#endif
+#ifdef KERNEL
+ wakeup(&(mcpair->flag)); /* XXX Does this do anything useful!! GO */
+ /*
+ * XXX Looks like the following is needed to truly get the
+ * functionality they were looking for here... This could be a
+ * side-effect of my using a tsleep in the Net- and OpenBSD port
+ * though... XXX
+ */
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ wakeup(&(mcpair->cond)); /* XXX XXX XXX GO */
+#endif
+#else /* KERNEL */
+ RF_SIGNAL_COND(mcpair->cond);
+#endif /* KERNEL */
+ RF_UNLOCK_MUTEX(mcpair->mutex);
+}
diff --git a/sys/dev/raidframe/rf_mcpair.h b/sys/dev/raidframe/rf_mcpair.h
new file mode 100644
index 00000000000..852b85ad041
--- /dev/null
+++ b/sys/dev/raidframe/rf_mcpair.h
@@ -0,0 +1,62 @@
+/* $OpenBSD: rf_mcpair.h,v 1.1 1999/01/11 14:29:29 niklas Exp $ */
+/* $NetBSD: rf_mcpair.h,v 1.1 1998/11/13 04:20:31 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_mcpair.h
+ * see comments in rf_mcpair.c
+ */
+
+#ifndef _RF__RF_MCPAIR_H_
+#define _RF__RF_MCPAIR_H_
+
+#include "rf_types.h"
+#include "rf_threadstuff.h"
+
+struct RF_MCPair_s {
+ RF_DECLARE_MUTEX(mutex)
+ RF_DECLARE_COND(cond)
+ int flag;
+ RF_MCPair_t *next;
+};
+
+#ifdef KERNEL
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#define RF_WAIT_MCPAIR(_mcp) mpsleep(&((_mcp)->flag), PZERO, "mcpair", 0, (void *) simple_lock_addr((_mcp)->mutex), MS_LOCK_SIMPLE)
+#else
+#define RF_WAIT_MCPAIR(_mcp) tsleep(&((_mcp)->flag), PRIBIO | PCATCH, "mcpair", 0)
+#endif
+#else /* KERNEL */
+#define RF_WAIT_MCPAIR(_mcp) RF_WAIT_COND((_mcp)->cond, (_mcp)->mutex)
+#endif /* KERNEL */
+
+int rf_ConfigureMCPair(RF_ShutdownList_t **listp);
+RF_MCPair_t *rf_AllocMCPair(void);
+void rf_FreeMCPair(RF_MCPair_t *t);
+void rf_MCPairWakeupFunc(RF_MCPair_t *t);
+
+#endif /* !_RF__RF_MCPAIR_H_ */
diff --git a/sys/dev/raidframe/rf_memchunk.c b/sys/dev/raidframe/rf_memchunk.c
new file mode 100644
index 00000000000..568eb90e12d
--- /dev/null
+++ b/sys/dev/raidframe/rf_memchunk.c
@@ -0,0 +1,256 @@
+/* $OpenBSD: rf_memchunk.c,v 1.1 1999/01/11 14:29:30 niklas Exp $ */
+/* $NetBSD: rf_memchunk.c,v 1.1 1998/11/13 04:20:31 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*********************************************************************************
+ * rf_memchunk.c
+ *
+ * experimental code. I've found that the malloc and free calls in the DAG
+ * creation code are very expensive. Since for any given workload the DAGs
+ * created for different accesses are likely to be similar to each other, the
+ * amount of memory used for any given DAG data structure is likely to be one
+ * of a small number of values. For example, in UNIX, all reads and writes will
+ * be less than 8k and will not span stripe unit boundaries. Thus in the absence
+ * of failure, the only DAGs that will ever get created are single-node reads
+ * and single-stripe-unit atomic read-modify-writes. So, I'm very likely to
+ * be continually asking for chunks of memory equal to the sizes of these two
+ * DAGs.
+ *
+ * This leads to the idea of holding on to these chunks of memory when the DAG is
+ * freed and then, when a new DAG is created, trying to find such a chunk before
+ * calling malloc.
+ *
+ * the "chunk list" is a list of lists. Each header node contains a size value
+ * and a pointer to a list of chunk descriptors, each of which holds a pointer
+ * to a chunk of memory of the indicated size.
+ *
+ * There is currently no way to purge memory out of the chunk list. My
+ * initial thought on this is to have a low-priority thread that wakes up every
+ * 1 or 2 seconds, purges all the chunks with low reuse counts, and sets all
+ * the reuse counts to zero.
+ *
+ * This whole idea may be bad, since malloc may be able to do this more efficiently.
+ * It's worth a try, though, and it can be turned off by setting useMemChunks to 0.
+ *
+ ********************************************************************************/
+
+/* :
+ * Log: rf_memchunk.c,v
+ * Revision 1.17 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.16 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.15 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.14 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.13 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.12 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.11 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.10 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.9 1996/05/20 16:15:45 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.8 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.7 1995/12/01 19:26:07 root
+ * added copyright info
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_threadstuff.h"
+#include "rf_debugMem.h"
+#include "rf_memchunk.h"
+#include "rf_general.h"
+#include "rf_options.h"
+#include "rf_shutdown.h"
+#include "rf_sys.h"
+
+typedef struct RF_ChunkHdr_s RF_ChunkHdr_t;
+struct RF_ChunkHdr_s {
+ int size;
+ RF_ChunkDesc_t *list;
+ RF_ChunkHdr_t *next;
+};
+
+static RF_ChunkHdr_t *chunklist, *chunk_hdr_free_list;
+static RF_ChunkDesc_t *chunk_desc_free_list;
+RF_DECLARE_STATIC_MUTEX(chunkmutex)
+
+static void rf_ShutdownMemChunk(void *);
+static RF_ChunkDesc_t *NewMemChunk(int, char *);
+
+
+static void rf_ShutdownMemChunk(ignored)
+ void *ignored;
+{
+ RF_ChunkDesc_t *pt, *p;
+ RF_ChunkHdr_t *hdr, *ht;
+
+ if (rf_memChunkDebug)
+ printf("Chunklist:\n");
+ for (hdr = chunklist; hdr;) {
+ for (p = hdr->list; p; ) {
+ if (rf_memChunkDebug)
+ printf("Size %d reuse count %d\n",p->size, p->reuse_count);
+ pt = p; p=p->next;
+ RF_Free(pt->buf, pt->size);
+ RF_Free(pt, sizeof(*pt));
+ }
+ ht = hdr; hdr=hdr->next;
+ RF_Free(ht, sizeof(*ht));
+ }
+
+ rf_mutex_destroy(&chunkmutex);
+}
+
+int rf_ConfigureMemChunk(listp)
+ RF_ShutdownList_t **listp;
+{
+ int rc;
+
+ chunklist = NULL;
+ chunk_hdr_free_list = NULL;
+ chunk_desc_free_list = NULL;
+ rc = rf_mutex_init(&chunkmutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ }
+ rc = rf_ShutdownCreate(listp, rf_ShutdownMemChunk, NULL);
+ if (rc) {
+ RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ rf_mutex_destroy(&chunkmutex);
+ }
+ return(rc);
+}
+
+/* called to get a chunk descriptor for a newly-allocated chunk of memory
+ * MUTEX MUST BE LOCKED
+ *
+ * free list is not currently used
+ */
+static RF_ChunkDesc_t *NewMemChunk(size, buf)
+ int size;
+ char *buf;
+{
+ RF_ChunkDesc_t *p;
+
+ if (chunk_desc_free_list) {p = chunk_desc_free_list; chunk_desc_free_list = p->next;}
+ else RF_Malloc(p, sizeof(RF_ChunkDesc_t), (RF_ChunkDesc_t *));
+ p->size = size;
+ p->buf = buf;
+ p->next = NULL;
+ p->reuse_count = 0;
+ return(p);
+}
+
+/* looks for a chunk of memory of acceptable size. If none, allocates one and returns
+ * a chunk descriptor for it, but does not install anything in the list. This is done
+ * when the chunk is released.
+ */
+RF_ChunkDesc_t *rf_GetMemChunk(size)
+ int size;
+{
+ RF_ChunkHdr_t *hdr = chunklist;
+ RF_ChunkDesc_t *p = NULL;
+ char *buf;
+
+ RF_LOCK_MUTEX(chunkmutex);
+ for (hdr = chunklist; hdr; hdr = hdr->next) if (hdr->size >= size) {
+ p = hdr->list;
+ if (p) {
+ hdr->list = p->next;
+ p->next = NULL;
+ p->reuse_count++;
+ }
+ break;
+ }
+ if (!p) {
+ RF_Malloc(buf, size, (char *));
+ p = NewMemChunk(size, buf);
+ }
+ RF_UNLOCK_MUTEX(chunkmutex);
+ (void) bzero(p->buf, size);
+ return(p);
+}
+
+void rf_ReleaseMemChunk(chunk)
+ RF_ChunkDesc_t *chunk;
+{
+ RF_ChunkHdr_t *hdr, *ht = NULL, *new;
+
+ RF_LOCK_MUTEX(chunkmutex);
+ for (hdr = chunklist; hdr && hdr->size < chunk->size; ht=hdr,hdr=hdr->next);
+ if (hdr && hdr->size == chunk->size) {
+ chunk->next = hdr->list;
+ hdr->list = chunk;
+ }
+ else {
+ RF_Malloc(new, sizeof(RF_ChunkHdr_t), (RF_ChunkHdr_t *));
+ new->size = chunk->size; new->list = chunk; chunk->next = NULL;
+ if (ht) {
+ new->next = ht->next;
+ ht->next = new;
+ }
+ else {
+ new->next = hdr;
+ chunklist = new;
+ }
+ }
+ RF_UNLOCK_MUTEX(chunkmutex);
+}
diff --git a/sys/dev/raidframe/rf_memchunk.h b/sys/dev/raidframe/rf_memchunk.h
new file mode 100644
index 00000000000..7d41f57eae5
--- /dev/null
+++ b/sys/dev/raidframe/rf_memchunk.h
@@ -0,0 +1,80 @@
+/* $OpenBSD: rf_memchunk.h,v 1.1 1999/01/11 14:29:30 niklas Exp $ */
+/* $NetBSD: rf_memchunk.h,v 1.1 1998/11/13 04:20:31 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* header file for rf_memchunk.c. See comments there */
+
+/* :
+ * Log: rf_memchunk.h,v
+ * Revision 1.8 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.7 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.6 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.5 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.4 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.3 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.2 1995/12/01 19:25:56 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_MEMCHUNK_H_
+#define _RF__RF_MEMCHUNK_H_
+
+#include "rf_types.h"
+
+struct RF_ChunkDesc_s {
+ int size;
+ int reuse_count;
+ char *buf;
+ RF_ChunkDesc_t *next;
+};
+
+int rf_ConfigureMemChunk(RF_ShutdownList_t **listp);
+RF_ChunkDesc_t *rf_GetMemChunk(int size);
+void rf_ReleaseMemChunk(RF_ChunkDesc_t *chunk);
+
+#endif /* !_RF__RF_MEMCHUNK_H_ */
diff --git a/sys/dev/raidframe/rf_netbsd.h b/sys/dev/raidframe/rf_netbsd.h
new file mode 100644
index 00000000000..6d66769112b
--- /dev/null
+++ b/sys/dev/raidframe/rf_netbsd.h
@@ -0,0 +1,98 @@
+/* $OpenBSD: rf_netbsd.h,v 1.1 1999/01/11 14:29:30 niklas Exp $ */
+/* $NetBSD: rf_netbsd.h,v 1.1 1998/11/13 04:20:31 oster Exp $ */
+/*-
+ * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Greg Oster
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Jason R. Thorpe.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _RF__RF_NETBSDSTUFF_H_
+#define _RF__RF_NETBSDSTUFF_H_
+
+#include <sys/fcntl.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+
+
+
+#if defined(__NetBSD__) && defined(_KERNEL)
+struct raidcinfo {
+ struct vnode *ci_vp; /* device's vnode */
+ dev_t ci_dev; /* XXX: device's dev_t */
+#if 0
+ size_t ci_size; /* size */
+ char *ci_path; /* path to component */
+ size_t ci_pathlen; /* length of component path */
+#endif
+};
+#endif
+
+#endif /* _RF__RF_NETBSDSTUFF_H_ */
diff --git a/sys/dev/raidframe/rf_netbsdkintf.c b/sys/dev/raidframe/rf_netbsdkintf.c
new file mode 100644
index 00000000000..ad6673541cc
--- /dev/null
+++ b/sys/dev/raidframe/rf_netbsdkintf.c
@@ -0,0 +1,2048 @@
+/* $OpenBSD: rf_netbsdkintf.c,v 1.1 1999/01/11 14:29:30 niklas Exp $ */
+/* $NetBSD: rf_netbsdkintf.c,v 1.5 1998/12/22 20:03:14 oster Exp $ */
+/*-
+ * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Greg Oster; Jason R. Thorpe.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: Utah $Hdr: cd.c 1.6 90/11/28$
+ *
+ * @(#)cd.c 8.2 (Berkeley) 11/16/93
+ */
+
+
+
+
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Authors: Mark Holland, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/***********************************************************
+ *
+ * rf_kintf.c -- the kernel interface routines for RAIDframe
+ *
+ ***********************************************************/
+/*
+ * :
+ * Log: rf_kintf.c,v
+ * Revision 1.57 1996/07/19 16:12:20 jimz
+ * remove addition of protectedSectors in InitBP- it's already
+ * done in the diskqueue code
+ *
+ * Revision 1.56 1996/07/17 21:00:58 jimz
+ * clean up timer interface, tracing
+ *
+ * Revision 1.55 1996/06/17 03:00:54 jimz
+ * Change RAIDFRAME_GET_INFO interface to do its own copyout()
+ * (because size of device config structure now exceeds 8k)
+ *
+ * Revision 1.54 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.53 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.52 1996/06/06 17:28:08 jimz
+ * track sector number of last I/O dequeued
+ *
+ * Revision 1.51 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.50 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.49 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.48 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.47 1996/05/30 12:59:18 jimz
+ * make etimer happier, more portable
+ *
+ * Revision 1.46 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.45 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.44 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.43 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.42 1996/05/23 22:17:54 jimz
+ * fix sector size hardcoding problems
+ *
+ * Revision 1.41 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.40 1996/05/23 13:18:07 jimz
+ * tracing_mutex -> rf_tracing_mutex
+ *
+ * Revision 1.39 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.38 1996/05/20 16:15:32 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.37 1996/05/10 16:23:47 jimz
+ * RF_offset -> RF_Offset
+ *
+ * Revision 1.36 1996/05/08 21:01:24 jimz
+ * fixed up enum type names that were conflicting with other
+ * enums and function names (ie, "panic")
+ * future naming trends will be towards RF_ and rf_ for
+ * everything raidframe-related
+ *
+ * Revision 1.35 1996/05/03 19:10:48 jimz
+ * change sanity checking for bogus I/Os to return more appropriate
+ * values (to make some user-level utilities happer with RAIDframe)
+ *
+ * Revision 1.34 1996/05/02 22:17:00 jimz
+ * When using DKUSAGE, send a bogus IO after configuring to let DKUSAGE know
+ * that we exist. This will let user-level programs doing group stats on the
+ * RF device function without error before RF gets its first IO
+ *
+ * Changed rf_device_config devs and spares fields to RF_RaidDisk_t
+ *
+ * Inc numOutstanding for the disk queue in rf_DispatchKernelIO if
+ * type is IO_TYPE_NOP. I'm not sure this is right, but it seems to be,
+ * because the disk IO completion routine wants to dec it, and doesn't
+ * care if there was no such IO.
+ *
+ * Revision 1.33 1996/05/02 15:05:44 jimz
+ * for now, rf_DoAccessKernel will reject non-sector-sized I/Os
+ * eventually, it should do something more clever...
+ * (and do it in DoAccess(), not just DoAccessKernel())
+ *
+ * Revision 1.32 1996/05/01 16:28:39 jimz
+ * get rid of uses of ccmn_ functions
+ *
+ * Revision 1.31 1996/05/01 15:42:17 jimz
+ * ccmn_* memory management is on the way out. This is an archival checkpoint-
+ * both the old and new code are in place (all the ccmn_ calls are #if 0). After
+ * this, the ccmn_ code will no longer appear.
+ *
+ * Revision 1.30 1996/04/22 15:53:13 jimz
+ * MAX_RAIDS -> NRAIDFRAME
+ *
+ * Revision 1.29 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.28 1995/12/01 19:11:01 root
+ * added copyright info
+ *
+ * Revision 1.27 1995/11/28 18:56:40 wvcii
+ * disabled buffer copy in rf_write
+ *
+ * Revision 1.26 1995/10/06 16:37:08 jimz
+ * get struct bufs from ubc, not cam
+ * copy all write data, and operate on copy
+ * (temporary hack to get around dags in PQ that want
+ * to Xor into user write buffers)
+ *
+ * Revision 1.25 1995/09/30 22:23:08 jimz
+ * do not require raid to be active to perform ACCTOTAL ioctl
+ *
+ * Revision 1.24 1995/09/30 20:39:08 jimz
+ * added new ioctls:
+ * RAIDFRAME_RESET_ACCTOTALS
+ * RAIDFRAME_GET_ACCTOTALS
+ * RAIDFRAME_KEEP_ACCTOTALS
+ *
+ * Revision 1.23 1995/09/20 21:11:59 jimz
+ * include dfstrace.h in KERNEL block
+ * (even though it's a kernel-only file, this makes the depend process
+ * at user-level happy. Why the user-level Makefile wants to depend
+ * kintf.c is less clear, but this is a workaround).
+ *
+ * Revision 1.22 1995/09/19 23:19:03 jimz
+ * added DKUSAGE support
+ *
+ */
+
+
+
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+
+
+#ifdef KERNEL
+
+#include <sys/errno.h>
+
+#ifdef __NetBSD__
+#include "raid.h"
+#include <sys/param.h>
+#include <sys/pool.h>
+#include <sys/queue.h>
+#include <sys/disk.h>
+#include <sys/device.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/fcntl.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#endif
+
+#include <sys/param.h>
+#include <sys/types.h>
+
+#include <machine/types.h>
+
+#include <sys/disklabel.h>
+
+#include <sys/conf.h>
+
+
+#ifdef __NetBSD__
+#include <sys/lock.h>
+#endif /* __NetBSD__ */
+
+
+#include <sys/buf.h>
+#include <sys/user.h>
+#include "rf_raid.h"
+#include "rf_raidframe.h"
+#include "rf_dag.h"
+#include "rf_dagflags.h"
+#include "rf_diskqueue.h"
+#include "rf_acctrace.h"
+#include "rf_etimer.h"
+#include "rf_general.h"
+#include "rf_debugMem.h"
+#include "rf_kintf.h"
+#include "rf_options.h"
+#include "rf_driver.h"
+#include "rf_parityscan.h"
+#include "rf_debugprint.h"
+#include "rf_threadstuff.h"
+
+int rf_kdebug_level = 0;
+
+#define RFK_BOOT_NONE 0
+#define RFK_BOOT_GOOD 1
+#define RFK_BOOT_BAD 2
+static int rf_kbooted = RFK_BOOT_NONE;
+
+#ifdef DEBUG
+#define db0_printf(a) printf a
+#define db_printf(a) if (rf_kdebug_level > 0) printf a
+#define db1_printf(a) if (rf_kdebug_level > 0) printf a
+#define db2_printf(a) if (rf_kdebug_level > 1) printf a
+#define db3_printf(a) if (rf_kdebug_level > 2) printf a
+#define db4_printf(a) if (rf_kdebug_level > 3) printf a
+#define db5_printf(a) if (rf_kdebug_level > 4) printf a
+#else /* DEBUG */
+#define db0_printf(a) printf a
+#define db1_printf(a) { }
+#define db2_printf(a) { }
+#define db3_printf(a) { }
+#define db4_printf(a) { }
+#define db5_printf(a) { }
+#endif /* DEBUG */
+
+static RF_Raid_t **raidPtrs; /* global raid device descriptors */
+
+static int rf_pending_testaccs;
+
+RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
+RF_DECLARE_STATIC_MUTEX(rf_async_done_q_mutex)
+static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a spare table */
+static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from installation process */
+static struct rf_test_acc *rf_async_done_qh, *rf_async_done_qt;
+
+static struct rf_recon_req *recon_queue = NULL; /* used to communicate reconstruction requests */
+
+
+decl_simple_lock_data(,recon_queue_mutex)
+
+
+#define LOCK_RECON_Q_MUTEX() simple_lock(&recon_queue_mutex)
+#define UNLOCK_RECON_Q_MUTEX() simple_unlock(&recon_queue_mutex)
+
+/* prototypes */
+static void KernelWakeupFunc(struct buf *bp);
+static void InitBP(struct buf *bp, struct vnode *, unsigned rw_flag, dev_t dev,
+ RF_SectorNum_t startSect, RF_SectorCount_t numSect, caddr_t buf,
+ void (*cbFunc)(struct buf *), void *cbArg, int logBytesPerSector,
+ struct proc *b_proc);
+
+#define Dprintf0(s) if (rf_queueDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf1(s,a) if (rf_queueDebug) rf_debug_printf(s,a,NULL,NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf2(s,a,b) if (rf_queueDebug) rf_debug_printf(s,a,b,NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf3(s,a,b,c) if (rf_queueDebug) rf_debug_printf(s,a,b,c,NULL,NULL,NULL,NULL,NULL)
+
+
+/* this is so that we can compile under 2.0 as well as 3.2 */
+#ifndef proc_to_task
+#define proc_to_task(x) ((x)->task)
+#endif /* !proc_to_task */
+
+void raidattach __P((int));
+int raidsize __P((dev_t));
+
+void rf_DiskIOComplete(RF_DiskQueue_t *, RF_DiskQueueData_t *, int);
+void rf_CopybackReconstructedData(RF_Raid_t *raidPtr);
+static int raidinit __P((dev_t,RF_Raid_t *,int));
+
+int raidopen __P((dev_t, int, int, struct proc *));
+int raidclose __P((dev_t, int, int, struct proc *));
+int raidioctl __P((dev_t, u_long, caddr_t, int, struct proc *));
+int raidwrite __P((dev_t, struct uio *, int));
+int raidread __P((dev_t, struct uio *, int));
+void raidstrategy __P((struct buf *));
+int raiddump __P((dev_t, daddr_t, caddr_t, size_t));
+
+/*
+ * Pilfered from ccd.c
+ */
+
+struct raidbuf {
+ struct buf rf_buf; /* new I/O buf. MUST BE FIRST!!! */
+ struct buf *rf_obp; /* ptr. to original I/O buf */
+ int rf_flags; /* misc. flags */
+ RF_DiskQueueData_t *req; /* the request that this was part of.. */
+};
+
+
+#define RAIDGETBUF(rs) pool_get(&(rs)->sc_cbufpool, PR_NOWAIT)
+#define RAIDPUTBUF(rs, cbp) pool_put(&(rs)->sc_cbufpool, cbp)
+
+/* XXX Not sure if the following should be replacing the raidPtrs above,
+or if it should be used in conjunction with that... */
+
+struct raid_softc {
+ int sc_unit; /* logical unit number */
+ int sc_flags; /* flags */
+ int sc_cflags; /* configuration flags */
+ size_t sc_size; /* size of the raid device */
+ dev_t sc_dev; /* our device..*/
+ char sc_xname[20]; /* XXX external name */
+ struct disk sc_dkdev; /* generic disk device info */
+ struct pool sc_cbufpool; /* component buffer pool */
+};
+
+/* sc_flags */
+#define RAIDF_INITED 0x01 /* unit has been initialized */
+#define RAIDF_WLABEL 0x02 /* label area is writable */
+#define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
+#define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
+#define RAIDF_LOCKED 0x80 /* unit is locked */
+
+#define raidunit(x) DISKUNIT(x)
+static int numraid=0;
+
+#define RAIDLABELDEV(dev) \
+ (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
+
+/* declared here, and made public, for the benefit of KVM stuff.. */
+struct raid_softc *raid_softc;
+
+static void raidgetdefaultlabel __P((RF_Raid_t *, struct raid_softc *, struct disklabel *));
+static void raidgetdisklabel __P((dev_t));
+static void raidmakedisklabel __P((struct raid_softc *));
+
+static int raidlock __P((struct raid_softc *));
+static void raidunlock __P((struct raid_softc *));
+int raidlookup __P((char *, struct proc *p, struct vnode **));
+
+
+void
+raidattach(num)
+ int num;
+{
+ int raidID;
+
+#ifdef DEBUG
+ printf("raidattach: Asked for %d units\n",num);
+#endif
+
+ if (num <= 0) {
+#ifdef DIAGNOSTIC
+ panic("raidattach: count <= 0");
+#endif
+ return;
+ }
+ /*
+ This is where all the initialization stuff gets done.
+ */
+
+ /* Make some space for requested number of units... */
+
+ RF_Calloc(raidPtrs, num, sizeof(RF_Raid_t *), (RF_Raid_t **));
+ if (raidPtrs == NULL) {
+ panic("raidPtrs is NULL!!\n");
+ }
+
+
+
+ rf_kbooted = rf_boot();
+ if (rf_kbooted) {
+ panic("Serious error booting RAID!!\n");
+ }
+
+ rf_kbooted = RFK_BOOT_GOOD;
+
+ /*
+ put together some datastructures like the CCD device does..
+ This lets us lock the device and what-not when it gets opened.
+ */
+
+ raid_softc = (struct raid_softc *)
+ malloc(num * sizeof(struct raid_softc),
+ M_DEVBUF, M_NOWAIT);
+ if (raid_softc == NULL) {
+ printf("WARNING: no memory for RAIDframe driver\n");
+ return;
+ }
+ numraid = num;
+ bzero(raid_softc, num * sizeof(struct raid_softc));
+
+ for(raidID=0;raidID < num;raidID++) {
+ RF_Calloc(raidPtrs[raidID], 1, sizeof(RF_Raid_t),
+ (RF_Raid_t *));
+ if (raidPtrs[raidID]==NULL) {
+ printf("raidPtrs[%d] is NULL\n",raidID);
+ }
+ }
+}
+
+
+int
+raidsize(dev)
+ dev_t dev;
+{
+ struct raid_softc *rs;
+ struct disklabel *lp;
+ int part, unit, omask, size;
+
+ unit = raidunit(dev);
+ if (unit >= numraid)
+ return (-1);
+ rs = &raid_softc[unit];
+
+ if ((rs->sc_flags & RAIDF_INITED) == 0)
+ return (-1);
+
+ part = DISKPART(dev);
+ omask = rs->sc_dkdev.dk_openmask & (1 << part);
+ lp = rs->sc_dkdev.dk_label;
+
+ if (omask == 0 && raidopen(dev, 0, S_IFBLK, curproc))
+ return (-1);
+
+ if (lp->d_partitions[part].p_fstype != FS_SWAP)
+ size = -1;
+ else
+ size = lp->d_partitions[part].p_size *
+ (lp->d_secsize / DEV_BSIZE);
+
+ if (omask == 0 && raidclose(dev, 0, S_IFBLK, curproc))
+ return (-1);
+
+ return (size);
+
+}
+
+int
+raiddump(dev, blkno, va, size)
+ dev_t dev;
+ daddr_t blkno;
+ caddr_t va;
+ size_t size;
+{
+ /* Not implemented. */
+ return ENXIO;
+}
+
+/* ARGSUSED */
+int
+raidopen(dev, flags, fmt, p)
+ dev_t dev;
+ int flags, fmt;
+ struct proc *p;
+{
+ int unit = raidunit(dev);
+ struct raid_softc *rs;
+ struct disklabel *lp;
+ int part,pmask;
+ unsigned int raidID;
+ int rc;
+ int error = 0;
+
+ /* This whole next chunk of code is somewhat suspect... Not sure
+ it's needed here at all... XXX */
+
+ if (rf_kbooted == RFK_BOOT_NONE) {
+ printf("Doing restart on raidopen.\n");
+ rf_kbooted = RFK_BOOT_GOOD;
+ rc = rf_boot();
+ if (rc) {
+ rf_kbooted = RFK_BOOT_BAD;
+ printf("Someone is unhappy...\n");
+ return(rc);
+ }
+ }
+
+ if (unit >= numraid)
+ return (ENXIO);
+ rs = &raid_softc[unit];
+
+ if ((error = raidlock(rs)) != 0)
+ return(error);
+ lp = rs->sc_dkdev.dk_label;
+
+ raidID = raidunit(dev);
+
+ part = DISKPART(dev);
+ pmask = (1 << part);
+
+ db1_printf(("Opening raid device number: %d partition: %d\n",
+ raidID,part));
+
+
+ if ((rs->sc_flags & RAIDF_INITED) &&
+ (rs->sc_dkdev.dk_openmask == 0))
+ raidgetdisklabel(dev);
+
+ /* make sure that this partition exists */
+
+ if (part != RAW_PART) {
+ db1_printf(("Not a raw partition..\n"));
+ if (((rs->sc_flags & RAIDF_INITED) == 0) ||
+ ((part >= lp->d_npartitions) ||
+ (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
+ error = ENXIO;
+ raidunlock(rs);
+ db1_printf(("Bailing out...\n"));
+ return(error);
+ }
+ }
+
+ /* Prevent this unit from being unconfigured while open. */
+ switch (fmt) {
+ case S_IFCHR:
+ rs->sc_dkdev.dk_copenmask |= pmask;
+ break;
+
+ case S_IFBLK:
+ rs->sc_dkdev.dk_bopenmask |= pmask;
+ break;
+ }
+ rs->sc_dkdev.dk_openmask =
+ rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
+
+ raidunlock(rs);
+
+ return(error);
+
+
+}
+
+/* ARGSUSED */
+int
+raidclose(dev, flags, fmt, p)
+ dev_t dev;
+ int flags, fmt;
+ struct proc *p;
+{
+ int unit = raidunit(dev);
+ struct raid_softc *rs;
+ int error = 0;
+ int part;
+
+ if (unit >= numraid)
+ return (ENXIO);
+ rs = &raid_softc[unit];
+
+ if ((error = raidlock(rs)) != 0)
+ return (error);
+
+ part = DISKPART(dev);
+
+ /* ...that much closer to allowing unconfiguration... */
+ switch (fmt) {
+ case S_IFCHR:
+ rs->sc_dkdev.dk_copenmask &= ~(1 << part);
+ break;
+
+ case S_IFBLK:
+ rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
+ break;
+ }
+ rs->sc_dkdev.dk_openmask =
+ rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
+
+ raidunlock(rs);
+ return (0);
+
+}
+
+void
+raidstrategy(bp)
+ register struct buf *bp;
+{
+ register int s;
+
+ unsigned int raidID = raidunit(bp->b_dev);
+ RF_Raid_t *raidPtr;
+ struct raid_softc *rs = &raid_softc[raidID];
+ struct disklabel *lp;
+ int wlabel;
+
+#if 0
+ db1_printf(("Strategy: 0x%x 0x%x\n",bp,bp->b_data));
+ db1_printf(("Strategy(2): bp->b_bufsize%d\n", (int)bp->b_bufsize));
+ db1_printf(("bp->b_count=%d\n",(int)bp->b_bcount));
+ db1_printf(("bp->b_resid=%d\n",(int)bp->b_resid));
+ db1_printf(("bp->b_blkno=%d\n",(int)bp->b_blkno));
+
+ if (bp->b_flags&B_READ)
+ db1_printf(("READ\n"));
+ else
+ db1_printf(("WRITE\n"));
+#endif
+ if (rf_kbooted != RFK_BOOT_GOOD)
+ return;
+ if (raidID >= numraid || !raidPtrs[raidID]) {
+ bp->b_error = ENODEV;
+ bp->b_flags |= B_ERROR;
+ bp->b_resid = bp->b_bcount;
+ biodone(bp);
+ return;
+ }
+ raidPtr = raidPtrs[raidID];
+ if (!raidPtr->valid) {
+ bp->b_error = ENODEV;
+ bp->b_flags |= B_ERROR;
+ bp->b_resid = bp->b_bcount;
+ biodone(bp);
+ return;
+ }
+ if (bp->b_bcount == 0) {
+ db1_printf(("b_bcount is zero..\n"));
+ biodone(bp);
+ return;
+ }
+ lp = rs->sc_dkdev.dk_label;
+
+ /*
+ * Do bounds checking and adjust transfer. If there's an
+ * error, the bounds check will flag that for us.
+ */
+
+ wlabel = rs->sc_flags & (RAIDF_WLABEL|RAIDF_LABELLING);
+ if (DISKPART(bp->b_dev) != RAW_PART)
+ if (bounds_check_with_label(bp, lp, wlabel) <= 0) {
+ db1_printf(("Bounds check failed!!:%d %d\n",
+ (int)bp->b_blkno,(int)wlabel));
+ biodone(bp);
+ return;
+ }
+
+ s = splbio(); /* XXX Needed? */
+ db1_printf(("Beginning strategy...\n"));
+
+ bp->b_resid = 0;
+ bp->b_error = rf_DoAccessKernel(raidPtrs[raidID], bp,
+ NULL, NULL, NULL);
+ if (bp->b_error) {
+ bp->b_flags |= B_ERROR;
+ db1_printf(("bp->b_flags HAS B_ERROR SET!!!: %d\n",
+ bp->b_error));
+ }
+ splx(s);
+#if 0
+ db1_printf(("Strategy exiting: 0x%x 0x%x %d %d\n",
+ bp,bp->b_data,
+ (int)bp->b_bcount,(int)bp->b_resid));
+#endif
+}
+
+/* ARGSUSED */
+int
+raidread(dev, uio, flags)
+ dev_t dev;
+ struct uio *uio;
+ int flags;
+{
+ int unit = raidunit(dev);
+ struct raid_softc *rs;
+ int result;
+ int part;
+
+ if (unit >= numraid)
+ return (ENXIO);
+ rs = &raid_softc[unit];
+
+ if ((rs->sc_flags & RAIDF_INITED) == 0)
+ return (ENXIO);
+ part = DISKPART(dev);
+
+ db1_printf(("raidread: unit: %d partition: %d\n",unit,part));
+
+#if 0
+ return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
+#endif
+ result=physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
+ db1_printf(("raidread done. Result is %d %d\n",
+ result,uio->uio_resid));
+ return(result);
+
+}
+
+/* ARGSUSED */
+int
+raidwrite(dev, uio, flags)
+ dev_t dev;
+ struct uio *uio;
+ int flags;
+{
+ int unit = raidunit(dev);
+ struct raid_softc *rs;
+
+ if (unit >= numraid)
+ return (ENXIO);
+ rs = &raid_softc[unit];
+
+ if ((rs->sc_flags & RAIDF_INITED) == 0)
+ return (ENXIO);
+ db1_printf(("raidwrite\n"));
+ return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
+
+
+}
+
+int
+raidioctl(dev, cmd, data, flag, p)
+ dev_t dev;
+ u_long cmd;
+ caddr_t data;
+ int flag;
+ struct proc *p;
+{
+ int unit = raidunit(dev);
+ int error = 0;
+ int part, pmask;
+ struct raid_softc *rs;
+#if 0
+ int r,c;
+#endif
+ /* struct raid_ioctl *ccio = (struct ccd_ioctl *)data; */
+
+ /* struct ccdbuf *cbp; */
+ /* struct raidbuf *raidbp; */
+ RF_Config_t *k_cfg, *u_cfg;
+ u_char *specific_buf;
+ int retcode = 0;
+
+ int row;
+ struct rf_recon_req *rrcopy, *rr;
+#if 0
+ int nbytes, spl, rw, row;
+ struct rf_test_acc *ta;
+ struct buf *bp;
+ RF_SparetWait_t *waitreq;
+ struct rf_test_acc *ta_p, *ta_copy;
+#endif
+
+ if (unit >= numraid)
+ return (ENXIO);
+ rs = &raid_softc[unit];
+
+ db1_printf(("raidioctl: %d %d %d %d\n",(int)dev,
+ (int)DISKPART(dev),(int)unit,(int)cmd));
+
+ /* Must be open for writes for these commands... */
+ switch (cmd) {
+ case DIOCSDINFO:
+ case DIOCWDINFO:
+ case DIOCWLABEL:
+ if ((flag & FWRITE) == 0)
+ return (EBADF);
+ }
+
+ /* Must be initialized for these... */
+ switch (cmd) {
+ case DIOCGDINFO:
+ case DIOCSDINFO:
+ case DIOCWDINFO:
+ case DIOCGPART:
+ case DIOCWLABEL:
+ case DIOCGDEFLABEL:
+ case RAIDFRAME_SHUTDOWN:
+ case RAIDFRAME_REWRITEPARITY:
+ case RAIDFRAME_GET_INFO:
+ case RAIDFRAME_RESET_ACCTOTALS:
+ case RAIDFRAME_GET_ACCTOTALS:
+ case RAIDFRAME_KEEP_ACCTOTALS:
+ case RAIDFRAME_GET_SIZE:
+ case RAIDFRAME_FAIL_DISK:
+ case RAIDFRAME_COPYBACK:
+ case RAIDFRAME_CHECKRECON:
+ if ((rs->sc_flags & RAIDF_INITED) == 0)
+ return (ENXIO);
+ }
+
+ switch (cmd) {
+
+
+ /* configure the system */
+ case RAIDFRAME_CONFIGURE:
+
+ db3_printf(("rf_ioctl: RAIDFRAME_CONFIGURE\n"));
+ /* copy-in the configuration information */
+ /* data points to a pointer to the configuration structure */
+ u_cfg = *((RF_Config_t **) data);
+ RF_Malloc(k_cfg,sizeof(RF_Config_t),(RF_Config_t *));
+ if (k_cfg == NULL) {
+ db3_printf(("rf_ioctl: ENOMEM for config. Code is %d\n", retcode));
+ return(ENOMEM);
+ }
+ retcode = copyin((caddr_t) u_cfg, (caddr_t) k_cfg,
+ sizeof(RF_Config_t));
+ if (retcode) {
+ db3_printf(("rf_ioctl: retcode=%d copyin.1\n",
+ retcode));
+ return(retcode);
+ }
+
+ /* allocate a buffer for the layout-specific data,
+ and copy it in */
+ if (k_cfg->layoutSpecificSize) {
+ if (k_cfg->layoutSpecificSize > 10000) {
+ /* sanity check */
+ db3_printf(("rf_ioctl: EINVAL %d\n", retcode));
+ return(EINVAL);
+ }
+ RF_Malloc(specific_buf,k_cfg->layoutSpecificSize,
+ (u_char *));
+ if (specific_buf == NULL) {
+ RF_Free(k_cfg,sizeof(RF_Config_t));
+ db3_printf(("rf_ioctl: ENOMEM %d\n", retcode));
+ return(ENOMEM);
+ }
+ retcode = copyin(k_cfg->layoutSpecific,
+ (caddr_t) specific_buf,
+ k_cfg->layoutSpecificSize);
+ if (retcode) {
+ db3_printf(("rf_ioctl: retcode=%d copyin.2\n",
+ retcode));
+ return(retcode);
+ }
+ } else specific_buf = NULL;
+ k_cfg->layoutSpecific = specific_buf;
+
+ /* should do some kind of sanity check on the configuration.
+ Store the sum of all the bytes in the last byte?
+ */
+
+#if 0
+ db1_printf(("Considering configuring the system.:%d 0x%x\n",
+ unit,p));
+#endif
+
+ /* We need the pointer to this a little deeper, so
+ stash it here... */
+
+ raidPtrs[unit]->proc = p;
+
+ /* configure the system */
+ rf_pending_testaccs = 0;
+
+
+ raidPtrs[unit]->raidid = unit;
+ retcode = rf_Configure(raidPtrs[unit], k_cfg);
+
+
+ if (retcode == 0) {
+ retcode = raidinit(dev, raidPtrs[unit],unit);
+ }
+
+ /* free the buffers. No return code here. */
+ if (k_cfg->layoutSpecificSize) {
+ RF_Free(specific_buf,k_cfg->layoutSpecificSize);
+ }
+ RF_Free(k_cfg,sizeof(RF_Config_t));
+
+ db3_printf(("rf_ioctl: retcode=%d RAIDFRAME_CONFIGURE\n",
+ retcode));
+ return(retcode);
+
+ /* shutdown the system */
+ case RAIDFRAME_SHUTDOWN:
+
+ if ((error = raidlock(rs)) != 0)
+ return(error);
+
+ /*
+ * If somebody has a partition mounted, we shouldn't
+ * shutdown.
+ */
+
+ part = DISKPART(dev);
+ pmask = (1 << part);
+ if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
+ ((rs->sc_dkdev.dk_bopenmask & pmask) &&
+ (rs->sc_dkdev.dk_copenmask & pmask))) {
+ raidunlock(rs);
+ return (EBUSY);
+ }
+
+ /* the intention here was to disallow shutdowns while
+ raidframe is mounted, but it doesn't work because the
+ shutdown ioctl calls rf_open
+ */
+ if (rf_pending_testaccs > 0) {
+ printf("RAIDFRAME: Can't shutdown because there are %d pending test accs\n",
+ rf_pending_testaccs);
+ return(EINVAL);
+ }
+ if (rf_debugKernelAccess) {
+ printf("call shutdown\n");
+ }
+ raidPtrs[unit]->proc = p; /* XXX necessary evil */
+ retcode = rf_Shutdown(raidPtrs[unit]);
+
+ db1_printf(("Done main shutdown\n"));
+
+ pool_destroy(&rs->sc_cbufpool);
+ db1_printf(("Done freeing component buffer freelist\n"));
+
+ /* It's no longer initialized... */
+ rs->sc_flags &= ~RAIDF_INITED;
+
+ /* Detach the disk. */
+ disk_detach(&rs->sc_dkdev);
+
+ raidunlock(rs);
+
+ return(retcode);
+
+ /* initialize all parity */
+ case RAIDFRAME_REWRITEPARITY:
+
+ if (raidPtrs[unit]->Layout.map->faultsTolerated == 0)
+ return(EINVAL);
+ /* borrow the thread of the requesting process */
+ raidPtrs[unit]->proc = p; /* Blah... :-p GO */
+ retcode = rf_RewriteParity(raidPtrs[unit]);
+ /* return I/O Error if the parity rewrite fails */
+
+ if (retcode)
+ retcode = EIO;
+ return(retcode);
+
+ /* issue a test-unit-ready through raidframe to the
+ indicated device */
+#if 0 /* XXX not supported yet (ever?) */
+ case RAIDFRAME_TUR:
+ /* debug only */
+ retcode = rf_SCSI_DoTUR(0, 0, 0, 0, *(dev_t *) data);
+ return(retcode);
+#endif
+ case RAIDFRAME_GET_INFO:
+ {
+ RF_Raid_t *raid = raidPtrs[unit];
+ RF_DeviceConfig_t *cfg, **ucfgp;
+ int i, j, d;
+
+ if (!raid->valid)
+ return(ENODEV);
+ ucfgp = (RF_DeviceConfig_t **)data;
+ RF_Malloc(cfg,sizeof(RF_DeviceConfig_t),
+ (RF_DeviceConfig_t *));
+ if (cfg == NULL)
+ return(ENOMEM);
+ bzero((char *)cfg, sizeof(RF_DeviceConfig_t));
+ cfg->rows = raid->numRow;
+ cfg->cols = raid->numCol;
+ cfg->ndevs = raid->numRow * raid->numCol;
+ if (cfg->ndevs >= RF_MAX_DISKS) {
+ cfg->ndevs = 0;
+ return(ENOMEM);
+ }
+ cfg->nspares = raid->numSpare;
+ if (cfg->nspares >= RF_MAX_DISKS) {
+ cfg->nspares = 0;
+ return(ENOMEM);
+ }
+ cfg->maxqdepth = raid->maxQueueDepth;
+ d = 0;
+ for(i=0;i<cfg->rows;i++) {
+ for(j=0;j<cfg->cols;j++) {
+ cfg->devs[d] = raid->Disks[i][j];
+ d++;
+ }
+ }
+ for(j=cfg->cols,i=0;i<cfg->nspares;i++,j++) {
+ cfg->spares[i] = raid->Disks[0][j];
+ }
+ retcode = copyout((caddr_t)cfg, (caddr_t)*ucfgp,
+ sizeof(RF_DeviceConfig_t));
+ RF_Free(cfg,sizeof(RF_DeviceConfig_t));
+
+ return(retcode);
+ }
+ break;
+
+ case RAIDFRAME_RESET_ACCTOTALS:
+ {
+ RF_Raid_t *raid = raidPtrs[unit];
+
+ bzero(&raid->acc_totals, sizeof(raid->acc_totals));
+ return(0);
+ }
+ break;
+
+ case RAIDFRAME_GET_ACCTOTALS:
+ {
+ RF_AccTotals_t *totals = (RF_AccTotals_t *)data;
+ RF_Raid_t *raid = raidPtrs[unit];
+
+ *totals = raid->acc_totals;
+ return(0);
+ }
+ break;
+
+ case RAIDFRAME_KEEP_ACCTOTALS:
+ {
+ RF_Raid_t *raid = raidPtrs[unit];
+ int *keep = (int *)data;
+
+ raid->keep_acc_totals = *keep;
+ return(0);
+ }
+ break;
+
+ case RAIDFRAME_GET_SIZE:
+ *(int *) data = raidPtrs[unit]->totalSectors;
+ return(0);
+
+#define RAIDFRAME_RECON 1
+ /* XXX The above should probably be set somewhere else!! GO */
+#if RAIDFRAME_RECON > 0
+
+ /* fail a disk & optionally start reconstruction */
+ case RAIDFRAME_FAIL_DISK:
+ rr = (struct rf_recon_req *) data;
+
+ if (rr->row < 0 || rr->row >= raidPtrs[unit]->numRow
+ || rr->col < 0 || rr->col >= raidPtrs[unit]->numCol)
+ return(EINVAL);
+
+ printf("Failing the disk: row: %d col: %d\n",rr->row,rr->col);
+
+ /* make a copy of the recon request so that we don't
+ rely on the user's buffer */
+ RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
+ bcopy(rr, rrcopy, sizeof(*rr));
+ rrcopy->raidPtr = (void *) raidPtrs[unit];
+
+ LOCK_RECON_Q_MUTEX();
+ rrcopy->next = recon_queue;
+ recon_queue = rrcopy;
+ wakeup(&recon_queue);
+ UNLOCK_RECON_Q_MUTEX();
+
+ return(0);
+
+ /* invoke a copyback operation after recon on whatever
+ disk needs it, if any */
+ case RAIDFRAME_COPYBACK:
+ /* borrow the current thread to get this done */
+ raidPtrs[unit]->proc = p; /* ICK.. but needed :-p GO */
+ rf_CopybackReconstructedData(raidPtrs[unit]);
+ return(0);
+
+ /* return the percentage completion of reconstruction */
+ case RAIDFRAME_CHECKRECON:
+ row = *(int *) data;
+ if (row < 0 || row >= raidPtrs[unit]->numRow)
+ return(EINVAL);
+ if (raidPtrs[unit]->status[row] != rf_rs_reconstructing)
+ *(int *) data = 100;
+ else
+ *(int *) data = raidPtrs[unit]->reconControl[row]->percentComplete;
+ return(0);
+
+ /* the sparetable daemon calls this to wait for the
+ kernel to need a spare table.
+ * this ioctl does not return until a spare table is needed.
+ * XXX -- calling mpsleep here in the ioctl code is almost
+ certainly wrong and evil. -- XXX
+ * XXX -- I should either compute the spare table in the
+ kernel, or have a different -- XXX
+ * XXX -- interface (a different character device) for
+ delivering the table -- XXX
+ */
+#if 0
+ case RAIDFRAME_SPARET_WAIT:
+ RF_LOCK_MUTEX(rf_sparet_wait_mutex);
+ while (!rf_sparet_wait_queue) mpsleep(&rf_sparet_wait_queue, (PZERO+1)|PCATCH, "sparet wait", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
+ waitreq = rf_sparet_wait_queue;
+ rf_sparet_wait_queue = rf_sparet_wait_queue->next;
+ RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
+
+ *((RF_SparetWait_t *) data) = *waitreq; /* structure assignment */
+
+ RF_Free(waitreq, sizeof(*waitreq));
+ return(0);
+
+
+ /* wakes up a process waiting on SPARET_WAIT and puts an
+ error code in it that will cause the dameon to exit */
+ case RAIDFRAME_ABORT_SPARET_WAIT:
+ RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
+ waitreq->fcol = -1;
+ RF_LOCK_MUTEX(rf_sparet_wait_mutex);
+ waitreq->next = rf_sparet_wait_queue;
+ rf_sparet_wait_queue = waitreq;
+ RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
+ wakeup(&rf_sparet_wait_queue);
+ return(0);
+
+ /* used by the spare table daemon to deliver a spare table
+ into the kernel */
+ case RAIDFRAME_SEND_SPARET:
+
+ /* install the spare table */
+ retcode = rf_SetSpareTable(raidPtrs[unit],*(void **) data);
+
+ /* respond to the requestor. the return status of the
+ spare table installation is passed in the "fcol" field */
+ RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
+ waitreq->fcol = retcode;
+ RF_LOCK_MUTEX(rf_sparet_wait_mutex);
+ waitreq->next = rf_sparet_resp_queue;
+ rf_sparet_resp_queue = waitreq;
+ wakeup(&rf_sparet_resp_queue);
+ RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
+
+ return(retcode);
+#endif
+
+
+#endif /* RAIDFRAME_RECON > 0 */
+
+ default: break; /* fall through to the os-specific code below */
+
+ }
+
+ if (!raidPtrs[unit]->valid)
+ return(EINVAL);
+
+ /*
+ * Add support for "regular" device ioctls here.
+ */
+
+ switch (cmd) {
+ case DIOCGDINFO:
+ db1_printf(("DIOCGDINFO %d %d\n",(int)dev,(int)DISKPART(dev)));
+ *(struct disklabel *)data = *(rs->sc_dkdev.dk_label);
+ break;
+
+ case DIOCGPART:
+ db1_printf(("DIOCGPART: %d %d\n",(int)dev,(int)DISKPART(dev)));
+ ((struct partinfo *)data)->disklab = rs->sc_dkdev.dk_label;
+ ((struct partinfo *)data)->part =
+ &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
+ break;
+
+ case DIOCWDINFO:
+ db1_printf(("DIOCWDINFO\n"));
+ case DIOCSDINFO:
+ db1_printf(("DIOCSDINFO\n"));
+ if ((error = raidlock(rs)) != 0)
+ return (error);
+
+ rs->sc_flags |= RAIDF_LABELLING;
+
+ error = setdisklabel(rs->sc_dkdev.dk_label,
+ (struct disklabel *)data, 0, rs->sc_dkdev.dk_cpulabel);
+ if (error == 0) {
+ if (cmd == DIOCWDINFO)
+ error = writedisklabel(RAIDLABELDEV(dev),
+ raidstrategy, rs->sc_dkdev.dk_label,
+ rs->sc_dkdev.dk_cpulabel);
+ }
+
+ rs->sc_flags &= ~RAIDF_LABELLING;
+
+ raidunlock(rs);
+
+ if (error)
+ return (error);
+ break;
+
+ case DIOCWLABEL:
+ db1_printf(("DIOCWLABEL\n"));
+ if (*(int *)data != 0)
+ rs->sc_flags |= RAIDF_WLABEL;
+ else
+ rs->sc_flags &= ~RAIDF_WLABEL;
+ break;
+
+ case DIOCGDEFLABEL:
+ db1_printf(("DIOCGDEFLABEL\n"));
+ raidgetdefaultlabel(raidPtrs[unit], rs,
+ (struct disklabel *)data);
+ break;
+
+ default:
+ retcode = ENOTTY; /* XXXX ?? OR EINVAL ? */
+ }
+ return(retcode);
+
+}
+
+
+/* raidinit -- complete the rest of the initialization for the
+ RAIDframe device. */
+
+
+static int
+raidinit(dev, raidPtr,unit)
+ dev_t dev;
+ RF_Raid_t *raidPtr;
+ int unit;
+{
+ int retcode;
+ /* int ix; */
+ /* struct raidbuf *raidbp; */
+ struct raid_softc *rs;
+
+ retcode = 0;
+
+ rs = &raid_softc[unit];
+ pool_init(&rs->sc_cbufpool, sizeof(struct raidbuf), 0,
+ 0, 0, "raidpl", 0, NULL, NULL, M_DEVBUF);
+
+
+ /* XXX should check return code first... */
+ rs->sc_flags |= RAIDF_INITED;
+
+ sprintf(rs->sc_xname, "raid%d", unit); /* XXX doesn't check bounds.*/
+
+ rs->sc_dkdev.dk_name = rs->sc_xname;
+ /* disk_attach actually creates space for the CPU disklabel, among
+ other things, so it's critical to call this *BEFORE* we
+ try putzing with disklabels. */
+ disk_attach(&rs->sc_dkdev);
+
+ /* XXX There may be a weird interaction here between this, and
+ protectedSectors, as used in RAIDframe. */
+ rs->sc_size = raidPtr->totalSectors;
+ rs->sc_dev = dev;
+ return(retcode);
+}
+
+
+/*********************************************************
+ *
+ * initialization code called at boot time (startup.c)
+ *
+ ********************************************************/
+int rf_boot()
+{
+ int i, rc;
+
+ rc = rf_mutex_init(&rf_sparet_wait_mutex);
+ if (rc) {
+ RF_PANIC();
+ }
+ rc = rf_mutex_init(&rf_async_done_q_mutex);
+ if (rc) {
+ RF_PANIC();
+ }
+ rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
+ recon_queue = NULL;
+ rf_async_done_qh = rf_async_done_qt = NULL;
+ for (i=0; i<numraid; i++)
+ raidPtrs[i] = NULL;
+ rc = rf_BootRaidframe();
+ if (rc == 0)
+ printf("Kernelized RAIDframe activated\n");
+ else
+ rf_kbooted = RFK_BOOT_BAD;
+ return(rc);
+}
+
+/*
+ * This kernel thread never exits. It is created once, and persists
+ * until the system reboots.
+ */
+void rf_ReconKernelThread()
+{
+ struct rf_recon_req *req;
+ int s;
+
+ /* XXX not sure what spl() level we should be at here... probably splbio() */
+ s=splbio();
+
+ while (1) {
+ /* grab the next reconstruction request from the queue */
+ LOCK_RECON_Q_MUTEX();
+ while (!recon_queue) {
+ UNLOCK_RECON_Q_MUTEX();
+ tsleep(&recon_queue, PRIBIO | PCATCH, "raidframe recon", 0);
+ LOCK_RECON_Q_MUTEX();
+ }
+ req = recon_queue;
+ recon_queue = recon_queue->next;
+ UNLOCK_RECON_Q_MUTEX();
+
+ /*
+ * If flags specifies that we should start recon, this call
+ * will not return until reconstruction completes, fails, or is aborted.
+ */
+ rf_FailDisk((RF_Raid_t *) req->raidPtr, req->row, req->col,
+ ((req->flags&RF_FDFLAGS_RECON) ? 1 : 0));
+
+ RF_Free(req, sizeof(*req));
+ }
+}
+/* wake up the daemon & tell it to get us a spare table
+ * XXX
+ * the entries in the queues should be tagged with the raidPtr
+ * so that in the extremely rare case that two recons happen at once, we know for
+ * which device were requesting a spare table
+ * XXX
+ */
+int rf_GetSpareTableFromDaemon(req)
+ RF_SparetWait_t *req;
+{
+ int retcode;
+
+ RF_LOCK_MUTEX(rf_sparet_wait_mutex);
+ req->next = rf_sparet_wait_queue;
+ rf_sparet_wait_queue = req;
+ wakeup(&rf_sparet_wait_queue);
+
+ /* mpsleep unlocks the mutex */
+ while (!rf_sparet_resp_queue) {
+ tsleep(&rf_sparet_resp_queue, PRIBIO | PCATCH,
+ "raidframe getsparetable", 0);
+#if 0
+ mpsleep(&rf_sparet_resp_queue, PZERO, "sparet resp", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE);
+#endif
+ }
+ req = rf_sparet_resp_queue;
+ rf_sparet_resp_queue = req->next;
+ RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
+
+ retcode = req->fcol;
+ RF_Free(req, sizeof(*req)); /* this is not the same req as we alloc'd */
+ return(retcode);
+}
+
+/* a wrapper around rf_DoAccess that extracts appropriate info from the bp & passes it down.
+ * any calls originating in the kernel must use non-blocking I/O
+ * do some extra sanity checking to return "appropriate" error values for
+ * certain conditions (to make some standard utilities work)
+ */
+int rf_DoAccessKernel(raidPtr, bp, flags, cbFunc, cbArg)
+ RF_Raid_t *raidPtr;
+ struct buf *bp;
+ RF_RaidAccessFlags_t flags;
+ void (*cbFunc)(struct buf *);
+ void *cbArg;
+{
+ RF_SectorCount_t num_blocks, pb, sum;
+ RF_RaidAddr_t raid_addr;
+ int retcode;
+ struct partition *pp;
+ daddr_t blocknum;
+ int unit;
+ struct raid_softc *rs;
+
+ /* XXX The dev_t used here should be for /dev/[r]raid* !!! */
+
+ unit = raidPtr->raidid;
+ rs = &raid_softc[unit];
+
+ /* Ok, for the bp we have here, bp->b_blkno is relative to the
+ partition.. Need to make it absolute to the underlying
+ device.. */
+
+ blocknum = bp->b_blkno;
+ if (DISKPART(bp->b_dev) != RAW_PART) {
+ pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
+ blocknum += pp->p_offset;
+ db1_printf(("updated: %d %d\n",DISKPART(bp->b_dev),
+ pp->p_offset));
+ } else {
+ db1_printf(("Is raw..\n"));
+ }
+ db1_printf(("Blocks: %d, %d\n", (int) bp->b_blkno, (int) blocknum));
+
+ db1_printf(("bp->b_bcount = %d\n",(int)bp->b_bcount));
+ db1_printf(("bp->b_resid = %d\n",(int)bp->b_resid));
+
+ /* *THIS* is where we adjust what block we're going to... but
+ DO NOT TOUCH bp->b_blkno!!! */
+ raid_addr = blocknum;
+
+ num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
+ pb = (bp->b_bcount&raidPtr->sectorMask) ? 1 : 0;
+ sum = raid_addr + num_blocks + pb;
+ if (1 || rf_debugKernelAccess) {
+ db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
+ (int)raid_addr, (int)sum,(int)num_blocks,
+ (int)pb,(int)bp->b_resid));
+ }
+
+
+ if ((sum > raidPtr->totalSectors) || (sum < raid_addr)
+ || (sum < num_blocks) || (sum < pb))
+ {
+ bp->b_error = ENOSPC;
+ bp->b_flags |= B_ERROR;
+ bp->b_resid = bp->b_bcount;
+ biodone(bp);
+ return(bp->b_error);
+ }
+
+ /*
+ * XXX rf_DoAccess() should do this, not just DoAccessKernel()
+ */
+
+ if (bp->b_bcount & raidPtr->sectorMask) {
+ bp->b_error = EINVAL;
+ bp->b_flags |= B_ERROR;
+ bp->b_resid = bp->b_bcount;
+ biodone(bp);
+ return(bp->b_error);
+ }
+ db1_printf(("Calling DoAccess..\n"));
+
+ /* don't ever condition on bp->b_flags & B_WRITE.
+ always condition on B_READ instead */
+ retcode = rf_DoAccess(raidPtr, (bp->b_flags & B_READ) ?
+ RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
+ 0, raid_addr, num_blocks, bp->b_un.b_addr,
+ bp, NULL, NULL, RF_DAG_NONBLOCKING_IO|flags,
+ NULL, cbFunc, cbArg);
+#if 0
+ db1_printf(("After call to DoAccess: 0x%x 0x%x %d\n",bp,
+ bp->b_data,(int)bp->b_resid));
+#endif
+ return(retcode);
+}
+
+/* invoke an I/O from kernel mode. Disk queue should be locked upon entry */
+
+int rf_DispatchKernelIO(queue, req)
+ RF_DiskQueue_t *queue;
+ RF_DiskQueueData_t *req;
+{
+ int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
+ struct buf *bp;
+ struct raidbuf *raidbp=NULL;
+ struct raid_softc *rs;
+ int unit;
+
+ /* XXX along with the vnode, we also need the softc associated with
+ this device.. */
+
+ req->queue = queue;
+
+ unit = queue->raidPtr->raidid;
+
+ db1_printf(("DispatchKernelIO unit: %d\n",unit));
+
+ if (unit >= numraid) {
+ printf("Invalid unit number: %d %d\n",unit,numraid);
+ panic("Invalid Unit number in rf_DispatchKernelIO\n");
+ }
+
+ rs = &raid_softc[unit];
+
+ /* XXX is this the right place? */
+ disk_busy(&rs->sc_dkdev);
+
+ bp = req->bp;
+
+ /*
+ XXX when there is a physical disk failure, someone is passing
+ us a buffer that contains old stuff!! Attempt to deal with
+ this problem without taking a performance hit...
+ (not sure where the real bug is. It's buried in RAIDframe
+ somewhere) :-( GO )
+ */
+
+ if (bp->b_flags & B_ERROR) {
+ bp->b_flags &= ~B_ERROR;
+ }
+ if (bp->b_error!=0) {
+ bp->b_error = 0;
+ }
+
+ raidbp = RAIDGETBUF(rs);
+
+ raidbp->rf_flags = 0; /* XXX not really used anywhere... */
+
+ /*
+ * context for raidiodone
+ */
+ raidbp->rf_obp = bp;
+ raidbp->req = req;
+
+ switch (req->type) {
+ case RF_IO_TYPE_NOP: /* used primarily to unlock a locked queue */
+ /*
+ Dprintf2("rf_DispatchKernelIO: NOP to r %d c %d\n",
+ queue->row, queue->col);
+ */
+ /* XXX need to do something extra here.. */
+ /* I'm leaving this in, as I've never actually seen it
+ used, and I'd like folks to report it... GO */
+ printf(("WAKEUP CALLED\n"));
+ queue->numOutstanding++;
+
+ /* XXX need to glue the original buffer into this?? */
+
+ KernelWakeupFunc(&raidbp->rf_buf);
+ break;
+
+ case RF_IO_TYPE_READ:
+ case RF_IO_TYPE_WRITE:
+
+ if (req->tracerec) {
+ RF_ETIMER_START(req->tracerec->timer);
+ }
+
+
+ InitBP(&raidbp->rf_buf, queue->rf_cinfo->ci_vp,
+ op | bp->b_flags, queue->rf_cinfo->ci_dev,
+ req->sectorOffset, req->numSector,
+ req->buf, KernelWakeupFunc, (void *) req,
+ queue->raidPtr->logBytesPerSector, req->b_proc);
+
+ if (rf_debugKernelAccess) {
+ db1_printf(("dispatch: bp->b_blkno = %ld\n",
+ (long) bp->b_blkno));
+ }
+ queue->numOutstanding++;
+ queue->last_deq_sector = req->sectorOffset;
+ /* acc wouldn't have been let in if there were any
+ pending reqs at any other priority */
+ queue->curPriority = req->priority;
+ /*
+ Dprintf3("rf_DispatchKernelIO: %c to row %d col %d\n",
+ req->type, queue->row, queue->col);
+ */
+
+ db1_printf(("Going for %c to unit %d row %d col %d\n",
+ req->type, unit, queue->row, queue->col));
+ db1_printf(("sector %d count %d (%d bytes) %d\n",
+ (int) req->sectorOffset, (int) req->numSector,
+ (int) (req->numSector <<
+ queue->raidPtr->logBytesPerSector),
+ (int) queue->raidPtr->logBytesPerSector));
+ if ((raidbp->rf_buf.b_flags & B_READ) == 0) {
+ raidbp->rf_buf.b_vp->v_numoutput++;
+ }
+
+ VOP_STRATEGY(&raidbp->rf_buf);
+
+ break;
+
+ default:
+ panic("bad req->type in rf_DispatchKernelIO");
+ }
+ db1_printf(("Exiting from DispatchKernelIO\n"));
+ return(0);
+}
+
+/* this is the callback function associated with a I/O invoked from
+ kernel code.
+ */
+static void KernelWakeupFunc(vbp)
+ struct buf *vbp;
+{
+ RF_DiskQueueData_t *req = NULL;
+ RF_DiskQueue_t *queue;
+ struct raidbuf *raidbp = (struct raidbuf *)vbp;
+ struct buf *bp;
+ struct raid_softc *rs;
+ int unit;
+ register int s;
+
+ s=splbio(); /* XXX */
+ db1_printf(("recovering the request queue:\n"));
+ req = raidbp->req;
+
+ bp = raidbp->rf_obp;
+#if 0
+ db1_printf(("bp=0x%x\n",bp));
+#endif
+
+ queue = (RF_DiskQueue_t *) req->queue;
+
+ if (raidbp->rf_buf.b_flags & B_ERROR) {
+#if 0
+ printf("Setting bp->b_flags!!! %d\n",raidbp->rf_buf.b_error);
+#endif
+ bp->b_flags |= B_ERROR;
+ bp->b_error = raidbp->rf_buf.b_error ?
+ raidbp->rf_buf.b_error : EIO;
+ }
+
+#if 0
+ db1_printf(("raidbp->rf_buf.b_bcount=%d\n",(int)raidbp->rf_buf.b_bcount));
+ db1_printf(("raidbp->rf_buf.b_bufsize=%d\n",(int)raidbp->rf_buf.b_bufsize));
+ db1_printf(("raidbp->rf_buf.b_resid=%d\n",(int)raidbp->rf_buf.b_resid));
+ db1_printf(("raidbp->rf_buf.b_data=0x%x\n",raidbp->rf_buf.b_data));
+#endif
+
+ /* XXX methinks this could be wrong... */
+#if 1
+ bp->b_resid = raidbp->rf_buf.b_resid;
+#endif
+
+ if (req->tracerec) {
+ RF_ETIMER_STOP(req->tracerec->timer);
+ RF_ETIMER_EVAL(req->tracerec->timer);
+ RF_LOCK_MUTEX(rf_tracing_mutex);
+ req->tracerec->diskwait_us += RF_ETIMER_VAL_US(req->tracerec->timer);
+ req->tracerec->phys_io_us += RF_ETIMER_VAL_US(req->tracerec->timer);
+ req->tracerec->num_phys_ios++;
+ RF_UNLOCK_MUTEX(rf_tracing_mutex);
+ }
+
+ bp->b_bcount = raidbp->rf_buf.b_bcount;/* XXXX ?? */
+
+ unit = queue->raidPtr->raidid; /* *Much* simpler :-> */
+
+
+ /* XXX Ok, let's get aggressive... If B_ERROR is set, let's go ballistic,
+ and mark the component as hosed... */
+#if 1
+ if (bp->b_flags&B_ERROR) {
+ /* Mark the disk as dead */
+ /* but only mark it once... */
+ if (queue->raidPtr->Disks[queue->row][queue->col].status ==
+ rf_ds_optimal) {
+ printf("raid%d: IO Error. Marking %s as failed.\n",
+ unit, queue->raidPtr->Disks[queue->row][queue->col].devname );
+ queue->raidPtr->Disks[queue->row][queue->col].status =
+ rf_ds_failed;
+ queue->raidPtr->status[queue->row] = rf_rs_degraded;
+ queue->raidPtr->numFailures++;
+ } else { /* Disk is already dead... */
+ /* printf("Disk already marked as dead!\n"); */
+ }
+
+ }
+#endif
+
+ rs = &raid_softc[unit];
+ RAIDPUTBUF(rs,raidbp);
+
+
+ if (bp->b_resid==0) {
+ db1_printf(("Disk is no longer busy for this buffer... %d %ld %ld\n",
+ unit, bp->b_resid, bp->b_bcount));
+ /* XXX is this the right place for a disk_unbusy()??!??!?!? */
+ disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid));
+ } else {
+ db1_printf(("b_resid is still %ld\n",bp->b_resid));
+ }
+
+ rf_DiskIOComplete(queue, req, (bp->b_flags & B_ERROR) ? 1 : 0);
+ (req->CompleteFunc)(req->argument, (bp->b_flags & B_ERROR) ? 1 : 0);
+ /* printf("Exiting KernelWakeupFunc\n"); */
+
+ splx(s); /* XXX */
+}
+
+
+
+/*
+ * initialize a buf structure for doing an I/O in the kernel.
+ */
+static void InitBP(
+ struct buf *bp,
+ struct vnode *b_vp,
+ unsigned rw_flag,
+ dev_t dev,
+ RF_SectorNum_t startSect,
+ RF_SectorCount_t numSect,
+ caddr_t buf,
+ void (*cbFunc)(struct buf *),
+ void *cbArg,
+ int logBytesPerSector,
+ struct proc *b_proc)
+{
+ /* bp->b_flags = B_PHYS | rw_flag; */
+ bp->b_flags = B_CALL | rw_flag; /* XXX need B_PHYS here too??? */
+ bp->b_bcount = numSect << logBytesPerSector;
+ bp->b_bufsize = bp->b_bcount;
+ bp->b_error = 0;
+ bp->b_dev = dev;
+ db1_printf(("bp->b_dev is %d\n", dev));
+ bp->b_un.b_addr = buf;
+#if 0
+ db1_printf(("bp->b_data=0x%x\n",bp->b_data));
+#endif
+
+ bp->b_blkno = startSect;
+ bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
+ db1_printf(("b_bcount is: %d\n",(int)bp->b_bcount));
+ if (bp->b_bcount == 0) {
+ panic("bp->b_bcount is zero in InitBP!!\n");
+ }
+ bp->b_proc = b_proc;
+ bp->b_iodone = cbFunc;
+ bp->b_vp = b_vp;
+
+}
+#endif /* KERNEL */
+
+/* Extras... */
+
+unsigned int rpcc()
+{
+ /* XXX no clue what this is supposed to do.. my guess is
+ that it's supposed to read the CPU cycle counter... */
+ /* db1_printf("this is supposed to do something useful too!??\n"); */
+ return(0);
+}
+
+#if 0
+int rf_GetSpareTableFromDaemon(req)
+ RF_SparetWait_t *req;
+{
+ int retcode=1;
+ printf("This is supposed to do something useful!!\n"); /* XXX */
+
+ return(retcode);
+
+}
+#endif
+
+static void
+raidgetdefaultlabel(raidPtr, rs, lp)
+ RF_Raid_t *raidPtr;
+ struct raid_softc *rs;
+ struct disklabel *lp;
+{
+ db1_printf(("Building a default label...\n"));
+ bzero(lp, sizeof(*lp));
+
+ /* fabricate a label... */
+ lp->d_secperunit = raidPtr->totalSectors;
+ lp->d_secsize = raidPtr->bytesPerSector;
+ lp->d_nsectors = 1024 * (1024 / raidPtr->bytesPerSector);
+ lp->d_ntracks = 1;
+ lp->d_ncylinders = raidPtr->totalSectors / lp->d_nsectors;
+ lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
+
+ strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
+ lp->d_type = DTYPE_RAID;
+ strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
+ lp->d_rpm = 3600;
+ lp->d_interleave = 1;
+ lp->d_flags = 0;
+
+ lp->d_partitions[RAW_PART].p_offset = 0;
+ lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
+ lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
+ lp->d_npartitions = RAW_PART + 1;
+
+ lp->d_magic = DISKMAGIC;
+ lp->d_magic2 = DISKMAGIC;
+ lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
+
+}
+
+/*
+ * Read the disklabel from the raid device. If one is not present, fake one
+ * up.
+ */
+static void
+raidgetdisklabel(dev)
+ dev_t dev;
+{
+ int unit = raidunit(dev);
+ struct raid_softc *rs = &raid_softc[unit];
+ char *errstring;
+ struct disklabel *lp = rs->sc_dkdev.dk_label;
+ struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
+ RF_Raid_t *raidPtr;
+
+ db1_printf(("Getting the disklabel...\n"));
+
+ bzero(clp, sizeof(*clp));
+
+ raidPtr = raidPtrs[unit];
+
+ raidgetdefaultlabel(raidPtr, rs, lp);
+
+ /*
+ * Call the generic disklabel extraction routine.
+ */
+ errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
+ rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel);
+ if (errstring)
+ raidmakedisklabel(rs);
+ else {
+ int i;
+ struct partition *pp;
+
+ /*
+ * Sanity check whether the found disklabel is valid.
+ *
+ * This is necessary since total size of the raid device
+ * may vary when an interleave is changed even though exactly
+ * same componets are used, and old disklabel may used
+ * if that is found.
+ */
+ if (lp->d_secperunit != rs->sc_size)
+ printf("WARNING: %s: "
+ "total sector size in disklabel (%d) != "
+ "the size of raid (%d)\n", rs->sc_xname,
+ lp->d_secperunit, rs->sc_size);
+ for (i = 0; i < lp->d_npartitions; i++) {
+ pp = &lp->d_partitions[i];
+ if (pp->p_offset + pp->p_size > rs->sc_size)
+ printf("WARNING: %s: end of partition `%c' "
+ "exceeds the size of raid (%d)\n",
+ rs->sc_xname, 'a' + i, rs->sc_size);
+ }
+ }
+
+}
+
+/*
+ * Take care of things one might want to take care of in the event
+ * that a disklabel isn't present.
+ */
+static void
+raidmakedisklabel(rs)
+ struct raid_softc *rs;
+{
+ struct disklabel *lp = rs->sc_dkdev.dk_label;
+ db1_printf(("Making a label..\n"));
+
+ /*
+ * For historical reasons, if there's no disklabel present
+ * the raw partition must be marked FS_BSDFFS.
+ */
+
+ lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
+
+ strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
+
+ lp->d_checksum = dkcksum(lp);
+}
+
+/*
+ * Lookup the provided name in the filesystem. If the file exists,
+ * is a valid block device, and isn't being used by anyone else,
+ * set *vpp to the file's vnode.
+ * You'll find the original of this in ccd.c
+ */
+int
+raidlookup(path, p, vpp)
+ char *path;
+ struct proc *p;
+ struct vnode **vpp; /* result */
+{
+ struct nameidata nd;
+ struct vnode *vp;
+ struct vattr va;
+ int error;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path, p);
+ if ((error = vn_open(&nd, FREAD|FWRITE, 0)) != 0) {
+#ifdef DEBUG
+ printf("RAIDframe: vn_open returned %d\n",error);
+#endif
+ return (error);
+ }
+ vp = nd.ni_vp;
+ if (vp->v_usecount > 1) {
+ VOP_UNLOCK(vp, 0);
+ (void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p);
+ return (EBUSY);
+ }
+ if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)) != 0) {
+ VOP_UNLOCK(vp, 0);
+ (void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p);
+ return (error);
+ }
+ /* XXX: eventually we should handle VREG, too. */
+ if (va.va_type != VBLK) {
+ VOP_UNLOCK(vp, 0);
+ (void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p);
+ return (ENOTBLK);
+ }
+ VOP_UNLOCK(vp, 0);
+ *vpp = vp;
+ return (0);
+}
+
+/*
+ * Wait interruptibly for an exclusive lock.
+ *
+ * XXX
+ * Several drivers do this; it should be abstracted and made MP-safe.
+ * (Hmm... where have we seen this warning before :-> GO )
+ */
+static int
+raidlock(rs)
+ struct raid_softc *rs;
+{
+ int error;
+
+ while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
+ rs->sc_flags |= RAIDF_WANTED;
+ if ((error =
+ tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
+ return (error);
+ }
+ rs->sc_flags |= RAIDF_LOCKED;
+ return (0);
+}
+
+/*
+ * Unlock and wake up any waiters.
+ */
+static void
+raidunlock(rs)
+ struct raid_softc *rs;
+{
+
+ rs->sc_flags &= ~RAIDF_LOCKED;
+ if ((rs->sc_flags & RAIDF_WANTED) != 0) {
+ rs->sc_flags &= ~RAIDF_WANTED;
+ wakeup(rs);
+ }
+}
diff --git a/sys/dev/raidframe/rf_nwayxor.c b/sys/dev/raidframe/rf_nwayxor.c
new file mode 100644
index 00000000000..c319aa04c52
--- /dev/null
+++ b/sys/dev/raidframe/rf_nwayxor.c
@@ -0,0 +1,454 @@
+/* $OpenBSD: rf_nwayxor.c,v 1.1 1999/01/11 14:29:31 niklas Exp $ */
+/* $NetBSD: rf_nwayxor.c,v 1.1 1998/11/13 04:20:31 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/************************************************************
+ *
+ * nwayxor.c -- code to do N-way xors for reconstruction
+ *
+ * nWayXorN xors N input buffers into the destination buffer.
+ * adapted from danner's longword_bxor code.
+ *
+ ************************************************************/
+
+/* :
+ * Log: rf_nwayxor.c,v
+ * Revision 1.6 1996/06/12 03:31:18 jimz
+ * only print call counts if rf_showXorCallCounts != 0
+ *
+ * Revision 1.5 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.4 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.3 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.2 1995/12/01 19:29:14 root
+ * added copyright info
+ *
+ */
+
+#include "rf_nwayxor.h"
+#include "rf_shutdown.h"
+
+static int callcount[10];
+static void rf_ShutdownNWayXor(void *);
+
+static void rf_ShutdownNWayXor(ignored)
+ void *ignored;
+{
+ int i;
+
+ if (rf_showXorCallCounts == 0)
+ return;
+ printf("Call counts for n-way xor routines: ");
+ for (i=0; i<10; i++)
+ printf("%d ",callcount[i]);
+ printf("\n");
+}
+
+int rf_ConfigureNWayXor(listp)
+ RF_ShutdownList_t **listp;
+{
+ int i, rc;
+
+ for (i=0; i<10; i++)
+ callcount[i] = 0;
+ rc = rf_ShutdownCreate(listp, rf_ShutdownNWayXor, NULL);
+ return(rc);
+}
+
+void rf_nWayXor1(src_rbs, dest_rb, len)
+ RF_ReconBuffer_t **src_rbs;
+ RF_ReconBuffer_t *dest_rb;
+ int len;
+{
+ register unsigned long *src = (unsigned long *) src_rbs[0]->buffer;
+ register unsigned long *dest= (unsigned long *) dest_rb->buffer;
+ register unsigned long *end = src+len;
+ register unsigned long d0, d1, d2, d3, s0, s1, s2, s3;
+
+ callcount[1]++;
+ while (len >= 4 )
+ {
+ d0 = dest[0];
+ d1 = dest[1];
+ d2 = dest[2];
+ d3 = dest[3];
+ s0 = src[0];
+ s1 = src[1];
+ s2 = src[2];
+ s3 = src[3];
+ dest[0] = d0 ^ s0;
+ dest[1] = d1 ^ s1;
+ dest[2] = d2 ^ s2;
+ dest[3] = d3 ^ s3;
+ src += 4;
+ dest += 4;
+ len -= 4;
+ }
+ while (src < end) {*dest++ ^= *src++;}
+}
+
+void rf_nWayXor2(src_rbs, dest_rb, len)
+ RF_ReconBuffer_t **src_rbs;
+ RF_ReconBuffer_t *dest_rb;
+ int len;
+{
+ register unsigned long *dst = (unsigned long *) dest_rb->buffer;
+ register unsigned long *a = dst;
+ register unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
+ register unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
+ unsigned long a0,a1,a2,a3, b0,b1,b2,b3;
+
+ callcount[2]++;
+ /* align dest to cache line */
+ while ((((unsigned long) dst) & 0x1f))
+ {
+ *dst++ = *a++ ^ *b++ ^ *c++;
+ len--;
+ }
+ while (len > 4 )
+ {
+ a0 = a[0]; len -= 4;
+
+ a1 = a[1];
+ a2 = a[2];
+
+ a3 = a[3]; a += 4;
+
+ b0 = b[0];
+ b1 = b[1];
+
+ b2 = b[2];
+ b3 = b[3];
+ /* start dual issue */
+ a0 ^= b0; b0 = c[0];
+
+ b += 4; a1 ^= b1;
+
+ a2 ^= b2; a3 ^= b3;
+
+ b1 = c[1]; a0 ^= b0;
+
+ b2 = c[2]; a1 ^= b1;
+
+ b3 = c[3]; a2 ^= b2;
+
+ dst[0] = a0; a3 ^= b3;
+ dst[1] = a1; c += 4;
+ dst[2] = a2;
+ dst[3] = a3; dst += 4;
+ }
+ while (len)
+ {
+ *dst++ = *a++ ^ *b++ ^ *c++;
+ len--;
+ }
+}
+
+/* note that first arg is not incremented but 2nd arg is */
+#define LOAD_FIRST(_dst,_b) \
+ a0 = _dst[0]; len -= 4; \
+ a1 = _dst[1]; \
+ a2 = _dst[2]; \
+ a3 = _dst[3]; \
+ b0 = _b[0]; \
+ b1 = _b[1]; \
+ b2 = _b[2]; \
+ b3 = _b[3]; _b += 4;
+
+/* note: arg is incremented */
+#define XOR_AND_LOAD_NEXT(_n) \
+ a0 ^= b0; b0 = _n[0]; \
+ a1 ^= b1; b1 = _n[1]; \
+ a2 ^= b2; b2 = _n[2]; \
+ a3 ^= b3; b3 = _n[3]; \
+ _n += 4;
+
+/* arg is incremented */
+#define XOR_AND_STORE(_dst) \
+ a0 ^= b0; _dst[0] = a0; \
+ a1 ^= b1; _dst[1] = a1; \
+ a2 ^= b2; _dst[2] = a2; \
+ a3 ^= b3; _dst[3] = a3; \
+ _dst += 4;
+
+
+void rf_nWayXor3(src_rbs, dest_rb, len)
+ RF_ReconBuffer_t **src_rbs;
+ RF_ReconBuffer_t *dest_rb;
+ int len;
+{
+ register unsigned long *dst = (unsigned long *) dest_rb->buffer;
+ register unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
+ register unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
+ register unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
+ unsigned long a0,a1,a2,a3, b0,b1,b2,b3;
+
+ callcount[3]++;
+ /* align dest to cache line */
+ while ((((unsigned long) dst) & 0x1f)) {
+ *dst++ ^= *b++ ^ *c++ ^ *d++;
+ len--;
+ }
+ while (len > 4 ) {
+ LOAD_FIRST(dst,b);
+ XOR_AND_LOAD_NEXT(c);
+ XOR_AND_LOAD_NEXT(d);
+ XOR_AND_STORE(dst);
+ }
+ while (len) {
+ *dst++ ^= *b++ ^ *c++ ^ *d++;
+ len--;
+ }
+}
+
+void rf_nWayXor4(src_rbs, dest_rb, len)
+ RF_ReconBuffer_t **src_rbs;
+ RF_ReconBuffer_t *dest_rb;
+ int len;
+{
+ register unsigned long *dst = (unsigned long *) dest_rb->buffer;
+ register unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
+ register unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
+ register unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
+ register unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
+ unsigned long a0,a1,a2,a3, b0,b1,b2,b3;
+
+ callcount[4]++;
+ /* align dest to cache line */
+ while ((((unsigned long) dst) & 0x1f)) {
+ *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++;
+ len--;
+ }
+ while (len > 4 ) {
+ LOAD_FIRST(dst,b);
+ XOR_AND_LOAD_NEXT(c);
+ XOR_AND_LOAD_NEXT(d);
+ XOR_AND_LOAD_NEXT(e);
+ XOR_AND_STORE(dst);
+ }
+ while (len) {
+ *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++;
+ len--;
+ }
+}
+
+void rf_nWayXor5(src_rbs, dest_rb, len)
+ RF_ReconBuffer_t **src_rbs;
+ RF_ReconBuffer_t *dest_rb;
+ int len;
+{
+ register unsigned long *dst = (unsigned long *) dest_rb->buffer;
+ register unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
+ register unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
+ register unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
+ register unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
+ register unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
+ unsigned long a0,a1,a2,a3, b0,b1,b2,b3;
+
+ callcount[5]++;
+ /* align dest to cache line */
+ while ((((unsigned long) dst) & 0x1f)) {
+ *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++;
+ len--;
+ }
+ while (len > 4 ) {
+ LOAD_FIRST(dst,b);
+ XOR_AND_LOAD_NEXT(c);
+ XOR_AND_LOAD_NEXT(d);
+ XOR_AND_LOAD_NEXT(e);
+ XOR_AND_LOAD_NEXT(f);
+ XOR_AND_STORE(dst);
+ }
+ while (len) {
+ *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++;
+ len--;
+ }
+}
+
+void rf_nWayXor6(src_rbs, dest_rb, len)
+ RF_ReconBuffer_t **src_rbs;
+ RF_ReconBuffer_t *dest_rb;
+ int len;
+{
+ register unsigned long *dst = (unsigned long *) dest_rb->buffer;
+ register unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
+ register unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
+ register unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
+ register unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
+ register unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
+ register unsigned long *g = (unsigned long *) src_rbs[5]->buffer;
+ unsigned long a0,a1,a2,a3, b0,b1,b2,b3;
+
+ callcount[6]++;
+ /* align dest to cache line */
+ while ((((unsigned long) dst) & 0x1f)) {
+ *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++;
+ len--;
+ }
+ while (len > 4 ) {
+ LOAD_FIRST(dst,b);
+ XOR_AND_LOAD_NEXT(c);
+ XOR_AND_LOAD_NEXT(d);
+ XOR_AND_LOAD_NEXT(e);
+ XOR_AND_LOAD_NEXT(f);
+ XOR_AND_LOAD_NEXT(g);
+ XOR_AND_STORE(dst);
+ }
+ while (len) {
+ *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++;
+ len--;
+ }
+}
+
+void rf_nWayXor7(src_rbs, dest_rb, len)
+ RF_ReconBuffer_t **src_rbs;
+ RF_ReconBuffer_t *dest_rb;
+ int len;
+{
+ register unsigned long *dst = (unsigned long *) dest_rb->buffer;
+ register unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
+ register unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
+ register unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
+ register unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
+ register unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
+ register unsigned long *g = (unsigned long *) src_rbs[5]->buffer;
+ register unsigned long *h = (unsigned long *) src_rbs[6]->buffer;
+ unsigned long a0,a1,a2,a3, b0,b1,b2,b3;
+
+ callcount[7]++;
+ /* align dest to cache line */
+ while ((((unsigned long) dst) & 0x1f)) {
+ *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++;
+ len--;
+ }
+ while (len > 4 ) {
+ LOAD_FIRST(dst,b);
+ XOR_AND_LOAD_NEXT(c);
+ XOR_AND_LOAD_NEXT(d);
+ XOR_AND_LOAD_NEXT(e);
+ XOR_AND_LOAD_NEXT(f);
+ XOR_AND_LOAD_NEXT(g);
+ XOR_AND_LOAD_NEXT(h);
+ XOR_AND_STORE(dst);
+ }
+ while (len) {
+ *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++;
+ len--;
+ }
+}
+
+void rf_nWayXor8(src_rbs, dest_rb, len)
+ RF_ReconBuffer_t **src_rbs;
+ RF_ReconBuffer_t *dest_rb;
+ int len;
+{
+ register unsigned long *dst = (unsigned long *) dest_rb->buffer;
+ register unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
+ register unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
+ register unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
+ register unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
+ register unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
+ register unsigned long *g = (unsigned long *) src_rbs[5]->buffer;
+ register unsigned long *h = (unsigned long *) src_rbs[6]->buffer;
+ register unsigned long *i = (unsigned long *) src_rbs[7]->buffer;
+ unsigned long a0,a1,a2,a3, b0,b1,b2,b3;
+
+ callcount[8]++;
+ /* align dest to cache line */
+ while ((((unsigned long) dst) & 0x1f)) {
+ *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++;
+ len--;
+ }
+ while (len > 4 ) {
+ LOAD_FIRST(dst,b);
+ XOR_AND_LOAD_NEXT(c);
+ XOR_AND_LOAD_NEXT(d);
+ XOR_AND_LOAD_NEXT(e);
+ XOR_AND_LOAD_NEXT(f);
+ XOR_AND_LOAD_NEXT(g);
+ XOR_AND_LOAD_NEXT(h);
+ XOR_AND_LOAD_NEXT(i);
+ XOR_AND_STORE(dst);
+ }
+ while (len) {
+ *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++;
+ len--;
+ }
+}
+
+
+void rf_nWayXor9(src_rbs, dest_rb, len)
+ RF_ReconBuffer_t **src_rbs;
+ RF_ReconBuffer_t *dest_rb;
+ int len;
+{
+ register unsigned long *dst = (unsigned long *) dest_rb->buffer;
+ register unsigned long *b = (unsigned long *) src_rbs[0]->buffer;
+ register unsigned long *c = (unsigned long *) src_rbs[1]->buffer;
+ register unsigned long *d = (unsigned long *) src_rbs[2]->buffer;
+ register unsigned long *e = (unsigned long *) src_rbs[3]->buffer;
+ register unsigned long *f = (unsigned long *) src_rbs[4]->buffer;
+ register unsigned long *g = (unsigned long *) src_rbs[5]->buffer;
+ register unsigned long *h = (unsigned long *) src_rbs[6]->buffer;
+ register unsigned long *i = (unsigned long *) src_rbs[7]->buffer;
+ register unsigned long *j = (unsigned long *) src_rbs[8]->buffer;
+ unsigned long a0,a1,a2,a3, b0,b1,b2,b3;
+
+ callcount[9]++;
+ /* align dest to cache line */
+ while ((((unsigned long) dst) & 0x1f)) {
+ *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++ ^ *j++;
+ len--;
+ }
+ while (len > 4 ) {
+ LOAD_FIRST(dst,b);
+ XOR_AND_LOAD_NEXT(c);
+ XOR_AND_LOAD_NEXT(d);
+ XOR_AND_LOAD_NEXT(e);
+ XOR_AND_LOAD_NEXT(f);
+ XOR_AND_LOAD_NEXT(g);
+ XOR_AND_LOAD_NEXT(h);
+ XOR_AND_LOAD_NEXT(i);
+ XOR_AND_LOAD_NEXT(j);
+ XOR_AND_STORE(dst);
+ }
+ while (len) {
+ *dst++ ^= *b++ ^ *c++ ^ *d++ ^ *e++ ^ *f++ ^ *g++ ^ *h++ ^ *i++ ^ *j++;
+ len--;
+ }
+}
diff --git a/sys/dev/raidframe/rf_nwayxor.h b/sys/dev/raidframe/rf_nwayxor.h
new file mode 100644
index 00000000000..f474dff9908
--- /dev/null
+++ b/sys/dev/raidframe/rf_nwayxor.h
@@ -0,0 +1,75 @@
+/* $OpenBSD: rf_nwayxor.h,v 1.1 1999/01/11 14:29:31 niklas Exp $ */
+/* $NetBSD: rf_nwayxor.h,v 1.1 1998/11/13 04:20:31 oster Exp $ */
+/*
+ * rf_nwayxor.h
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * rf_nwayxor.h -- types and prototypes for nwayxor module
+ */
+/*
+ * :
+ * Log: rf_nwayxor.h,v
+ * Revision 1.4 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.3 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.2 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.1 1996/05/18 19:56:47 jimz
+ * Initial revision
+ *
+ */
+
+#ifndef _RF__RF_NWAYXOR_H_
+#define _RF__RF_NWAYXOR_H_
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_reconstruct.h"
+
+int rf_ConfigureNWayXor(RF_ShutdownList_t **listp);
+void rf_nWayXor1(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len);
+void rf_nWayXor2(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len);
+void rf_nWayXor3(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len);
+void rf_nWayXor4(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len);
+void rf_nWayXor5(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len);
+void rf_nWayXor6(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len);
+void rf_nWayXor7(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len);
+void rf_nWayXor8(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len);
+void rf_nWayXor9(RF_ReconBuffer_t **src_rbs, RF_ReconBuffer_t *dest_rb, int len);
+
+#endif /* !_RF__RF_NWAYXOR_H_ */
diff --git a/sys/dev/raidframe/rf_openbsd.h b/sys/dev/raidframe/rf_openbsd.h
new file mode 100644
index 00000000000..5e34e977c91
--- /dev/null
+++ b/sys/dev/raidframe/rf_openbsd.h
@@ -0,0 +1,94 @@
+/* $OpenBSD: rf_openbsd.h,v 1.1 1999/01/11 14:29:32 niklas Exp $ */
+
+/*-
+ * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Greg Oster
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*-
+ * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Jason R. Thorpe.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _RF__RF_OPENBSD_H_
+#define _RF__RF_OPENBSD_H_
+
+#include <sys/fcntl.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+
+struct raidcinfo {
+ struct vnode *ci_vp; /* device's vnode */
+ dev_t ci_dev; /* XXX: device's dev_t */
+#if 0
+ size_t ci_size; /* size */
+ char *ci_path; /* path to component */
+ size_t ci_pathlen; /* length of component path */
+#endif
+};
+
+#endif /* _RF__RF_OPENBSD_H_ */
diff --git a/sys/dev/raidframe/rf_openbsdkintf.c b/sys/dev/raidframe/rf_openbsdkintf.c
new file mode 100644
index 00000000000..55b7cfbcca4
--- /dev/null
+++ b/sys/dev/raidframe/rf_openbsdkintf.c
@@ -0,0 +1,2033 @@
+/* $OpenBSD: rf_openbsdkintf.c,v 1.1 1999/01/11 14:29:32 niklas Exp $ */
+
+/*-
+ * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Greg Oster; Jason R. Thorpe.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the NetBSD
+ * Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: Utah $Hdr: cd.c 1.6 90/11/28$
+ *
+ * @(#)cd.c 8.2 (Berkeley) 11/16/93
+ */
+
+
+
+
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Authors: Mark Holland, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/***********************************************************
+ *
+ * rf_kintf.c -- the kernel interface routines for RAIDframe
+ *
+ ***********************************************************/
+/*
+ * :
+ * Log: rf_kintf.c,v
+ * Revision 1.57 1996/07/19 16:12:20 jimz
+ * remove addition of protectedSectors in InitBP- it's already
+ * done in the diskqueue code
+ *
+ * Revision 1.56 1996/07/17 21:00:58 jimz
+ * clean up timer interface, tracing
+ *
+ * Revision 1.55 1996/06/17 03:00:54 jimz
+ * Change RAIDFRAME_GET_INFO interface to do its own copyout()
+ * (because size of device config structure now exceeds 8k)
+ *
+ * Revision 1.54 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.53 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.52 1996/06/06 17:28:08 jimz
+ * track sector number of last I/O dequeued
+ *
+ * Revision 1.51 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.50 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.49 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.48 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.47 1996/05/30 12:59:18 jimz
+ * make etimer happier, more portable
+ *
+ * Revision 1.46 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.45 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.44 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.43 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.42 1996/05/23 22:17:54 jimz
+ * fix sector size hardcoding problems
+ *
+ * Revision 1.41 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.40 1996/05/23 13:18:07 jimz
+ * tracing_mutex -> rf_tracing_mutex
+ *
+ * Revision 1.39 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.38 1996/05/20 16:15:32 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.37 1996/05/10 16:23:47 jimz
+ * RF_offset -> RF_Offset
+ *
+ * Revision 1.36 1996/05/08 21:01:24 jimz
+ * fixed up enum type names that were conflicting with other
+ * enums and function names (ie, "panic")
+ * future naming trends will be towards RF_ and rf_ for
+ * everything raidframe-related
+ *
+ * Revision 1.35 1996/05/03 19:10:48 jimz
+ * change sanity checking for bogus I/Os to return more appropriate
+ * values (to make some user-level utilities happer with RAIDframe)
+ *
+ * Revision 1.34 1996/05/02 22:17:00 jimz
+ * When using DKUSAGE, send a bogus IO after configuring to let DKUSAGE know
+ * that we exist. This will let user-level programs doing group stats on the
+ * RF device function without error before RF gets its first IO
+ *
+ * Changed rf_device_config devs and spares fields to RF_RaidDisk_t
+ *
+ * Inc numOutstanding for the disk queue in rf_DispatchKernelIO if
+ * type is IO_TYPE_NOP. I'm not sure this is right, but it seems to be,
+ * because the disk IO completion routine wants to dec it, and doesn't
+ * care if there was no such IO.
+ *
+ * Revision 1.33 1996/05/02 15:05:44 jimz
+ * for now, rf_DoAccessKernel will reject non-sector-sized I/Os
+ * eventually, it should do something more clever...
+ * (and do it in DoAccess(), not just DoAccessKernel())
+ *
+ * Revision 1.32 1996/05/01 16:28:39 jimz
+ * get rid of uses of ccmn_ functions
+ *
+ * Revision 1.31 1996/05/01 15:42:17 jimz
+ * ccmn_* memory management is on the way out. This is an archival checkpoint-
+ * both the old and new code are in place (all the ccmn_ calls are #if 0). After
+ * this, the ccmn_ code will no longer appear.
+ *
+ * Revision 1.30 1996/04/22 15:53:13 jimz
+ * MAX_RAIDS -> NRAIDFRAME
+ *
+ * Revision 1.29 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.28 1995/12/01 19:11:01 root
+ * added copyright info
+ *
+ * Revision 1.27 1995/11/28 18:56:40 wvcii
+ * disabled buffer copy in rf_write
+ *
+ * Revision 1.26 1995/10/06 16:37:08 jimz
+ * get struct bufs from ubc, not cam
+ * copy all write data, and operate on copy
+ * (temporary hack to get around dags in PQ that want
+ * to Xor into user write buffers)
+ *
+ * Revision 1.25 1995/09/30 22:23:08 jimz
+ * do not require raid to be active to perform ACCTOTAL ioctl
+ *
+ * Revision 1.24 1995/09/30 20:39:08 jimz
+ * added new ioctls:
+ * RAIDFRAME_RESET_ACCTOTALS
+ * RAIDFRAME_GET_ACCTOTALS
+ * RAIDFRAME_KEEP_ACCTOTALS
+ *
+ * Revision 1.23 1995/09/20 21:11:59 jimz
+ * include dfstrace.h in KERNEL block
+ * (even though it's a kernel-only file, this makes the depend process
+ * at user-level happy. Why the user-level Makefile wants to depend
+ * kintf.c is less clear, but this is a workaround).
+ *
+ * Revision 1.22 1995/09/19 23:19:03 jimz
+ * added DKUSAGE support
+ *
+ */
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#ifdef KERNEL
+
+#include <sys/errno.h>
+
+#include "raid.h"
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/queue.h>
+#include <sys/disk.h>
+#include <sys/device.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/fcntl.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/conf.h>
+#include <sys/lock.h>
+#include <sys/buf.h>
+#include <sys/user.h>
+
+#include "rf_raid.h"
+#include "rf_raidframe.h"
+#include "rf_dag.h"
+#include "rf_dagflags.h"
+#include "rf_diskqueue.h"
+#include "rf_acctrace.h"
+#include "rf_etimer.h"
+#include "rf_general.h"
+#include "rf_debugMem.h"
+#include "rf_kintf.h"
+#include "rf_options.h"
+#include "rf_driver.h"
+#include "rf_parityscan.h"
+#include "rf_debugprint.h"
+#include "rf_threadstuff.h"
+
+int rf_kdebug_level = 0;
+
+#define RFK_BOOT_NONE 0
+#define RFK_BOOT_GOOD 1
+#define RFK_BOOT_BAD 2
+static int rf_kbooted = RFK_BOOT_NONE;
+
+#ifdef RAIDDEBUG
+#define db0_printf(a) printf a
+#define db_printf(a) do if (rf_kdebug_level > 0) printf a; while(0)
+#define db1_printf(a) do if (rf_kdebug_level > 0) printf a; while(0)
+#define db2_printf(a) do if (rf_kdebug_level > 1) printf a; while(0)
+#define db3_printf(a) do if (rf_kdebug_level > 2) printf a; while(0)
+#define db4_printf(a) do if (rf_kdebug_level > 3) printf a; while(0)
+#define db5_printf(a) do if (rf_kdebug_level > 4) printf a; while(0)
+#else /* RAIDDEBUG */
+#define db0_printf(a) printf a
+#define db1_printf(a) (void)0
+#define db2_printf(a) (void)0
+#define db3_printf(a) (void)0
+#define db4_printf(a) (void)0
+#define db5_printf(a) (void)0
+#endif /* RAIDDEBUG */
+
+static RF_Raid_t **raidPtrs; /* global raid device descriptors */
+
+static int rf_pending_testaccs;
+
+RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex)
+RF_DECLARE_STATIC_MUTEX(rf_async_done_q_mutex)
+
+/* requests to install a spare table */
+static RF_SparetWait_t *rf_sparet_wait_queue;
+
+/* responses from installation process */
+static RF_SparetWait_t *rf_sparet_resp_queue;
+static struct rf_test_acc *rf_async_done_qh, *rf_async_done_qt;
+
+/* used to communicate reconstruction requests */
+static struct rf_recon_req *recon_queue = NULL;
+
+decl_simple_lock_data(,recon_queue_mutex)
+
+#define LOCK_RECON_Q_MUTEX() simple_lock(&recon_queue_mutex)
+#define UNLOCK_RECON_Q_MUTEX() simple_unlock(&recon_queue_mutex)
+
+/* prototypes */
+void rf_KernelWakeupFunc __P((struct buf *));
+void rf_InitBP __P((struct buf *, struct vnode *, unsigned, dev_t,
+ RF_SectorNum_t, RF_SectorCount_t, caddr_t, void (*)(struct buf *),
+ void *, int, struct proc *));
+
+/* this is so that we can compile under 2.0 as well as 3.2 */
+#ifndef proc_to_task
+#define proc_to_task(x) ((x)->task)
+#endif /* !proc_to_task */
+
+void raidattach __P((int));
+int raidsize __P((dev_t));
+
+void rf_DiskIOComplete(RF_DiskQueue_t *, RF_DiskQueueData_t *, int);
+void rf_CopybackReconstructedData(RF_Raid_t *raidPtr);
+int raidinit __P((dev_t,RF_Raid_t *,int));
+
+int raidopen __P((dev_t, int, int, struct proc *));
+int raidclose __P((dev_t, int, int, struct proc *));
+int raidioctl __P((dev_t, u_long, caddr_t, int, struct proc *));
+int raidwrite __P((dev_t, struct uio *, int));
+int raidread __P((dev_t, struct uio *, int));
+void raidstrategy __P((struct buf *));
+int raiddump __P((dev_t, daddr_t, caddr_t, size_t));
+
+/*
+ * Pilfered from ccd.c
+ */
+struct raidbuf {
+ struct buf rf_buf; /* new I/O buf. MUST BE FIRST!!! */
+ struct buf *rf_obp; /* ptr. to original I/O buf */
+ int rf_flags; /* misc. flags */
+ RF_DiskQueueData_t *req; /* the request that this was part of.. */
+};
+
+#define RAIDGETBUF() malloc(sizeof (struct raidbuf), M_DEVBUF, M_NOWAIT)
+#define RAIDPUTBUF(buf) free(buf, M_DEVBUF)
+
+/*
+ * XXX Not sure if the following should be replacing the raidPtrs above,
+ * or if it should be used in conjunction with that...
+ */
+struct raid_softc {
+ int sc_unit; /* logical unit number */
+ int sc_flags; /* flags */
+ int sc_cflags; /* configuration flags */
+ size_t sc_size; /* size of the raid device */
+ dev_t sc_dev; /* our device..*/
+ char sc_xname[20]; /* XXX external name */
+ struct disk sc_dkdev; /* generic disk device info */
+};
+
+/* sc_flags */
+#define RAIDF_INITED 0x01 /* unit has been initialized */
+#define RAIDF_WLABEL 0x02 /* label area is writable */
+#define RAIDF_LABELLING 0x04 /* unit is currently being labelled */
+#define RAIDF_WANTED 0x40 /* someone is waiting to obtain a lock */
+#define RAIDF_LOCKED 0x80 /* unit is locked */
+
+#define raidunit(x) DISKUNIT(x)
+static int numraid=0;
+
+#define RAIDLABELDEV(dev) \
+ (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART))
+
+/* declared here, and made public, for the benefit of KVM stuff.. */
+struct raid_softc *raid_softc;
+
+void raidgetdefaultlabel
+ __P((RF_Raid_t *, struct raid_softc *, struct disklabel *));
+void raidgetdisklabel __P((dev_t));
+void raidmakedisklabel __P((struct raid_softc *));
+
+int raidlock __P((struct raid_softc *));
+void raidunlock __P((struct raid_softc *));
+int raidlookup __P((char *, struct proc *p, struct vnode **));
+
+
+void
+raidattach(num)
+ int num;
+{
+ int raidID;
+
+ db1_printf(("raidattach: Asked for %d units\n", num));
+
+ if (num <= 0) {
+#ifdef DIAGNOSTIC
+ panic("raidattach: count <= 0");
+#endif
+ return;
+ }
+
+ /*
+ * This is where all the initialization stuff gets done.
+ */
+
+ /* Make some space for requested number of units... */
+ RF_Calloc(raidPtrs, num, sizeof(RF_Raid_t *), (RF_Raid_t **));
+ if (raidPtrs == NULL) {
+ panic("raidPtrs is NULL!!\n");
+ }
+
+ rf_kbooted = rf_boot();
+ if (rf_kbooted) {
+ panic("Serious error booting RAID!!\n");
+ }
+
+ rf_kbooted = RFK_BOOT_GOOD;
+
+ /*
+ put together some datastructures like the CCD device does..
+ This lets us lock the device and what-not when it gets opened.
+ */
+
+ raid_softc = (struct raid_softc *)
+ malloc(num * sizeof(struct raid_softc),
+ M_DEVBUF, M_NOWAIT);
+ if (raid_softc == NULL) {
+ printf("WARNING: no memory for RAIDframe driver\n");
+ return;
+ }
+ numraid = num;
+ bzero(raid_softc, num * sizeof(struct raid_softc));
+
+ for(raidID=0;raidID < num;raidID++) {
+ RF_Calloc(raidPtrs[raidID], 1, sizeof(RF_Raid_t),
+ (RF_Raid_t *));
+ if (raidPtrs[raidID]==NULL) {
+ printf("raidPtrs[%d] is NULL\n",raidID);
+ }
+ }
+}
+
+int
+raidsize(dev)
+ dev_t dev;
+{
+ struct raid_softc *rs;
+ struct disklabel *lp;
+ int part, unit, omask, size;
+
+ unit = raidunit(dev);
+ if (unit >= numraid)
+ return (-1);
+ rs = &raid_softc[unit];
+
+ if ((rs->sc_flags & RAIDF_INITED) == 0)
+ return (-1);
+
+ part = DISKPART(dev);
+ omask = rs->sc_dkdev.dk_openmask & (1 << part);
+ lp = rs->sc_dkdev.dk_label;
+
+ if (omask == 0 && raidopen(dev, 0, S_IFBLK, curproc))
+ return (-1);
+
+ if (lp->d_partitions[part].p_fstype != FS_SWAP)
+ size = -1;
+ else
+ size = lp->d_partitions[part].p_size *
+ (lp->d_secsize / DEV_BSIZE);
+
+ if (omask == 0 && raidclose(dev, 0, S_IFBLK, curproc))
+ return (-1);
+
+ return (size);
+
+}
+
+int
+raiddump(dev, blkno, va, size)
+ dev_t dev;
+ daddr_t blkno;
+ caddr_t va;
+ size_t size;
+{
+ /* Not implemented. */
+ return (ENXIO);
+}
+
+/* ARGSUSED */
+int
+raidopen(dev, flags, fmt, p)
+ dev_t dev;
+ int flags, fmt;
+ struct proc *p;
+{
+ int unit = raidunit(dev);
+ struct raid_softc *rs;
+ struct disklabel *lp;
+ int part,pmask;
+ unsigned int raidID;
+ int rc;
+ int error = 0;
+
+ /*
+ * XXX This whole next chunk of code is somewhat suspect... Not sure
+ * it's needed here at all.
+ */
+ if (rf_kbooted == RFK_BOOT_NONE) {
+ printf("Doing restart on raidopen.\n");
+ rf_kbooted = RFK_BOOT_GOOD;
+ rc = rf_boot();
+ if (rc) {
+ rf_kbooted = RFK_BOOT_BAD;
+ printf("Someone is unhappy...\n");
+ return (rc);
+ }
+ }
+
+ if (unit >= numraid)
+ return (ENXIO);
+ rs = &raid_softc[unit];
+
+ if ((error = raidlock(rs)) != 0)
+ return (error);
+ lp = rs->sc_dkdev.dk_label;
+
+ raidID = raidunit(dev);
+
+ part = DISKPART(dev);
+ pmask = (1 << part);
+
+ db1_printf(
+ ("Opening raid device number: %d partition: %d\n", raidID, part));
+
+
+ if ((rs->sc_flags & RAIDF_INITED) && (rs->sc_dkdev.dk_openmask == 0))
+ raidgetdisklabel(dev);
+
+ /* make sure that this partition exists */
+
+ if (part != RAW_PART) {
+ db1_printf(("Not a raw partition..\n"));
+ if (((rs->sc_flags & RAIDF_INITED) == 0) ||
+ ((part >= lp->d_npartitions) ||
+ (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
+ error = ENXIO;
+ raidunlock(rs);
+ db1_printf(("Bailing out...\n"));
+ return (error);
+ }
+ }
+
+ /* Prevent this unit from being unconfigured while open. */
+ switch (fmt) {
+ case S_IFCHR:
+ rs->sc_dkdev.dk_copenmask |= pmask;
+ break;
+
+ case S_IFBLK:
+ rs->sc_dkdev.dk_bopenmask |= pmask;
+ break;
+ }
+ rs->sc_dkdev.dk_openmask =
+ rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
+
+ raidunlock(rs);
+
+ return (error);
+}
+
+/* ARGSUSED */
+int
+raidclose(dev, flags, fmt, p)
+ dev_t dev;
+ int flags, fmt;
+ struct proc *p;
+{
+ int unit = raidunit(dev);
+ struct raid_softc *rs;
+ int error = 0;
+ int part;
+
+ if (unit >= numraid)
+ return (ENXIO);
+ rs = &raid_softc[unit];
+
+ if ((error = raidlock(rs)) != 0)
+ return (error);
+
+ part = DISKPART(dev);
+
+ /* ...that much closer to allowing unconfiguration... */
+ switch (fmt) {
+ case S_IFCHR:
+ rs->sc_dkdev.dk_copenmask &= ~(1 << part);
+ break;
+
+ case S_IFBLK:
+ rs->sc_dkdev.dk_bopenmask &= ~(1 << part);
+ break;
+ }
+ rs->sc_dkdev.dk_openmask =
+ rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask;
+
+ raidunlock(rs);
+ return (0);
+}
+
+void
+raidstrategy(bp)
+ struct buf *bp;
+{
+ int s;
+
+ unsigned int raidID = raidunit(bp->b_dev);
+ RF_Raid_t *raidPtr;
+ struct raid_softc *rs = &raid_softc[raidID];
+ struct disklabel *lp;
+ int wlabel;
+
+ db1_printf(("Strategy: 0x%x 0x%x\n", bp, bp->b_data));
+ db1_printf(("Strategy(2): bp->b_bufsize %d\n", (int)bp->b_bufsize));
+ db1_printf(("bp->b_count=%d\n", (int)bp->b_bcount));
+ db1_printf(("bp->b_resid=%d\n", (int)bp->b_resid));
+ db1_printf(("bp->b_blkno=%d\n", (int)bp->b_blkno));
+
+ if (bp->b_flags & B_READ)
+ db1_printf(("READ\n"));
+ else
+ db1_printf(("WRITE\n"));
+
+ if (rf_kbooted != RFK_BOOT_GOOD)
+ return;
+ if (raidID >= numraid || !raidPtrs[raidID]) {
+ bp->b_error = ENODEV;
+ bp->b_flags |= B_ERROR;
+ bp->b_resid = bp->b_bcount;
+ biodone(bp);
+ return;
+ }
+ raidPtr = raidPtrs[raidID];
+ if (!raidPtr->valid) {
+ bp->b_error = ENODEV;
+ bp->b_flags |= B_ERROR;
+ bp->b_resid = bp->b_bcount;
+ biodone(bp);
+ return;
+ }
+ if (bp->b_bcount == 0) {
+ db1_printf(("b_bcount is zero..\n"));
+ biodone(bp);
+ return;
+ }
+ lp = rs->sc_dkdev.dk_label;
+
+ /*
+ * Do bounds checking and adjust transfer. If there's an
+ * error, the bounds check will flag that for us.
+ */
+ wlabel = rs->sc_flags & (RAIDF_WLABEL|RAIDF_LABELLING);
+ if (DISKPART(bp->b_dev) != RAW_PART)
+ if (bounds_check_with_label(bp, lp, rs->sc_dkdev.dk_cpulabel,
+ wlabel) <= 0) {
+ db1_printf(("Bounds check failed!!:%d %d\n",
+ (int)bp->b_blkno, (int)wlabel));
+ biodone(bp);
+ return;
+ }
+
+ /* XXX splbio() needed? */
+ s = splbio();
+ db1_printf(("Beginning strategy...\n"));
+
+ bp->b_resid = 0;
+ bp->b_error =
+ rf_DoAccessKernel(raidPtrs[raidID], bp, NULL, NULL, NULL);
+ if (bp->b_error) {
+ bp->b_flags |= B_ERROR;
+ db1_printf(
+ ("bp->b_flags HAS B_ERROR SET!!!: %d\n", bp->b_error));
+ }
+ splx(s);
+ db1_printf(("Strategy exiting: 0x%x 0x%x %d %d\n", bp, bp->b_data,
+ (int)bp->b_bcount, (int)bp->b_resid));
+}
+
+/* ARGSUSED */
+int
+raidread(dev, uio, flags)
+ dev_t dev;
+ struct uio *uio;
+ int flags;
+{
+ int unit = raidunit(dev);
+ struct raid_softc *rs;
+ int result;
+ int part;
+
+ if (unit >= numraid)
+ return (ENXIO);
+ rs = &raid_softc[unit];
+
+ if ((rs->sc_flags & RAIDF_INITED) == 0)
+ return (ENXIO);
+ part = DISKPART(dev);
+
+ db1_printf(("raidread: unit: %d partition: %d\n", unit, part));
+
+#if 0
+ return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio));
+#endif
+ result=physio(raidstrategy, NULL, dev, B_READ, minphys, uio);
+ db1_printf(("raidread done. Result is %d %d\n", result,
+ uio->uio_resid));
+ return (result);
+}
+
+/* ARGSUSED */
+int
+raidwrite(dev, uio, flags)
+ dev_t dev;
+ struct uio *uio;
+ int flags;
+{
+ int unit = raidunit(dev);
+ struct raid_softc *rs;
+
+ if (unit >= numraid)
+ return (ENXIO);
+ rs = &raid_softc[unit];
+
+ if ((rs->sc_flags & RAIDF_INITED) == 0)
+ return (ENXIO);
+ db1_printf(("raidwrite\n"));
+ return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio));
+}
+
+int
+raidioctl(dev, cmd, data, flag, p)
+ dev_t dev;
+ u_long cmd;
+ caddr_t data;
+ int flag;
+ struct proc *p;
+{
+ int unit = raidunit(dev);
+ int error = 0;
+ int part, pmask;
+ struct raid_softc *rs;
+#if 0
+ int r, c;
+#endif
+ /* struct raid_ioctl *ccio = (struct ccd_ioctl *)data; */
+
+ /* struct ccdbuf *cbp; */
+ /* struct raidbuf *raidbp; */
+ RF_Config_t *k_cfg, *u_cfg;
+ u_char *specific_buf;
+ int retcode = 0;
+
+ int row;
+ struct rf_recon_req *rrcopy, *rr;
+#if 0
+ int nbytes, spl, rw, row;
+ struct rf_test_acc *ta;
+ struct buf *bp;
+ RF_SparetWait_t *waitreq;
+ struct rf_test_acc *ta_p, *ta_copy;
+#endif
+
+ if (unit >= numraid)
+ return (ENXIO);
+ rs = &raid_softc[unit];
+
+ db1_printf(("raidioctl: %d %d %d %d\n", (int)dev, (int)DISKPART(dev),
+ (int)unit, (int)cmd));
+
+ /* Must be open for writes for these commands... */
+ switch (cmd) {
+ case DIOCSDINFO:
+ case DIOCWDINFO:
+ case DIOCWLABEL:
+ if ((flag & FWRITE) == 0)
+ return (EBADF);
+ }
+
+ /* Must be initialized for these... */
+ switch (cmd) {
+ case DIOCGDINFO:
+ case DIOCSDINFO:
+ case DIOCWDINFO:
+ case DIOCGPART:
+ case DIOCWLABEL:
+ case RAIDFRAME_SHUTDOWN:
+ case RAIDFRAME_REWRITEPARITY:
+ case RAIDFRAME_GET_INFO:
+ case RAIDFRAME_RESET_ACCTOTALS:
+ case RAIDFRAME_GET_ACCTOTALS:
+ case RAIDFRAME_KEEP_ACCTOTALS:
+ case RAIDFRAME_GET_SIZE:
+ case RAIDFRAME_FAIL_DISK:
+ case RAIDFRAME_COPYBACK:
+ case RAIDFRAME_CHECKRECON:
+ if ((rs->sc_flags & RAIDF_INITED) == 0)
+ return (ENXIO);
+ }
+
+ switch (cmd) {
+ case RAIDFRAME_CONFIGURE:
+ /* Configure the system */
+
+ db3_printf(("rf_ioctl: RAIDFRAME_CONFIGURE\n"));
+
+ /*
+ * Copy-in the configuration information
+ * data points to a pointer to the configuration structure.
+ */
+ u_cfg = *((RF_Config_t **)data);
+ RF_Malloc(k_cfg, sizeof (RF_Config_t), (RF_Config_t *));
+ if (k_cfg == NULL) {
+ db3_printf((
+ "rf_ioctl: ENOMEM for config. Code is %d\n",
+ retcode));
+ return (ENOMEM);
+ }
+ retcode = copyin((caddr_t)u_cfg, (caddr_t)k_cfg,
+ sizeof (RF_Config_t));
+ if (retcode) {
+ db3_printf(("rf_ioctl: retcode=%d copyin.1\n",
+ retcode));
+ return (retcode);
+ }
+
+ /*
+ * Allocate a buffer for the layout-specific data,
+ * and copy it in.
+ */
+ if (k_cfg->layoutSpecificSize) {
+ if (k_cfg->layoutSpecificSize > 10000) {
+ /* sanity check */
+ db3_printf(("rf_ioctl: EINVAL %d\n", retcode));
+ return (EINVAL);
+ }
+ RF_Malloc(specific_buf, k_cfg->layoutSpecificSize,
+ (u_char *));
+ if (specific_buf == NULL) {
+ RF_Free(k_cfg, sizeof (RF_Config_t));
+ db3_printf(("rf_ioctl: ENOMEM %d\n", retcode));
+ return (ENOMEM);
+ }
+ retcode = copyin(k_cfg->layoutSpecific,
+ (caddr_t)specific_buf, k_cfg->layoutSpecificSize);
+ if (retcode) {
+ db3_printf(("rf_ioctl: retcode=%d copyin.2\n",
+ retcode));
+ return (retcode);
+ }
+ } else
+ specific_buf = NULL;
+ k_cfg->layoutSpecific = specific_buf;
+
+ /*
+ * We should do some kind of sanity check on the
+ * configuration.
+ * Store the sum of all the bytes in the last byte?
+ */
+
+ db1_printf(("Considering configuring the system.:%d 0x%x\n",
+ unit, p));
+
+ /*
+ * We need the pointer to this a little deeper,
+ * so stash it here...
+ */
+ raidPtrs[unit]->proc = p;
+
+ /* configure the system */
+ rf_pending_testaccs = 0;
+
+ raidPtrs[unit]->raidid = unit;
+ retcode = rf_Configure(raidPtrs[unit], k_cfg);
+
+ if (retcode == 0) {
+ retcode = raidinit(dev, raidPtrs[unit],unit);
+ }
+
+ /* Free the buffers. No return code here. */
+ if (k_cfg->layoutSpecificSize) {
+ RF_Free(specific_buf, k_cfg->layoutSpecificSize);
+ }
+ RF_Free(k_cfg, sizeof (RF_Config_t));
+
+ db3_printf(("rf_ioctl: retcode=%d RAIDFRAME_CONFIGURE\n",
+ retcode));
+ return (retcode);
+
+ case RAIDFRAME_SHUTDOWN:
+ /* Shutdown the system */
+
+ if ((error = raidlock(rs)) != 0)
+ return (error);
+
+ /*
+ * If somebody has a partition mounted, we shouldn't
+ * shutdown.
+ */
+
+ part = DISKPART(dev);
+ pmask = (1 << part);
+ if ((rs->sc_dkdev.dk_openmask & ~pmask) ||
+ ((rs->sc_dkdev.dk_bopenmask & pmask) &&
+ (rs->sc_dkdev.dk_copenmask & pmask))) {
+ raidunlock(rs);
+ return (EBUSY);
+ }
+
+ /*
+ * The intention here was to disallow shutdowns while
+ * raidframe is mounted, but it doesn't work because the
+ * shutdown ioctl calls rf_open.
+ */
+ if (rf_pending_testaccs > 0) {
+ printf("RAIDFRAME: Can't shutdown because there are "
+ "%d pending test accs\n",
+ rf_pending_testaccs);
+ return (EINVAL);
+ }
+ if (rf_debugKernelAccess) {
+ printf("call shutdown\n");
+ }
+ raidPtrs[unit]->proc = p; /* XXX Necessary evil */
+ retcode = rf_Shutdown(raidPtrs[unit]);
+
+ db1_printf(("Done main shutdown\n"));
+
+ /* It's no longer initialized... */
+ rs->sc_flags &= ~RAIDF_INITED;
+
+ /* Detach the disk. */
+ disk_detach(&rs->sc_dkdev);
+
+ raidunlock(rs);
+
+ return (retcode);
+
+ case RAIDFRAME_REWRITEPARITY:
+ /* initialize all parity */
+
+ if (raidPtrs[unit]->Layout.map->faultsTolerated == 0)
+ return (EINVAL);
+ /* borrow the thread of the requesting process */
+ raidPtrs[unit]->proc = p; /* Blah... :-p GO */
+ retcode = rf_RewriteParity(raidPtrs[unit]);
+ /* return I/O Error if the parity rewrite fails */
+
+ if (retcode)
+ retcode = EIO;
+ return (retcode);
+
+#if 0 /* XXX not supported yet (ever?) */
+ case RAIDFRAME_TUR:
+ /*
+ * Issue a test-unit-ready through raidframe to the
+ * indicated device.
+ */
+
+ /* debug only */
+ retcode = rf_SCSI_DoTUR(0, 0, 0, 0, *(dev_t *)data);
+ return (retcode);
+#endif
+
+ case RAIDFRAME_GET_INFO:
+ {
+ RF_Raid_t *raid = raidPtrs[unit];
+ RF_DeviceConfig_t *cfg, **ucfgp;
+ int i, j, d;
+
+ if (!raid->valid)
+ return (ENODEV);
+ ucfgp = (RF_DeviceConfig_t **)data;
+ RF_Malloc(cfg,sizeof(RF_DeviceConfig_t),
+ (RF_DeviceConfig_t *));
+ if (cfg == NULL)
+ return (ENOMEM);
+ bzero((char *)cfg, sizeof(RF_DeviceConfig_t));
+ cfg->rows = raid->numRow;
+ cfg->cols = raid->numCol;
+ cfg->ndevs = raid->numRow * raid->numCol;
+ if (cfg->ndevs >= RF_MAX_DISKS) {
+ cfg->ndevs = 0;
+ return (ENOMEM);
+ }
+ cfg->nspares = raid->numSpare;
+ if (cfg->nspares >= RF_MAX_DISKS) {
+ cfg->nspares = 0;
+ return (ENOMEM);
+ }
+ cfg->maxqdepth = raid->maxQueueDepth;
+ d = 0;
+ for(i=0;i<cfg->rows;i++) {
+ for(j=0;j<cfg->cols;j++) {
+ cfg->devs[d] = raid->Disks[i][j];
+ d++;
+ }
+ }
+ for(j=cfg->cols,i=0;i<cfg->nspares;i++,j++) {
+ cfg->spares[i] = raid->Disks[0][j];
+ }
+ retcode = copyout((caddr_t)cfg, (caddr_t)*ucfgp,
+ sizeof(RF_DeviceConfig_t));
+ RF_Free(cfg,sizeof(RF_DeviceConfig_t));
+
+ return (retcode);
+ }
+ break;
+
+ case RAIDFRAME_RESET_ACCTOTALS:
+ {
+ RF_Raid_t *raid = raidPtrs[unit];
+
+ bzero(&raid->acc_totals, sizeof(raid->acc_totals));
+ return (0);
+ }
+ break;
+
+ case RAIDFRAME_GET_ACCTOTALS:
+ {
+ RF_AccTotals_t *totals = (RF_AccTotals_t *)data;
+ RF_Raid_t *raid = raidPtrs[unit];
+
+ *totals = raid->acc_totals;
+ return (0);
+ }
+ break;
+
+ case RAIDFRAME_KEEP_ACCTOTALS:
+ {
+ RF_Raid_t *raid = raidPtrs[unit];
+ int *keep = (int *)data;
+
+ raid->keep_acc_totals = *keep;
+ return (0);
+ }
+ break;
+
+ case RAIDFRAME_GET_SIZE:
+ *(int *) data = raidPtrs[unit]->totalSectors;
+ return (0);
+
+#define RAIDFRAME_RECON 1
+ /* XXX The above should probably be set somewhere else!! GO */
+#if RAIDFRAME_RECON > 0
+
+ /* fail a disk & optionally start reconstruction */
+ case RAIDFRAME_FAIL_DISK:
+ rr = (struct rf_recon_req *) data;
+
+ if (rr->row < 0 || rr->row >= raidPtrs[unit]->numRow
+ || rr->col < 0 || rr->col >= raidPtrs[unit]->numCol)
+ return (EINVAL);
+
+ printf("Failing the disk: row: %d col: %d\n",rr->row,rr->col);
+
+ /*
+ * Make a copy of the recon request so that we don't
+ * rely on the user's buffer
+ */
+ RF_Malloc(rrcopy, sizeof(*rrcopy), (struct rf_recon_req *));
+ bcopy(rr, rrcopy, sizeof(*rr));
+ rrcopy->raidPtr = (void *) raidPtrs[unit];
+
+ LOCK_RECON_Q_MUTEX();
+ rrcopy->next = recon_queue;
+ recon_queue = rrcopy;
+ wakeup(&recon_queue);
+ UNLOCK_RECON_Q_MUTEX();
+
+ return (0);
+
+ /*
+ * Invoke a copyback operation after recon on whatever
+ * disk needs it, if any.
+ */
+ case RAIDFRAME_COPYBACK:
+ /* Borrow the current thread to get this done */
+ raidPtrs[unit]->proc = p; /* ICK.. but needed :-p GO */
+ rf_CopybackReconstructedData(raidPtrs[unit]);
+ return (0);
+
+ /* Return the percentage completion of reconstruction */
+ case RAIDFRAME_CHECKRECON:
+ row = *(int *)data;
+ if (row < 0 || row >= raidPtrs[unit]->numRow)
+ return (EINVAL);
+ if (raidPtrs[unit]->status[row] != rf_rs_reconstructing)
+ *(int *)data = 100;
+ else
+ *(int *)data =
+ raidPtrs[unit]->reconControl[row]->percentComplete;
+ return (0);
+
+#if 0
+ case RAIDFRAME_SPARET_WAIT:
+ /*
+ * The sparetable daemon calls this to wait for the
+ * kernel to need a spare table.
+ * This ioctl does not return until a spare table is needed.
+ * XXX -- Calling mpsleep here in the ioctl code is almost
+ * certainly wrong and evil. -- XXX
+ * XXX -- I should either compute the spare table in the
+ * kernel, or have a different. -- XXX
+ * XXX -- Interface (a different character device) for
+ * delivering the table. -- XXX
+ */
+ RF_LOCK_MUTEX(rf_sparet_wait_mutex);
+ while (!rf_sparet_wait_queue)
+ mpsleep(&rf_sparet_wait_queue, (PZERO+1)|PCATCH,
+ "sparet wait", 0,
+ (void *)simple_lock_addr(rf_sparet_wait_mutex),
+ MS_LOCK_SIMPLE);
+ waitreq = rf_sparet_wait_queue;
+ rf_sparet_wait_queue = rf_sparet_wait_queue->next;
+ RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
+
+ *((RF_SparetWait_t *)data) = *waitreq;
+
+ RF_Free(waitreq, sizeof *waitreq);
+ return (0);
+
+ case RAIDFRAME_ABORT_SPARET_WAIT:
+ /*
+ * Wakes up a process waiting on SPARET_WAIT and puts an
+ * error code in it that will cause the dameon to exit.
+ */
+ RF_Malloc(waitreq, sizeof(*waitreq), (RF_SparetWait_t *));
+ waitreq->fcol = -1;
+ RF_LOCK_MUTEX(rf_sparet_wait_mutex);
+ waitreq->next = rf_sparet_wait_queue;
+ rf_sparet_wait_queue = waitreq;
+ RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
+ wakeup(&rf_sparet_wait_queue);
+ return (0);
+
+ case RAIDFRAME_SEND_SPARET:
+ /*
+ * Used by the spare table daemon to deliver a spare table
+ * into the kernel
+ */
+
+ /* Install the spare table */
+ retcode = rf_SetSpareTable(raidPtrs[unit],*(void **) data);
+
+ /*
+ * Respond to the requestor. the return status of the
+ * spare table installation is passed in the "fcol" field
+ */
+ RF_Malloc(waitreq, sizeof *waitreq, (RF_SparetWait_t *));
+ waitreq->fcol = retcode;
+ RF_LOCK_MUTEX(rf_sparet_wait_mutex);
+ waitreq->next = rf_sparet_resp_queue;
+ rf_sparet_resp_queue = waitreq;
+ wakeup(&rf_sparet_resp_queue);
+ RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
+
+ return (retcode);
+#endif
+#endif /* RAIDFRAME_RECON > 0 */
+
+ default:
+ /* fall through to the os-specific code below */
+ break;
+ }
+
+ if (!raidPtrs[unit]->valid)
+ return (EINVAL);
+
+ /*
+ * Add support for "regular" device ioctls here.
+ */
+ switch (cmd) {
+ case DIOCGDINFO:
+ db1_printf(
+ ("DIOCGDINFO %d %d\n", (int)dev, (int)DISKPART(dev)));
+ *(struct disklabel *)data = *(rs->sc_dkdev.dk_label);
+ break;
+
+ case DIOCGPART:
+ db1_printf(
+ ("DIOCGPART: %d %d\n", (int)dev, (int)DISKPART(dev)));
+ ((struct partinfo *)data)->disklab = rs->sc_dkdev.dk_label;
+ ((struct partinfo *)data)->part =
+ &rs->sc_dkdev.dk_label->d_partitions[DISKPART(dev)];
+ break;
+
+ case DIOCWDINFO:
+ db1_printf(("DIOCWDINFO\n"));
+ case DIOCSDINFO:
+ db1_printf(("DIOCSDINFO\n"));
+ if ((error = raidlock(rs)) != 0)
+ return (error);
+
+ rs->sc_flags |= RAIDF_LABELLING;
+
+ error = setdisklabel(rs->sc_dkdev.dk_label,
+ (struct disklabel *)data, 0, rs->sc_dkdev.dk_cpulabel);
+ if (error == 0) {
+ if (cmd == DIOCWDINFO)
+ error = writedisklabel(RAIDLABELDEV(dev),
+ raidstrategy, rs->sc_dkdev.dk_label,
+ rs->sc_dkdev.dk_cpulabel);
+ }
+
+ rs->sc_flags &= ~RAIDF_LABELLING;
+
+ raidunlock(rs);
+
+ if (error)
+ return (error);
+ break;
+
+ case DIOCWLABEL:
+ db1_printf(("DIOCWLABEL\n"));
+ if (*(int *)data != 0)
+ rs->sc_flags |= RAIDF_WLABEL;
+ else
+ rs->sc_flags &= ~RAIDF_WLABEL;
+ break;
+
+ default:
+ retcode = ENOTTY; /* XXXX ?? OR EINVAL ? */
+ }
+ return (retcode);
+}
+
+/*
+ * raidinit -- complete the rest of the initialization for the
+ * RAIDframe device.
+ */
+int
+raidinit(dev, raidPtr, unit)
+ dev_t dev;
+ RF_Raid_t *raidPtr;
+ int unit;
+{
+ int retcode;
+ /* int ix; */
+ /* struct raidbuf *raidbp; */
+ struct raid_softc *rs;
+
+ retcode = 0;
+
+ rs = &raid_softc[unit];
+
+ /* XXX should check return code first... */
+ rs->sc_flags |= RAIDF_INITED;
+
+ /* XXX doesn't check bounds.*/
+ sprintf(rs->sc_xname, "raid%d", unit);
+
+ rs->sc_dkdev.dk_name = rs->sc_xname;
+
+ /*
+ * disk_attach actually creates space for the CPU disklabel, among
+ * other things, so it's critical to call this *BEFORE* we
+ * try putzing with disklabels.
+ */
+ disk_attach(&rs->sc_dkdev);
+
+ /*
+ * XXX There may be a weird interaction here between this, and
+ * protectedSectors, as used in RAIDframe.
+ */
+ rs->sc_size = raidPtr->totalSectors;
+ rs->sc_dev = dev;
+ return (retcode);
+}
+
+/*********************************************************
+ *
+ * initialization code called at boot time (startup.c)
+ *
+ ********************************************************/
+int
+rf_boot()
+{
+ int i, rc;
+
+ rc = rf_mutex_init(&rf_sparet_wait_mutex);
+ if (rc) {
+ RF_PANIC();
+ }
+ rc = rf_mutex_init(&rf_async_done_q_mutex);
+ if (rc) {
+ RF_PANIC();
+ }
+ rf_sparet_wait_queue = rf_sparet_resp_queue = NULL;
+ recon_queue = NULL;
+ rf_async_done_qh = rf_async_done_qt = NULL;
+ for (i=0; i<numraid; i++)
+ raidPtrs[i] = NULL;
+ rc = rf_BootRaidframe();
+ if (rc == 0)
+ printf("Kernelized RAIDframe activated\n");
+ else
+ rf_kbooted = RFK_BOOT_BAD;
+ return (rc);
+}
+
+/*
+ * This kernel thread never exits. It is created once, and persists
+ * until the system reboots.
+ */
+void
+rf_ReconKernelThread()
+{
+ struct rf_recon_req *req;
+ int s;
+
+ /*
+ * XXX not sure what spl() level we should be at here...
+ * probably splbio()
+ */
+ s = splbio();
+
+ while (1) {
+ /* grab the next reconstruction request from the queue */
+ LOCK_RECON_Q_MUTEX();
+ while (!recon_queue) {
+ UNLOCK_RECON_Q_MUTEX();
+ tsleep(&recon_queue, PRIBIO | PCATCH, "raidframe recon", 0);
+ LOCK_RECON_Q_MUTEX();
+ }
+ req = recon_queue;
+ recon_queue = recon_queue->next;
+ UNLOCK_RECON_Q_MUTEX();
+
+ /*
+ * If flags specifies that we should start recon, this call
+ * will not return until reconstruction completes, fails, or
+ * is aborted.
+ */
+ rf_FailDisk((RF_Raid_t *) req->raidPtr, req->row, req->col,
+ ((req->flags&RF_FDFLAGS_RECON) ? 1 : 0));
+
+ RF_Free(req, sizeof(*req));
+ }
+}
+
+/*
+ * Wake up the daemon & tell it to get us a spare table
+ * XXX
+ * The entries in the queues should be tagged with the raidPtr so that in the
+ * extremely rare case that two recons happen at once, we know for
+ * which device were requesting a spare table.
+ * XXX
+ */
+int
+rf_GetSpareTableFromDaemon(req)
+ RF_SparetWait_t *req;
+{
+ int retcode;
+
+ RF_LOCK_MUTEX(rf_sparet_wait_mutex);
+ req->next = rf_sparet_wait_queue;
+ rf_sparet_wait_queue = req;
+ wakeup(&rf_sparet_wait_queue);
+
+ /* mpsleep unlocks the mutex */
+ while (!rf_sparet_resp_queue) {
+ tsleep(&rf_sparet_resp_queue, PRIBIO | PCATCH,
+ "raidframe getsparetable", 0);
+#if 0
+ mpsleep(&rf_sparet_resp_queue, PZERO, "sparet resp", 0,
+ (void *) simple_lock_addr(rf_sparet_wait_mutex),
+ MS_LOCK_SIMPLE);
+#endif
+ }
+ req = rf_sparet_resp_queue;
+ rf_sparet_resp_queue = req->next;
+ RF_UNLOCK_MUTEX(rf_sparet_wait_mutex);
+
+ retcode = req->fcol;
+ /* this is not the same req as we alloc'd */
+ RF_Free(req, sizeof(*req));
+ return (retcode);
+}
+
+/*
+ * A wrapper around rf_DoAccess that extracts appropriate info from the
+ * bp & passes it down.
+ * Any calls originating in the kernel must use non-blocking I/O
+ * do some extra sanity checking to return "appropriate" error values for
+ * certain conditions (to make some standard utilities work)
+ */
+int
+rf_DoAccessKernel(raidPtr, bp, flags, cbFunc, cbArg)
+ RF_Raid_t *raidPtr;
+ struct buf *bp;
+ RF_RaidAccessFlags_t flags;
+ void (*cbFunc)(struct buf *);
+ void *cbArg;
+{
+ RF_SectorCount_t num_blocks, pb, sum;
+ RF_RaidAddr_t raid_addr;
+ int retcode;
+ struct partition *pp;
+ daddr_t blocknum;
+ int unit;
+ struct raid_softc *rs;
+
+ /* XXX The dev_t used here should be for /dev/[r]raid* !!! */
+
+ unit = raidPtr->raidid;
+ rs = &raid_softc[unit];
+
+ /*
+ * Ok, for the bp we have here, bp->b_blkno is relative to the
+ * partition.. Need to make it absolute to the underlying device..
+ */
+ blocknum = bp->b_blkno;
+ if (DISKPART(bp->b_dev) != RAW_PART) {
+ pp = &rs->sc_dkdev.dk_label->d_partitions[DISKPART(bp->b_dev)];
+ blocknum += pp->p_offset;
+ db1_printf(
+ ("updated: %d %d\n", DISKPART(bp->b_dev), pp->p_offset));
+ } else {
+ db1_printf(("Is raw..\n"));
+ }
+ db1_printf(("Blocks: %d, %d\n", (int)bp->b_blkno, (int)blocknum));
+ db1_printf(("bp->b_bcount = %d\n", (int)bp->b_bcount));
+ db1_printf(("bp->b_resid = %d\n", (int)bp->b_resid));
+
+ /*
+ * *THIS* is where we adjust what block we're going to... but
+ * DO NOT TOUCH bp->b_blkno!!!
+ */
+ raid_addr = blocknum;
+
+ num_blocks = bp->b_bcount >> raidPtr->logBytesPerSector;
+ pb = (bp->b_bcount&raidPtr->sectorMask) ? 1 : 0;
+ sum = raid_addr + num_blocks + pb;
+ if (1 || rf_debugKernelAccess) {
+ db1_printf(("raid_addr=%d sum=%d num_blocks=%d(+%d) (%d)\n",
+ (int)raid_addr, (int)sum, (int)num_blocks, (int)pb,
+ (int)bp->b_resid));
+ }
+
+
+ if ((sum > raidPtr->totalSectors) || (sum < raid_addr) ||
+ (sum < num_blocks) || (sum < pb)) {
+ bp->b_error = ENOSPC;
+ bp->b_flags |= B_ERROR;
+ bp->b_resid = bp->b_bcount;
+ biodone(bp);
+ return (bp->b_error);
+ }
+
+ /*
+ * XXX rf_DoAccess() should do this, not just DoAccessKernel()
+ */
+ if (bp->b_bcount & raidPtr->sectorMask) {
+ bp->b_error = EINVAL;
+ bp->b_flags |= B_ERROR;
+ bp->b_resid = bp->b_bcount;
+ biodone(bp);
+ return (bp->b_error);
+ }
+ db1_printf(("Calling DoAccess..\n"));
+
+ /*
+ * Don't ever condition on bp->b_flags & B_WRITE.
+ * always condition on B_READ instead.
+ */
+ retcode = rf_DoAccess(raidPtr,
+ (bp->b_flags & B_READ) ? RF_IO_TYPE_READ : RF_IO_TYPE_WRITE,
+ 0, raid_addr, num_blocks, bp->b_un.b_addr, bp, NULL, NULL,
+ RF_DAG_NONBLOCKING_IO|flags, NULL, cbFunc, cbArg);
+ db1_printf(("After call to DoAccess: 0x%x 0x%x %d\n", bp, bp->b_data,
+ (int)bp->b_resid));
+ return (retcode);
+}
+
+/* Invoke an I/O from kernel mode. Disk queue should be locked upon entry */
+
+int
+rf_DispatchKernelIO(queue, req)
+ RF_DiskQueue_t *queue;
+ RF_DiskQueueData_t *req;
+{
+ int op = (req->type == RF_IO_TYPE_READ) ? B_READ : B_WRITE;
+ struct buf *bp;
+ struct raidbuf *raidbp = NULL;
+ struct raid_softc *rs;
+ int unit;
+
+ /*
+ * XXX along with the vnode, we also need the softc associated with
+ * this device..
+ */
+ req->queue = queue;
+
+ unit = queue->raidPtr->raidid;
+
+ db1_printf(("DispatchKernelIO unit: %d\n", unit));
+
+ if (unit >= numraid) {
+ printf("Invalid unit number: %d %d\n", unit, numraid);
+ panic("Invalid Unit number in rf_DispatchKernelIO\n");
+ }
+
+ rs = &raid_softc[unit];
+
+ /* XXX is this the right place? */
+ disk_busy(&rs->sc_dkdev);
+
+ bp = req->bp;
+
+ /*
+ * XXX When there is a physical disk failure, someone is passing
+ * us a buffer that contains old stuff!! Attempt to deal with
+ * this problem without taking a performance hit...
+ * (not sure where the real bug is. It's buried in RAIDframe
+ * somewhere) :-( GO )
+ */
+ if (bp->b_flags & B_ERROR) {
+ bp->b_flags &= ~B_ERROR;
+ }
+ if (bp->b_error!=0) {
+ bp->b_error = 0;
+ }
+
+ raidbp = RAIDGETBUF();
+
+ raidbp->rf_flags = 0; /* XXX not really used anywhere... */
+
+ /*
+ * context for raidiodone
+ */
+ raidbp->rf_obp = bp;
+ raidbp->req = req;
+
+ switch (req->type) {
+ case RF_IO_TYPE_NOP:
+ /* Used primarily to unlock a locked queue. */
+
+ db1_printf(("rf_DispatchKernelIO: NOP to r %d c %d\n",
+ queue->row, queue->col));
+
+ /* XXX need to do something extra here.. */
+
+ /*
+ * I'm leaving this in, as I've never actually seen it
+ * used, and I'd like folks to report it... GO
+ */
+ printf(("WAKEUP CALLED\n"));
+ queue->numOutstanding++;
+
+ /* XXX need to glue the original buffer into this?? */
+
+ rf_KernelWakeupFunc(&raidbp->rf_buf);
+ break;
+
+ case RF_IO_TYPE_READ:
+ case RF_IO_TYPE_WRITE:
+ if (req->tracerec) {
+ RF_ETIMER_START(req->tracerec->timer);
+ }
+
+ rf_InitBP(&raidbp->rf_buf, queue->rf_cinfo->ci_vp,
+ op | bp->b_flags, queue->rf_cinfo->ci_dev,
+ req->sectorOffset, req->numSector,
+ req->buf, rf_KernelWakeupFunc, (void *)req,
+ queue->raidPtr->logBytesPerSector, req->b_proc);
+
+ if (rf_debugKernelAccess) {
+ db1_printf(("dispatch: bp->b_blkno = %ld\n",
+ (long)bp->b_blkno));
+ }
+ queue->numOutstanding++;
+ queue->last_deq_sector = req->sectorOffset;
+
+ /*
+ * Acc wouldn't have been let in if there were any
+ * pending reqs at any other priority.
+ */
+ queue->curPriority = req->priority;
+
+ db1_printf(("Going for %c to unit %d row %d col %d\n",
+ req->type, unit, queue->row, queue->col));
+ db1_printf(("sector %d count %d (%d bytes) %d\n",
+ (int)req->sectorOffset, (int)req->numSector,
+ (int)(req->numSector << queue->raidPtr->logBytesPerSector),
+ (int)queue->raidPtr->logBytesPerSector));
+ if ((raidbp->rf_buf.b_flags & B_READ) == 0) {
+ raidbp->rf_buf.b_vp->v_numoutput++;
+ }
+
+ VOP_STRATEGY(&raidbp->rf_buf);
+ break;
+
+ default:
+ panic("bad req->type in rf_DispatchKernelIO");
+ }
+ db1_printf(("Exiting from DispatchKernelIO\n"));
+ return (0);
+}
+
+/*
+ * This is the callback function associated with a I/O invoked from
+ * kernel code.
+ */
+void
+rf_KernelWakeupFunc(vbp)
+ struct buf *vbp;
+{
+ RF_DiskQueueData_t *req = NULL;
+ RF_DiskQueue_t *queue;
+ struct raidbuf *raidbp = (struct raidbuf *)vbp;
+ struct buf *bp;
+ struct raid_softc *rs;
+ int unit;
+ int s;
+
+ s = splbio(); /* XXX */
+ db1_printf(("recovering the request queue:\n"));
+ req = raidbp->req;
+
+ bp = raidbp->rf_obp;
+ db1_printf(("bp=0x%x\n", bp));
+
+ queue = (RF_DiskQueue_t *)req->queue;
+
+ if (raidbp->rf_buf.b_flags & B_ERROR) {
+ db1_printf(
+ ("Setting bp->b_flags!!! %d\n", raidbp->rf_buf.b_error));
+ bp->b_flags |= B_ERROR;
+ bp->b_error =
+ raidbp->rf_buf.b_error ? raidbp->rf_buf.b_error : EIO;
+ }
+
+ db1_printf(("raidbp->rf_buf.b_bcount=%d\n",
+ (int)raidbp->rf_buf.b_bcount));
+ db1_printf(("raidbp->rf_buf.b_bufsize=%d\n",
+ (int)raidbp->rf_buf.b_bufsize));
+ db1_printf(("raidbp->rf_buf.b_resid=%d\n",
+ (int)raidbp->rf_buf.b_resid));
+ db1_printf(("raidbp->rf_buf.b_data=0x%x\n", raidbp->rf_buf.b_data));
+
+#if 1
+ /* XXX Methinks this could be wrong... */
+ bp->b_resid = raidbp->rf_buf.b_resid;
+#endif
+
+ if (req->tracerec) {
+ RF_ETIMER_STOP(req->tracerec->timer);
+ RF_ETIMER_EVAL(req->tracerec->timer);
+ RF_LOCK_MUTEX(rf_tracing_mutex);
+ req->tracerec->diskwait_us +=
+ RF_ETIMER_VAL_US(req->tracerec->timer);
+ req->tracerec->phys_io_us +=
+ RF_ETIMER_VAL_US(req->tracerec->timer);
+ req->tracerec->num_phys_ios++;
+ RF_UNLOCK_MUTEX(rf_tracing_mutex);
+ }
+
+ bp->b_bcount = raidbp->rf_buf.b_bcount;/* XXXX ?? */
+
+ unit = queue->raidPtr->raidid; /* *Much* simpler :-> */
+
+#if 1
+ /*
+ * XXX Ok, let's get aggressive... If B_ERROR is set, let's go
+ * ballistic, and mark the component as hosed...
+ */
+ if (bp->b_flags & B_ERROR) {
+ /* Mark the disk as dead but only mark it once... */
+ if (queue->raidPtr->Disks[queue->row][queue->col].status ==
+ rf_ds_optimal) {
+ printf("raid%d: IO Error. Marking %s as failed.\n",
+ unit,
+ queue->raidPtr->
+ Disks[queue->row][queue->col].devname);
+ queue->raidPtr->Disks[queue->row][queue->col].status =
+ rf_ds_failed;
+ queue->raidPtr->status[queue->row] = rf_rs_degraded;
+ queue->raidPtr->numFailures++;
+ } else {
+ /* Disk is already dead... */
+ /* printf("Disk already marked as dead!\n"); */
+ }
+ }
+#endif
+
+ rs = &raid_softc[unit];
+ RAIDPUTBUF(raidbp);
+
+ if (bp->b_resid==0) {
+ db1_printf((
+ "Disk is no longer busy for this buffer... %d %ld %ld\n",
+ unit, bp->b_resid, bp->b_bcount));
+ /* XXX is this the right place for a disk_unbusy()??!??!?!? */
+ disk_unbusy(&rs->sc_dkdev, (bp->b_bcount - bp->b_resid));
+ } else {
+ db1_printf(("b_resid is still %ld\n", bp->b_resid));
+ }
+
+ rf_DiskIOComplete(queue, req, (bp->b_flags & B_ERROR) ? 1 : 0);
+ (req->CompleteFunc)(req->argument, (bp->b_flags & B_ERROR) ? 1 : 0);
+ /* printf("Exiting rf_KernelWakeupFunc\n"); */
+
+ splx(s); /* XXX */
+}
+
+/*
+ * Initialize a buf structure for doing an I/O in the kernel.
+ */
+void
+rf_InitBP(bp, b_vp, rw_flag, dev, startSect, numSect, buf, cbFunc, cbArg,
+ logBytesPerSector, b_proc)
+ struct buf *bp;
+ struct vnode *b_vp;
+ unsigned rw_flag;
+ dev_t dev;
+ RF_SectorNum_t startSect;
+ RF_SectorCount_t numSect;
+ caddr_t buf;
+ void (*cbFunc)(struct buf *);
+ void *cbArg;
+ int logBytesPerSector;
+ struct proc *b_proc;
+{
+ /* bp->b_flags = B_PHYS | rw_flag; */
+ bp->b_flags = B_CALL | rw_flag; /* XXX need B_PHYS here too??? */
+ bp->b_bcount = numSect << logBytesPerSector;
+ bp->b_bufsize = bp->b_bcount;
+ bp->b_error = 0;
+ bp->b_dev = dev;
+ db1_printf(("bp->b_dev is %d\n", dev));
+ bp->b_un.b_addr = buf;
+ db1_printf(("bp->b_data=0x%x\n", bp->b_data));
+
+ bp->b_blkno = startSect;
+ bp->b_resid = bp->b_bcount; /* XXX is this right!??!?!! */
+ db1_printf(("b_bcount is: %d\n", (int)bp->b_bcount));
+ if (bp->b_bcount == 0) {
+ panic("bp->b_bcount is zero in rf_InitBP!!\n");
+ }
+ bp->b_proc = b_proc;
+ bp->b_iodone = cbFunc;
+ bp->b_vp = b_vp;
+}
+#endif /* KERNEL */
+
+/* Extras... */
+
+unsigned int
+rpcc()
+{
+ /* XXX no clue what this is supposed to do.. my guess is
+ that it's supposed to read the CPU cycle counter... */
+ /* db1_printf("this is supposed to do something useful too!??\n"); */
+ return (0);
+}
+
+#if 0
+int
+rf_GetSpareTableFromDaemon(req)
+ RF_SparetWait_t *req;
+{
+ int retcode=1;
+ printf("This is supposed to do something useful!!\n"); /* XXX */
+
+ return (retcode);
+}
+#endif
+
+void
+raidgetdefaultlabel(raidPtr, rs, lp)
+ RF_Raid_t *raidPtr;
+ struct raid_softc *rs;
+ struct disklabel *lp;
+{
+ db1_printf(("Building a default label...\n"));
+ bzero(lp, sizeof(*lp));
+
+ /* fabricate a label... */
+ lp->d_secperunit = raidPtr->totalSectors;
+ lp->d_secsize = raidPtr->bytesPerSector;
+ lp->d_nsectors = 1024 * (1024 / raidPtr->bytesPerSector);
+ lp->d_ntracks = 1;
+ lp->d_ncylinders = raidPtr->totalSectors / lp->d_nsectors;
+ lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
+
+ strncpy(lp->d_typename, "raid", sizeof(lp->d_typename));
+ lp->d_type = DTYPE_RAID;
+ strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
+ lp->d_rpm = 3600;
+ lp->d_interleave = 1;
+ lp->d_flags = 0;
+
+ lp->d_partitions[RAW_PART].p_offset = 0;
+ lp->d_partitions[RAW_PART].p_size = raidPtr->totalSectors;
+ lp->d_partitions[RAW_PART].p_fstype = FS_UNUSED;
+ lp->d_npartitions = RAW_PART + 1;
+
+ lp->d_magic = DISKMAGIC;
+ lp->d_magic2 = DISKMAGIC;
+ lp->d_checksum = dkcksum(rs->sc_dkdev.dk_label);
+
+}
+
+/*
+ * Read the disklabel from the raid device. If one is not present, fake one
+ * up.
+ */
+void
+raidgetdisklabel(dev)
+ dev_t dev;
+{
+ int unit = raidunit(dev);
+ struct raid_softc *rs = &raid_softc[unit];
+ char *errstring;
+ struct disklabel *lp = rs->sc_dkdev.dk_label;
+ struct cpu_disklabel *clp = rs->sc_dkdev.dk_cpulabel;
+ RF_Raid_t *raidPtr;
+
+ db1_printf(("Getting the disklabel...\n"));
+
+ bzero(clp, sizeof(*clp));
+
+ raidPtr = raidPtrs[unit];
+
+ raidgetdefaultlabel(raidPtr, rs, lp);
+
+ /*
+ * Call the generic disklabel extraction routine.
+ */
+ errstring = readdisklabel(RAIDLABELDEV(dev), raidstrategy,
+ rs->sc_dkdev.dk_label, rs->sc_dkdev.dk_cpulabel, 0);
+ if (errstring)
+ raidmakedisklabel(rs);
+ else {
+ int i;
+ struct partition *pp;
+
+ /*
+ * Sanity check whether the found disklabel is valid.
+ *
+ * This is necessary since total size of the raid device
+ * may vary when an interleave is changed even though exactly
+ * same componets are used, and old disklabel may used
+ * if that is found.
+ */
+ if (lp->d_secperunit != rs->sc_size)
+ printf("WARNING: %s: "
+ "total sector size in disklabel (%d) != "
+ "the size of raid (%d)\n", rs->sc_xname,
+ lp->d_secperunit, rs->sc_size);
+ for (i = 0; i < lp->d_npartitions; i++) {
+ pp = &lp->d_partitions[i];
+ if (pp->p_offset + pp->p_size > rs->sc_size)
+ printf("WARNING: %s: end of partition `%c' "
+ "exceeds the size of raid (%d)\n",
+ rs->sc_xname, 'a' + i, rs->sc_size);
+ }
+ }
+}
+
+/*
+ * Take care of things one might want to take care of in the event
+ * that a disklabel isn't present.
+ */
+void
+raidmakedisklabel(rs)
+ struct raid_softc *rs;
+{
+ struct disklabel *lp = rs->sc_dkdev.dk_label;
+ db1_printf(("Making a label..\n"));
+
+ /*
+ * For historical reasons, if there's no disklabel present
+ * the raw partition must be marked FS_BSDFFS.
+ */
+
+ lp->d_partitions[RAW_PART].p_fstype = FS_BSDFFS;
+
+ strncpy(lp->d_packname, "default label", sizeof(lp->d_packname));
+
+ lp->d_checksum = dkcksum(lp);
+}
+
+/*
+ * Lookup the provided name in the filesystem. If the file exists,
+ * is a valid block device, and isn't being used by anyone else,
+ * set *vpp to the file's vnode.
+ * You'll find the original of this in ccd.c
+ */
+int
+raidlookup(path, p, vpp)
+ char *path;
+ struct proc *p;
+ struct vnode **vpp; /* result */
+{
+ struct nameidata nd;
+ struct vnode *vp;
+ struct vattr va;
+ int error;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path, p);
+ if ((error = vn_open(&nd, FREAD|FWRITE, 0)) != 0) {
+ db1_printf(("RAIDframe: vn_open returned %d\n", error));
+ return (error);
+ }
+ vp = nd.ni_vp;
+ if (vp->v_usecount > 1) {
+ VOP_UNLOCK(vp, 0, p);
+ (void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p);
+ return (EBUSY);
+ }
+ if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)) != 0) {
+ VOP_UNLOCK(vp, 0, p);
+ (void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p);
+ return (error);
+ }
+ /* XXX: eventually we should handle VREG, too. */
+ if (va.va_type != VBLK) {
+ VOP_UNLOCK(vp, 0, p);
+ (void)vn_close(vp, FREAD|FWRITE, p->p_ucred, p);
+ return (ENOTBLK);
+ }
+ VOP_UNLOCK(vp, 0, p);
+ *vpp = vp;
+ return (0);
+}
+
+/*
+ * Wait interruptibly for an exclusive lock.
+ *
+ * XXX
+ * Several drivers do this; it should be abstracted and made MP-safe.
+ * (Hmm... where have we seen this warning before :-> GO )
+ */
+int
+raidlock(rs)
+ struct raid_softc *rs;
+{
+ int error;
+
+ while ((rs->sc_flags & RAIDF_LOCKED) != 0) {
+ rs->sc_flags |= RAIDF_WANTED;
+ if ((error = tsleep(rs, PRIBIO | PCATCH, "raidlck", 0)) != 0)
+ return (error);
+ }
+ rs->sc_flags |= RAIDF_LOCKED;
+ return (0);
+}
+
+/*
+ * Unlock and wake up any waiters.
+ */
+void
+raidunlock(rs)
+ struct raid_softc *rs;
+{
+ rs->sc_flags &= ~RAIDF_LOCKED;
+ if ((rs->sc_flags & RAIDF_WANTED) != 0) {
+ rs->sc_flags &= ~RAIDF_WANTED;
+ wakeup(rs);
+ }
+}
diff --git a/sys/dev/raidframe/rf_options.c b/sys/dev/raidframe/rf_options.c
new file mode 100644
index 00000000000..c9af8105ba7
--- /dev/null
+++ b/sys/dev/raidframe/rf_options.c
@@ -0,0 +1,85 @@
+/* $OpenBSD: rf_options.c,v 1.1 1999/01/11 14:29:33 niklas Exp $ */
+/* $NetBSD: rf_options.c,v 1.1 1998/11/13 04:20:31 oster Exp $ */
+/*
+ * rf_options.c
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#ifdef KERNEL
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include <dfstrace.h>
+#endif /* !__NetBSD__ && !__OpenBSD__ */
+#endif /* KERNEL */
+
+#include "rf_threadstuff.h"
+#include "rf_types.h"
+#include "rf_archs.h"
+#include "rf_general.h"
+#include "rf_options.h"
+
+#ifdef RF_DBG_OPTION
+#undef RF_DBG_OPTION
+#endif /* RF_DBG_OPTION */
+
+#ifdef __STDC__
+#define RF_DBG_OPTION(_option_,_defval_) long rf_##_option_ = _defval_;
+#else /* __STDC__ */
+#define RF_DBG_OPTION(_option_,_defval_) long rf_/**/_option_ = _defval_;
+#endif /* __STDC__ */
+
+#include "rf_optnames.h"
+
+#undef RF_DBG_OPTION
+
+#ifdef __STDC__
+#define RF_DBG_OPTION(_option_,_defval_) { RF_STRING(_option_), &rf_##_option_ },
+#else /* __STDC__ */
+#define RF_DBG_OPTION(_option_,_defval_) { RF_STRING(_option_), &rf_/**/_option_ },
+#endif /* __STDC__ */
+
+RF_DebugName_t rf_debugNames[] = {
+#include "rf_optnames.h"
+ {NULL, NULL}
+};
+
+#undef RF_DBG_OPTION
+
+#ifdef __STDC__
+#define RF_DBG_OPTION(_option_,_defval_) rf_##_option_ = _defval_ ;
+#else /* __STDC__ */
+#define RF_DBG_OPTION(_option_,_defval_) rf_/**/_option_ = _defval_ ;
+#endif /* __STDC__ */
+
+void rf_ResetDebugOptions()
+{
+#include "rf_optnames.h"
+}
diff --git a/sys/dev/raidframe/rf_options.h b/sys/dev/raidframe/rf_options.h
new file mode 100644
index 00000000000..2b5499cb672
--- /dev/null
+++ b/sys/dev/raidframe/rf_options.h
@@ -0,0 +1,68 @@
+/* $OpenBSD: rf_options.h,v 1.1 1999/01/11 14:29:33 niklas Exp $ */
+/* $NetBSD: rf_options.h,v 1.1 1998/11/13 04:20:31 oster Exp $ */
+/*
+ * rf_options.h
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_OPTIONS_H_
+#define _RF__RF_OPTIONS_H_
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#ifdef KERNEL
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include <dfstrace.h>
+#endif /* !__NetBSD__ && !__OpenBSD__ */
+#endif /* KERNEL */
+
+#define RF_DEFAULT_LOCK_TABLE_SIZE 256
+
+typedef struct RF_DebugNames_s {
+ char *name;
+ long *ptr;
+} RF_DebugName_t;
+
+extern RF_DebugName_t rf_debugNames[];
+
+#ifdef RF_DBG_OPTION
+#undef RF_DBG_OPTION
+#endif /* RF_DBG_OPTION */
+
+#ifdef __STDC__
+#define RF_DBG_OPTION(_option_,_defval_) extern long rf_##_option_;
+#else /* __STDC__ */
+#define RF_DBG_OPTION(_option_,_defval_) extern long rf_/**/_option_;
+#endif /* __STDC__ */
+#include "rf_optnames.h"
+
+void rf_ResetDebugOptions(void);
+
+#endif /* !_RF__RF_OPTIONS_H_ */
diff --git a/sys/dev/raidframe/rf_optnames.h b/sys/dev/raidframe/rf_optnames.h
new file mode 100644
index 00000000000..064b2da76f2
--- /dev/null
+++ b/sys/dev/raidframe/rf_optnames.h
@@ -0,0 +1,144 @@
+/* $OpenBSD: rf_optnames.h,v 1.1 1999/01/11 14:29:33 niklas Exp $ */
+/* $NetBSD: rf_optnames.h,v 1.1 1998/11/13 04:20:31 oster Exp $ */
+/*
+ * rf_optnames.h
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * Don't protect against multiple inclusion here- we actually want this.
+ */
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+RF_DBG_OPTION(accSizeKB,0) /* if nonzero, the fixed access size to run */
+RF_DBG_OPTION(accessDebug,0)
+RF_DBG_OPTION(accessTraceBufSize,0)
+RF_DBG_OPTION(alignAccesses,0) /* whether accs should be aligned to their size */
+RF_DBG_OPTION(camlayerIOs,0)
+RF_DBG_OPTION(camlayerDebug,0) /* debug CAM activity */
+RF_DBG_OPTION(cscanDebug,0) /* debug CSCAN sorting */
+RF_DBG_OPTION(dagDebug,0)
+RF_DBG_OPTION(debugPrintUseBuffer,0)
+RF_DBG_OPTION(degDagDebug,0)
+RF_DBG_OPTION(disableAsyncAccs,0)
+RF_DBG_OPTION(diskDebug,0)
+RF_DBG_OPTION(doDebug,0)
+RF_DBG_OPTION(dtDebug,0)
+RF_DBG_OPTION(enableAtomicRMW,0) /* this debug var enables locking of the disk
+ * arm during small-write operations. Setting
+ * this variable to anything other than 0 will
+ * result in deadlock. (wvcii)
+ */
+RF_DBG_OPTION(engineDebug,0)
+RF_DBG_OPTION(fifoDebug,0) /* debug fifo queueing */
+RF_DBG_OPTION(floatingRbufDebug,0)
+RF_DBG_OPTION(forceHeadSepLimit,-1)
+RF_DBG_OPTION(forceNumFloatingReconBufs,-1) /* wire down number of extra recon buffers to use */
+RF_DBG_OPTION(keepAccTotals,0) /* turn on keep_acc_totals */
+RF_DBG_OPTION(lockTableSize,RF_DEFAULT_LOCK_TABLE_SIZE)
+RF_DBG_OPTION(mapDebug,0)
+RF_DBG_OPTION(maxNumTraces,-1)
+RF_DBG_OPTION(maxRandomSizeKB,128) /* if rf_accSizeKB==0, acc sizes are uniform in [ (1/2)..maxRandomSizeKB ] */
+RF_DBG_OPTION(maxTraceRunTimeSec,0)
+RF_DBG_OPTION(memAmtDebug,0) /* trace amount of memory allocated */
+RF_DBG_OPTION(memChunkDebug,0)
+RF_DBG_OPTION(memDebug,0)
+RF_DBG_OPTION(memDebugAddress,0)
+RF_DBG_OPTION(numBufsToAccumulate,1) /* number of buffers to accumulate before doing XOR */
+RF_DBG_OPTION(prReconSched,0)
+RF_DBG_OPTION(printDAGsDebug,0)
+RF_DBG_OPTION(printStatesDebug,0)
+RF_DBG_OPTION(protectedSectors,64L) /* # of sectors at start of disk to
+ exclude from RAID address space */
+RF_DBG_OPTION(pssDebug,0)
+RF_DBG_OPTION(queueDebug,0)
+RF_DBG_OPTION(quiesceDebug,0)
+RF_DBG_OPTION(raidSectorOffset,0) /* added to all incoming sectors to
+ debug alignment problems */
+RF_DBG_OPTION(reconDebug,0)
+RF_DBG_OPTION(reconbufferDebug,0)
+RF_DBG_OPTION(rewriteParityStripes,0) /* debug flag that causes parity rewrite at startup */
+RF_DBG_OPTION(scanDebug,0) /* debug SCAN sorting */
+RF_DBG_OPTION(showXorCallCounts,0) /* show n-way Xor call counts */
+RF_DBG_OPTION(shutdownDebug,0) /* show shutdown calls */
+RF_DBG_OPTION(sizePercentage,100)
+RF_DBG_OPTION(sstfDebug,0) /* turn on debugging info for sstf queueing */
+RF_DBG_OPTION(stripeLockDebug,0)
+RF_DBG_OPTION(suppressLocksAndLargeWrites,0)
+RF_DBG_OPTION(suppressTraceDelays,0)
+RF_DBG_OPTION(testDebug,0)
+RF_DBG_OPTION(useMemChunks,1)
+RF_DBG_OPTION(validateDAGDebug,0)
+RF_DBG_OPTION(validateVisitedDebug,1) /* XXX turn to zero by default? */
+RF_DBG_OPTION(verifyParityDebug,0)
+RF_DBG_OPTION(warnLongIOs,0)
+
+#ifdef KERNEL
+RF_DBG_OPTION(debugKernelAccess,0) /* DoAccessKernel debugging */
+#endif /* KERNEL */
+
+#ifndef KERNEL
+RF_DBG_OPTION(disableParityVerify,0) /* supress verification of parity */
+RF_DBG_OPTION(interactiveScript,0) /* set as a debug option for now */
+RF_DBG_OPTION(looptestShowWrites,0) /* user-level loop test write debugging */
+RF_DBG_OPTION(traceDebug,0)
+#endif /* !KERNEL */
+
+#ifdef SIMULATE
+RF_DBG_OPTION(addrSizePercentage,100)
+RF_DBG_OPTION(diskTrace,0) /* ised to turn the timing traces on and of */
+RF_DBG_OPTION(eventDebug,0)
+RF_DBG_OPTION(mWactive,1500)
+RF_DBG_OPTION(mWidle,625)
+RF_DBG_OPTION(mWsleep,15)
+RF_DBG_OPTION(mWspinup,3500)
+#endif /* SIMULATE */
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+RF_DBG_OPTION(forceParityLogReint,0)
+RF_DBG_OPTION(numParityRegions,0) /* number of regions in the array */
+RF_DBG_OPTION(numReintegrationThreads,1)
+RF_DBG_OPTION(parityLogDebug,0) /* if nonzero, enables debugging of parity logging */
+RF_DBG_OPTION(totalInCoreLogCapacity,1024*1024) /* target bytes available for in-core logs */
+#endif /* RF_INCLUDE_PARITYLOGGING > 0 */
+
+#if DFSTRACE > 0
+RF_DBG_OPTION(DFSTraceAccesses,0)
+#endif /* DFSTRACE > 0 */
+
+#if RF_DEMO > 0
+RF_DBG_OPTION(demoMeterHpos,0) /* horizontal position of meters for demo mode */
+RF_DBG_OPTION(demoMeterTag,0)
+RF_DBG_OPTION(demoMeterVpos,0) /* vertical position of meters for demo mode */
+RF_DBG_OPTION(demoMode,0)
+RF_DBG_OPTION(demoSMM,0)
+RF_DBG_OPTION(demoSuppressReconInitVerify,0) /* supress initialization & verify for recon */
+#endif /* RF_DEMO > 0 */
diff --git a/sys/dev/raidframe/rf_owner.h b/sys/dev/raidframe/rf_owner.h
new file mode 100644
index 00000000000..5b741bf3a5d
--- /dev/null
+++ b/sys/dev/raidframe/rf_owner.h
@@ -0,0 +1,75 @@
+/* $OpenBSD: rf_owner.h,v 1.1 1999/01/11 14:29:33 niklas Exp $ */
+/* $NetBSD: rf_owner.h,v 1.1 1998/11/13 04:20:31 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* :
+ * Log: rf_owner.h,v
+ * Revision 1.8 1996/08/20 14:36:51 jimz
+ * add bufLen to RF_EventCreate_t to be able to include buffer length
+ * when freeing buffer
+ *
+ * Revision 1.7 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.6 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.5 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.4 1995/12/01 19:44:30 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_OWNER_H_
+#define _RF__RF_OWNER_H_
+
+#include "rf_types.h"
+
+struct RF_OwnerInfo_s {
+ RF_RaidAccessDesc_t *desc;
+ int owner;
+ double last_start;
+ int done;
+ int notFirst;
+};
+
+struct RF_EventCreate_s {
+ RF_Raid_t *raidPtr;
+ RF_Script_t *script;
+ RF_OwnerInfo_t *ownerInfo;
+ char *bufPtr;
+ int bufLen;
+};
+
+#endif /* !_RF__RF_OWNER_H_ */
diff --git a/sys/dev/raidframe/rf_paritylog.c b/sys/dev/raidframe/rf_paritylog.c
new file mode 100644
index 00000000000..84bf2107d99
--- /dev/null
+++ b/sys/dev/raidframe/rf_paritylog.c
@@ -0,0 +1,1022 @@
+/* $OpenBSD: rf_paritylog.c,v 1.1 1999/01/11 14:29:34 niklas Exp $ */
+/* $NetBSD: rf_paritylog.c,v 1.1 1998/11/13 04:20:31 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* Code for manipulating in-core parity logs
+ *
+ * :
+ * Log: rf_paritylog.c,v
+ * Revision 1.27 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.26 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.25 1996/07/17 21:00:58 jimz
+ * clean up timer interface, tracing
+ *
+ * Revision 1.24 1996/06/11 10:18:59 jimz
+ * AllocParityLogCommonData() was freeing the common pointer immediately
+ * after allocating this. It appeared that this free really belonged
+ * inside one of the failure cases (for backing out), so I moved it
+ * in there.
+ *
+ * Revision 1.23 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.22 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.21 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.20 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.19 1996/05/30 12:59:18 jimz
+ * make etimer happier, more portable
+ *
+ * Revision 1.18 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.17 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.16 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.15 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.14 1996/05/20 16:16:59 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.13 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.12 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.11 1995/12/06 20:54:44 wvcii
+ * added prototyping
+ *
+ * Revision 1.10 1995/11/30 16:05:37 wvcii
+ * added copyright info
+ *
+ * Revision 1.9 1995/10/08 20:41:28 wvcii
+ * fixed bug in allocation of CommonLogData (was allocating incorrect size)
+ *
+ * Revision 1.8 1995/09/07 15:52:12 jimz
+ * noop compile when INCLUDE_PARITYLOGGING not defined
+ *
+ * Revision 1.7 1995/09/06 19:17:36 wvcii
+ * moved code for reintegration to rf_paritylogDiskMgr.c
+ *
+ * Revision 1.6 95/07/07 00:16:06 wvcii
+ * this version free from deadlock, fails parity verification
+ *
+ * Revision 1.5 1995/06/09 13:14:24 wvcii
+ * code is now nonblocking
+ *
+ * Revision 1.4 95/06/01 17:01:59 wvcii
+ * code debug
+ *
+ * Revision 1.3 95/05/31 13:08:23 wvcii
+ * code debug
+ *
+ * Revision 1.2 95/05/21 15:42:15 wvcii
+ * code debug
+ *
+ * Revision 1.1 95/05/18 10:43:54 wvcii
+ * Initial revision
+ *
+ */
+
+#include "rf_archs.h"
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+
+/*
+ * Append-only log for recording parity "update" and "overwrite" records
+ */
+
+#include "rf_types.h"
+#include "rf_threadstuff.h"
+#include "rf_mcpair.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_dagfuncs.h"
+#include "rf_desc.h"
+#include "rf_layout.h"
+#include "rf_diskqueue.h"
+#include "rf_etimer.h"
+#include "rf_paritylog.h"
+#include "rf_general.h"
+#include "rf_threadid.h"
+#include "rf_map.h"
+#include "rf_paritylogging.h"
+#include "rf_paritylogDiskMgr.h"
+#include "rf_sys.h"
+
+static RF_CommonLogData_t *AllocParityLogCommonData(RF_Raid_t *raidPtr)
+{
+ RF_CommonLogData_t *common = NULL;
+ int rc;
+
+ /* Return a struct for holding common parity log information from the free
+ list (rf_parityLogDiskQueue.freeCommonList). If the free list is empty, call
+ RF_Malloc to create a new structure.
+ NON-BLOCKING */
+
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ if (raidPtr->parityLogDiskQueue.freeCommonList)
+ {
+ common = raidPtr->parityLogDiskQueue.freeCommonList;
+ raidPtr->parityLogDiskQueue.freeCommonList = raidPtr->parityLogDiskQueue.freeCommonList->next;
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ }
+ else
+ {
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ RF_Malloc(common, sizeof(RF_CommonLogData_t), (RF_CommonLogData_t *));
+ rc = rf_mutex_init(&common->mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ RF_Free(common, sizeof(RF_CommonLogData_t));
+ common = NULL;
+ }
+ }
+ common->next = NULL;
+ return(common);
+}
+
+static void FreeParityLogCommonData(RF_CommonLogData_t *common)
+{
+ RF_Raid_t *raidPtr;
+
+ /* Insert a single struct for holding parity log information
+ (data) into the free list (rf_parityLogDiskQueue.freeCommonList).
+ NON-BLOCKING */
+
+ raidPtr = common->raidPtr;
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ common->next = raidPtr->parityLogDiskQueue.freeCommonList;
+ raidPtr->parityLogDiskQueue.freeCommonList = common;
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+}
+
+static RF_ParityLogData_t *AllocParityLogData(RF_Raid_t *raidPtr)
+{
+ RF_ParityLogData_t *data = NULL;
+
+ /* Return a struct for holding parity log information from the free
+ list (rf_parityLogDiskQueue.freeList). If the free list is empty, call
+ RF_Malloc to create a new structure.
+ NON-BLOCKING */
+
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ if (raidPtr->parityLogDiskQueue.freeDataList)
+ {
+ data = raidPtr->parityLogDiskQueue.freeDataList;
+ raidPtr->parityLogDiskQueue.freeDataList = raidPtr->parityLogDiskQueue.freeDataList->next;
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ }
+ else
+ {
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ RF_Malloc(data, sizeof(RF_ParityLogData_t), (RF_ParityLogData_t *));
+ }
+ data->next = NULL;
+ data->prev = NULL;
+ return(data);
+}
+
+
+static void FreeParityLogData(RF_ParityLogData_t *data)
+{
+ RF_ParityLogData_t *nextItem;
+ RF_Raid_t *raidPtr;
+
+ /* Insert a linked list of structs for holding parity log
+ information (data) into the free list (parityLogDiskQueue.freeList).
+ NON-BLOCKING */
+
+ raidPtr = data->common->raidPtr;
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ while (data)
+ {
+ nextItem = data->next;
+ data->next = raidPtr->parityLogDiskQueue.freeDataList;
+ raidPtr->parityLogDiskQueue.freeDataList = data;
+ data = nextItem;
+ }
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+}
+
+
+static void EnqueueParityLogData(
+ RF_ParityLogData_t *data,
+ RF_ParityLogData_t **head,
+ RF_ParityLogData_t **tail)
+{
+ RF_Raid_t *raidPtr;
+
+ /* Insert an in-core parity log (*data) into the head of
+ a disk queue (*head, *tail).
+ NON-BLOCKING */
+
+ raidPtr = data->common->raidPtr;
+ if (rf_parityLogDebug)
+ printf("[enqueueing parity log data, region %d, raidAddress %d, numSector %d]\n",data->regionID,(int)data->diskAddress.raidAddress, (int)data->diskAddress.numSector);
+ RF_ASSERT(data->prev == NULL);
+ RF_ASSERT(data->next == NULL);
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ if (*head)
+ {
+ /* insert into head of queue */
+ RF_ASSERT((*head)->prev == NULL);
+ RF_ASSERT((*tail)->next == NULL);
+ data->next = *head;
+ (*head)->prev = data;
+ *head = data;
+ }
+ else
+ {
+ /* insert into empty list */
+ RF_ASSERT(*head == NULL);
+ RF_ASSERT(*tail == NULL);
+ *head = data;
+ *tail = data;
+ }
+ RF_ASSERT((*head)->prev == NULL);
+ RF_ASSERT((*tail)->next == NULL);
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+}
+
+static RF_ParityLogData_t *DequeueParityLogData(
+ RF_Raid_t *raidPtr,
+ RF_ParityLogData_t **head,
+ RF_ParityLogData_t **tail,
+ int ignoreLocks)
+{
+ RF_ParityLogData_t *data;
+
+ /* Remove and return an in-core parity log from the tail of
+ a disk queue (*head, *tail).
+ NON-BLOCKING */
+
+ /* remove from tail, preserving FIFO order */
+ if (!ignoreLocks)
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ data = *tail;
+ if (data)
+ {
+ if (*head == *tail)
+ {
+ /* removing last item from queue */
+ *head = NULL;
+ *tail = NULL;
+ }
+ else
+ {
+ *tail = (*tail)->prev;
+ (*tail)->next = NULL;
+ RF_ASSERT((*head)->prev == NULL);
+ RF_ASSERT((*tail)->next == NULL);
+ }
+ data->next = NULL;
+ data->prev = NULL;
+ if (rf_parityLogDebug)
+ printf("[dequeueing parity log data, region %d, raidAddress %d, numSector %d]\n",data->regionID,(int)data->diskAddress.raidAddress, (int)data->diskAddress.numSector);
+ }
+ if (*head)
+ {
+ RF_ASSERT((*head)->prev == NULL);
+ RF_ASSERT((*tail)->next == NULL);
+ }
+ if (!ignoreLocks)
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ return(data);
+}
+
+
+static void RequeueParityLogData(
+ RF_ParityLogData_t *data,
+ RF_ParityLogData_t **head,
+ RF_ParityLogData_t **tail)
+{
+ RF_Raid_t *raidPtr;
+
+ /* Insert an in-core parity log (*data) into the tail of
+ a disk queue (*head, *tail).
+ NON-BLOCKING */
+
+ raidPtr = data->common->raidPtr;
+ RF_ASSERT(data);
+ if (rf_parityLogDebug)
+ printf("[requeueing parity log data, region %d, raidAddress %d, numSector %d]\n",data->regionID,(int)data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ if (*tail)
+ {
+ /* append to tail of list */
+ data->prev = *tail;
+ data->next = NULL;
+ (*tail)->next = data;
+ *tail = data;
+ }
+ else
+ {
+ /* inserting into an empty list */
+ *head = data;
+ *tail = data;
+ (*head)->prev = NULL;
+ (*tail)->next = NULL;
+ }
+ RF_ASSERT((*head)->prev == NULL);
+ RF_ASSERT((*tail)->next == NULL);
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+}
+
+RF_ParityLogData_t *rf_CreateParityLogData(
+ RF_ParityRecordType_t operation,
+ RF_PhysDiskAddr_t *pda,
+ caddr_t bufPtr,
+ RF_Raid_t *raidPtr,
+ int (*wakeFunc)(RF_DagNode_t *node, int status),
+ void *wakeArg,
+ RF_AccTraceEntry_t *tracerec,
+ RF_Etimer_t startTime)
+{
+ RF_ParityLogData_t *data, *resultHead = NULL, *resultTail = NULL;
+ RF_CommonLogData_t *common;
+ RF_PhysDiskAddr_t *diskAddress;
+ int boundary, offset = 0;
+
+ /* Return an initialized struct of info to be logged.
+ Build one item per physical disk address, one item per region.
+
+ NON-BLOCKING */
+
+ diskAddress = pda;
+ common = AllocParityLogCommonData(raidPtr);
+ RF_ASSERT(common);
+
+ common->operation = operation;
+ common->bufPtr = bufPtr;
+ common->raidPtr = raidPtr;
+ common->wakeFunc = wakeFunc;
+ common->wakeArg = wakeArg;
+ common->tracerec = tracerec;
+ common->startTime = startTime;
+ common->cnt = 0;
+
+ if (rf_parityLogDebug)
+ printf("[entering CreateParityLogData]\n");
+ while (diskAddress)
+ {
+ common->cnt++;
+ data = AllocParityLogData(raidPtr);
+ RF_ASSERT(data);
+ data->common = common;
+ data->next = NULL;
+ data->prev = NULL;
+ data->regionID = rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector);
+ if (data->regionID == rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector + diskAddress->numSector - 1))
+ {
+ /* disk address does not cross a region boundary */
+ data->diskAddress = *diskAddress;
+ data->bufOffset = offset;
+ offset = offset + diskAddress->numSector;
+ EnqueueParityLogData(data, &resultHead, &resultTail);
+ /* adjust disk address */
+ diskAddress = diskAddress->next;
+ }
+ else
+ {
+ /* disk address crosses a region boundary */
+ /* find address where region is crossed */
+ boundary = 0;
+ while (data->regionID == rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector + boundary))
+ boundary++;
+
+ /* enter data before the boundary */
+ data->diskAddress = *diskAddress;
+ data->diskAddress.numSector = boundary;
+ data->bufOffset = offset;
+ offset += boundary;
+ EnqueueParityLogData(data, &resultHead, &resultTail);
+ /* adjust disk address */
+ diskAddress->startSector += boundary;
+ diskAddress->numSector -= boundary;
+ }
+ }
+ if (rf_parityLogDebug)
+ printf("[leaving CreateParityLogData]\n");
+ return(resultHead);
+}
+
+
+RF_ParityLogData_t *rf_SearchAndDequeueParityLogData(
+ RF_Raid_t *raidPtr,
+ int regionID,
+ RF_ParityLogData_t **head,
+ RF_ParityLogData_t **tail,
+ int ignoreLocks)
+{
+ RF_ParityLogData_t *w;
+
+ /* Remove and return an in-core parity log from a specified region (regionID).
+ If a matching log is not found, return NULL.
+
+ NON-BLOCKING.
+ */
+
+ /* walk backward through a list, looking for an entry with a matching region ID */
+ if (!ignoreLocks)
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ w = (*tail);
+ while (w)
+ {
+ if (w->regionID == regionID)
+ {
+ /* remove an element from the list */
+ if (w == *tail)
+ {
+ if (*head == *tail)
+ {
+ /* removing only element in the list */
+ *head = NULL;
+ *tail = NULL;
+ }
+ else
+ {
+ /* removing last item in the list */
+ *tail = (*tail)->prev;
+ (*tail)->next = NULL;
+ RF_ASSERT((*head)->prev == NULL);
+ RF_ASSERT((*tail)->next == NULL);
+ }
+ }
+ else
+ {
+ if (w == *head)
+ {
+ /* removing first item in the list */
+ *head = (*head)->next;
+ (*head)->prev = NULL;
+ RF_ASSERT((*head)->prev == NULL);
+ RF_ASSERT((*tail)->next == NULL);
+ }
+ else
+ {
+ /* removing an item from the middle of the list */
+ w->prev->next = w->next;
+ w->next->prev = w->prev;
+ RF_ASSERT((*head)->prev == NULL);
+ RF_ASSERT((*tail)->next == NULL);
+ }
+ }
+ w->prev = NULL;
+ w->next = NULL;
+ if (rf_parityLogDebug)
+ printf("[dequeueing parity log data, region %d, raidAddress %d, numSector %d]\n",w->regionID,(int)w->diskAddress.raidAddress,(int) w->diskAddress.numSector);
+ return(w);
+ }
+ else
+ w = w->prev;
+ }
+ if (!ignoreLocks)
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ return(NULL);
+}
+
+static RF_ParityLogData_t *DequeueMatchingLogData(
+ RF_Raid_t *raidPtr,
+ RF_ParityLogData_t **head,
+ RF_ParityLogData_t **tail)
+{
+ RF_ParityLogData_t *logDataList, *logData;
+ int regionID;
+
+ /* Remove and return an in-core parity log from the tail of
+ a disk queue (*head, *tail). Then remove all matching
+ (identical regionIDs) logData and return as a linked list.
+
+ NON-BLOCKING
+ */
+
+ logDataList = DequeueParityLogData(raidPtr, head, tail, RF_TRUE);
+ if (logDataList)
+ {
+ regionID = logDataList->regionID;
+ logData = logDataList;
+ logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, head, tail, RF_TRUE);
+ while (logData->next)
+ {
+ logData = logData->next;
+ logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, head, tail, RF_TRUE);
+ }
+ }
+ return(logDataList);
+}
+
+
+static RF_ParityLog_t *AcquireParityLog(
+ RF_ParityLogData_t *logData,
+ int finish)
+{
+ RF_ParityLog_t *log = NULL;
+ RF_Raid_t *raidPtr;
+
+ /* Grab a log buffer from the pool and return it.
+ If no buffers are available, return NULL.
+ NON-BLOCKING
+ */
+ raidPtr = logData->common->raidPtr;
+ RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex);
+ if (raidPtr->parityLogPool.parityLogs)
+ {
+ log = raidPtr->parityLogPool.parityLogs;
+ raidPtr->parityLogPool.parityLogs = raidPtr->parityLogPool.parityLogs->next;
+ log->regionID = logData->regionID;
+ log->numRecords = 0;
+ log->next = NULL;
+ raidPtr->logsInUse++;
+ RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
+ }
+ else
+ {
+ /* no logs available, so place ourselves on the queue of work waiting on log buffers
+ this is done while parityLogPool.mutex is held, to ensure synchronization
+ with ReleaseParityLogs.
+ */
+ if (rf_parityLogDebug)
+ printf("[blocked on log, region %d, finish %d]\n", logData->regionID, finish);
+ if (finish)
+ RequeueParityLogData(logData, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
+ else
+ EnqueueParityLogData(logData, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
+ }
+ RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex);
+ return(log);
+}
+
+void rf_ReleaseParityLogs(
+ RF_Raid_t *raidPtr,
+ RF_ParityLog_t *firstLog)
+{
+ RF_ParityLogData_t *logDataList;
+ RF_ParityLog_t *log, *lastLog;
+ int cnt;
+
+ /* Insert a linked list of parity logs (firstLog) to
+ the free list (parityLogPool.parityLogPool)
+
+ NON-BLOCKING.
+ */
+
+ RF_ASSERT(firstLog);
+
+ /* Before returning logs to global free list, service all
+ requests which are blocked on logs. Holding mutexes for parityLogPool and parityLogDiskQueue
+ forces synchronization with AcquireParityLog().
+ */
+ RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex);
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ logDataList = DequeueMatchingLogData(raidPtr, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
+ log = firstLog;
+ if (firstLog)
+ firstLog = firstLog->next;
+ log->numRecords = 0;
+ log->next = NULL;
+ while (logDataList && log)
+ {
+ RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex);
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ rf_ParityLogAppend(logDataList, RF_TRUE, &log, RF_FALSE);
+ if (rf_parityLogDebug)
+ printf("[finishing up buf-blocked log data, region %d]\n", logDataList->regionID);
+ if (log == NULL)
+ {
+ log = firstLog;
+ if (firstLog)
+ {
+ firstLog = firstLog->next;
+ log->numRecords = 0;
+ log->next = NULL;
+ }
+ }
+ RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex);
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ if (log)
+ logDataList = DequeueMatchingLogData(raidPtr, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
+ }
+ /* return remaining logs to pool */
+ if (log)
+ {
+ log->next = firstLog;
+ firstLog = log;
+ }
+ if (firstLog)
+ {
+ lastLog = firstLog;
+ raidPtr->logsInUse--;
+ RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
+ while (lastLog->next)
+ {
+ lastLog = lastLog->next;
+ raidPtr->logsInUse--;
+ RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
+ }
+ lastLog->next = raidPtr->parityLogPool.parityLogs;
+ raidPtr->parityLogPool.parityLogs = firstLog;
+ cnt = 0;
+ log = raidPtr->parityLogPool.parityLogs;
+ while (log)
+ {
+ cnt++;
+ log = log->next;
+ }
+ RF_ASSERT(cnt + raidPtr->logsInUse == raidPtr->numParityLogs);
+ }
+ RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex);
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+}
+
+static void ReintLog(
+ RF_Raid_t *raidPtr,
+ int regionID,
+ RF_ParityLog_t *log)
+{
+ RF_ASSERT(log);
+
+ /* Insert an in-core parity log (log) into the disk queue of reintegration
+ work. Set the flag (reintInProgress) for the specified region (regionID)
+ to indicate that reintegration is in progress for this region.
+ NON-BLOCKING
+ */
+
+ RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
+ raidPtr->regionInfo[regionID].reintInProgress = RF_TRUE; /* cleared when reint complete */
+
+ if (rf_parityLogDebug)
+ printf("[requesting reintegration of region %d]\n", log->regionID);
+ /* move record to reintegration queue */
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ log->next = raidPtr->parityLogDiskQueue.reintQueue;
+ raidPtr->parityLogDiskQueue.reintQueue = log;
+ RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
+}
+
+static void FlushLog(
+ RF_Raid_t *raidPtr,
+ RF_ParityLog_t *log)
+{
+ /* insert a core log (log) into a list of logs (parityLogDiskQueue.flushQueue)
+ waiting to be written to disk.
+ NON-BLOCKING
+ */
+
+ RF_ASSERT(log);
+ RF_ASSERT(log->numRecords == raidPtr->numSectorsPerLog);
+ RF_ASSERT(log->next == NULL);
+ /* move log to flush queue */
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ log->next = raidPtr->parityLogDiskQueue.flushQueue;
+ raidPtr->parityLogDiskQueue.flushQueue = log;
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
+}
+
+static int DumpParityLogToDisk(
+ int finish,
+ RF_ParityLogData_t *logData)
+{
+ int i, diskCount, regionID = logData->regionID;
+ RF_ParityLog_t *log;
+ RF_Raid_t *raidPtr;
+
+ raidPtr = logData->common->raidPtr;
+
+ /* Move a core log to disk. If the log disk is full, initiate
+ reintegration.
+
+ Return (0) if we can enqueue the dump immediately, otherwise
+ return (1) to indicate we are blocked on reintegration and
+ control of the thread should be relinquished.
+
+ Caller must hold regionInfo[regionID].mutex
+
+ NON-BLOCKING
+ */
+
+ if (rf_parityLogDebug)
+ printf("[dumping parity log to disk, region %d]\n", regionID);
+ log = raidPtr->regionInfo[regionID].coreLog;
+ RF_ASSERT(log->numRecords == raidPtr->numSectorsPerLog);
+ RF_ASSERT(log->next == NULL);
+
+ /* if reintegration is in progress, must queue work */
+ RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
+ if (raidPtr->regionInfo[regionID].reintInProgress)
+ {
+ /* Can not proceed since this region is currently being reintegrated.
+ We can not block, so queue remaining work and return */
+ if (rf_parityLogDebug)
+ printf("[region %d waiting on reintegration]\n",regionID);
+ /* XXX not sure about the use of finish - shouldn't this always be "Enqueue"? */
+ if (finish)
+ RequeueParityLogData(logData, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail);
+ else
+ EnqueueParityLogData(logData, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail);
+ RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
+ return(1); /* relenquish control of this thread */
+ }
+ RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
+ raidPtr->regionInfo[regionID].coreLog = NULL;
+ if ((raidPtr->regionInfo[regionID].diskCount) < raidPtr->regionInfo[regionID].capacity)
+ /* IMPORTANT!! this loop bound assumes region disk holds an integral number of core logs */
+ {
+ /* update disk map for this region */
+ diskCount = raidPtr->regionInfo[regionID].diskCount;
+ for (i = 0; i < raidPtr->numSectorsPerLog; i++)
+ {
+ raidPtr->regionInfo[regionID].diskMap[i + diskCount].operation = log->records[i].operation;
+ raidPtr->regionInfo[regionID].diskMap[i + diskCount].parityAddr = log->records[i].parityAddr;
+ }
+ log->diskOffset = diskCount;
+ raidPtr->regionInfo[regionID].diskCount += raidPtr->numSectorsPerLog;
+ FlushLog(raidPtr, log);
+ }
+ else
+ {
+ /* no room for log on disk, send it to disk manager and request reintegration */
+ RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == raidPtr->regionInfo[regionID].capacity);
+ ReintLog(raidPtr, regionID, log);
+ }
+ if (rf_parityLogDebug)
+ printf("[finished dumping parity log to disk, region %d]\n", regionID);
+ return(0);
+}
+
+int rf_ParityLogAppend(
+ RF_ParityLogData_t *logData,
+ int finish,
+ RF_ParityLog_t **incomingLog,
+ int clearReintFlag)
+{
+ int regionID, logItem, itemDone;
+ RF_ParityLogData_t *item;
+ int punt, done = RF_FALSE;
+ RF_ParityLog_t *log;
+ RF_Raid_t *raidPtr;
+ RF_Etimer_t timer;
+ int (*wakeFunc)(RF_DagNode_t *node, int status);
+ void *wakeArg;
+
+ /* Add parity to the appropriate log, one sector at a time.
+ This routine is called is called by dag functions ParityLogUpdateFunc
+ and ParityLogOverwriteFunc and therefore MUST BE NONBLOCKING.
+
+ Parity to be logged is contained in a linked-list (logData). When
+ this routine returns, every sector in the list will be in one of
+ three places:
+ 1) entered into the parity log
+ 2) queued, waiting on reintegration
+ 3) queued, waiting on a core log
+
+ Blocked work is passed to the ParityLoggingDiskManager for completion.
+ Later, as conditions which required the block are removed, the work
+ reenters this routine with the "finish" parameter set to "RF_TRUE."
+
+ NON-BLOCKING
+ */
+
+ raidPtr = logData->common->raidPtr;
+ /* lock the region for the first item in logData */
+ RF_ASSERT(logData != NULL);
+ regionID = logData->regionID;
+ RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+ RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled);
+
+ if (clearReintFlag)
+ {
+ /* Enable flushing for this region. Holding both locks provides
+ a synchronization barrier with DumpParityLogToDisk
+ */
+ RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ RF_ASSERT(raidPtr->regionInfo[regionID].reintInProgress == RF_TRUE);
+ raidPtr->regionInfo[regionID].diskCount = 0;
+ raidPtr->regionInfo[regionID].reintInProgress = RF_FALSE;
+ RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex); /* flushing is now enabled */
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ }
+
+ /* process each item in logData */
+ while (logData)
+ {
+ /* remove an item from logData */
+ item = logData;
+ logData = logData->next;
+ item->next = NULL;
+ item->prev = NULL;
+
+ if (rf_parityLogDebug)
+ printf("[appending parity log data, region %d, raidAddress %d, numSector %d]\n",item->regionID,(int)item->diskAddress.raidAddress, (int)item->diskAddress.numSector);
+
+ /* see if we moved to a new region */
+ if (regionID != item->regionID)
+ {
+ RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+ regionID = item->regionID;
+ RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+ RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled);
+ }
+
+ punt = RF_FALSE; /* Set to RF_TRUE if work is blocked. This can happen in one of two ways:
+ 1) no core log (AcquireParityLog)
+ 2) waiting on reintegration (DumpParityLogToDisk)
+ If punt is RF_TRUE, the dataItem was queued, so skip to next item.
+ */
+
+ /* process item, one sector at a time, until all sectors processed or we punt */
+ if (item->diskAddress.numSector > 0)
+ done = RF_FALSE;
+ else
+ RF_ASSERT(0);
+ while (!punt && !done)
+ {
+ /* verify that a core log exists for this region */
+ if (!raidPtr->regionInfo[regionID].coreLog)
+ {
+ /* Attempt to acquire a parity log.
+ If acquisition fails, queue remaining work in data item and move to nextItem.
+ */
+ if (incomingLog) {
+ if (*incomingLog)
+ {
+ RF_ASSERT((*incomingLog)->next == NULL);
+ raidPtr->regionInfo[regionID].coreLog = *incomingLog;
+ raidPtr->regionInfo[regionID].coreLog->regionID = regionID;
+ *incomingLog = NULL;
+ }
+ else
+ raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
+ } else
+ raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
+ /* Note: AcquireParityLog either returns a log or enqueues currentItem */
+ }
+ if (!raidPtr->regionInfo[regionID].coreLog)
+ punt = RF_TRUE; /* failed to find a core log */
+ else
+ {
+ RF_ASSERT(raidPtr->regionInfo[regionID].coreLog->next == NULL);
+ /* verify that the log has room for new entries */
+ /* if log is full, dump it to disk and grab a new log */
+ if (raidPtr->regionInfo[regionID].coreLog->numRecords == raidPtr->numSectorsPerLog)
+ {
+ /* log is full, dump it to disk */
+ if (DumpParityLogToDisk(finish, item))
+ punt = RF_TRUE; /* dump unsuccessful, blocked on reintegration */
+ else
+ {
+ /* dump was successful */
+ if (incomingLog) {
+ if (*incomingLog)
+ {
+ RF_ASSERT((*incomingLog)->next == NULL);
+ raidPtr->regionInfo[regionID].coreLog = *incomingLog;
+ raidPtr->regionInfo[regionID].coreLog->regionID = regionID;
+ *incomingLog = NULL;
+ }
+ else
+ raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
+ } else
+ raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
+ /* if a core log is not available, must queue work and return */
+ if (!raidPtr->regionInfo[regionID].coreLog)
+ punt = RF_TRUE; /* blocked on log availability */
+ }
+ }
+ }
+ /* if we didn't punt on this item, attempt to add a sector to the core log */
+ if (!punt)
+ {
+ RF_ASSERT(raidPtr->regionInfo[regionID].coreLog->next == NULL);
+ /* at this point, we have a core log with enough room for a sector */
+ /* copy a sector into the log */
+ log = raidPtr->regionInfo[regionID].coreLog;
+ RF_ASSERT(log->numRecords < raidPtr->numSectorsPerLog);
+ logItem = log->numRecords++;
+ log->records[logItem].parityAddr = item->diskAddress;
+ RF_ASSERT(log->records[logItem].parityAddr.startSector >= raidPtr->regionInfo[regionID].parityStartAddr);
+ RF_ASSERT(log->records[logItem].parityAddr.startSector < raidPtr->regionInfo[regionID].parityStartAddr + raidPtr->regionInfo[regionID].numSectorsParity);
+ log->records[logItem].parityAddr.numSector = 1;
+ log->records[logItem].operation = item->common->operation;
+ bcopy((item->common->bufPtr + (item->bufOffset++ * (1<<item->common->raidPtr->logBytesPerSector))), log->bufPtr + (logItem * (1<<item->common->raidPtr->logBytesPerSector)), (1<<item->common->raidPtr->logBytesPerSector));
+ item->diskAddress.numSector--;
+ item->diskAddress.startSector++;
+ if (item->diskAddress.numSector == 0)
+ done = RF_TRUE;
+ }
+ }
+
+ if (!punt)
+ {
+ /* Processed this item completely, decrement count of items
+ to be processed.
+ */
+ RF_ASSERT(item->diskAddress.numSector == 0);
+ RF_LOCK_MUTEX(item->common->mutex);
+ item->common->cnt--;
+ if (item->common->cnt == 0)
+ itemDone = RF_TRUE;
+ else
+ itemDone = RF_FALSE;
+ RF_UNLOCK_MUTEX(item->common->mutex);
+ if (itemDone)
+ {
+ /* Finished processing all log data for this IO
+ Return structs to free list and invoke wakeup function.
+ */
+ timer = item->common->startTime; /* grab initial value of timer */
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ item->common->tracerec->plog_us += RF_ETIMER_VAL_US(timer);
+ if (rf_parityLogDebug)
+ printf("[waking process for region %d]\n", item->regionID);
+ wakeFunc = item->common->wakeFunc;
+ wakeArg = item->common->wakeArg;
+ FreeParityLogCommonData(item->common);
+ FreeParityLogData(item);
+ (wakeFunc)(wakeArg, 0);
+ }
+ else
+ FreeParityLogData(item);
+ }
+ }
+ RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+ if (rf_parityLogDebug)
+ printf("[exiting ParityLogAppend]\n");
+ return(0);
+}
+
+
+void rf_EnableParityLogging(RF_Raid_t *raidPtr)
+{
+ int regionID;
+
+ for (regionID = 0; regionID < rf_numParityRegions; regionID++) {
+ RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+ raidPtr->regionInfo[regionID].loggingEnabled = RF_TRUE;
+ RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+ }
+ if (rf_parityLogDebug)
+ printf("[parity logging enabled]\n");
+}
+
+#endif /* RF_INCLUDE_PARITYLOGGING > 0 */
diff --git a/sys/dev/raidframe/rf_paritylog.h b/sys/dev/raidframe/rf_paritylog.h
new file mode 100644
index 00000000000..fd6128174e1
--- /dev/null
+++ b/sys/dev/raidframe/rf_paritylog.h
@@ -0,0 +1,225 @@
+/* $OpenBSD: rf_paritylog.h,v 1.1 1999/01/11 14:29:34 niklas Exp $ */
+/* $NetBSD: rf_paritylog.h,v 1.1 1998/11/13 04:20:31 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* header file for parity log
+ *
+ * :
+ * Log: rf_paritylog.h,v
+ * Revision 1.21 1996/07/17 21:00:58 jimz
+ * clean up timer interface, tracing
+ *
+ * Revision 1.20 1996/07/15 17:22:18 jimz
+ * nit-pick code cleanup
+ * resolve stdlib problems on DEC OSF
+ *
+ * Revision 1.19 1996/06/11 10:17:57 jimz
+ * definitions and run state for parity logging thread
+ *
+ * Revision 1.18 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.17 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.16 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.15 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.14 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.13 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.12 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.11 1995/12/06 20:54:58 wvcii
+ * added prototyping
+ *
+ * Revision 1.10 1995/11/30 16:05:50 wvcii
+ * added copyright info
+ *
+ * Revision 1.9 1995/10/07 05:09:27 wvcii
+ * removed #define BYTESPERSECTOR 512
+ *
+ * Revision 1.8 1995/09/06 19:27:52 wvcii
+ * added startTime to commonLogData
+ *
+ * Revision 1.7 1995/07/07 00:13:42 wvcii
+ * this version free from deadlock, fails parity verification
+ *
+ */
+
+#ifndef _RF__RF_PARITYLOG_H_
+#define _RF__RF_PARITYLOG_H_
+
+#include "rf_types.h"
+
+#define RF_DEFAULT_NUM_SECTORS_PER_LOG 64
+
+typedef int RF_RegionId_t;
+
+typedef enum RF_ParityRecordType_e {
+ RF_STOP,
+ RF_UPDATE,
+ RF_OVERWRITE
+} RF_ParityRecordType_t;
+
+struct RF_CommonLogData_s {
+ RF_DECLARE_MUTEX(mutex) /* protects cnt */
+ int cnt; /* when 0, time to call wakeFunc */
+ RF_Raid_t *raidPtr;
+/* int (*wakeFunc)(struct buf *); */
+ int (*wakeFunc)(RF_DagNode_t *node, int status);
+ void *wakeArg;
+ RF_AccTraceEntry_t *tracerec;
+ RF_Etimer_t startTime;
+ caddr_t bufPtr;
+ RF_ParityRecordType_t operation;
+ RF_CommonLogData_t *next;
+};
+
+struct RF_ParityLogData_s {
+ RF_RegionId_t regionID; /* this struct guaranteed to span a single region */
+ int bufOffset; /* offset from common->bufPtr */
+ RF_PhysDiskAddr_t diskAddress;
+ RF_CommonLogData_t *common; /* info shared by one or more parityLogData structs */
+ RF_ParityLogData_t *next;
+ RF_ParityLogData_t *prev;
+};
+
+struct RF_ParityLogAppendQueue_s {
+ RF_DECLARE_MUTEX(mutex)
+};
+
+struct RF_ParityLogRecord_s {
+ RF_PhysDiskAddr_t parityAddr;
+ RF_ParityRecordType_t operation;
+};
+
+struct RF_ParityLog_s {
+ RF_RegionId_t regionID;
+ int numRecords;
+ int diskOffset;
+ RF_ParityLogRecord_t *records;
+ caddr_t bufPtr;
+ RF_ParityLog_t *next;
+};
+
+struct RF_ParityLogQueue_s {
+ RF_DECLARE_MUTEX(mutex)
+ RF_ParityLog_t *parityLogs;
+};
+
+struct RF_RegionBufferQueue_s {
+ RF_DECLARE_MUTEX(mutex)
+ RF_DECLARE_COND(cond)
+ int bufferSize;
+ int totalBuffers; /* size of array 'buffers' */
+ int availableBuffers; /* num available 'buffers' */
+ int emptyBuffersIndex; /* stick next freed buffer here */
+ int availBuffersIndex; /* grab next buffer from here */
+ caddr_t *buffers; /* array buffers used to hold parity */
+};
+
+#define RF_PLOG_CREATED (1<<0) /* thread is created */
+#define RF_PLOG_RUNNING (1<<1) /* thread is running */
+#define RF_PLOG_TERMINATE (1<<2) /* thread is terminated (should exit) */
+#define RF_PLOG_SHUTDOWN (1<<3) /* thread is aware and exiting/exited */
+
+struct RF_ParityLogDiskQueue_s {
+ RF_DECLARE_MUTEX(mutex) /* protects all vars in this struct */
+ RF_DECLARE_COND(cond)
+ int threadState; /* is thread running, should it shutdown (see above) */
+ RF_ParityLog_t *flushQueue; /* list of parity logs to be flushed to log disk */
+ RF_ParityLog_t *reintQueue; /* list of parity logs waiting to be reintegrated */
+ RF_ParityLogData_t *bufHead; /* head of FIFO list of log data, waiting on a buffer */
+ RF_ParityLogData_t *bufTail; /* tail of FIFO list of log data, waiting on a buffer */
+ RF_ParityLogData_t *reintHead; /* head of FIFO list of log data, waiting on reintegration */
+ RF_ParityLogData_t *reintTail; /* tail of FIFO list of log data, waiting on reintegration */
+ RF_ParityLogData_t *logBlockHead; /* queue of work, blocked until a log is available */
+ RF_ParityLogData_t *logBlockTail;
+ RF_ParityLogData_t *reintBlockHead; /* queue of work, blocked until reintegration is complete */
+ RF_ParityLogData_t *reintBlockTail;
+ RF_CommonLogData_t *freeCommonList; /* list of unused common data structs */
+ RF_ParityLogData_t *freeDataList; /* list of unused log data structs */
+};
+
+struct RF_DiskMap_s {
+ RF_PhysDiskAddr_t parityAddr;
+ RF_ParityRecordType_t operation;
+};
+
+struct RF_RegionInfo_s {
+ RF_DECLARE_MUTEX(mutex) /* protects: diskCount, diskMap, loggingEnabled, coreLog */
+ RF_DECLARE_MUTEX(reintMutex) /* protects: reintInProgress */
+ int reintInProgress; /* flag used to suspend flushing operations */
+ RF_SectorCount_t capacity; /* capacity of this region in sectors */
+ RF_SectorNum_t regionStartAddr; /* starting disk address for this region */
+ RF_SectorNum_t parityStartAddr; /* starting disk address for this region */
+ RF_SectorCount_t numSectorsParity; /* number of parity sectors protected by this region */
+ RF_SectorCount_t diskCount; /* num of sectors written to this region's disk log */
+ RF_DiskMap_t *diskMap; /* in-core map of what's in this region's disk log */
+ int loggingEnabled; /* logging enable for this region */
+ RF_ParityLog_t *coreLog; /* in-core log for this region */
+};
+
+RF_ParityLogData_t *rf_CreateParityLogData(RF_ParityRecordType_t operation,
+ RF_PhysDiskAddr_t *pda, caddr_t bufPtr, RF_Raid_t *raidPtr,
+ int (*wakeFunc)(RF_DagNode_t *node, int status),
+ void *wakeArg, RF_AccTraceEntry_t *tracerec,
+ RF_Etimer_t startTime);
+RF_ParityLogData_t *rf_SearchAndDequeueParityLogData(RF_Raid_t *raidPtr,
+ RF_RegionId_t regionID, RF_ParityLogData_t **head,
+ RF_ParityLogData_t **tail, int ignoreLocks);
+void rf_ReleaseParityLogs(RF_Raid_t *raidPtr, RF_ParityLog_t *firstLog);
+int rf_ParityLogAppend(RF_ParityLogData_t *logData, int finish,
+ RF_ParityLog_t **incomingLog, int clearReintFlag);
+void rf_EnableParityLogging(RF_Raid_t *raidPtr);
+
+#endif /* !_RF__RF_PARITYLOG_H_ */
diff --git a/sys/dev/raidframe/rf_paritylogDiskMgr.c b/sys/dev/raidframe/rf_paritylogDiskMgr.c
new file mode 100644
index 00000000000..92079d5ec26
--- /dev/null
+++ b/sys/dev/raidframe/rf_paritylogDiskMgr.c
@@ -0,0 +1,790 @@
+/* $OpenBSD: rf_paritylogDiskMgr.c,v 1.1 1999/01/11 14:29:34 niklas Exp $ */
+/* $NetBSD: rf_paritylogDiskMgr.c,v 1.1 1998/11/13 04:20:31 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/* Code for flushing and reintegration operations related to parity logging.
+ *
+ * :
+ * Log: rf_paritylogDiskMgr.c,v
+ * Revision 1.25 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.24 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.23 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.22 1996/06/11 10:17:33 jimz
+ * Put in thread startup/shutdown mechanism for proper synchronization
+ * with start and end of day routines.
+ *
+ * Revision 1.21 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.20 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.19 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.18 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.17 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.16 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.15 1996/05/30 12:59:18 jimz
+ * make etimer happier, more portable
+ *
+ * Revision 1.14 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.13 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.12 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.11 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.10 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.9 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.8 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.7 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.6 1995/12/06 20:58:27 wvcii
+ * added prototypes
+ *
+ * Revision 1.5 1995/11/30 16:06:05 wvcii
+ * added copyright info
+ *
+ * Revision 1.4 1995/10/09 22:41:10 wvcii
+ * minor bug fix
+ *
+ * Revision 1.3 1995/10/08 20:43:47 wvcii
+ * lots of random debugging - debugging still incomplete
+ *
+ * Revision 1.2 1995/09/07 15:52:19 jimz
+ * noop compile when INCLUDE_PARITYLOGGING not defined
+ *
+ * Revision 1.1 1995/09/06 19:24:44 wvcii
+ * Initial revision
+ *
+ */
+
+#include "rf_archs.h"
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+
+#include "rf_types.h"
+#include "rf_threadstuff.h"
+#include "rf_mcpair.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_dagfuncs.h"
+#include "rf_desc.h"
+#include "rf_layout.h"
+#include "rf_diskqueue.h"
+#include "rf_paritylog.h"
+#include "rf_general.h"
+#include "rf_threadid.h"
+#include "rf_etimer.h"
+#include "rf_paritylogging.h"
+#include "rf_engine.h"
+#include "rf_dagutils.h"
+#include "rf_map.h"
+#include "rf_parityscan.h"
+#include "rf_sys.h"
+
+#include "rf_paritylogDiskMgr.h"
+
+static caddr_t AcquireReintBuffer(RF_RegionBufferQueue_t *);
+
+static caddr_t AcquireReintBuffer(pool)
+ RF_RegionBufferQueue_t *pool;
+{
+ caddr_t bufPtr = NULL;
+
+ /* Return a region buffer from the free list (pool).
+ If the free list is empty, WAIT.
+ BLOCKING */
+
+ RF_LOCK_MUTEX(pool->mutex);
+ if (pool->availableBuffers > 0) {
+ bufPtr = pool->buffers[pool->availBuffersIndex];
+ pool->availableBuffers--;
+ pool->availBuffersIndex++;
+ if (pool->availBuffersIndex == pool->totalBuffers)
+ pool->availBuffersIndex = 0;
+ RF_UNLOCK_MUTEX(pool->mutex);
+ }
+ else {
+ RF_PANIC(); /* should never happen in currect config, single reint */
+ RF_WAIT_COND(pool->cond, pool->mutex);
+ }
+ return(bufPtr);
+}
+
+static void ReleaseReintBuffer(
+ RF_RegionBufferQueue_t *pool,
+ caddr_t bufPtr)
+{
+ /* Insert a region buffer (bufPtr) into the free list (pool).
+ NON-BLOCKING */
+
+ RF_LOCK_MUTEX(pool->mutex);
+ pool->availableBuffers++;
+ pool->buffers[pool->emptyBuffersIndex] = bufPtr;
+ pool->emptyBuffersIndex++;
+ if (pool->emptyBuffersIndex == pool->totalBuffers)
+ pool->emptyBuffersIndex = 0;
+ RF_ASSERT(pool->availableBuffers <= pool->totalBuffers);
+ RF_UNLOCK_MUTEX(pool->mutex);
+ RF_SIGNAL_COND(pool->cond);
+}
+
+
+
+static void ReadRegionLog(
+ RF_RegionId_t regionID,
+ RF_MCPair_t *rrd_mcpair,
+ caddr_t regionBuffer,
+ RF_Raid_t *raidPtr,
+ RF_DagHeader_t **rrd_dag_h,
+ RF_AllocListElem_t **rrd_alloclist,
+ RF_PhysDiskAddr_t **rrd_pda)
+{
+ /* Initiate the read a region log from disk. Once initiated, return
+ to the calling routine.
+
+ NON-BLOCKING
+ */
+
+ RF_AccTraceEntry_t tracerec;
+ RF_DagNode_t *rrd_rdNode;
+
+ /* create DAG to read region log from disk */
+ rf_MakeAllocList(*rrd_alloclist);
+ *rrd_dag_h = rf_MakeSimpleDAG(raidPtr, 1, 0, regionBuffer, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+ "Rrl", *rrd_alloclist, RF_DAG_FLAGS_NONE, RF_IO_NORMAL_PRIORITY);
+
+ /* create and initialize PDA for the core log */
+ /* RF_Malloc(*rrd_pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *)); */
+ *rrd_pda = rf_AllocPDAList(1);
+ rf_MapLogParityLogging(raidPtr, regionID, 0, &((*rrd_pda)->row), &((*rrd_pda)->col), &((*rrd_pda)->startSector));
+ (*rrd_pda)->numSector = raidPtr->regionInfo[regionID].capacity;
+
+ if ((*rrd_pda)->next) {
+ (*rrd_pda)->next = NULL;
+ printf("set rrd_pda->next to NULL\n");
+ }
+
+ /* initialize DAG parameters */
+ bzero((char *)&tracerec,sizeof(tracerec));
+ (*rrd_dag_h)->tracerec = &tracerec;
+ rrd_rdNode = (*rrd_dag_h)->succedents[0]->succedents[0];
+ rrd_rdNode->params[0].p = *rrd_pda;
+/* rrd_rdNode->params[1] = regionBuffer; */
+ rrd_rdNode->params[2].v = 0;
+ rrd_rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0);
+
+ /* launch region log read dag */
+ rf_DispatchDAG(*rrd_dag_h, (void (*)(void *))rf_MCPairWakeupFunc,
+ (void *) rrd_mcpair);
+}
+
+
+
+static void WriteCoreLog(
+ RF_ParityLog_t *log,
+ RF_MCPair_t *fwr_mcpair,
+ RF_Raid_t *raidPtr,
+ RF_DagHeader_t **fwr_dag_h,
+ RF_AllocListElem_t **fwr_alloclist,
+ RF_PhysDiskAddr_t **fwr_pda)
+{
+ RF_RegionId_t regionID = log->regionID;
+ RF_AccTraceEntry_t tracerec;
+ RF_SectorNum_t regionOffset;
+ RF_DagNode_t *fwr_wrNode;
+
+ /* Initiate the write of a core log to a region log disk.
+ Once initiated, return to the calling routine.
+
+ NON-BLOCKING
+ */
+
+ /* create DAG to write a core log to a region log disk */
+ rf_MakeAllocList(*fwr_alloclist);
+ *fwr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, 0, log->bufPtr, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+ "Wcl", *fwr_alloclist, RF_DAG_FLAGS_NONE, RF_IO_NORMAL_PRIORITY);
+
+ /* create and initialize PDA for the region log */
+ /* RF_Malloc(*fwr_pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *)); */
+ *fwr_pda = rf_AllocPDAList(1);
+ regionOffset = log->diskOffset;
+ rf_MapLogParityLogging(raidPtr, regionID, regionOffset, &((*fwr_pda)->row), &((*fwr_pda)->col), &((*fwr_pda)->startSector));
+ (*fwr_pda)->numSector = raidPtr->numSectorsPerLog;
+
+ /* initialize DAG parameters */
+ bzero((char *)&tracerec,sizeof(tracerec));
+ (*fwr_dag_h)->tracerec = &tracerec;
+ fwr_wrNode = (*fwr_dag_h)->succedents[0]->succedents[0];
+ fwr_wrNode->params[0].p = *fwr_pda;
+/* fwr_wrNode->params[1] = log->bufPtr; */
+ fwr_wrNode->params[2].v = 0;
+ fwr_wrNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0);
+
+ /* launch the dag to write the core log to disk */
+ rf_DispatchDAG(*fwr_dag_h, (void (*)(void *)) rf_MCPairWakeupFunc,
+ (void *) fwr_mcpair);
+}
+
+
+static void ReadRegionParity(
+ RF_RegionId_t regionID,
+ RF_MCPair_t *prd_mcpair,
+ caddr_t parityBuffer,
+ RF_Raid_t *raidPtr,
+ RF_DagHeader_t **prd_dag_h,
+ RF_AllocListElem_t **prd_alloclist,
+ RF_PhysDiskAddr_t **prd_pda)
+{
+ /* Initiate the read region parity from disk.
+ Once initiated, return to the calling routine.
+
+ NON-BLOCKING
+ */
+
+ RF_AccTraceEntry_t tracerec;
+ RF_DagNode_t *prd_rdNode;
+
+ /* create DAG to read region parity from disk */
+ rf_MakeAllocList(*prd_alloclist);
+ *prd_dag_h = rf_MakeSimpleDAG(raidPtr, 1, 0, NULL, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+ "Rrp", *prd_alloclist, RF_DAG_FLAGS_NONE, RF_IO_NORMAL_PRIORITY);
+
+ /* create and initialize PDA for region parity */
+ /* RF_Malloc(*prd_pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *)); */
+ *prd_pda = rf_AllocPDAList(1);
+ rf_MapRegionParity(raidPtr, regionID, &((*prd_pda)->row), &((*prd_pda)->col), &((*prd_pda)->startSector), &((*prd_pda)->numSector));
+ if (rf_parityLogDebug)
+ printf("[reading %d sectors of parity from region %d]\n",
+ (int)(*prd_pda)->numSector, regionID);
+ if ((*prd_pda)->next) {
+ (*prd_pda)->next = NULL;
+ printf("set prd_pda->next to NULL\n");
+ }
+
+ /* initialize DAG parameters */
+ bzero((char *)&tracerec,sizeof(tracerec));
+ (*prd_dag_h)->tracerec = &tracerec;
+ prd_rdNode = (*prd_dag_h)->succedents[0]->succedents[0];
+ prd_rdNode->params[0].p = *prd_pda;
+ prd_rdNode->params[1].p = parityBuffer;
+ prd_rdNode->params[2].v = 0;
+ prd_rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0);
+ if (rf_validateDAGDebug)
+ rf_ValidateDAG(*prd_dag_h);
+ /* launch region parity read dag */
+ rf_DispatchDAG(*prd_dag_h, (void (*)(void *)) rf_MCPairWakeupFunc,
+ (void *) prd_mcpair);
+}
+
+static void WriteRegionParity(
+ RF_RegionId_t regionID,
+ RF_MCPair_t *pwr_mcpair,
+ caddr_t parityBuffer,
+ RF_Raid_t *raidPtr,
+ RF_DagHeader_t **pwr_dag_h,
+ RF_AllocListElem_t **pwr_alloclist,
+ RF_PhysDiskAddr_t **pwr_pda)
+{
+ /* Initiate the write of region parity to disk.
+ Once initiated, return to the calling routine.
+
+ NON-BLOCKING
+ */
+
+ RF_AccTraceEntry_t tracerec;
+ RF_DagNode_t *pwr_wrNode;
+
+ /* create DAG to write region log from disk */
+ rf_MakeAllocList(*pwr_alloclist);
+ *pwr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, 0, parityBuffer, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+ "Wrp", *pwr_alloclist, RF_DAG_FLAGS_NONE, RF_IO_NORMAL_PRIORITY);
+
+ /* create and initialize PDA for region parity */
+ /* RF_Malloc(*pwr_pda, sizeof(RF_PhysDiskAddr_t), (RF_PhysDiskAddr_t *)); */
+ *pwr_pda = rf_AllocPDAList(1);
+ rf_MapRegionParity(raidPtr, regionID, &((*pwr_pda)->row), &((*pwr_pda)->col), &((*pwr_pda)->startSector), &((*pwr_pda)->numSector));
+
+ /* initialize DAG parameters */
+ bzero((char *)&tracerec,sizeof(tracerec));
+ (*pwr_dag_h)->tracerec = &tracerec;
+ pwr_wrNode = (*pwr_dag_h)->succedents[0]->succedents[0];
+ pwr_wrNode->params[0].p = *pwr_pda;
+/* pwr_wrNode->params[1] = parityBuffer; */
+ pwr_wrNode->params[2].v = 0;
+ pwr_wrNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, 0);
+
+ /* launch the dag to write region parity to disk */
+ rf_DispatchDAG(*pwr_dag_h, (void (*)(void *))rf_MCPairWakeupFunc,
+ (void *) pwr_mcpair);
+}
+
+static void FlushLogsToDisk(
+ RF_Raid_t *raidPtr,
+ RF_ParityLog_t *logList)
+{
+ /* Flush a linked list of core logs to the log disk.
+ Logs contain the disk location where they should be
+ written. Logs were written in FIFO order and that
+ order must be preserved.
+
+ Recommended optimizations:
+ 1) allow multiple flushes to occur simultaneously
+ 2) coalesce contiguous flush operations
+
+ BLOCKING
+ */
+
+ RF_ParityLog_t *log;
+ RF_RegionId_t regionID;
+ RF_MCPair_t *fwr_mcpair;
+ RF_DagHeader_t *fwr_dag_h;
+ RF_AllocListElem_t *fwr_alloclist;
+ RF_PhysDiskAddr_t *fwr_pda;
+
+ fwr_mcpair = rf_AllocMCPair();
+ RF_LOCK_MUTEX(fwr_mcpair->mutex);
+
+ RF_ASSERT(logList);
+ log = logList;
+ while (log)
+ {
+ regionID = log->regionID;
+
+ /* create and launch a DAG to write the core log */
+ if (rf_parityLogDebug)
+ printf("[initiating write of core log for region %d]\n", regionID);
+ fwr_mcpair->flag = RF_FALSE;
+ WriteCoreLog(log, fwr_mcpair, raidPtr, &fwr_dag_h, &fwr_alloclist, &fwr_pda);
+
+ /* wait for the DAG to complete */
+#ifndef SIMULATE
+ while (!fwr_mcpair->flag)
+ RF_WAIT_COND(fwr_mcpair->cond, fwr_mcpair->mutex);
+#endif /* !SIMULATE */
+ if (fwr_dag_h->status != rf_enable)
+ {
+ RF_ERRORMSG1("Unable to write core log to disk (region %d)\n", regionID);
+ RF_ASSERT(0);
+ }
+
+ /* RF_Free(fwr_pda, sizeof(RF_PhysDiskAddr_t)); */
+ rf_FreePhysDiskAddr(fwr_pda);
+ rf_FreeDAG(fwr_dag_h);
+ rf_FreeAllocList(fwr_alloclist);
+
+ log = log->next;
+ }
+ RF_UNLOCK_MUTEX(fwr_mcpair->mutex);
+ rf_FreeMCPair(fwr_mcpair);
+ rf_ReleaseParityLogs(raidPtr, logList);
+}
+
+static void ReintegrateRegion(
+ RF_Raid_t *raidPtr,
+ RF_RegionId_t regionID,
+ RF_ParityLog_t *coreLog)
+{
+ RF_MCPair_t *rrd_mcpair=NULL, *prd_mcpair, *pwr_mcpair;
+ RF_DagHeader_t *rrd_dag_h, *prd_dag_h, *pwr_dag_h;
+ RF_AllocListElem_t *rrd_alloclist, *prd_alloclist, *pwr_alloclist;
+ RF_PhysDiskAddr_t *rrd_pda, *prd_pda, *pwr_pda;
+ caddr_t parityBuffer, regionBuffer=NULL;
+
+ /* Reintegrate a region (regionID).
+ 1. acquire region and parity buffers
+ 2. read log from disk
+ 3. read parity from disk
+ 4. apply log to parity
+ 5. apply core log to parity
+ 6. write new parity to disk
+
+ BLOCKING
+ */
+
+ if (rf_parityLogDebug)
+ printf("[reintegrating region %d]\n", regionID);
+
+ /* initiate read of region parity */
+ if (rf_parityLogDebug)
+ printf("[initiating read of parity for region %d]\n", regionID);
+ parityBuffer = AcquireReintBuffer(&raidPtr->parityBufferPool);
+ prd_mcpair = rf_AllocMCPair();
+ RF_LOCK_MUTEX(prd_mcpair->mutex);
+ prd_mcpair->flag = RF_FALSE;
+ ReadRegionParity(regionID, prd_mcpair, parityBuffer, raidPtr, &prd_dag_h, &prd_alloclist, &prd_pda);
+
+ /* if region log nonempty, initiate read */
+ if (raidPtr->regionInfo[regionID].diskCount > 0)
+ {
+ if (rf_parityLogDebug)
+ printf("[initiating read of disk log for region %d]\n", regionID);
+ regionBuffer = AcquireReintBuffer(&raidPtr->regionBufferPool);
+ rrd_mcpair = rf_AllocMCPair();
+ RF_LOCK_MUTEX(rrd_mcpair->mutex);
+ rrd_mcpair->flag = RF_FALSE;
+ ReadRegionLog(regionID, rrd_mcpair, regionBuffer, raidPtr, &rrd_dag_h, &rrd_alloclist, &rrd_pda);
+ }
+
+ /* wait on read of region parity to complete */
+#ifndef SIMULATE
+ while (!prd_mcpair->flag) {
+ RF_WAIT_COND(prd_mcpair->cond, prd_mcpair->mutex);
+ }
+#endif /* !SIMULATE */
+ RF_UNLOCK_MUTEX(prd_mcpair->mutex);
+ if (prd_dag_h->status != rf_enable)
+ {
+ RF_ERRORMSG("Unable to read parity from disk\n");
+ /* add code to fail the parity disk */
+ RF_ASSERT(0);
+ }
+
+ /* apply core log to parity */
+ /* if (coreLog)
+ ApplyLogsToParity(coreLog, parityBuffer); */
+
+ if (raidPtr->regionInfo[regionID].diskCount > 0)
+ {
+ /* wait on read of region log to complete */
+#ifndef SIMULATE
+ while (!rrd_mcpair->flag)
+ RF_WAIT_COND(rrd_mcpair->cond, rrd_mcpair->mutex);
+#endif /* !SIMULATE */
+ RF_UNLOCK_MUTEX(rrd_mcpair->mutex);
+ if (rrd_dag_h->status != rf_enable)
+ {
+ RF_ERRORMSG("Unable to read region log from disk\n");
+ /* add code to fail the log disk */
+ RF_ASSERT(0);
+ }
+ /* apply region log to parity */
+ /* ApplyRegionToParity(regionID, regionBuffer, parityBuffer); */
+ /* release resources associated with region log */
+ /* RF_Free(rrd_pda, sizeof(RF_PhysDiskAddr_t)); */
+ rf_FreePhysDiskAddr(rrd_pda);
+ rf_FreeDAG(rrd_dag_h);
+ rf_FreeAllocList(rrd_alloclist);
+ rf_FreeMCPair(rrd_mcpair);
+ ReleaseReintBuffer(&raidPtr->regionBufferPool, regionBuffer);
+ }
+
+ /* write reintegrated parity to disk */
+ if (rf_parityLogDebug)
+ printf("[initiating write of parity for region %d]\n", regionID);
+ pwr_mcpair = rf_AllocMCPair();
+ RF_LOCK_MUTEX(pwr_mcpair->mutex);
+ pwr_mcpair->flag = RF_FALSE;
+ WriteRegionParity(regionID, pwr_mcpair, parityBuffer, raidPtr, &pwr_dag_h, &pwr_alloclist, &pwr_pda);
+#ifndef SIMULATE
+ while (!pwr_mcpair->flag)
+ RF_WAIT_COND(pwr_mcpair->cond, pwr_mcpair->mutex);
+#endif /* !SIMULATE */
+ RF_UNLOCK_MUTEX(pwr_mcpair->mutex);
+ if (pwr_dag_h->status != rf_enable)
+ {
+ RF_ERRORMSG("Unable to write parity to disk\n");
+ /* add code to fail the parity disk */
+ RF_ASSERT(0);
+ }
+
+ /* release resources associated with read of old parity */
+ /* RF_Free(prd_pda, sizeof(RF_PhysDiskAddr_t)); */
+ rf_FreePhysDiskAddr(prd_pda);
+ rf_FreeDAG(prd_dag_h);
+ rf_FreeAllocList(prd_alloclist);
+ rf_FreeMCPair(prd_mcpair);
+
+ /* release resources associated with write of new parity */
+ ReleaseReintBuffer(&raidPtr->parityBufferPool, parityBuffer);
+ /* RF_Free(pwr_pda, sizeof(RF_PhysDiskAddr_t)); */
+ rf_FreePhysDiskAddr(pwr_pda);
+ rf_FreeDAG(pwr_dag_h);
+ rf_FreeAllocList(pwr_alloclist);
+ rf_FreeMCPair(pwr_mcpair);
+
+ if (rf_parityLogDebug)
+ printf("[finished reintegrating region %d]\n", regionID);
+}
+
+
+
+static void ReintegrateLogs(
+ RF_Raid_t *raidPtr,
+ RF_ParityLog_t *logList)
+{
+ RF_ParityLog_t *log, *freeLogList = NULL;
+ RF_ParityLogData_t *logData, *logDataList;
+ RF_RegionId_t regionID;
+
+ RF_ASSERT(logList);
+ while (logList)
+ {
+ log = logList;
+ logList = logList->next;
+ log->next = NULL;
+ regionID = log->regionID;
+ ReintegrateRegion(raidPtr, regionID, log);
+ log->numRecords = 0;
+
+ /* remove all items which are blocked on reintegration of this region */
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ logData = rf_SearchAndDequeueParityLogData(raidPtr, regionID, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail, RF_TRUE);
+ logDataList = logData;
+ while (logData)
+ {
+ logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail, RF_TRUE);
+ logData = logData->next;
+ }
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+
+ /* process blocked log data and clear reintInProgress flag for this region */
+ if (logDataList)
+ rf_ParityLogAppend(logDataList, RF_TRUE, &log, RF_TRUE);
+ else
+ {
+ /* Enable flushing for this region. Holding both locks provides
+ a synchronization barrier with DumpParityLogToDisk
+ */
+ RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+ RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ raidPtr->regionInfo[regionID].diskCount = 0;
+ raidPtr->regionInfo[regionID].reintInProgress = RF_FALSE;
+ RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+ RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex); /* flushing is now enabled */
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ }
+ /* if log wasn't used, attach it to the list of logs to be returned */
+ if (log)
+ {
+ log->next = freeLogList;
+ freeLogList = log;
+ }
+ }
+ if (freeLogList)
+ rf_ReleaseParityLogs(raidPtr, freeLogList);
+}
+
+int rf_ShutdownLogging(RF_Raid_t *raidPtr)
+{
+ /* shutdown parity logging
+ 1) disable parity logging in all regions
+ 2) reintegrate all regions
+ */
+
+ RF_SectorCount_t diskCount;
+ RF_RegionId_t regionID;
+ RF_ParityLog_t *log;
+
+ if (rf_parityLogDebug)
+ printf("[shutting down parity logging]\n");
+ /* Since parity log maps are volatile, we must reintegrate all regions. */
+ if (rf_forceParityLogReint) {
+ for (regionID = 0; regionID < rf_numParityRegions; regionID++)
+ {
+ RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+ raidPtr->regionInfo[regionID].loggingEnabled = RF_FALSE;
+ log = raidPtr->regionInfo[regionID].coreLog;
+ raidPtr->regionInfo[regionID].coreLog = NULL;
+ diskCount = raidPtr->regionInfo[regionID].diskCount;
+ RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+ if (diskCount > 0 || log != NULL)
+ ReintegrateRegion(raidPtr, regionID, log);
+ if (log != NULL)
+ rf_ReleaseParityLogs(raidPtr, log);
+ }
+ }
+ if (rf_parityLogDebug)
+ {
+ printf("[parity logging disabled]\n");
+ printf("[should be done!]\n");
+ }
+ return(0);
+}
+
+int rf_ParityLoggingDiskManager(RF_Raid_t *raidPtr)
+{
+ RF_ParityLog_t *reintQueue, *flushQueue;
+ int workNeeded, done = RF_FALSE;
+
+ rf_assign_threadid(); /* don't remove this line */
+
+ /* Main program for parity logging disk thread. This routine waits
+ for work to appear in either the flush or reintegration queues
+ and is responsible for flushing core logs to the log disk as
+ well as reintegrating parity regions.
+
+ BLOCKING
+ */
+
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+
+ /*
+ * Inform our creator that we're running. Don't bother doing the
+ * mutex lock/unlock dance- we locked above, and we'll unlock
+ * below with nothing to do, yet.
+ */
+ raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_RUNNING;
+ RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
+
+ /* empty the work queues */
+ flushQueue = raidPtr->parityLogDiskQueue.flushQueue; raidPtr->parityLogDiskQueue.flushQueue = NULL;
+ reintQueue = raidPtr->parityLogDiskQueue.reintQueue; raidPtr->parityLogDiskQueue.reintQueue = NULL;
+ workNeeded = (flushQueue || reintQueue);
+
+ while (!done)
+ {
+ while (workNeeded)
+ {
+ /* First, flush all logs in the flush queue, freeing buffers
+ Second, reintegrate all regions which are reported as full.
+ Third, append queued log data until blocked.
+
+ Note: Incoming appends (ParityLogAppend) can block on either
+ 1. empty buffer pool
+ 2. region under reintegration
+ To preserve a global FIFO ordering of appends, buffers are not
+ released to the world until those appends blocked on buffers are
+ removed from the append queue. Similarly, regions which are
+ reintegrated are not opened for general use until the append
+ queue has been emptied.
+ */
+
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+
+ /* empty flushQueue, using free'd log buffers to process bufTail */
+ if (flushQueue)
+ FlushLogsToDisk(raidPtr, flushQueue);
+
+ /* empty reintQueue, flushing from reintTail as we go */
+ if (reintQueue)
+ ReintegrateLogs(raidPtr, reintQueue);
+
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ flushQueue = raidPtr->parityLogDiskQueue.flushQueue; raidPtr->parityLogDiskQueue.flushQueue = NULL;
+ reintQueue = raidPtr->parityLogDiskQueue.reintQueue; raidPtr->parityLogDiskQueue.reintQueue = NULL;
+ workNeeded = (flushQueue || reintQueue);
+ }
+ /* no work is needed at this point */
+ if (raidPtr->parityLogDiskQueue.threadState&RF_PLOG_TERMINATE)
+ {
+ /* shutdown parity logging
+ 1. disable parity logging in all regions
+ 2. reintegrate all regions
+ */
+ done = RF_TRUE; /* thread disabled, no work needed */
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ rf_ShutdownLogging(raidPtr);
+ }
+ if (!done)
+ {
+ /* thread enabled, no work needed, so sleep */
+ if (rf_parityLogDebug)
+ printf("[parity logging disk manager sleeping]\n");
+ RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, raidPtr->parityLogDiskQueue.mutex);
+ if (rf_parityLogDebug)
+ printf("[parity logging disk manager just woke up]\n");
+ flushQueue = raidPtr->parityLogDiskQueue.flushQueue; raidPtr->parityLogDiskQueue.flushQueue = NULL;
+ reintQueue = raidPtr->parityLogDiskQueue.reintQueue; raidPtr->parityLogDiskQueue.reintQueue = NULL;
+ workNeeded = (flushQueue || reintQueue);
+ }
+ }
+ /*
+ * Announce that we're done.
+ */
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_SHUTDOWN;
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ /*
+ * In the Net- and OpenBSD kernels, the thread must exit; returning would
+ * cause the proc trampoline to attempt to return to userspace.
+ */
+ kthread_exit(0); /* does not return */
+#else
+ return(0);
+#endif
+}
+
+#endif /* RF_INCLUDE_PARITYLOGGING > 0 */
diff --git a/sys/dev/raidframe/rf_paritylogDiskMgr.h b/sys/dev/raidframe/rf_paritylogDiskMgr.h
new file mode 100644
index 00000000000..c20558d9897
--- /dev/null
+++ b/sys/dev/raidframe/rf_paritylogDiskMgr.h
@@ -0,0 +1,63 @@
+/* $OpenBSD: rf_paritylogDiskMgr.h,v 1.1 1999/01/11 14:29:35 niklas Exp $ */
+/* $NetBSD: rf_paritylogDiskMgr.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* header file for parity log disk mgr code
+ *
+ * :
+ * Log: rf_paritylogDiskMgr.h,v
+ * Revision 1.5 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.4 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.3 1995/12/06 20:56:39 wvcii
+ * added prototypes
+ *
+ * Revision 1.2 1995/11/30 16:06:21 wvcii
+ * added copyright info
+ *
+ * Revision 1.1 1995/09/06 19:25:29 wvcii
+ * Initial revision
+ *
+ *
+ */
+
+#ifndef _RF__RF_PARITYLOGDISKMGR_H_
+#define _RF__RF_PARITYLOGDISKMGR_H_
+
+#include "rf_types.h"
+
+int rf_ShutdownLogging(RF_Raid_t *raidPtr);
+int rf_ParityLoggingDiskManager(RF_Raid_t *raidPtr);
+
+#endif /* !_RF__RF_PARITYLOGDISKMGR_H_ */
diff --git a/sys/dev/raidframe/rf_paritylogging.c b/sys/dev/raidframe/rf_paritylogging.c
new file mode 100644
index 00000000000..595612b3718
--- /dev/null
+++ b/sys/dev/raidframe/rf_paritylogging.c
@@ -0,0 +1,1088 @@
+/* $OpenBSD: rf_paritylogging.c,v 1.1 1999/01/11 14:29:35 niklas Exp $ */
+/* $NetBSD: rf_paritylogging.c,v 1.1 1998/11/13 04:20:32 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* :
+ * Log: rf_paritylogging.c,v
+ * Revision 1.42 1996/11/05 21:10:40 jimz
+ * failed pda generalization
+ *
+ * Revision 1.41 1996/07/31 16:56:18 jimz
+ * dataBytesPerStripe, sectorsPerDisk init arch-indep.
+ *
+ * Revision 1.40 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.39 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.38 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.37 1996/06/17 03:24:14 jimz
+ * switch to new shutdown function typing
+ *
+ * Revision 1.36 1996/06/14 23:15:38 jimz
+ * attempt to deal with thread GC problem
+ *
+ * Revision 1.35 1996/06/11 13:48:30 jimz
+ * get it to compile in-kernel
+ *
+ * Revision 1.34 1996/06/11 10:16:35 jimz
+ * Check return values on array configuration- back out if failed.
+ * Reorder shutdown to avoid using deallocated resources.
+ * Get rid of bogus join op in shutdown.
+ *
+ * Revision 1.33 1996/06/10 18:29:17 wvcii
+ * fixed bug in rf_IdentifyStripeParityLogging
+ * - added array initialization
+ *
+ * Revision 1.32 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.31 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.30 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.29 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.28 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.27 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.26 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.25 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.24 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.23 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.22 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.21 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.20 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.19 1996/05/20 16:16:30 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.18 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.17 1996/05/03 19:47:11 wvcii
+ * added includes of new dag library
+ *
+ * Revision 1.16 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.15 1995/12/06 20:57:43 wvcii
+ * added prototypes
+ * reintegration of logs on shutdown now conditional on forceParityLogReint
+ *
+ * Revision 1.14 1995/11/30 16:06:42 wvcii
+ * added copyright info
+ *
+ * Revision 1.13 1995/11/17 19:01:29 wvcii
+ * added prototyping to MapParity
+ *
+ * Revision 1.12 1995/11/07 15:36:03 wvcii
+ * changed ParityLoggingDagSelect prototype
+ * function no longer returns numHdrSucc, numTermAnt
+ *
+ * Revision 1.11 1995/10/08 20:42:54 wvcii
+ * lots of random debugging - debugging incomplete
+ *
+ * Revision 1.10 1995/09/07 01:26:55 jimz
+ * Achive basic compilation in kernel. Kernel functionality
+ * is not guaranteed at all, but it'll compile. Mostly. I hope.
+ *
+ * Revision 1.9 1995/09/06 19:21:17 wvcii
+ * explicit shutdown (forced reintegration) for simulator version
+ *
+ * Revision 1.8 1995/07/08 18:19:16 rachad
+ * Parity verifies can not be done in the simulator.
+ *
+ * Revision 1.7 1995/07/07 00:17:20 wvcii
+ * this version free from deadlock, fails parity verification
+ *
+ * Revision 1.6 1995/06/23 13:39:59 robby
+ * updeated to prototypes in rf_layout.h
+ *
+ * Revision 1.5 1995/06/09 13:14:56 wvcii
+ * code is now nonblocking
+ *
+ * Revision 1.4 95/06/01 17:02:23 wvcii
+ * code debug
+ *
+ * Revision 1.3 95/05/31 13:08:57 wvcii
+ * code debug
+ *
+ * Revision 1.2 95/05/21 15:35:00 wvcii
+ * code debug
+ *
+ *
+ *
+ */
+
+/*
+ parity logging configuration, dag selection, and mapping is implemented here
+ */
+
+#include "rf_archs.h"
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_dagutils.h"
+#include "rf_dagfuncs.h"
+#include "rf_dagffrd.h"
+#include "rf_dagffwr.h"
+#include "rf_dagdegrd.h"
+#include "rf_dagdegwr.h"
+#include "rf_threadid.h"
+#include "rf_paritylog.h"
+#include "rf_paritylogDiskMgr.h"
+#include "rf_paritylogging.h"
+#include "rf_parityloggingdags.h"
+#include "rf_general.h"
+#include "rf_map.h"
+#include "rf_utils.h"
+#include "rf_shutdown.h"
+
+typedef struct RF_ParityLoggingConfigInfo_s {
+ RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by IdentifyStripe */
+} RF_ParityLoggingConfigInfo_t;
+
+static void FreeRegionInfo(RF_Raid_t *raidPtr, RF_RegionId_t regionID);
+static void rf_ShutdownParityLogging(RF_ThreadArg_t arg);
+static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg);
+static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg);
+static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg);
+static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg);
+static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg);
+
+int rf_ConfigureParityLogging(
+ RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr)
+{
+ int i, j, startdisk, rc;
+ RF_SectorCount_t totalLogCapacity, fragmentation, lastRegionCapacity;
+ RF_SectorCount_t parityBufferCapacity, maxRegionParityRange;
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_ParityLoggingConfigInfo_t *info;
+ RF_ParityLog_t *l=NULL, *next;
+ caddr_t lHeapPtr;
+
+ /*
+ * We create multiple entries on the shutdown list here, since
+ * this configuration routine is fairly complicated in and of
+ * itself, and this makes backing out of a failed configuration
+ * much simpler.
+ */
+
+ raidPtr->numSectorsPerLog = RF_DEFAULT_NUM_SECTORS_PER_LOG;
+
+ /* create a parity logging configuration structure */
+ RF_MallocAndAdd(info, sizeof(RF_ParityLoggingConfigInfo_t), (RF_ParityLoggingConfigInfo_t *), raidPtr->cleanupList);
+ if (info == NULL)
+ return(ENOMEM);
+ layoutPtr->layoutSpecificInfo = (void *) info;
+
+ RF_ASSERT(raidPtr->numRow == 1);
+
+ /* the stripe identifier must identify the disks in each stripe,
+ * IN THE ORDER THAT THEY APPEAR IN THE STRIPE.
+ */
+ info->stripeIdentifier = rf_make_2d_array((raidPtr->numCol), (raidPtr->numCol), raidPtr->cleanupList);
+ if (info->stripeIdentifier == NULL)
+ return(ENOMEM);
+
+ startdisk = 0;
+ for (i=0; i<(raidPtr->numCol); i++)
+ {
+ for (j=0; j<(raidPtr->numCol); j++)
+ {
+ info->stripeIdentifier[i][j] = (startdisk + j) % (raidPtr->numCol - 1);
+ }
+ if ((--startdisk) < 0)
+ startdisk = raidPtr->numCol-1-1;
+ }
+
+ /* fill in the remaining layout parameters */
+ layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
+ layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+ layoutPtr->numParityCol = 1;
+ layoutPtr->numParityLogCol = 1;
+ layoutPtr->numDataCol = raidPtr->numCol - layoutPtr->numParityCol - layoutPtr->numParityLogCol;
+ layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+ layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk;
+ raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
+
+ raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+
+ /* configure parity log parameters
+
+ parameter comment/constraints
+ ---------------- -------------------
+ * numParityRegions all regions (except possibly last) of equal size
+ * totalInCoreLogCapacity amount of memory in bytes available for in-core logs (default 1 MB)
+ # numSectorsPerLog capacity of an in-core log in sectors (1 disk track)
+ numParityLogs total number of in-core logs, should be at least numParityRegions
+ regionLogCapacity size of a region log (except possibly last one) in sectors
+ totalLogCapacity total amount of log space in sectors
+
+ * denotes a user settable parameter.
+ # logs are fixed to be the size of a disk track, value #defined in rf_paritylog.h
+
+ */
+
+ totalLogCapacity = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit * layoutPtr->numParityLogCol;
+ raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions;
+ if (rf_parityLogDebug)
+ printf("bytes per sector %d\n", raidPtr->bytesPerSector);
+
+ /* reduce fragmentation within a disk region by adjusting the number of regions
+ in an attempt to allow an integral number of logs to fit into a disk region */
+ fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog;
+ if (fragmentation > 0)
+ for (i = 1; i < (raidPtr->numSectorsPerLog / 2); i++)
+ {
+ if (((totalLogCapacity / (rf_numParityRegions + i)) % raidPtr->numSectorsPerLog) < fragmentation)
+ {
+ rf_numParityRegions++;
+ raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions;
+ fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog;
+ }
+ if (((totalLogCapacity / (rf_numParityRegions - i)) % raidPtr->numSectorsPerLog) < fragmentation)
+ {
+ rf_numParityRegions--;
+ raidPtr->regionLogCapacity = totalLogCapacity / rf_numParityRegions;
+ fragmentation = raidPtr->regionLogCapacity % raidPtr->numSectorsPerLog;
+ }
+ }
+ /* ensure integral number of regions per log */
+ raidPtr->regionLogCapacity = (raidPtr->regionLogCapacity / raidPtr->numSectorsPerLog) * raidPtr->numSectorsPerLog;
+
+ raidPtr->numParityLogs = rf_totalInCoreLogCapacity / (raidPtr->bytesPerSector * raidPtr->numSectorsPerLog);
+ /* to avoid deadlock, must ensure that enough logs exist for each region to have one simultaneously */
+ if (raidPtr->numParityLogs < rf_numParityRegions)
+ raidPtr->numParityLogs = rf_numParityRegions;
+
+ /* create region information structs */
+ RF_Malloc(raidPtr->regionInfo, (rf_numParityRegions * sizeof(RF_RegionInfo_t)), (RF_RegionInfo_t *));
+ if (raidPtr->regionInfo == NULL)
+ return(ENOMEM);
+
+ /* last region may not be full capacity */
+ lastRegionCapacity = raidPtr->regionLogCapacity;
+ while ((rf_numParityRegions - 1) * raidPtr->regionLogCapacity + lastRegionCapacity > totalLogCapacity)
+ lastRegionCapacity = lastRegionCapacity - raidPtr->numSectorsPerLog;
+
+ raidPtr->regionParityRange = raidPtr->sectorsPerDisk / rf_numParityRegions;
+ maxRegionParityRange = raidPtr->regionParityRange;
+
+/* i can't remember why this line is in the code -wvcii 6/30/95 */
+/* if (raidPtr->sectorsPerDisk % rf_numParityRegions > 0)
+ regionParityRange++; */
+
+ /* build pool of unused parity logs */
+ RF_Malloc(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * raidPtr->numSectorsPerLog * raidPtr->bytesPerSector, (caddr_t));
+ if (raidPtr->parityLogBufferHeap == NULL)
+ return(ENOMEM);
+ lHeapPtr = raidPtr->parityLogBufferHeap;
+ rc = rf_mutex_init(&raidPtr->parityLogPool.mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
+ return(ENOMEM);
+ }
+ for (i = 0; i < raidPtr->numParityLogs; i++)
+ {
+ if (i == 0)
+ {
+ RF_Calloc(raidPtr->parityLogPool.parityLogs, 1, sizeof(RF_ParityLog_t), (RF_ParityLog_t *));
+ if (raidPtr->parityLogPool.parityLogs == NULL) {
+ RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
+ return(ENOMEM);
+ }
+ l = raidPtr->parityLogPool.parityLogs;
+ }
+ else
+ {
+ RF_Calloc(l->next, 1, sizeof(RF_ParityLog_t), (RF_ParityLog_t *));
+ if (l->next == NULL) {
+ RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
+ for(l=raidPtr->parityLogPool.parityLogs;l;l=next) {
+ next = l->next;
+ if (l->records)
+ RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t)));
+ RF_Free(l, sizeof(RF_ParityLog_t));
+ }
+ return(ENOMEM);
+ }
+ l = l->next;
+ }
+ l->bufPtr = lHeapPtr;
+ lHeapPtr += raidPtr->numSectorsPerLog * raidPtr->bytesPerSector;
+ RF_Malloc(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t)), (RF_ParityLogRecord_t *));
+ if (l->records == NULL) {
+ RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
+ for(l=raidPtr->parityLogPool.parityLogs;l;l=next) {
+ next = l->next;
+ if (l->records)
+ RF_Free(l->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t)));
+ RF_Free(l, sizeof(RF_ParityLog_t));
+ }
+ return(ENOMEM);
+ }
+ }
+ rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingPool, raidPtr);
+ if (rc) {
+ RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ rf_ShutdownParityLoggingPool(raidPtr);
+ return(rc);
+ }
+
+ /* build pool of region buffers */
+ rc = rf_mutex_init(&raidPtr->regionBufferPool.mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ return(ENOMEM);
+ }
+ rc = rf_cond_init(&raidPtr->regionBufferPool.cond);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ rf_mutex_destroy(&raidPtr->regionBufferPool.mutex);
+ return(ENOMEM);
+ }
+ raidPtr->regionBufferPool.bufferSize = raidPtr->regionLogCapacity * raidPtr->bytesPerSector;
+ printf("regionBufferPool.bufferSize %d\n",raidPtr->regionBufferPool.bufferSize);
+ raidPtr->regionBufferPool.totalBuffers = 1; /* for now, only one region at a time may be reintegrated */
+ raidPtr->regionBufferPool.availableBuffers = raidPtr->regionBufferPool.totalBuffers;
+ raidPtr->regionBufferPool.availBuffersIndex = 0;
+ raidPtr->regionBufferPool.emptyBuffersIndex = 0;
+ RF_Malloc(raidPtr->regionBufferPool.buffers, raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t), (caddr_t *));
+ if (raidPtr->regionBufferPool.buffers == NULL) {
+ rf_mutex_destroy(&raidPtr->regionBufferPool.mutex);
+ rf_cond_destroy(&raidPtr->regionBufferPool.cond);
+ return(ENOMEM);
+ }
+ for (i = 0; i < raidPtr->regionBufferPool.totalBuffers; i++) {
+ RF_Malloc(raidPtr->regionBufferPool.buffers[i], raidPtr->regionBufferPool.bufferSize * sizeof(char), (caddr_t));
+ if (raidPtr->regionBufferPool.buffers == NULL) {
+ rf_mutex_destroy(&raidPtr->regionBufferPool.mutex);
+ rf_cond_destroy(&raidPtr->regionBufferPool.cond);
+ for(j=0;j<i;j++) {
+ RF_Free(raidPtr->regionBufferPool.buffers[i], raidPtr->regionBufferPool.bufferSize * sizeof(char));
+ }
+ RF_Free(raidPtr->regionBufferPool.buffers, raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t));
+ return(ENOMEM);
+ }
+ printf("raidPtr->regionBufferPool.buffers[%d] = %lx\n", i,
+ (long)raidPtr->regionBufferPool.buffers[i]);
+ }
+ rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingRegionBufferPool, raidPtr);
+ if (rc) {
+ RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ rf_ShutdownParityLoggingRegionBufferPool(raidPtr);
+ return(rc);
+ }
+
+ /* build pool of parity buffers */
+ parityBufferCapacity = maxRegionParityRange;
+ rc = rf_mutex_init(&raidPtr->parityBufferPool.mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ return(rc);
+ }
+ rc = rf_cond_init(&raidPtr->parityBufferPool.cond);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ rf_mutex_destroy(&raidPtr->parityBufferPool.mutex);
+ return(ENOMEM);
+ }
+ raidPtr->parityBufferPool.bufferSize = parityBufferCapacity * raidPtr->bytesPerSector;
+ printf("parityBufferPool.bufferSize %d\n",raidPtr->parityBufferPool.bufferSize);
+ raidPtr->parityBufferPool.totalBuffers = 1; /* for now, only one region at a time may be reintegrated */
+ raidPtr->parityBufferPool.availableBuffers = raidPtr->parityBufferPool.totalBuffers;
+ raidPtr->parityBufferPool.availBuffersIndex = 0;
+ raidPtr->parityBufferPool.emptyBuffersIndex = 0;
+ RF_Malloc(raidPtr->parityBufferPool.buffers, raidPtr->parityBufferPool.totalBuffers * sizeof(caddr_t), (caddr_t *));
+ if (raidPtr->parityBufferPool.buffers == NULL) {
+ rf_mutex_destroy(&raidPtr->parityBufferPool.mutex);
+ rf_cond_destroy(&raidPtr->parityBufferPool.cond);
+ return(ENOMEM);
+ }
+ for (i = 0; i < raidPtr->parityBufferPool.totalBuffers; i++) {
+ RF_Malloc(raidPtr->parityBufferPool.buffers[i], raidPtr->parityBufferPool.bufferSize * sizeof(char), (caddr_t));
+ if (raidPtr->parityBufferPool.buffers == NULL) {
+ rf_mutex_destroy(&raidPtr->parityBufferPool.mutex);
+ rf_cond_destroy(&raidPtr->parityBufferPool.cond);
+ for(j=0;j<i;j++) {
+ RF_Free(raidPtr->parityBufferPool.buffers[i], raidPtr->regionBufferPool.bufferSize * sizeof(char));
+ }
+ RF_Free(raidPtr->parityBufferPool.buffers, raidPtr->regionBufferPool.totalBuffers * sizeof(caddr_t));
+ return(ENOMEM);
+ }
+ printf("parityBufferPool.buffers[%d] = %lx\n", i,
+ (long)raidPtr->parityBufferPool.buffers[i]);
+ }
+ rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingParityBufferPool, raidPtr);
+ if (rc) {
+ RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ rf_ShutdownParityLoggingParityBufferPool(raidPtr);
+ return(rc);
+ }
+
+ /* initialize parityLogDiskQueue */
+ rc = rf_create_managed_mutex(listp, &raidPtr->parityLogDiskQueue.mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ return(rc);
+ }
+ rc = rf_create_managed_cond(listp, &raidPtr->parityLogDiskQueue.cond);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ return(rc);
+ }
+ raidPtr->parityLogDiskQueue.flushQueue = NULL;
+ raidPtr->parityLogDiskQueue.reintQueue = NULL;
+ raidPtr->parityLogDiskQueue.bufHead = NULL;
+ raidPtr->parityLogDiskQueue.bufTail = NULL;
+ raidPtr->parityLogDiskQueue.reintHead = NULL;
+ raidPtr->parityLogDiskQueue.reintTail = NULL;
+ raidPtr->parityLogDiskQueue.logBlockHead = NULL;
+ raidPtr->parityLogDiskQueue.logBlockTail = NULL;
+ raidPtr->parityLogDiskQueue.reintBlockHead = NULL;
+ raidPtr->parityLogDiskQueue.reintBlockTail = NULL;
+ raidPtr->parityLogDiskQueue.freeDataList = NULL;
+ raidPtr->parityLogDiskQueue.freeCommonList = NULL;
+
+ rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingDiskQueue, raidPtr);
+ if (rc) {
+ RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ return(rc);
+ }
+
+ for (i = 0; i < rf_numParityRegions; i++)
+ {
+ rc = rf_mutex_init(&raidPtr->regionInfo[i].mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ for(j=0;j<i;j++)
+ FreeRegionInfo(raidPtr, j);
+ RF_Free(raidPtr->regionInfo, (rf_numParityRegions * sizeof(RF_RegionInfo_t)));
+ return(ENOMEM);
+ }
+ rc = rf_mutex_init(&raidPtr->regionInfo[i].reintMutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ rf_mutex_destroy(&raidPtr->regionInfo[i].mutex);
+ for(j=0;j<i;j++)
+ FreeRegionInfo(raidPtr, j);
+ RF_Free(raidPtr->regionInfo, (rf_numParityRegions * sizeof(RF_RegionInfo_t)));
+ return(ENOMEM);
+ }
+ raidPtr->regionInfo[i].reintInProgress = RF_FALSE;
+ raidPtr->regionInfo[i].regionStartAddr = raidPtr->regionLogCapacity * i;
+ raidPtr->regionInfo[i].parityStartAddr = raidPtr->regionParityRange * i;
+ if (i < rf_numParityRegions - 1)
+ {
+ raidPtr->regionInfo[i].capacity = raidPtr->regionLogCapacity;
+ raidPtr->regionInfo[i].numSectorsParity = raidPtr->regionParityRange;
+ }
+ else
+ {
+ raidPtr->regionInfo[i].capacity = lastRegionCapacity;
+ raidPtr->regionInfo[i].numSectorsParity = raidPtr->sectorsPerDisk - raidPtr->regionParityRange * i;
+ if (raidPtr->regionInfo[i].numSectorsParity > maxRegionParityRange)
+ maxRegionParityRange = raidPtr->regionInfo[i].numSectorsParity;
+ }
+ raidPtr->regionInfo[i].diskCount = 0;
+ RF_ASSERT(raidPtr->regionInfo[i].capacity + raidPtr->regionInfo[i].regionStartAddr <= totalLogCapacity);
+ RF_ASSERT(raidPtr->regionInfo[i].parityStartAddr + raidPtr->regionInfo[i].numSectorsParity <= raidPtr->sectorsPerDisk);
+ RF_Malloc(raidPtr->regionInfo[i].diskMap, (raidPtr->regionInfo[i].capacity * sizeof(RF_DiskMap_t)), (RF_DiskMap_t *));
+ if (raidPtr->regionInfo[i].diskMap == NULL) {
+ rf_mutex_destroy(&raidPtr->regionInfo[i].mutex);
+ rf_mutex_destroy(&raidPtr->regionInfo[i].reintMutex);
+ for(j=0;j<i;j++)
+ FreeRegionInfo(raidPtr, j);
+ RF_Free(raidPtr->regionInfo, (rf_numParityRegions * sizeof(RF_RegionInfo_t)));
+ return(ENOMEM);
+ }
+ raidPtr->regionInfo[i].loggingEnabled = RF_FALSE;
+ raidPtr->regionInfo[i].coreLog = NULL;
+ }
+ rc = rf_ShutdownCreate(listp, rf_ShutdownParityLoggingRegionInfo, raidPtr);
+ if (rc) {
+ RF_ERRORMSG3("Unable to create shutdown entry file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ rf_ShutdownParityLoggingRegionInfo(raidPtr);
+ return(rc);
+ }
+
+ RF_ASSERT(raidPtr->parityLogDiskQueue.threadState == 0);
+ raidPtr->parityLogDiskQueue.threadState = RF_PLOG_CREATED;
+ rc = RF_CREATE_THREAD(raidPtr->pLogDiskThreadHandle, rf_ParityLoggingDiskManager, raidPtr);
+ if (rc) {
+ raidPtr->parityLogDiskQueue.threadState = 0;
+ RF_ERRORMSG3("Unable to create parity logging disk thread file %s line %d rc=%d\n",
+ __FILE__, __LINE__, rc);
+ return(ENOMEM);
+ }
+ /* wait for thread to start */
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ while(!(raidPtr->parityLogDiskQueue.threadState&RF_PLOG_RUNNING)) {
+ RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, raidPtr->parityLogDiskQueue.mutex);
+ }
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+
+ rc = rf_ShutdownCreate(listp, rf_ShutdownParityLogging, raidPtr);
+ if (rc) {
+ RF_ERRORMSG1("Got rc=%d adding parity logging shutdown event\n", rc);
+ rf_ShutdownParityLogging(raidPtr);
+ return(rc);
+ }
+
+ if (rf_parityLogDebug)
+ {
+ printf(" size of disk log in sectors: %d\n",
+ (int)totalLogCapacity);
+ printf(" total number of parity regions is %d\n", (int)rf_numParityRegions);
+ printf(" nominal sectors of log per parity region is %d\n", (int)raidPtr->regionLogCapacity);
+ printf(" nominal region fragmentation is %d sectors\n",(int)fragmentation);
+ printf(" total number of parity logs is %d\n", raidPtr->numParityLogs);
+ printf(" parity log size is %d sectors\n", raidPtr->numSectorsPerLog);
+ printf(" total in-core log space is %d bytes\n", (int) rf_totalInCoreLogCapacity);
+ }
+
+ rf_EnableParityLogging(raidPtr);
+
+ return(0);
+}
+
+static void FreeRegionInfo(
+ RF_Raid_t *raidPtr,
+ RF_RegionId_t regionID)
+{
+ RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+ RF_Free(raidPtr->regionInfo[regionID].diskMap, (raidPtr->regionInfo[regionID].capacity * sizeof(RF_DiskMap_t)));
+ if (!rf_forceParityLogReint && raidPtr->regionInfo[regionID].coreLog) {
+ rf_ReleaseParityLogs(raidPtr, raidPtr->regionInfo[regionID].coreLog);
+ raidPtr->regionInfo[regionID].coreLog = NULL;
+ }
+ else {
+ RF_ASSERT(raidPtr->regionInfo[regionID].coreLog == NULL);
+ RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == 0);
+ }
+ RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
+ rf_mutex_destroy(&raidPtr->regionInfo[regionID].mutex);
+ rf_mutex_destroy(&raidPtr->regionInfo[regionID].reintMutex);
+}
+
+
+static void FreeParityLogQueue(
+ RF_Raid_t *raidPtr,
+ RF_ParityLogQueue_t *queue)
+{
+ RF_ParityLog_t *l1, *l2;
+
+ RF_LOCK_MUTEX(queue->mutex);
+ l1 = queue->parityLogs;
+ while (l1)
+ {
+ l2 = l1;
+ l1 = l2->next;
+ RF_Free(l2->records, (raidPtr->numSectorsPerLog * sizeof(RF_ParityLogRecord_t)));
+ RF_Free(l2, sizeof(RF_ParityLog_t));
+ }
+ RF_UNLOCK_MUTEX(queue->mutex);
+ rf_mutex_destroy(&queue->mutex);
+}
+
+
+static void FreeRegionBufferQueue(RF_RegionBufferQueue_t *queue)
+{
+ int i;
+
+ RF_LOCK_MUTEX(queue->mutex);
+ if (queue->availableBuffers != queue->totalBuffers)
+ {
+ printf("Attempt to free region queue which is still in use!\n");
+ RF_ASSERT(0);
+ }
+ for (i = 0; i < queue->totalBuffers; i++)
+ RF_Free(queue->buffers[i], queue->bufferSize);
+ RF_Free(queue->buffers, queue->totalBuffers * sizeof(caddr_t));
+ RF_UNLOCK_MUTEX(queue->mutex);
+ rf_mutex_destroy(&queue->mutex);
+}
+
+static void rf_ShutdownParityLoggingRegionInfo(RF_ThreadArg_t arg)
+{
+ RF_Raid_t *raidPtr;
+ RF_RegionId_t i;
+
+ raidPtr = (RF_Raid_t *)arg;
+ if (rf_parityLogDebug) {
+ int tid;
+ rf_get_threadid(tid);
+ printf("[%d] ShutdownParityLoggingRegionInfo\n", tid);
+ }
+ /* free region information structs */
+ for (i = 0; i < rf_numParityRegions; i++)
+ FreeRegionInfo(raidPtr, i);
+ RF_Free(raidPtr->regionInfo, (rf_numParityRegions * sizeof(raidPtr->regionInfo)));
+ raidPtr->regionInfo = NULL;
+}
+
+static void rf_ShutdownParityLoggingPool(RF_ThreadArg_t arg)
+{
+ RF_Raid_t *raidPtr;
+
+ raidPtr = (RF_Raid_t *)arg;
+ if (rf_parityLogDebug) {
+ int tid;
+ rf_get_threadid(tid);
+ printf("[%d] ShutdownParityLoggingPool\n", tid);
+ }
+ /* free contents of parityLogPool */
+ FreeParityLogQueue(raidPtr, &raidPtr->parityLogPool);
+ RF_Free(raidPtr->parityLogBufferHeap, raidPtr->numParityLogs * raidPtr->numSectorsPerLog * raidPtr->bytesPerSector);
+}
+
+static void rf_ShutdownParityLoggingRegionBufferPool(RF_ThreadArg_t arg)
+{
+ RF_Raid_t *raidPtr;
+
+ raidPtr = (RF_Raid_t *)arg;
+ if (rf_parityLogDebug) {
+ int tid;
+ rf_get_threadid(tid);
+ printf("[%d] ShutdownParityLoggingRegionBufferPool\n", tid);
+ }
+ FreeRegionBufferQueue(&raidPtr->regionBufferPool);
+}
+
+static void rf_ShutdownParityLoggingParityBufferPool(RF_ThreadArg_t arg)
+{
+ RF_Raid_t *raidPtr;
+
+ raidPtr = (RF_Raid_t *)arg;
+ if (rf_parityLogDebug) {
+ int tid;
+ rf_get_threadid(tid);
+ printf("[%d] ShutdownParityLoggingParityBufferPool\n", tid);
+ }
+ FreeRegionBufferQueue(&raidPtr->parityBufferPool);
+}
+
+static void rf_ShutdownParityLoggingDiskQueue(RF_ThreadArg_t arg)
+{
+ RF_ParityLogData_t *d;
+ RF_CommonLogData_t *c;
+ RF_Raid_t *raidPtr;
+
+ raidPtr = (RF_Raid_t *)arg;
+ if (rf_parityLogDebug) {
+ int tid;
+ rf_get_threadid(tid);
+ printf("[%d] ShutdownParityLoggingDiskQueue\n", tid);
+ }
+ /* free disk manager stuff */
+ RF_ASSERT(raidPtr->parityLogDiskQueue.bufHead == NULL);
+ RF_ASSERT(raidPtr->parityLogDiskQueue.bufTail == NULL);
+ RF_ASSERT(raidPtr->parityLogDiskQueue.reintHead == NULL);
+ RF_ASSERT(raidPtr->parityLogDiskQueue.reintTail == NULL);
+ while (raidPtr->parityLogDiskQueue.freeDataList)
+ {
+ d = raidPtr->parityLogDiskQueue.freeDataList;
+ raidPtr->parityLogDiskQueue.freeDataList = raidPtr->parityLogDiskQueue.freeDataList->next;
+ RF_Free(d, sizeof(RF_ParityLogData_t));
+ }
+ while (raidPtr->parityLogDiskQueue.freeCommonList)
+ {
+ c = raidPtr->parityLogDiskQueue.freeCommonList;
+ rf_mutex_destroy(&c->mutex);
+ raidPtr->parityLogDiskQueue.freeCommonList = raidPtr->parityLogDiskQueue.freeCommonList->next;
+ RF_Free(c, sizeof(RF_CommonLogData_t));
+ }
+}
+
+static void rf_ShutdownParityLogging(RF_ThreadArg_t arg)
+{
+ RF_Raid_t *raidPtr;
+
+ raidPtr = (RF_Raid_t *)arg;
+ if (rf_parityLogDebug) {
+ int tid;
+ rf_get_threadid(tid);
+ printf("[%d] ShutdownParityLogging\n", tid);
+ }
+#ifndef SIMULATE
+ /* shutdown disk thread */
+ /* This has the desirable side-effect of forcing all regions to be
+ reintegrated. This is necessary since all parity log maps are
+ currently held in volatile memory. */
+
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ raidPtr->parityLogDiskQueue.threadState |= RF_PLOG_TERMINATE;
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
+ /*
+ * pLogDiskThread will now terminate when queues are cleared
+ * now wait for it to be done
+ */
+ RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+ while(!(raidPtr->parityLogDiskQueue.threadState&RF_PLOG_SHUTDOWN)) {
+ RF_WAIT_COND(raidPtr->parityLogDiskQueue.cond, raidPtr->parityLogDiskQueue.mutex);
+ }
+ RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
+#else /* !SIMULATE */
+ /* explicitly call shutdown routines which force reintegration */
+ rf_ShutdownLogging(raidPtr);
+#endif /* !SIMULATE */
+ if (rf_parityLogDebug) {
+ int tid;
+ rf_get_threadid(tid);
+ printf("[%d] ShutdownParityLogging done (thread completed)\n", tid);
+ }
+}
+
+int rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t *raidPtr)
+{
+ return(20);
+}
+
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t *raidPtr)
+{
+ return(10);
+}
+
+/* return the region ID for a given RAID address */
+RF_RegionId_t rf_MapRegionIDParityLogging(
+ RF_Raid_t *raidPtr,
+ RF_SectorNum_t address)
+{
+ RF_RegionId_t regionID;
+
+/* regionID = address / (raidPtr->regionParityRange * raidPtr->Layout.numDataCol); */
+ regionID = address / raidPtr->regionParityRange;
+ if (regionID == rf_numParityRegions)
+ {
+ /* last region may be larger than other regions */
+ regionID--;
+ }
+ RF_ASSERT(address >= raidPtr->regionInfo[regionID].parityStartAddr);
+ RF_ASSERT(address < raidPtr->regionInfo[regionID].parityStartAddr + raidPtr->regionInfo[regionID].numSectorsParity);
+ RF_ASSERT(regionID < rf_numParityRegions);
+ return(regionID);
+}
+
+
+/* given a logical RAID sector, determine physical disk address of data */
+void rf_MapSectorParityLogging(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+ *row = 0;
+ /* *col = (SUID % (raidPtr->numCol - raidPtr->Layout.numParityLogCol)); */
+ *col = SUID % raidPtr->Layout.numDataCol;
+ *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
+ (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+
+/* given a logical RAID sector, determine physical disk address of parity */
+void rf_MapParityParityLogging(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+
+ *row = 0;
+ /* *col = raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%(raidPtr->numCol - raidPtr->Layout.numParityLogCol); */
+ *col = raidPtr->Layout.numDataCol;
+ *diskSector =(SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
+ (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+
+/* given a regionID and sector offset, determine the physical disk address of the parity log */
+void rf_MapLogParityLogging(
+ RF_Raid_t *raidPtr,
+ RF_RegionId_t regionID,
+ RF_SectorNum_t regionOffset,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *startSector)
+{
+ *row = 0;
+ *col = raidPtr->numCol - 1;
+ *startSector = raidPtr->regionInfo[regionID].regionStartAddr + regionOffset;
+}
+
+
+/* given a regionID, determine the physical disk address of the logged parity for that region */
+void rf_MapRegionParity(
+ RF_Raid_t *raidPtr,
+ RF_RegionId_t regionID,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *startSector,
+ RF_SectorCount_t *numSector)
+{
+ *row = 0;
+ *col = raidPtr->numCol - 2;
+ *startSector = raidPtr->regionInfo[regionID].parityStartAddr;
+ *numSector = raidPtr->regionInfo[regionID].numSectorsParity;
+}
+
+
+/* given a logical RAID address, determine the participating disks in the stripe */
+void rf_IdentifyStripeParityLogging(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids,
+ RF_RowCol_t *outRow)
+{
+ RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr);
+ RF_ParityLoggingConfigInfo_t *info = (RF_ParityLoggingConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+ *outRow = 0;
+ *diskids = info->stripeIdentifier[ stripeID % raidPtr->numCol ];
+}
+
+
+void rf_MapSIDToPSIDParityLogging(
+ RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t stripeID,
+ RF_StripeNum_t *psID,
+ RF_ReconUnitNum_t *which_ru)
+{
+ *which_ru = 0;
+ *psID = stripeID;
+}
+
+
+/* select an algorithm for performing an access. Returns two pointers,
+ * one to a function that will return information about the DAG, and
+ * another to a function that will create the dag.
+ */
+void rf_ParityLoggingDagSelect(
+ RF_Raid_t *raidPtr,
+ RF_IoType_t type,
+ RF_AccessStripeMap_t *asmp,
+ RF_VoidFuncPtr *createFunc)
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_PhysDiskAddr_t *failedPDA=NULL;
+ RF_RowCol_t frow, fcol;
+ RF_RowStatus_t rstat;
+ int prior_recon;
+ int tid;
+
+ RF_ASSERT(RF_IO_IS_R_OR_W(type));
+
+ if (asmp->numDataFailed + asmp->numParityFailed > 1) {
+ RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n");
+ /* *infoFunc = */ *createFunc = NULL;
+ return;
+ } else if (asmp->numDataFailed + asmp->numParityFailed == 1) {
+
+ /* if under recon & already reconstructed, redirect the access to the spare drive
+ * and eliminate the failure indication
+ */
+ failedPDA = asmp->failedPDAs[0];
+ frow = failedPDA->row; fcol = failedPDA->col;
+ rstat = raidPtr->status[failedPDA->row];
+ prior_recon = (rstat == rf_rs_reconfigured) || (
+ (rstat == rf_rs_reconstructing) ?
+ rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0
+ );
+ if (prior_recon) {
+ RF_RowCol_t or = failedPDA->row,oc=failedPDA->col;
+ RF_SectorNum_t oo=failedPDA->startSector;
+ if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { /* redirect to dist spare space */
+
+ if (failedPDA == asmp->parityInfo) {
+
+ /* parity has failed */
+ (layoutPtr->map->MapParity)(raidPtr, failedPDA->raidAddress, &failedPDA->row,
+ &failedPDA->col, &failedPDA->startSector, RF_REMAP);
+
+ if (asmp->parityInfo->next) { /* redir 2nd component, if any */
+ RF_PhysDiskAddr_t *p = asmp->parityInfo->next;
+ RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit;
+ p->row = failedPDA->row;
+ p->col = failedPDA->col;
+ p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) +
+ SUoffs; /* cheating: startSector is not really a RAID address */
+ }
+
+ } else if (asmp->parityInfo->next && failedPDA == asmp->parityInfo->next) {
+ RF_ASSERT(0); /* should not ever happen */
+ } else {
+
+ /* data has failed */
+ (layoutPtr->map->MapSector)(raidPtr, failedPDA->raidAddress, &failedPDA->row,
+ &failedPDA->col, &failedPDA->startSector, RF_REMAP);
+
+ }
+
+ } else { /* redirect to dedicated spare space */
+
+ failedPDA->row = raidPtr->Disks[frow][fcol].spareRow;
+ failedPDA->col = raidPtr->Disks[frow][fcol].spareCol;
+
+ /* the parity may have two distinct components, both of which may need to be redirected */
+ if (asmp->parityInfo->next) {
+ if (failedPDA == asmp->parityInfo) {
+ failedPDA->next->row = failedPDA->row;
+ failedPDA->next->col = failedPDA->col;
+ } else if (failedPDA == asmp->parityInfo->next) { /* paranoid: should never occur */
+ asmp->parityInfo->row = failedPDA->row;
+ asmp->parityInfo->col = failedPDA->col;
+ }
+ }
+ }
+
+ RF_ASSERT(failedPDA->col != -1);
+
+ if (rf_dagDebug || rf_mapDebug) {
+ rf_get_threadid(tid);
+ printf("[%d] Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n",
+ tid,type,or,oc,(long)oo,failedPDA->row,failedPDA->col,(long)failedPDA->startSector);
+ }
+
+ asmp->numDataFailed = asmp->numParityFailed = 0;
+ }
+
+ }
+
+
+ if (type == RF_IO_TYPE_READ) {
+
+ if (asmp->numDataFailed == 0)
+ *createFunc = (RF_VoidFuncPtr)rf_CreateFaultFreeReadDAG;
+ else
+ *createFunc = (RF_VoidFuncPtr)rf_CreateRaidFiveDegradedReadDAG;
+
+ }
+ else {
+
+
+ /* if mirroring, always use large writes. If the access requires two distinct parity updates,
+ * always do a small write. If the stripe contains a failure but the access does not, do a
+ * small write.
+ * The first conditional (numStripeUnitsAccessed <= numDataCol/2) uses a less-than-or-equal
+ * rather than just a less-than because when G is 3 or 4, numDataCol/2 is 1, and I want
+ * single-stripe-unit updates to use just one disk.
+ */
+ if ( (asmp->numDataFailed + asmp->numParityFailed) == 0) {
+ if (((asmp->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol!=1)) ||
+ (asmp->parityInfo->next!=NULL) || rf_CheckStripeForFailures(raidPtr, asmp)) {
+ *createFunc = (RF_VoidFuncPtr)rf_CreateParityLoggingSmallWriteDAG;
+ }
+ else
+ *createFunc = (RF_VoidFuncPtr)rf_CreateParityLoggingLargeWriteDAG;
+ }
+ else
+ if (asmp->numParityFailed == 1)
+ *createFunc = (RF_VoidFuncPtr)rf_CreateNonRedundantWriteDAG;
+ else
+ if (asmp->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit)
+ *createFunc = NULL;
+ else
+ *createFunc = (RF_VoidFuncPtr)rf_CreateDegradedWriteDAG;
+ }
+}
+
+#endif /* RF_INCLUDE_PARITYLOGGING > 0 */
diff --git a/sys/dev/raidframe/rf_paritylogging.h b/sys/dev/raidframe/rf_paritylogging.h
new file mode 100644
index 00000000000..3a2db063c28
--- /dev/null
+++ b/sys/dev/raidframe/rf_paritylogging.h
@@ -0,0 +1,137 @@
+/* $OpenBSD: rf_paritylogging.h,v 1.1 1999/01/11 14:29:36 niklas Exp $ */
+/* $NetBSD: rf_paritylogging.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* header file for Parity Logging */
+
+/*
+ * :
+ * Log: rf_paritylogging.h,v
+ * Revision 1.22 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.21 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.20 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.19 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.18 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.17 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.16 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.15 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.14 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.13 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.12 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.11 1995/12/06 20:56:25 wvcii
+ * added prototypes
+ *
+ * Revision 1.10 1995/11/30 16:06:58 wvcii
+ * added copyright info
+ *
+ * Revision 1.9 1995/11/17 19:53:08 wvcii
+ * fixed bug in MapParityRegion prototype
+ *
+ * Revision 1.8 1995/11/17 19:09:24 wvcii
+ * added prototypint to MapParity
+ *
+ * Revision 1.7 1995/11/07 15:28:17 wvcii
+ * changed ParityLoggingDagSelect prototype
+ * function no longer generates numHdrSucc, numTermAnt
+ *
+ * Revision 1.6 1995/07/07 00:16:50 wvcii
+ * this version free from deadlock, fails parity verification
+ *
+ * Revision 1.5 1995/06/23 13:39:44 robby
+ * updeated to prototypes in rf_layout.h
+ *
+ */
+
+#ifndef _RF__RF_PARITYLOGGING_H_
+#define _RF__RF_PARITYLOGGING_H_
+
+int rf_ConfigureParityLogging(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr);
+int rf_GetDefaultNumFloatingReconBuffersParityLogging(RF_Raid_t *raidPtr);
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitParityLogging(RF_Raid_t *raidPtr);
+RF_RegionId_t rf_MapRegionIDParityLogging(RF_Raid_t *raidPtr,
+ RF_SectorNum_t address);
+void rf_MapSectorParityLogging(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector,
+ int remap);
+void rf_MapParityParityLogging(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector,
+ int remap);
+void rf_MapLogParityLogging(RF_Raid_t *raidPtr, RF_RegionId_t regionID,
+ RF_SectorNum_t regionOffset, RF_RowCol_t *row, RF_RowCol_t *col,
+ RF_SectorNum_t *startSector);
+void rf_MapRegionParity(RF_Raid_t *raidPtr, RF_RegionId_t regionID,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *startSector,
+ RF_SectorCount_t *numSector);
+void rf_IdentifyStripeParityLogging(RF_Raid_t *raidPtr, RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids, RF_RowCol_t *outRow);
+void rf_MapSIDToPSIDParityLogging(RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t stripeID, RF_StripeNum_t *psID,
+ RF_ReconUnitNum_t *which_ru);
+void rf_ParityLoggingDagSelect(RF_Raid_t *raidPtr, RF_IoType_t type,
+ RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc);
+
+#endif /* !_RF__RF_PARITYLOGGING_H_ */
diff --git a/sys/dev/raidframe/rf_parityloggingdags.c b/sys/dev/raidframe/rf_parityloggingdags.c
new file mode 100644
index 00000000000..1cc51d0a7e3
--- /dev/null
+++ b/sys/dev/raidframe/rf_parityloggingdags.c
@@ -0,0 +1,752 @@
+/* $OpenBSD: rf_parityloggingdags.c,v 1.1 1999/01/11 14:29:37 niklas Exp $ */
+/* $NetBSD: rf_parityloggingdags.c,v 1.1 1998/11/13 04:20:32 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * Log: rf_parityloggingdags.c,v
+ * Revision 1.27 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.26 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.25 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.24 1996/06/11 13:47:21 jimz
+ * fix up for in-kernel compilation
+ *
+ * Revision 1.23 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.22 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.21 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.20 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.19 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.18 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.17 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.16 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.15 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.14 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.13 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.12 1996/05/08 21:01:24 jimz
+ * fixed up enum type names that were conflicting with other
+ * enums and function names (ie, "panic")
+ * future naming trends will be towards RF_ and rf_ for
+ * everything raidframe-related
+ *
+ * Revision 1.11 1996/05/03 19:42:02 wvcii
+ * added includes for dag library
+ *
+ * Revision 1.10 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.9 1995/12/06 20:55:24 wvcii
+ * added prototyping
+ * fixed bug in dag header numSuccedents count for both small and large dags
+ *
+ * Revision 1.8 1995/11/30 16:08:01 wvcii
+ * added copyright info
+ *
+ * Revision 1.7 1995/11/07 15:29:05 wvcii
+ * reorganized code, adding comments and asserts
+ * dag creation routines now generate term node
+ * encoded commit point, barrier, and antecedence types into dags
+ *
+ * Revision 1.6 1995/09/07 15:52:06 jimz
+ * noop compile when INCLUDE_PARITYLOGGING not defined
+ *
+ * Revision 1.5 1995/06/15 13:51:53 robby
+ * updated some wrong prototypes (after prototyping rf_dagutils.h)
+ *
+ * Revision 1.4 1995/06/09 13:15:05 wvcii
+ * code is now nonblocking
+ *
+ * Revision 1.3 95/05/31 13:09:14 wvcii
+ * code debug
+ *
+ * Revision 1.2 1995/05/21 15:34:14 wvcii
+ * code debug
+ *
+ * Revision 1.1 95/05/16 14:36:53 wvcii
+ * Initial revision
+ *
+ *
+ */
+
+#include "rf_archs.h"
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+
+/*
+ DAGs specific to parity logging are created here
+ */
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_dagutils.h"
+#include "rf_dagfuncs.h"
+#include "rf_threadid.h"
+#include "rf_debugMem.h"
+#include "rf_paritylog.h"
+#include "rf_memchunk.h"
+#include "rf_general.h"
+
+#include "rf_parityloggingdags.h"
+
+/******************************************************************************
+ *
+ * creates a DAG to perform a large-write operation:
+ *
+ * / Rod \ / Wnd \
+ * H -- NIL- Rod - NIL - Wnd ------ NIL - T
+ * \ Rod / \ Xor - Lpo /
+ *
+ * The writes are not done until the reads complete because if they were done in
+ * parallel, a failure on one of the reads could leave the parity in an inconsistent
+ * state, so that the retry with a new DAG would produce erroneous parity.
+ *
+ * Note: this DAG has the nasty property that none of the buffers allocated for reading
+ * old data can be freed until the XOR node fires. Need to fix this.
+ *
+ * The last two arguments are the number of faults tolerated, and function for the
+ * redundancy calculation. The undo for the redundancy calc is assumed to be null
+ *
+ *****************************************************************************/
+
+void rf_CommonCreateParityLoggingLargeWriteDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList,
+ int nfaults,
+ int (*redFunc)(RF_DagNode_t *))
+{
+ RF_DagNode_t *nodes, *wndNodes, *rodNodes=NULL, *syncNode, *xorNode, *lpoNode, *blockNode, *unblockNode, *termNode;
+ int nWndNodes, nRodNodes, i;
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_AccessStripeMapHeader_t *new_asm_h[2];
+ int nodeNum, asmNum;
+ RF_ReconUnitNum_t which_ru;
+ char *sosBuffer, *eosBuffer;
+ RF_PhysDiskAddr_t *pda;
+ RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
+
+ if (rf_dagDebug)
+ printf("[Creating parity-logging large-write DAG]\n");
+ RF_ASSERT(nfaults == 1); /* this arch only single fault tolerant */
+ dag_h->creator = "ParityLoggingLargeWriteDAG";
+
+ /* alloc the Wnd nodes, the xor node, and the Lpo node */
+ nWndNodes = asmap->numStripeUnitsAccessed;
+ RF_CallocAndAdd(nodes, nWndNodes + 6, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+ i = 0;
+ wndNodes = &nodes[i]; i += nWndNodes;
+ xorNode = &nodes[i]; i += 1;
+ lpoNode = &nodes[i]; i += 1;
+ blockNode = &nodes[i]; i += 1;
+ syncNode = &nodes[i]; i += 1;
+ unblockNode = &nodes[i]; i += 1;
+ termNode = &nodes[i]; i += 1;
+
+ dag_h->numCommitNodes = nWndNodes + 1;
+ dag_h->numCommits = 0;
+ dag_h->numSuccedents = 1;
+
+ rf_MapUnaccessedPortionOfStripe(raidPtr, layoutPtr, asmap, dag_h, new_asm_h, &nRodNodes, &sosBuffer, &eosBuffer, allocList);
+ if (nRodNodes > 0)
+ RF_CallocAndAdd(rodNodes, nRodNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+
+ /* begin node initialization */
+ rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nRodNodes + 1, 0, 0, 0, dag_h, "Nil", allocList);
+ rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nWndNodes + 1, 0, 0, dag_h, "Nil", allocList);
+ rf_InitNode(syncNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nWndNodes + 1, nRodNodes + 1, 0, 0, dag_h, "Nil", allocList);
+ rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+
+ /* initialize the Rod nodes */
+ for (nodeNum = asmNum = 0; asmNum < 2; asmNum++) {
+ if (new_asm_h[asmNum]) {
+ pda = new_asm_h[asmNum]->stripeMap->physInfo;
+ while (pda) {
+ rf_InitNode(&rodNodes[nodeNum], rf_wait, RF_FALSE, rf_DiskReadFunc,rf_DiskReadUndoFunc,rf_GenericWakeupFunc,1,1,4,0, dag_h, "Rod", allocList);
+ rodNodes[nodeNum].params[0].p = pda;
+ rodNodes[nodeNum].params[1].p = pda->bufPtr;
+ rodNodes[nodeNum].params[2].v = parityStripeID;
+ rodNodes[nodeNum].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ nodeNum++;
+ pda=pda->next;
+ }
+ }
+ }
+ RF_ASSERT(nodeNum == nRodNodes);
+
+ /* initialize the wnd nodes */
+ pda = asmap->physInfo;
+ for (i=0; i < nWndNodes; i++) {
+ rf_InitNode(&wndNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, "Wnd", allocList);
+ RF_ASSERT(pda != NULL);
+ wndNodes[i].params[0].p = pda;
+ wndNodes[i].params[1].p = pda->bufPtr;
+ wndNodes[i].params[2].v = parityStripeID;
+ wndNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ pda = pda->next;
+ }
+
+ /* initialize the redundancy node */
+ rf_InitNode(xorNode, rf_wait, RF_TRUE, redFunc, rf_NullNodeUndoFunc, NULL, 1, 1, 2*(nWndNodes+nRodNodes)+1, 1, dag_h, "Xr ", allocList);
+ xorNode->flags |= RF_DAGNODE_FLAG_YIELD;
+ for (i=0; i < nWndNodes; i++) {
+ xorNode->params[2*i+0] = wndNodes[i].params[0]; /* pda */
+ xorNode->params[2*i+1] = wndNodes[i].params[1]; /* buf ptr */
+ }
+ for (i=0; i < nRodNodes; i++) {
+ xorNode->params[2*(nWndNodes+i)+0] = rodNodes[i].params[0]; /* pda */
+ xorNode->params[2*(nWndNodes+i)+1] = rodNodes[i].params[1]; /* buf ptr */
+ }
+ xorNode->params[2*(nWndNodes+nRodNodes)].p = raidPtr; /* xor node needs to get at RAID information */
+
+ /* look for an Rod node that reads a complete SU. If none, alloc a buffer to receive the parity info.
+ * Note that we can't use a new data buffer because it will not have gotten written when the xor occurs.
+ */
+ for (i = 0; i < nRodNodes; i++)
+ if (((RF_PhysDiskAddr_t *) rodNodes[i].params[0].p)->numSector == raidPtr->Layout.sectorsPerStripeUnit)
+ break;
+ if (i == nRodNodes) {
+ RF_CallocAndAdd(xorNode->results[0], 1, rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit), (void *), allocList);
+ }
+ else {
+ xorNode->results[0] = rodNodes[i].params[1].p;
+ }
+
+ /* initialize the Lpo node */
+ rf_InitNode(lpoNode, rf_wait, RF_FALSE, rf_ParityLogOverwriteFunc, rf_ParityLogOverwriteUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpo", allocList);
+
+ lpoNode->params[0].p = asmap->parityInfo;
+ lpoNode->params[1].p = xorNode->results[0];
+ RF_ASSERT(asmap->parityInfo->next == NULL); /* parityInfo must describe entire parity unit */
+
+ /* connect nodes to form graph */
+
+ /* connect dag header to block node */
+ RF_ASSERT(dag_h->numSuccedents == 1);
+ RF_ASSERT(blockNode->numAntecedents == 0);
+ dag_h->succedents[0] = blockNode;
+
+ /* connect the block node to the Rod nodes */
+ RF_ASSERT(blockNode->numSuccedents == nRodNodes + 1);
+ for (i = 0; i < nRodNodes; i++) {
+ RF_ASSERT(rodNodes[i].numAntecedents == 1);
+ blockNode->succedents[i] = &rodNodes[i];
+ rodNodes[i].antecedents[0] = blockNode;
+ rodNodes[i].antType[0] = rf_control;
+ }
+
+ /* connect the block node to the sync node */
+ /* necessary if nRodNodes == 0 */
+ RF_ASSERT(syncNode->numAntecedents == nRodNodes + 1);
+ blockNode->succedents[nRodNodes] = syncNode;
+ syncNode->antecedents[0] = blockNode;
+ syncNode->antType[0] = rf_control;
+
+ /* connect the Rod nodes to the syncNode */
+ for (i = 0; i < nRodNodes; i++) {
+ rodNodes[i].succedents[0] = syncNode;
+ syncNode->antecedents[1 + i] = &rodNodes[i];
+ syncNode->antType[1 + i] = rf_control;
+ }
+
+ /* connect the sync node to the xor node */
+ RF_ASSERT(syncNode->numSuccedents == nWndNodes + 1);
+ RF_ASSERT(xorNode->numAntecedents == 1);
+ syncNode->succedents[0] = xorNode;
+ xorNode->antecedents[0] = syncNode;
+ xorNode->antType[0] = rf_trueData; /* carry forward from sync */
+
+ /* connect the sync node to the Wnd nodes */
+ for (i = 0; i < nWndNodes; i++) {
+ RF_ASSERT(wndNodes->numAntecedents == 1);
+ syncNode->succedents[1 + i] = &wndNodes[i];
+ wndNodes[i].antecedents[0] = syncNode;
+ wndNodes[i].antType[0] = rf_control;
+ }
+
+ /* connect the xor node to the Lpo node */
+ RF_ASSERT(xorNode->numSuccedents == 1);
+ RF_ASSERT(lpoNode->numAntecedents == 1);
+ xorNode->succedents[0] = lpoNode;
+ lpoNode->antecedents[0]= xorNode;
+ lpoNode->antType[0] = rf_trueData;
+
+ /* connect the Wnd nodes to the unblock node */
+ RF_ASSERT(unblockNode->numAntecedents == nWndNodes + 1);
+ for (i = 0; i < nWndNodes; i++) {
+ RF_ASSERT(wndNodes->numSuccedents == 1);
+ wndNodes[i].succedents[0] = unblockNode;
+ unblockNode->antecedents[i] = &wndNodes[i];
+ unblockNode->antType[i] = rf_control;
+ }
+
+ /* connect the Lpo node to the unblock node */
+ RF_ASSERT(lpoNode->numSuccedents == 1);
+ lpoNode->succedents[0] = unblockNode;
+ unblockNode->antecedents[nWndNodes] = lpoNode;
+ unblockNode->antType[nWndNodes] = rf_control;
+
+ /* connect unblock node to terminator */
+ RF_ASSERT(unblockNode->numSuccedents == 1);
+ RF_ASSERT(termNode->numAntecedents == 1);
+ RF_ASSERT(termNode->numSuccedents == 0);
+ unblockNode->succedents[0] = termNode;
+ termNode->antecedents[0] = unblockNode;
+ termNode->antType[0] = rf_control;
+}
+
+
+
+
+/******************************************************************************
+ *
+ * creates a DAG to perform a small-write operation (either raid 5 or pq), which is as follows:
+ *
+ * Header
+ * |
+ * Block
+ * / | ... \ \
+ * / | \ \
+ * Rod Rod Rod Rop
+ * | \ /| \ / | \/ |
+ * | | | /\ |
+ * Wnd Wnd Wnd X
+ * | \ / |
+ * | \ / |
+ * \ \ / Lpo
+ * \ \ / /
+ * +-> Unblock <-+
+ * |
+ * T
+ *
+ *
+ * R = Read, W = Write, X = Xor, o = old, n = new, d = data, p = parity.
+ * When the access spans a stripe unit boundary and is less than one SU in size, there will
+ * be two Rop -- X -- Wnp branches. I call this the "double-XOR" case.
+ * The second output from each Rod node goes to the X node. In the double-XOR
+ * case, there are exactly 2 Rod nodes, and each sends one output to one X node.
+ * There is one Rod -- Wnd -- T branch for each stripe unit being updated.
+ *
+ * The block and unblock nodes are unused. See comment above CreateFaultFreeReadDAG.
+ *
+ * Note: this DAG ignores all the optimizations related to making the RMWs atomic.
+ * it also has the nasty property that none of the buffers allocated for reading
+ * old data & parity can be freed until the XOR node fires. Need to fix this.
+ *
+ * A null qfuncs indicates single fault tolerant
+ *****************************************************************************/
+
+void rf_CommonCreateParityLoggingSmallWriteDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList,
+ RF_RedFuncs_t *pfuncs,
+ RF_RedFuncs_t *qfuncs)
+{
+ RF_DagNode_t *xorNodes, *blockNode, *unblockNode, *nodes;
+ RF_DagNode_t *readDataNodes, *readParityNodes;
+ RF_DagNode_t *writeDataNodes, *lpuNodes;
+ RF_DagNode_t *unlockDataNodes=NULL, *termNode;
+ RF_PhysDiskAddr_t *pda = asmap->physInfo;
+ int numDataNodes = asmap->numStripeUnitsAccessed;
+ int numParityNodes = (asmap->parityInfo->next) ? 2 : 1;
+ int i, j, nNodes, totalNumNodes;
+ RF_ReconUnitNum_t which_ru;
+ int (*func)(RF_DagNode_t *node), (*undoFunc)(RF_DagNode_t *node);
+ int (*qfunc)(RF_DagNode_t *node);
+ char *name, *qname;
+ RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout), asmap->raidAddress, &which_ru);
+ long nfaults = qfuncs ? 2 : 1;
+ int lu_flag = (rf_enableAtomicRMW) ? 1 : 0; /* lock/unlock flag */
+
+ if (rf_dagDebug) printf("[Creating parity-logging small-write DAG]\n");
+ RF_ASSERT(numDataNodes > 0);
+ RF_ASSERT(nfaults == 1);
+ dag_h->creator = "ParityLoggingSmallWriteDAG";
+
+ /* DAG creation occurs in three steps:
+ 1. count the number of nodes in the DAG
+ 2. create the nodes
+ 3. initialize the nodes
+ 4. connect the nodes
+ */
+
+ /* Step 1. compute number of nodes in the graph */
+
+ /* number of nodes:
+ a read and write for each data unit
+ a redundancy computation node for each parity node
+ a read and Lpu for each parity unit
+ a block and unblock node (2)
+ a terminator node
+ if atomic RMW
+ an unlock node for each data unit, redundancy unit
+ */
+ totalNumNodes = (2 * numDataNodes) + numParityNodes + (2 * numParityNodes) + 3;
+ if (lu_flag)
+ totalNumNodes += numDataNodes;
+
+ nNodes = numDataNodes + numParityNodes;
+
+ dag_h->numCommitNodes = numDataNodes + numParityNodes;
+ dag_h->numCommits = 0;
+ dag_h->numSuccedents = 1;
+
+ /* Step 2. create the nodes */
+ RF_CallocAndAdd(nodes, totalNumNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *), allocList);
+ i = 0;
+ blockNode = &nodes[i]; i += 1;
+ unblockNode = &nodes[i]; i += 1;
+ readDataNodes = &nodes[i]; i += numDataNodes;
+ readParityNodes = &nodes[i]; i += numParityNodes;
+ writeDataNodes = &nodes[i]; i += numDataNodes;
+ lpuNodes = &nodes[i]; i += numParityNodes;
+ xorNodes = &nodes[i]; i += numParityNodes;
+ termNode = &nodes[i]; i += 1;
+ if (lu_flag) {
+ unlockDataNodes = &nodes[i]; i += numDataNodes;
+ }
+ RF_ASSERT(i == totalNumNodes);
+
+ /* Step 3. initialize the nodes */
+ /* initialize block node (Nil) */
+ rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil", allocList);
+
+ /* initialize unblock node (Nil) */
+ rf_InitNode(unblockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nNodes, 0, 0, dag_h, "Nil", allocList);
+
+ /* initialize terminatory node (Trm) */
+ rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
+
+ /* initialize nodes which read old data (Rod) */
+ for (i = 0; i < numDataNodes; i++) {
+ rf_InitNode(&readDataNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rod", allocList);
+ RF_ASSERT(pda != NULL);
+ readDataNodes[i].params[0].p = pda; /* physical disk addr desc */
+ readDataNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old data */
+ readDataNodes[i].params[2].v = parityStripeID;
+ readDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, lu_flag, 0, which_ru);
+ pda=pda->next;
+ readDataNodes[i].propList[0] = NULL;
+ readDataNodes[i].propList[1] = NULL;
+ }
+
+ /* initialize nodes which read old parity (Rop) */
+ pda = asmap->parityInfo; i = 0;
+ for (i = 0; i < numParityNodes; i++) {
+ RF_ASSERT(pda != NULL);
+ rf_InitNode(&readParityNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, nNodes, 1, 4, 0, dag_h, "Rop", allocList);
+ readParityNodes[i].params[0].p = pda;
+ readParityNodes[i].params[1].p = rf_AllocBuffer(raidPtr, dag_h, pda, allocList); /* buffer to hold old parity */
+ readParityNodes[i].params[2].v = parityStripeID;
+ readParityNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ readParityNodes[i].propList[0] = NULL;
+ pda=pda->next;
+ }
+
+ /* initialize nodes which write new data (Wnd) */
+ pda = asmap->physInfo;
+ for (i=0; i < numDataNodes; i++) {
+ RF_ASSERT(pda != NULL);
+ rf_InitNode(&writeDataNodes[i], rf_wait, RF_TRUE, rf_DiskWriteFunc, rf_DiskWriteUndoFunc, rf_GenericWakeupFunc, 1, nNodes, 4, 0, dag_h, "Wnd", allocList);
+ writeDataNodes[i].params[0].p = pda; /* physical disk addr desc */
+ writeDataNodes[i].params[1].p = pda->bufPtr; /* buffer holding new data to be written */
+ writeDataNodes[i].params[2].v = parityStripeID;
+ writeDataNodes[i].params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+
+ if (lu_flag) {
+ /* initialize node to unlock the disk queue */
+ rf_InitNode(&unlockDataNodes[i], rf_wait, RF_FALSE, rf_DiskUnlockFunc, rf_DiskUnlockUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Und", allocList);
+ unlockDataNodes[i].params[0].p = pda; /* physical disk addr desc */
+ unlockDataNodes[i].params[1].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, lu_flag, which_ru);
+ }
+ pda = pda->next;
+ }
+
+
+ /* initialize nodes which compute new parity */
+ /* we use the simple XOR func in the double-XOR case, and when we're accessing only a portion of one stripe unit.
+ * the distinction between the two is that the regular XOR func assumes that the targbuf is a full SU in size,
+ * and examines the pda associated with the buffer to decide where within the buffer to XOR the data, whereas
+ * the simple XOR func just XORs the data into the start of the buffer.
+ */
+ if ((numParityNodes==2) || ((numDataNodes == 1) && (asmap->totalSectorsAccessed < raidPtr->Layout.sectorsPerStripeUnit))) {
+ func = pfuncs->simple; undoFunc = rf_NullNodeUndoFunc; name = pfuncs->SimpleName;
+ if (qfuncs)
+ { qfunc = qfuncs->simple; qname = qfuncs->SimpleName;}
+ } else {
+ func = pfuncs->regular; undoFunc = rf_NullNodeUndoFunc; name = pfuncs->RegularName;
+ if (qfuncs) { qfunc = qfuncs->regular; qname = qfuncs->RegularName;}
+ }
+ /* initialize the xor nodes: params are {pda,buf} from {Rod,Wnd,Rop} nodes, and raidPtr */
+ if (numParityNodes==2) { /* double-xor case */
+ for (i=0; i < numParityNodes; i++) {
+ rf_InitNode(&xorNodes[i], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, 7, 1, dag_h, name, allocList); /* no wakeup func for xor */
+ xorNodes[i].flags |= RF_DAGNODE_FLAG_YIELD;
+ xorNodes[i].params[0] = readDataNodes[i].params[0];
+ xorNodes[i].params[1] = readDataNodes[i].params[1];
+ xorNodes[i].params[2] = readParityNodes[i].params[0];
+ xorNodes[i].params[3] = readParityNodes[i].params[1];
+ xorNodes[i].params[4] = writeDataNodes[i].params[0];
+ xorNodes[i].params[5] = writeDataNodes[i].params[1];
+ xorNodes[i].params[6].p = raidPtr;
+ xorNodes[i].results[0] = readParityNodes[i].params[1].p; /* use old parity buf as target buf */
+ }
+ }
+ else {
+ /* there is only one xor node in this case */
+ rf_InitNode(&xorNodes[0], rf_wait, RF_TRUE, func, undoFunc, NULL, 1, nNodes, (2 * (numDataNodes + numDataNodes + 1) + 1), 1, dag_h, name, allocList);
+ xorNodes[0].flags |= RF_DAGNODE_FLAG_YIELD;
+ for (i=0; i < numDataNodes + 1; i++) {
+ /* set up params related to Rod and Rop nodes */
+ xorNodes[0].params[2*i+0] = readDataNodes[i].params[0]; /* pda */
+ xorNodes[0].params[2*i+1] = readDataNodes[i].params[1]; /* buffer pointer */
+ }
+ for (i=0; i < numDataNodes; i++) {
+ /* set up params related to Wnd and Wnp nodes */
+ xorNodes[0].params[2*(numDataNodes+1+i)+0] = writeDataNodes[i].params[0]; /* pda */
+ xorNodes[0].params[2*(numDataNodes+1+i)+1] = writeDataNodes[i].params[1]; /* buffer pointer */
+ }
+ xorNodes[0].params[2*(numDataNodes+numDataNodes+1)].p = raidPtr; /* xor node needs to get at RAID information */
+ xorNodes[0].results[0] = readParityNodes[0].params[1].p;
+ }
+
+ /* initialize the log node(s) */
+ pda = asmap->parityInfo;
+ for (i = 0; i < numParityNodes; i++) {
+ RF_ASSERT(pda);
+ rf_InitNode(&lpuNodes[i], rf_wait, RF_FALSE, rf_ParityLogUpdateFunc, rf_ParityLogUpdateUndoFunc, rf_GenericWakeupFunc, 1, 1, 2, 0, dag_h, "Lpu", allocList);
+ lpuNodes[i].params[0].p = pda; /* PhysDiskAddr of parity */
+ lpuNodes[i].params[1].p = xorNodes[i].results[0]; /* buffer pointer to parity */
+ pda = pda->next;
+ }
+
+
+ /* Step 4. connect the nodes */
+
+ /* connect header to block node */
+ RF_ASSERT(dag_h->numSuccedents == 1);
+ RF_ASSERT(blockNode->numAntecedents == 0);
+ dag_h->succedents[0] = blockNode;
+
+ /* connect block node to read old data nodes */
+ RF_ASSERT(blockNode->numSuccedents == (numDataNodes + numParityNodes));
+ for (i = 0; i < numDataNodes; i++) {
+ blockNode->succedents[i] = &readDataNodes[i];
+ RF_ASSERT(readDataNodes[i].numAntecedents == 1);
+ readDataNodes[i].antecedents[0]= blockNode;
+ readDataNodes[i].antType[0] = rf_control;
+ }
+
+ /* connect block node to read old parity nodes */
+ for (i = 0; i < numParityNodes; i++) {
+ blockNode->succedents[numDataNodes + i] = &readParityNodes[i];
+ RF_ASSERT(readParityNodes[i].numAntecedents == 1);
+ readParityNodes[i].antecedents[0] = blockNode;
+ readParityNodes[i].antType[0] = rf_control;
+ }
+
+ /* connect read old data nodes to write new data nodes */
+ for (i = 0; i < numDataNodes; i++) {
+ RF_ASSERT(readDataNodes[i].numSuccedents == numDataNodes + numParityNodes);
+ for (j = 0; j < numDataNodes; j++) {
+ RF_ASSERT(writeDataNodes[j].numAntecedents == numDataNodes + numParityNodes);
+ readDataNodes[i].succedents[j] = &writeDataNodes[j];
+ writeDataNodes[j].antecedents[i] = &readDataNodes[i];
+ if (i == j)
+ writeDataNodes[j].antType[i] = rf_antiData;
+ else
+ writeDataNodes[j].antType[i] = rf_control;
+ }
+ }
+
+ /* connect read old data nodes to xor nodes */
+ for (i = 0; i < numDataNodes; i++)
+ for (j = 0; j < numParityNodes; j++){
+ RF_ASSERT(xorNodes[j].numAntecedents == numDataNodes + numParityNodes);
+ readDataNodes[i].succedents[numDataNodes + j] = &xorNodes[j];
+ xorNodes[j].antecedents[i] = &readDataNodes[i];
+ xorNodes[j].antType[i] = rf_trueData;
+ }
+
+ /* connect read old parity nodes to write new data nodes */
+ for (i = 0; i < numParityNodes; i++) {
+ RF_ASSERT(readParityNodes[i].numSuccedents == numDataNodes + numParityNodes);
+ for (j = 0; j < numDataNodes; j++) {
+ readParityNodes[i].succedents[j] = &writeDataNodes[j];
+ writeDataNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
+ writeDataNodes[j].antType[numDataNodes + i] = rf_control;
+ }
+ }
+
+ /* connect read old parity nodes to xor nodes */
+ for (i = 0; i < numParityNodes; i++)
+ for (j = 0; j < numParityNodes; j++) {
+ readParityNodes[i].succedents[numDataNodes + j] = &xorNodes[j];
+ xorNodes[j].antecedents[numDataNodes + i] = &readParityNodes[i];
+ xorNodes[j].antType[numDataNodes + i] = rf_trueData;
+ }
+
+ /* connect xor nodes to write new parity nodes */
+ for (i = 0; i < numParityNodes; i++) {
+ RF_ASSERT(xorNodes[i].numSuccedents == 1);
+ RF_ASSERT(lpuNodes[i].numAntecedents == 1);
+ xorNodes[i].succedents[0] = &lpuNodes[i];
+ lpuNodes[i].antecedents[0] = &xorNodes[i];
+ lpuNodes[i].antType[0] = rf_trueData;
+ }
+
+ for (i = 0; i < numDataNodes; i++) {
+ if (lu_flag) {
+ /* connect write new data nodes to unlock nodes */
+ RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
+ RF_ASSERT(unlockDataNodes[i].numAntecedents == 1);
+ writeDataNodes[i].succedents[0] = &unlockDataNodes[i];
+ unlockDataNodes[i].antecedents[0] = &writeDataNodes[i];
+ unlockDataNodes[i].antType[0] = rf_control;
+
+ /* connect unlock nodes to unblock node */
+ RF_ASSERT(unlockDataNodes[i].numSuccedents == 1);
+ RF_ASSERT(unblockNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
+ unlockDataNodes[i].succedents[0] = unblockNode;
+ unblockNode->antecedents[i] = &unlockDataNodes[i];
+ unblockNode->antType[i] = rf_control;
+ }
+ else {
+ /* connect write new data nodes to unblock node */
+ RF_ASSERT(writeDataNodes[i].numSuccedents == 1);
+ RF_ASSERT(unblockNode->numAntecedents == (numDataNodes + (nfaults * numParityNodes)));
+ writeDataNodes[i].succedents[0] = unblockNode;
+ unblockNode->antecedents[i] = &writeDataNodes[i];
+ unblockNode->antType[i] = rf_control;
+ }
+ }
+
+ /* connect write new parity nodes to unblock node */
+ for (i = 0; i < numParityNodes; i++) {
+ RF_ASSERT(lpuNodes[i].numSuccedents == 1);
+ lpuNodes[i].succedents[0] = unblockNode;
+ unblockNode->antecedents[numDataNodes + i] = &lpuNodes[i];
+ unblockNode->antType[numDataNodes + i] = rf_control;
+ }
+
+ /* connect unblock node to terminator */
+ RF_ASSERT(unblockNode->numSuccedents == 1);
+ RF_ASSERT(termNode->numAntecedents == 1);
+ RF_ASSERT(termNode->numSuccedents == 0);
+ unblockNode->succedents[0] = termNode;
+ termNode->antecedents[0] = unblockNode;
+ termNode->antType[0] = rf_control;
+}
+
+
+void rf_CreateParityLoggingSmallWriteDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList,
+ RF_RedFuncs_t *pfuncs,
+ RF_RedFuncs_t *qfuncs)
+{
+ dag_h->creator = "ParityLoggingSmallWriteDAG";
+ rf_CommonCreateParityLoggingSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_xorFuncs, NULL);
+}
+
+
+void rf_CreateParityLoggingLargeWriteDAG(
+ RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap,
+ RF_DagHeader_t *dag_h,
+ void *bp,
+ RF_RaidAccessFlags_t flags,
+ RF_AllocListElem_t *allocList,
+ int nfaults,
+ int (*redFunc)(RF_DagNode_t *))
+{
+ dag_h->creator = "ParityLoggingSmallWriteDAG";
+ rf_CommonCreateParityLoggingLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularXorFunc);
+}
+
+#endif /* RF_INCLUDE_PARITYLOGGING > 0 */
diff --git a/sys/dev/raidframe/rf_parityloggingdags.h b/sys/dev/raidframe/rf_parityloggingdags.h
new file mode 100644
index 00000000000..1eecfc7fe08
--- /dev/null
+++ b/sys/dev/raidframe/rf_parityloggingdags.h
@@ -0,0 +1,94 @@
+/* $OpenBSD: rf_parityloggingdags.h,v 1.1 1999/01/11 14:29:37 niklas Exp $ */
+/* $NetBSD: rf_parityloggingdags.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/****************************************************************************
+ * *
+ * rf_parityloggingdags.h -- header file for parity logging dags *
+ * *
+ ****************************************************************************/
+
+/* :
+ * Log: rf_parityloggingdags.h,v
+ * Revision 1.10 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.9 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.8 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.7 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.6 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.5 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.4 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.3 1995/12/06 20:55:08 wvcii
+ * added prototyping
+ *
+ */
+
+#ifndef _RF__RF_PARITYLOGGINGDAGS_H_
+#define _RF__RF_PARITYLOGGINGDAGS_H_
+
+/* routines that create DAGs */
+void rf_CommonCreateParityLoggingLargeWriteDAG(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h,
+ void *bp, RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList,
+ int nfaults, int (*redFunc)(RF_DagNode_t *));
+void rf_CommonCreateParityLoggingSmallWriteDAG(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h,
+ void *bp, RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList,
+ RF_RedFuncs_t *pfuncs, RF_RedFuncs_t *qfuncs);
+
+void rf_CreateParityLoggingLargeWriteDAG(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h,
+ void *bp, RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList,
+ int nfaults, int (*redFunc)(RF_DagNode_t *));
+void rf_CreateParityLoggingSmallWriteDAG(RF_Raid_t *raidPtr,
+ RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h,
+ void *bp, RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList,
+ RF_RedFuncs_t *pfuncs, RF_RedFuncs_t *qfuncs);
+
+#endif /* !_RF__RF_PARITYLOGGINGDAGS_H_ */
diff --git a/sys/dev/raidframe/rf_parityscan.c b/sys/dev/raidframe/rf_parityscan.c
new file mode 100644
index 00000000000..3e6086873be
--- /dev/null
+++ b/sys/dev/raidframe/rf_parityscan.c
@@ -0,0 +1,553 @@
+/* $OpenBSD: rf_parityscan.c,v 1.1 1999/01/11 14:29:37 niklas Exp $ */
+/* $NetBSD: rf_parityscan.c,v 1.1 1998/11/13 04:20:32 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************
+ *
+ * rf_parityscan.c -- misc utilities related to parity verification
+ *
+ *****************************************************************************/
+
+/*
+ * :
+ * Log: rf_parityscan.c,v
+ * Revision 1.47 1996/08/20 20:35:01 jimz
+ * change diagnostic string in rewrite
+ *
+ * Revision 1.46 1996/08/20 20:03:19 jimz
+ * fixed parity rewrite to actually use arch-specific parity stuff
+ * (this ever worked... how?)
+ *
+ * Revision 1.45 1996/08/16 17:41:25 jimz
+ * allow rewrite parity on any fault-tolerant arch
+ *
+ * Revision 1.44 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.43 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.42 1996/07/22 21:12:01 jimz
+ * clean up parity scan status printing
+ *
+ * Revision 1.41 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.40 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.39 1996/07/09 21:44:26 jimz
+ * fix bogus return code in VerifyParityBasic when a stripe can't be corrected
+ *
+ * Revision 1.38 1996/06/20 17:56:57 jimz
+ * update VerifyParity to check complete AccessStripeMaps
+ *
+ * Revision 1.37 1996/06/19 22:23:01 jimz
+ * parity verification is now a layout-configurable thing
+ * not all layouts currently support it (correctly, anyway)
+ *
+ * Revision 1.36 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.35 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.34 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.33 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.32 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.31 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.30 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.29 1996/05/30 12:59:18 jimz
+ * make etimer happier, more portable
+ *
+ * Revision 1.28 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.27 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.26 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.25 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.24 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.23 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.22 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.21 1996/05/08 21:01:24 jimz
+ * fixed up enum type names that were conflicting with other
+ * enums and function names (ie, "panic")
+ * future naming trends will be towards RF_ and rf_ for
+ * everything raidframe-related
+ *
+ * Revision 1.20 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.19 1995/11/30 16:16:49 wvcii
+ * added copyright info
+ *
+ * Revision 1.18 1995/11/19 16:32:19 wvcii
+ * eliminated initialization of dag header fields which no longer exist
+ * (numDags, numDagsDone, firstHdr)
+ *
+ * Revision 1.17 1995/11/07 16:23:36 wvcii
+ * added comments, asserts, and prototypes
+ * encoded commit point nodes, barrier, and antecedents types into dags
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_dagfuncs.h"
+#include "rf_dagutils.h"
+#include "rf_mcpair.h"
+#include "rf_general.h"
+#include "rf_engine.h"
+#include "rf_parityscan.h"
+#include "rf_map.h"
+#include "rf_sys.h"
+
+/*****************************************************************************************
+ *
+ * walk through the entire arry and write new parity.
+ * This works by creating two DAGs, one to read a stripe of data and one to
+ * write new parity. The first is executed, the data is xored together, and
+ * then the second is executed. To avoid constantly building and tearing down
+ * the DAGs, we create them a priori and fill them in with the mapping
+ * information as we go along.
+ *
+ * there should never be more than one thread running this.
+ *
+ ****************************************************************************************/
+
+int rf_RewriteParity(raidPtr)
+ RF_Raid_t *raidPtr;
+{
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_AccessStripeMapHeader_t *asm_h;
+ int old_pctg, new_pctg, rc;
+ RF_PhysDiskAddr_t pda;
+ RF_SectorNum_t i;
+
+ pda.startSector = 0;
+ pda.numSector = raidPtr->Layout.sectorsPerStripeUnit;
+ old_pctg = -1;
+
+/* rf_verifyParityDebug=1; */
+ for (i=0; i<raidPtr->totalSectors; i+=layoutPtr->dataSectorsPerStripe) {
+ asm_h = rf_MapAccess(raidPtr, i, layoutPtr->dataSectorsPerStripe, NULL, RF_DONT_REMAP);
+ rc = rf_VerifyParity(raidPtr, asm_h->stripeMap, 1, 0);
+ /* printf("Parity verified: rc=%d\n",rc); */
+ switch (rc) {
+ case RF_PARITY_OKAY:
+ case RF_PARITY_CORRECTED:
+ break;
+ case RF_PARITY_BAD:
+ printf("Parity bad during correction\n");
+ RF_PANIC();
+ break;
+ case RF_PARITY_COULD_NOT_CORRECT:
+ printf("Could not correct bad parity\n");
+ RF_PANIC();
+ break;
+ case RF_PARITY_COULD_NOT_VERIFY:
+ printf("Could not verify parity\n");
+ RF_PANIC();
+ break;
+ default:
+ printf("Bad rc=%d from VerifyParity in RewriteParity\n", rc);
+ RF_PANIC();
+ }
+ rf_FreeAccessStripeMap(asm_h);
+ new_pctg = i*1000/raidPtr->totalSectors;
+ if (new_pctg != old_pctg) {
+#ifndef KERNEL
+ fprintf(stderr,"\rParity rewrite: %d.%d%% complete",
+ new_pctg/10, new_pctg%10);
+ fflush(stderr);
+#endif /* !KERNEL */
+ }
+ old_pctg = new_pctg;
+ }
+#ifndef KERNEL
+ fprintf(stderr,"\rParity rewrite: 100.0%% complete\n");
+#endif /* !KERNEL */
+#if 1
+ return(0); /* XXX nothing was here.. GO */
+#endif
+}
+
+/*****************************************************************************************
+ *
+ * verify that the parity in a particular stripe is correct.
+ * we validate only the range of parity defined by parityPDA, since
+ * this is all we have locked. The way we do this is to create an asm
+ * that maps the whole stripe and then range-restrict it to the parity
+ * region defined by the parityPDA.
+ *
+ ****************************************************************************************/
+int rf_VerifyParity(raidPtr, aasm, correct_it, flags)
+ RF_Raid_t *raidPtr;
+ RF_AccessStripeMap_t *aasm;
+ int correct_it;
+ RF_RaidAccessFlags_t flags;
+{
+ RF_PhysDiskAddr_t *parityPDA;
+ RF_AccessStripeMap_t *doasm;
+ RF_LayoutSW_t *lp;
+ int lrc, rc;
+
+ lp = raidPtr->Layout.map;
+ if (lp->faultsTolerated == 0) {
+ /*
+ * There isn't any parity. Call it "okay."
+ */
+ return(RF_PARITY_OKAY);
+ }
+ rc = RF_PARITY_OKAY;
+ if (lp->VerifyParity) {
+ for(doasm=aasm;doasm;doasm=doasm->next) {
+ for(parityPDA=doasm->parityInfo;parityPDA;parityPDA=parityPDA->next) {
+ lrc = lp->VerifyParity(raidPtr, doasm->raidAddress, parityPDA,
+ correct_it, flags);
+ if (lrc > rc) {
+ /* see rf_parityscan.h for why this works */
+ rc = lrc;
+ }
+ }
+ }
+ }
+ else {
+ rc = RF_PARITY_COULD_NOT_VERIFY;
+ }
+ return(rc);
+}
+
+int rf_VerifyParityBasic(raidPtr, raidAddr, parityPDA, correct_it, flags)
+ RF_Raid_t *raidPtr;
+ RF_RaidAddr_t raidAddr;
+ RF_PhysDiskAddr_t *parityPDA;
+ int correct_it;
+ RF_RaidAccessFlags_t flags;
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_RaidAddr_t startAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr);
+ RF_SectorCount_t numsector = parityPDA->numSector;
+ int numbytes = rf_RaidAddressToByte(raidPtr, numsector);
+ int bytesPerStripe = numbytes * layoutPtr->numDataCol;
+ RF_DagHeader_t *rd_dag_h, *wr_dag_h; /* read, write dag */
+ RF_DagNode_t *blockNode, *unblockNode, *wrBlock, *wrUnblock;
+ RF_AccessStripeMapHeader_t *asm_h;
+ RF_AccessStripeMap_t *asmap;
+ RF_AllocListElem_t *alloclist;
+ RF_PhysDiskAddr_t *pda;
+ char *pbuf, *buf, *end_p, *p;
+ int i, retcode;
+ RF_ReconUnitNum_t which_ru;
+ RF_StripeNum_t psID = rf_RaidAddressToParityStripeID(layoutPtr, raidAddr, &which_ru);
+ int stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
+ RF_AccTraceEntry_t tracerec;
+ RF_MCPair_t *mcpair;
+
+ retcode = RF_PARITY_OKAY;
+
+ mcpair = rf_AllocMCPair();
+ rf_MakeAllocList(alloclist);
+ RF_MallocAndAdd(buf, numbytes * (layoutPtr->numDataCol + layoutPtr->numParityCol), (char *), alloclist);
+ RF_CallocAndAdd(pbuf, 1, numbytes, (char *), alloclist); /* use calloc to make sure buffer is zeroed */
+ end_p = buf + bytesPerStripe;
+
+ rd_dag_h = rf_MakeSimpleDAG(raidPtr, stripeWidth, numbytes, buf, rf_DiskReadFunc, rf_DiskReadUndoFunc,
+ "Rod", alloclist, flags, RF_IO_NORMAL_PRIORITY);
+ blockNode = rd_dag_h->succedents[0];
+ unblockNode = blockNode->succedents[0]->succedents[0];
+
+ /* map the stripe and fill in the PDAs in the dag */
+ asm_h = rf_MapAccess(raidPtr, startAddr, layoutPtr->dataSectorsPerStripe, buf, RF_DONT_REMAP);
+ asmap = asm_h->stripeMap;
+
+ for (pda=asmap->physInfo,i=0; i<layoutPtr->numDataCol; i++,pda=pda->next) {
+ RF_ASSERT(pda);
+ rf_RangeRestrictPDA(raidPtr, parityPDA, pda, 0, 1);
+ RF_ASSERT(pda->numSector != 0);
+ if (rf_TryToRedirectPDA(raidPtr, pda, 0)) goto out; /* no way to verify parity if disk is dead. return w/ good status */
+ blockNode->succedents[i]->params[0].p = pda;
+ blockNode->succedents[i]->params[2].v = psID;
+ blockNode->succedents[i]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ }
+
+ RF_ASSERT(!asmap->parityInfo->next);
+ rf_RangeRestrictPDA(raidPtr, parityPDA, asmap->parityInfo, 0, 1);
+ RF_ASSERT(asmap->parityInfo->numSector != 0);
+ if (rf_TryToRedirectPDA(raidPtr, asmap->parityInfo, 1))
+ goto out;
+ blockNode->succedents[layoutPtr->numDataCol]->params[0].p = asmap->parityInfo;
+
+ /* fire off the DAG */
+ bzero((char *)&tracerec,sizeof(tracerec));
+ rd_dag_h->tracerec = &tracerec;
+
+ if (rf_verifyParityDebug) {
+ printf("Parity verify read dag:\n");
+ rf_PrintDAGList(rd_dag_h);
+ }
+
+ RF_LOCK_MUTEX(mcpair->mutex);
+ mcpair->flag = 0;
+ rf_DispatchDAG(rd_dag_h, (void (*)(void *))rf_MCPairWakeupFunc,
+ (void *) mcpair);
+ while (!mcpair->flag)
+ RF_WAIT_COND(mcpair->cond, mcpair->mutex);
+ RF_UNLOCK_MUTEX(mcpair->mutex);
+ if (rd_dag_h->status != rf_enable) {
+ RF_ERRORMSG("Unable to verify parity: can't read the stripe\n");
+ retcode = RF_PARITY_COULD_NOT_VERIFY;
+ goto out;
+ }
+
+ for (p=buf; p<end_p; p+=numbytes) {
+ rf_bxor(p, pbuf, numbytes, NULL);
+ }
+ for (i=0; i<numbytes; i++) {
+#if 0
+ if (pbuf[i]!=0 || buf[bytesPerStripe+i]!=0) {
+ printf("Bytes: %d %d %d\n",i,pbuf[i],buf[bytesPerStripe+i]);
+ }
+#endif
+ if (pbuf[i] != buf[bytesPerStripe+i]) {
+ if (!correct_it)
+ RF_ERRORMSG3("Parity verify error: byte %d of parity is 0x%x should be 0x%x\n",
+ i,(u_char) buf[bytesPerStripe+i],(u_char) pbuf[i]);
+ retcode = RF_PARITY_BAD;
+ break;
+ }
+ }
+
+ if (retcode && correct_it) {
+ wr_dag_h = rf_MakeSimpleDAG(raidPtr, 1, numbytes, pbuf, rf_DiskWriteFunc, rf_DiskWriteUndoFunc,
+ "Wnp", alloclist, flags, RF_IO_NORMAL_PRIORITY);
+ wrBlock = wr_dag_h->succedents[0]; wrUnblock = wrBlock->succedents[0]->succedents[0];
+ wrBlock->succedents[0]->params[0].p = asmap->parityInfo;
+ wrBlock->succedents[0]->params[2].v = psID;
+ wrBlock->succedents[0]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ bzero((char *)&tracerec,sizeof(tracerec));
+ wr_dag_h->tracerec = &tracerec;
+ if (rf_verifyParityDebug) {
+ printf("Parity verify write dag:\n");
+ rf_PrintDAGList(wr_dag_h);
+ }
+ RF_LOCK_MUTEX(mcpair->mutex);
+ mcpair->flag = 0;
+ rf_DispatchDAG(wr_dag_h, (void (*)(void *))rf_MCPairWakeupFunc,
+ (void *) mcpair);
+ while (!mcpair->flag)
+ RF_WAIT_COND(mcpair->cond, mcpair->mutex);
+ RF_UNLOCK_MUTEX(mcpair->mutex);
+ if (wr_dag_h->status != rf_enable) {
+ RF_ERRORMSG("Unable to correct parity in VerifyParity: can't write the stripe\n");
+ retcode = RF_PARITY_COULD_NOT_CORRECT;
+ }
+ rf_FreeDAG(wr_dag_h);
+ if (retcode == RF_PARITY_BAD)
+ retcode = RF_PARITY_CORRECTED;
+ }
+
+out:
+ rf_FreeAccessStripeMap(asm_h);
+ rf_FreeAllocList(alloclist);
+ rf_FreeDAG(rd_dag_h);
+ rf_FreeMCPair(mcpair);
+ return(retcode);
+}
+
+int rf_TryToRedirectPDA(raidPtr, pda, parity)
+ RF_Raid_t *raidPtr;
+ RF_PhysDiskAddr_t *pda;
+ int parity;
+{
+ if (raidPtr->Disks[pda->row][pda->col].status == rf_ds_reconstructing) {
+ if (rf_CheckRUReconstructed(raidPtr->reconControl[pda->row]->reconMap, pda->startSector)) {
+ if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
+ RF_RowCol_t or = pda->row, oc = pda->col;
+ RF_SectorNum_t os = pda->startSector;
+ if (parity) {
+ (raidPtr->Layout.map->MapParity)(raidPtr, pda->raidAddress, &pda->row, &pda->col, &pda->startSector, RF_REMAP);
+ if (rf_verifyParityDebug) printf("VerifyParity: Redir P r %d c %d sect %ld -> r %d c %d sect %ld\n",
+ or,oc,(long)os,pda->row,pda->col,(long)pda->startSector);
+ } else {
+ (raidPtr->Layout.map->MapSector)(raidPtr, pda->raidAddress, &pda->row, &pda->col, &pda->startSector, RF_REMAP);
+ if (rf_verifyParityDebug) printf("VerifyParity: Redir D r %d c %d sect %ld -> r %d c %d sect %ld\n",
+ or,oc,(long)os,pda->row,pda->col,(long)pda->startSector);
+ }
+ } else {
+ RF_RowCol_t spRow = raidPtr->Disks[pda->row][pda->col].spareRow;
+ RF_RowCol_t spCol = raidPtr->Disks[pda->row][pda->col].spareCol;
+ pda->row = spRow;
+ pda->col = spCol;
+ }
+ }
+ }
+ if (RF_DEAD_DISK(raidPtr->Disks[pda->row][pda->col].status)) return(1);
+ return(0);
+}
+
+/*****************************************************************************************
+ *
+ * currently a stub.
+ *
+ * takes as input an ASM describing a write operation and containing one failure, and
+ * verifies that the parity was correctly updated to reflect the write.
+ *
+ * if it's a data unit that's failed, we read the other data units in the stripe and
+ * the parity unit, XOR them together, and verify that we get the data intended for
+ * the failed disk. Since it's easy, we also validate that the right data got written
+ * to the surviving data disks.
+ *
+ * If it's the parity that failed, there's really no validation we can do except the
+ * above verification that the right data got written to all disks. This is because
+ * the new data intended for the failed disk is supplied in the ASM, but this is of
+ * course not the case for the new parity.
+ *
+ ****************************************************************************************/
+int rf_VerifyDegrModeWrite(raidPtr, asmh)
+ RF_Raid_t *raidPtr;
+ RF_AccessStripeMapHeader_t *asmh;
+{
+ return(0);
+}
+
+/* creates a simple DAG with a header, a block-recon node at level 1,
+ * nNodes nodes at level 2, an unblock-recon node at level 3, and
+ * a terminator node at level 4. The stripe address field in
+ * the block and unblock nodes are not touched, nor are the pda
+ * fields in the second-level nodes, so they must be filled in later.
+ *
+ * commit point is established at unblock node - this means that any
+ * failure during dag execution causes the dag to fail
+ */
+RF_DagHeader_t *rf_MakeSimpleDAG(raidPtr, nNodes, bytesPerSU, databuf, doFunc, undoFunc, name, alloclist, flags, priority)
+ RF_Raid_t *raidPtr;
+ int nNodes;
+ int bytesPerSU;
+ char *databuf;
+ int (*doFunc)(RF_DagNode_t *node);
+ int (*undoFunc)(RF_DagNode_t *node);
+ char *name; /* node names at the second level */
+ RF_AllocListElem_t *alloclist;
+ RF_RaidAccessFlags_t flags;
+ int priority;
+{
+ RF_DagHeader_t *dag_h;
+ RF_DagNode_t *nodes, *termNode, *blockNode, *unblockNode;
+ int i;
+
+ /* create the nodes, the block & unblock nodes, and the terminator node */
+ RF_CallocAndAdd(nodes, nNodes+3, sizeof(RF_DagNode_t), (RF_DagNode_t *), alloclist);
+ blockNode = &nodes[nNodes];
+ unblockNode = blockNode+1;
+ termNode = unblockNode+1;
+
+ dag_h = rf_AllocDAGHeader();
+ dag_h->raidPtr = (void *) raidPtr;
+ dag_h->allocList = NULL; /* we won't use this alloc list */
+ dag_h->status = rf_enable;
+ dag_h->numSuccedents = 1;
+ dag_h->creator = "SimpleDAG";
+
+ /* this dag can not commit until the unblock node is reached
+ * errors prior to the commit point imply the dag has failed
+ */
+ dag_h->numCommitNodes = 1;
+ dag_h->numCommits = 0;
+
+ dag_h->succedents[0] = blockNode;
+ rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, nNodes, 0, 0, 0, dag_h, "Nil", alloclist);
+ rf_InitNode(unblockNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc, NULL, 1, nNodes, 0, 0, dag_h, "Nil", alloclist);
+ unblockNode->succedents[0] = termNode;
+ for (i=0; i<nNodes; i++) {
+ blockNode->succedents[i] = unblockNode->antecedents[i] = &nodes[i];
+ unblockNode->antType[i] = rf_control;
+ rf_InitNode(&nodes[i], rf_wait, RF_FALSE, doFunc, undoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h, name, alloclist);
+ nodes[i].succedents[0] = unblockNode;
+ nodes[i].antecedents[0] = blockNode;
+ nodes[i].antType[0] = rf_control;
+ nodes[i].params[1].p = (databuf + (i*bytesPerSU));
+ }
+ rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", alloclist);
+ termNode->antecedents[0] = unblockNode;
+ termNode->antType[0] = rf_control;
+ return(dag_h);
+}
diff --git a/sys/dev/raidframe/rf_parityscan.h b/sys/dev/raidframe/rf_parityscan.h
new file mode 100644
index 00000000000..44aec7e2ca6
--- /dev/null
+++ b/sys/dev/raidframe/rf_parityscan.h
@@ -0,0 +1,118 @@
+/* $OpenBSD: rf_parityscan.h,v 1.1 1999/01/11 14:29:38 niklas Exp $ */
+/* $NetBSD: rf_parityscan.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* :
+ * Log: rf_parityscan.h,v
+ * Revision 1.14 1996/07/05 18:01:12 jimz
+ * don't make parity protos ndef KERNEL
+ *
+ * Revision 1.13 1996/06/20 17:41:43 jimz
+ * change decl for VerifyParity
+ *
+ * Revision 1.12 1996/06/20 15:38:39 jimz
+ * renumber parityscan return codes
+ *
+ * Revision 1.11 1996/06/19 22:23:01 jimz
+ * parity verification is now a layout-configurable thing
+ * not all layouts currently support it (correctly, anyway)
+ *
+ * Revision 1.10 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.9 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.8 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.7 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.6 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.5 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.4 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.3 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.2 1995/11/30 16:20:46 wvcii
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_PARITYSCAN_H_
+#define _RF__RF_PARITYSCAN_H_
+
+#include "rf_types.h"
+#include "rf_alloclist.h"
+
+int rf_RewriteParity(RF_Raid_t *raidPtr);
+int rf_VerifyParityBasic(RF_Raid_t *raidPtr, RF_RaidAddr_t raidAddr,
+ RF_PhysDiskAddr_t *parityPDA, int correct_it, RF_RaidAccessFlags_t flags);
+int rf_VerifyParity(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *stripeMap,
+ int correct_it, RF_RaidAccessFlags_t flags);
+int rf_TryToRedirectPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, int parity);
+int rf_VerifyDegrModeWrite(RF_Raid_t *raidPtr, RF_AccessStripeMapHeader_t *asmh);
+RF_DagHeader_t *rf_MakeSimpleDAG(RF_Raid_t *raidPtr, int nNodes,
+ int bytesPerSU, char *databuf,
+ int (*doFunc)(RF_DagNode_t *),
+ int (*undoFunc)(RF_DagNode_t *),
+ char *name, RF_AllocListElem_t *alloclist,
+ RF_RaidAccessFlags_t flags, int priority);
+
+#define RF_DO_CORRECT_PARITY 1
+#define RF_DONT_CORRECT_PARITY 0
+
+/*
+ * Return vals for VerifyParity operation
+ *
+ * Ordering is important here.
+ */
+#define RF_PARITY_OKAY 0 /* or no parity information */
+#define RF_PARITY_CORRECTED 1
+#define RF_PARITY_BAD 2
+#define RF_PARITY_COULD_NOT_CORRECT 3
+#define RF_PARITY_COULD_NOT_VERIFY 4
+
+#endif /* !_RF__RF_PARITYSCAN_H_ */
diff --git a/sys/dev/raidframe/rf_pq.c b/sys/dev/raidframe/rf_pq.c
new file mode 100644
index 00000000000..ebbc7917b26
--- /dev/null
+++ b/sys/dev/raidframe/rf_pq.c
@@ -0,0 +1,1026 @@
+/* $OpenBSD: rf_pq.c,v 1.1 1999/01/11 14:29:38 niklas Exp $ */
+/* $NetBSD: rf_pq.c,v 1.1 1998/11/13 04:20:32 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * Code for RAID level 6 (P + Q) disk array architecture.
+ *
+ * :
+ * Log: rf_pq.c,v
+ * Revision 1.33 1996/11/05 21:10:40 jimz
+ * failed pda generalization
+ *
+ * Revision 1.32 1996/07/31 16:29:50 jimz
+ * "fix" math on 32-bit machines using RF_LONGSHIFT
+ * (may be incorrect)
+ *
+ * Revision 1.31 1996/07/31 15:35:01 jimz
+ * evenodd changes; bugfixes for double-degraded archs, generalize
+ * some formerly PQ-only functions
+ *
+ * Revision 1.30 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.29 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.28 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.27 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.26 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.25 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.24 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.23 1996/05/30 12:59:18 jimz
+ * make etimer happier, more portable
+ *
+ * Revision 1.22 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.21 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.20 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.19 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.18 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.17 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.16 1996/05/17 14:52:04 wvcii
+ * added prototyping to QDelta()
+ * - changed buf params from volatile unsigned long * to char *
+ * changed QDelta for kernel
+ * - just bzero the buf since kernel doesn't include pq decode table
+ *
+ * Revision 1.15 1996/05/03 19:40:20 wvcii
+ * added includes for dag library
+ *
+ * Revision 1.14 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.13 1995/11/30 16:19:55 wvcii
+ * added copyright info
+ *
+ * Revision 1.12 1995/11/07 16:13:47 wvcii
+ * changed PQDagSelect prototype
+ * function no longer returns numHdrSucc, numTermAnt
+ * note: this file contains node functions which should be
+ * moved to rf_dagfuncs.c so that all node funcs are bundled together
+ *
+ * Revision 1.11 1995/10/04 03:50:33 wvcii
+ * removed panics, minor code cleanup in dag selection
+ *
+ *
+ */
+
+#include "rf_archs.h"
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_dagffrd.h"
+#include "rf_dagffwr.h"
+#include "rf_dagdegrd.h"
+#include "rf_dagdegwr.h"
+#include "rf_dagutils.h"
+#include "rf_dagfuncs.h"
+#include "rf_threadid.h"
+#include "rf_etimer.h"
+#include "rf_pqdeg.h"
+#include "rf_general.h"
+#include "rf_map.h"
+#include "rf_pq.h"
+#include "rf_sys.h"
+
+RF_RedFuncs_t rf_pFuncs = { rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P" };
+RF_RedFuncs_t rf_pRecoveryFuncs = { rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func" };
+
+int rf_RegularONPFunc(node)
+ RF_DagNode_t *node;
+{
+ return(rf_RegularXorFunc(node));
+}
+
+/*
+ same as simpleONQ func, but the coefficient is always 1
+*/
+
+int rf_SimpleONPFunc(node)
+ RF_DagNode_t *node;
+{
+ return(rf_SimpleXorFunc(node));
+}
+
+int rf_RecoveryPFunc(node)
+RF_DagNode_t *node;
+{
+ return(rf_RecoveryXorFunc(node));
+}
+
+int rf_RegularPFunc(node)
+ RF_DagNode_t *node;
+{
+ return(rf_RegularXorFunc(node));
+}
+
+#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
+
+static void QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
+ unsigned char coeff);
+static void rf_InvertQ(unsigned long *qbuf, unsigned long *abuf,
+ unsigned length, unsigned coeff);
+
+RF_RedFuncs_t rf_qFuncs = { rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q" };
+RF_RedFuncs_t rf_qRecoveryFuncs = { rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func" };
+RF_RedFuncs_t rf_pqRecoveryFuncs = { rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func" };
+
+void rf_PQDagSelect(
+ RF_Raid_t *raidPtr,
+ RF_IoType_t type,
+ RF_AccessStripeMap_t *asmap,
+ RF_VoidFuncPtr *createFunc)
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ unsigned ndfail = asmap->numDataFailed;
+ unsigned npfail = asmap->numParityFailed;
+ unsigned ntfail = npfail + ndfail;
+
+ RF_ASSERT(RF_IO_IS_R_OR_W(type));
+ if (ntfail > 2)
+ {
+ RF_ERRORMSG("more than two disks failed in a single group! Aborting I/O operation.\n");
+ /* *infoFunc = */ *createFunc = NULL;
+ return;
+ }
+
+ /* ok, we can do this I/O */
+ if (type == RF_IO_TYPE_READ)
+ {
+ switch (ndfail)
+ {
+ case 0:
+ /* fault free read */
+ *createFunc = rf_CreateFaultFreeReadDAG; /* same as raid 5 */
+ break;
+ case 1:
+ /* lost a single data unit */
+ /* two cases:
+ (1) parity is not lost.
+ do a normal raid 5 reconstruct read.
+ (2) parity is lost.
+ do a reconstruct read using "q".
+ */
+ if (ntfail == 2) /* also lost redundancy */
+ {
+ if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
+ *createFunc = rf_PQ_110_CreateReadDAG;
+ else
+ *createFunc = rf_PQ_101_CreateReadDAG;
+ }
+ else
+ {
+ /* P and Q are ok. But is there a failure
+ in some unaccessed data unit?
+ */
+ if (rf_NumFailedDataUnitsInStripe(raidPtr,asmap)==2)
+ *createFunc = rf_PQ_200_CreateReadDAG;
+ else
+ *createFunc = rf_PQ_100_CreateReadDAG;
+ }
+ break;
+ case 2:
+ /* lost two data units */
+ /* *infoFunc = PQOneTwo; */
+ *createFunc = rf_PQ_200_CreateReadDAG;
+ break;
+ }
+ return;
+ }
+
+ /* a write */
+ switch (ntfail)
+ {
+ case 0: /* fault free */
+ if (rf_suppressLocksAndLargeWrites ||
+ (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
+ (asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
+
+ *createFunc = rf_PQCreateSmallWriteDAG;
+ }
+ else {
+ *createFunc = rf_PQCreateLargeWriteDAG;
+ }
+ break;
+
+ case 1: /* single disk fault */
+ if (npfail==1)
+ {
+ RF_ASSERT ((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
+ if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q)
+ { /* q died, treat like normal mode raid5 write.*/
+ if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
+ || rf_NumFailedDataUnitsInStripe(raidPtr,asmap))
+ *createFunc = rf_PQ_001_CreateSmallWriteDAG;
+ else
+ *createFunc = rf_PQ_001_CreateLargeWriteDAG;
+ }
+ else
+ { /* parity died, small write only updating Q */
+ if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
+ || rf_NumFailedDataUnitsInStripe(raidPtr,asmap))
+ *createFunc = rf_PQ_010_CreateSmallWriteDAG;
+ else
+ *createFunc = rf_PQ_010_CreateLargeWriteDAG;
+ }
+ }
+ else
+ { /* data missing.
+ Do a P reconstruct write if only a single data unit
+ is lost in the stripe, otherwise a PQ reconstruct
+ write. */
+ if (rf_NumFailedDataUnitsInStripe(raidPtr,asmap)==2)
+ *createFunc = rf_PQ_200_CreateWriteDAG;
+ else
+ *createFunc = rf_PQ_100_CreateWriteDAG;
+ }
+ break;
+
+ case 2: /* two disk faults */
+ switch (npfail)
+ {
+ case 2: /* both p and q dead */
+ *createFunc = rf_PQ_011_CreateWriteDAG;
+ break;
+ case 1: /* either p or q and dead data */
+ RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
+ RF_ASSERT ((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
+ if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
+ *createFunc = rf_PQ_101_CreateWriteDAG;
+ else
+ *createFunc = rf_PQ_110_CreateWriteDAG;
+ break;
+ case 0: /* double data loss */
+ *createFunc = rf_PQ_200_CreateWriteDAG;
+ break;
+ }
+ break;
+
+ default: /* more than 2 disk faults */
+ *createFunc = NULL;
+ RF_PANIC();
+ }
+ return;
+}
+
+/*
+ Used as a stop gap info function
+*/
+static void PQOne(raidPtr, nSucc, nAnte, asmap)
+ RF_Raid_t *raidPtr;
+ int *nSucc;
+ int *nAnte;
+ RF_AccessStripeMap_t *asmap;
+{
+ *nSucc = *nAnte = 1;
+}
+
+static void PQOneTwo(raidPtr, nSucc, nAnte, asmap)
+ RF_Raid_t *raidPtr;
+ int *nSucc;
+ int *nAnte;
+ RF_AccessStripeMap_t *asmap;
+{
+ *nSucc = 1;
+ *nAnte = 2;
+}
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
+{
+ rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2,
+ rf_RegularPQFunc, RF_FALSE);
+}
+
+int rf_RegularONQFunc(node)
+ RF_DagNode_t *node;
+{
+ int np = node->numParams;
+ int d;
+ RF_Raid_t *raidPtr = (RF_Raid_t *)node->params[np-1].p;
+ int i;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+ RF_Etimer_t timer;
+ char *qbuf, *qpbuf;
+ char *obuf, *nbuf;
+ RF_PhysDiskAddr_t *old, *new;
+ unsigned long coeff;
+ unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
+
+ RF_ETIMER_START(timer);
+
+ d = (np-3)/4;
+ RF_ASSERT (4*d+3 == np);
+ qbuf = (char *) node->params[2*d+1].p; /* q buffer*/
+ for (i=0; i < d; i++)
+ {
+ old = (RF_PhysDiskAddr_t *) node->params[2*i].p;
+ obuf = (char *) node->params[2*i+1].p;
+ new = (RF_PhysDiskAddr_t *) node->params[2*(d+1+i)].p;
+ nbuf = (char *) node->params[2*(d+1+i)+1].p;
+ RF_ASSERT (new->numSector == old->numSector);
+ RF_ASSERT (new->raidAddress == old->raidAddress);
+ /* the stripe unit within the stripe tells us the coefficient to use
+ for the multiply. */
+ coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),new->raidAddress);
+ /* compute the data unit offset within the column, then add one */
+ coeff = (coeff % raidPtr->Layout.numDataCol);
+ qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,old->startSector % secPerSU);
+ QDelta(qpbuf,obuf,nbuf, rf_RaidAddressToByte(raidPtr, old->numSector),coeff);
+ }
+
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ tracerec->q_us += RF_ETIMER_VAL_US(timer);
+ rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no I/O in this node */
+ return(0);
+}
+
+/*
+ See the SimpleXORFunc for the difference between a simple and regular func.
+ These Q functions should be used for
+
+ new q = Q(data,old data,old q)
+
+ style updates and not for
+
+ q = ( new data, new data, .... )
+
+ computations.
+
+ The simple q takes 2(2d+1)+1 params, where d is the number
+ of stripes written. The order of params is
+ old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d
+ [2d] old q pda_0, old q buffer
+ [2d_2] new data pda_0, new data buffer_0, ... new data pda_d, new data buffer_d
+ raidPtr
+*/
+
+int rf_SimpleONQFunc(node)
+ RF_DagNode_t *node;
+{
+ int np = node->numParams;
+ int d;
+ RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p;
+ int i;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+ RF_Etimer_t timer;
+ char *qbuf;
+ char *obuf, *nbuf;
+ RF_PhysDiskAddr_t *old, *new;
+ unsigned long coeff;
+
+ RF_ETIMER_START(timer);
+
+ d = (np-3)/4;
+ RF_ASSERT (4*d+3 == np);
+ qbuf = (char *) node->params[2*d+1].p; /* q buffer*/
+ for (i=0; i < d; i++)
+ {
+ old = (RF_PhysDiskAddr_t *) node->params[2*i].p;
+ obuf = (char *) node->params[2*i+1].p;
+ new = (RF_PhysDiskAddr_t *) node->params[2*(d+1+i)].p;
+ nbuf = (char *) node->params[2*(d+1+i)+1].p;
+ RF_ASSERT (new->numSector == old->numSector);
+ RF_ASSERT (new->raidAddress == old->raidAddress);
+ /* the stripe unit within the stripe tells us the coefficient to use
+ for the multiply. */
+ coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),new->raidAddress);
+ /* compute the data unit offset within the column, then add one */
+ coeff = (coeff % raidPtr->Layout.numDataCol);
+ QDelta(qbuf,obuf,nbuf, rf_RaidAddressToByte(raidPtr, old->numSector),coeff);
+ }
+
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ tracerec->q_us += RF_ETIMER_VAL_US(timer);
+ rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no I/O in this node */
+ return(0);
+}
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
+{
+ rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs);
+}
+
+static void RegularQSubr(node,qbuf)
+ RF_DagNode_t *node;
+ char *qbuf;
+{
+ int np = node->numParams;
+ int d;
+ RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p;
+ unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
+ int i;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+ RF_Etimer_t timer;
+ char *obuf, *qpbuf;
+ RF_PhysDiskAddr_t *old;
+ unsigned long coeff;
+
+ RF_ETIMER_START(timer);
+
+ d = (np-1)/2;
+ RF_ASSERT (2*d+1 == np);
+ for (i=0; i < d; i++)
+ {
+ old = (RF_PhysDiskAddr_t *) node->params[2*i].p;
+ obuf = (char *) node->params[2*i+1].p;
+ coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),old->raidAddress);
+ /* compute the data unit offset within the column, then add one */
+ coeff = (coeff % raidPtr->Layout.numDataCol);
+ /* the input buffers may not all be aligned with the start of the
+ stripe. so shift by their sector offset within the stripe unit */
+ qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,old->startSector % secPerSU);
+ rf_IncQ((unsigned long *)qpbuf,(unsigned long *)obuf,rf_RaidAddressToByte(raidPtr, old->numSector),coeff);
+ }
+
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ tracerec->q_us += RF_ETIMER_VAL_US(timer);
+}
+
+/*
+ used in degraded writes.
+*/
+
+static void DegrQSubr(node)
+ RF_DagNode_t *node;
+{
+ int np = node->numParams;
+ int d;
+ RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p;
+ unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
+ int i;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+ RF_Etimer_t timer;
+ char *qbuf = node->results[1];
+ char *obuf, *qpbuf;
+ RF_PhysDiskAddr_t *old;
+ unsigned long coeff;
+ unsigned fail_start;
+ int j;
+
+ old = (RF_PhysDiskAddr_t *)node->params[np-2].p;
+ fail_start = old->startSector % secPerSU;
+
+ RF_ETIMER_START(timer);
+
+ d = (np-2)/2;
+ RF_ASSERT (2*d+2 == np);
+ for (i=0; i < d; i++)
+ {
+ old = (RF_PhysDiskAddr_t *) node->params[2*i].p;
+ obuf = (char *) node->params[2*i+1].p;
+ coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),old->raidAddress);
+ /* compute the data unit offset within the column, then add one */
+ coeff = (coeff % raidPtr->Layout.numDataCol);
+ /* the input buffers may not all be aligned with the start of the
+ stripe. so shift by their sector offset within the stripe unit */
+ j = old->startSector % secPerSU;
+ RF_ASSERT(j >= fail_start);
+ qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,j - fail_start);
+ rf_IncQ((unsigned long *)qpbuf,(unsigned long *)obuf,rf_RaidAddressToByte(raidPtr, old->numSector),coeff);
+ }
+
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ tracerec->q_us += RF_ETIMER_VAL_US(timer);
+}
+
+/*
+ Called by large write code to compute the new parity and the new q.
+
+ structure of the params:
+
+ pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol
+ raidPtr
+
+ for a total of 2d+1 arguments.
+ The result buffers results[0], results[1] are the buffers for the p and q,
+ respectively.
+
+ We compute Q first, then compute P. The P calculation may try to reuse
+ one of the input buffers for its output, so if we computed P first, we would
+ corrupt the input for the q calculation.
+*/
+
+int rf_RegularPQFunc(node)
+ RF_DagNode_t *node;
+{
+ RegularQSubr(node,node->results[1]);
+ return(rf_RegularXorFunc(node)); /* does the wakeup */
+}
+
+int rf_RegularQFunc(node)
+ RF_DagNode_t *node;
+{
+ /* Almost ... adjust Qsubr args */
+ RegularQSubr(node, node->results[0]);
+ rf_GenericWakeupFunc(node, 0); /* call wake func explicitly since no I/O in this node */
+ return(0);
+}
+
+/*
+ Called by singly degraded write code to compute the new parity and the new q.
+
+ structure of the params:
+
+ pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d
+ failedPDA raidPtr
+
+ for a total of 2d+2 arguments.
+ The result buffers results[0], results[1] are the buffers for the parity and q,
+ respectively.
+
+ We compute Q first, then compute parity. The parity calculation may try to reuse
+ one of the input buffers for its output, so if we computed parity first, we would
+ corrupt the input for the q calculation.
+
+ We treat this identically to the regularPQ case, ignoring the failedPDA extra argument.
+*/
+
+void rf_Degraded_100_PQFunc(node)
+ RF_DagNode_t *node;
+{
+ int np = node->numParams;
+
+ RF_ASSERT (np >= 2);
+ DegrQSubr(node);
+ rf_RecoveryXorFunc(node);
+}
+
+
+/*
+ The two below are used when reading a stripe with a single lost data unit.
+ The parameters are
+
+ pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
+
+ and results[0] contains the data buffer. Which is originally zero-filled.
+
+*/
+
+/* this Q func is used by the degraded-mode dag functions to recover lost data.
+ * the second-to-last parameter is the PDA for the failed portion of the access.
+ * the code here looks at this PDA and assumes that the xor target buffer is
+ * equal in size to the number of sectors in the failed PDA. It then uses
+ * the other PDAs in the parameter list to determine where within the target
+ * buffer the corresponding data should be xored.
+ *
+ * Recall the basic equation is
+ *
+ * Q = ( data_1 + 2 * data_2 ... + k * data_k ) mod 256
+ *
+ * so to recover data_j we need
+ *
+ * J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256
+ *
+ * So the coefficient for each buffer is (255 - data_col), and j should be initialized by
+ * copying Q into it. Then we need to do a table lookup to convert to solve
+ * data_j /= J
+ *
+ *
+ */
+int rf_RecoveryQFunc(node)
+ RF_DagNode_t *node;
+{
+ RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams-1].p;
+ RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &raidPtr->Layout;
+ RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams-2].p;
+ int i;
+ RF_PhysDiskAddr_t *pda;
+ RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr,failedPDA->startSector);
+ char *srcbuf, *destbuf;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+ RF_Etimer_t timer;
+ unsigned long coeff;
+
+ RF_ETIMER_START(timer);
+ /* start by copying Q into the buffer */
+ bcopy(node->params[node->numParams-3].p,node->results[0],
+ rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
+ for (i=0; i<node->numParams-4; i+=2)
+ {
+ RF_ASSERT (node->params[i+1].p != node->results[0]);
+ pda = (RF_PhysDiskAddr_t *) node->params[i].p;
+ srcbuf = (char *) node->params[i+1].p;
+ suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+ destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr,suoffset-failedSUOffset);
+ coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),pda->raidAddress);
+ /* compute the data unit offset within the column */
+ coeff = (coeff % raidPtr->Layout.numDataCol);
+ rf_IncQ((unsigned long *)destbuf, (unsigned long *)srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
+ }
+ /* Do the nasty inversion now */
+ coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),failedPDA->startSector) % raidPtr->Layout.numDataCol);
+ rf_InvertQ(node->results[0],node->results[0],rf_RaidAddressToByte(raidPtr,pda->numSector),coeff);
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ tracerec->q_us += RF_ETIMER_VAL_US(timer);
+ rf_GenericWakeupFunc(node, 0);
+ return(0);
+}
+
+int rf_RecoveryPQFunc(node)
+ RF_DagNode_t *node;
+{
+ RF_PANIC();
+ return(1);
+}
+
+/*
+ Degraded write Q subroutine.
+ Used when P is dead.
+ Large-write style Q computation.
+ Parameters
+
+ (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr.
+
+ We ignore failedPDA.
+
+ This is a "simple style" recovery func.
+*/
+
+void rf_PQ_DegradedWriteQFunc(node)
+ RF_DagNode_t *node;
+{
+ int np = node->numParams;
+ int d;
+ RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-1].p;
+ unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
+ int i;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+ RF_Etimer_t timer;
+ char *qbuf = node->results[0];
+ char *obuf, *qpbuf;
+ RF_PhysDiskAddr_t *old;
+ unsigned long coeff;
+ int fail_start,j;
+
+ old = (RF_PhysDiskAddr_t *) node->params[np-2].p;
+ fail_start = old->startSector % secPerSU;
+
+ RF_ETIMER_START(timer);
+
+ d = (np-2)/2;
+ RF_ASSERT (2*d+2 == np);
+
+ for (i=0; i < d; i++)
+ {
+ old = (RF_PhysDiskAddr_t *) node->params[2*i].p;
+ obuf = (char *) node->params[2*i+1].p;
+ coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),old->raidAddress);
+ /* compute the data unit offset within the column, then add one */
+ coeff = (coeff % raidPtr->Layout.numDataCol);
+ j = old->startSector % secPerSU;
+ RF_ASSERT(j >= fail_start);
+ qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,j - fail_start);
+ rf_IncQ((unsigned long *)qpbuf,(unsigned long *)obuf,rf_RaidAddressToByte(raidPtr, old->numSector),coeff);
+ }
+
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ tracerec->q_us += RF_ETIMER_VAL_US(timer);
+ rf_GenericWakeupFunc(node, 0);
+}
+
+
+
+
+/* Q computations */
+
+/*
+ coeff - colummn;
+
+ compute dest ^= qfor[28-coeff][rn[coeff+1] a]
+
+ on 5-bit basis;
+ length in bytes;
+*/
+
+void rf_IncQ(dest,buf,length,coeff)
+ unsigned long *dest;
+ unsigned long *buf;
+ unsigned length;
+ unsigned coeff;
+{
+ unsigned long a, d, new;
+ unsigned long a1, a2;
+ unsigned int *q = &(rf_qfor[28-coeff][0]);
+ unsigned r = rf_rn[coeff+1];
+
+#define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
+#define INSERT(a,i) (a << (5L*i))
+
+ length /= 8;
+ /* 13 5 bit quants in a 64 bit word */
+ while (length)
+ {
+ a = *buf++;
+ d = *dest;
+ a1 = EXTRACT(a,0) ^ r;
+ a2 = EXTRACT(a,1) ^ r;
+ new = INSERT(a2,1) | a1 ;
+ a1 = EXTRACT(a,2) ^ r;
+ a2 = EXTRACT(a,3) ^ r;
+ a1 = q[a1];
+ a2 = q[a2];
+ new = new | INSERT(a1,2) | INSERT (a2,3);
+ a1 = EXTRACT(a,4) ^ r;
+ a2 = EXTRACT(a,5) ^ r;
+ a1 = q[a1];
+ a2 = q[a2];
+ new = new | INSERT(a1,4) | INSERT (a2,5);
+ a1 = EXTRACT(a,5) ^ r;
+ a2 = EXTRACT(a,6) ^ r;
+ a1 = q[a1];
+ a2 = q[a2];
+ new = new | INSERT(a1,5) | INSERT (a2,6);
+#if RF_LONGSHIFT > 2
+ a1 = EXTRACT(a,7) ^ r;
+ a2 = EXTRACT(a,8) ^ r;
+ a1 = q[a1];
+ a2 = q[a2];
+ new = new | INSERT(a1,7) | INSERT (a2,8);
+ a1 = EXTRACT(a,9) ^ r;
+ a2 = EXTRACT(a,10) ^ r;
+ a1 = q[a1];
+ a2 = q[a2];
+ new = new | INSERT(a1,9) | INSERT (a2,10);
+ a1 = EXTRACT(a,11) ^ r;
+ a2 = EXTRACT(a,12) ^ r;
+ a1 = q[a1];
+ a2 = q[a2];
+ new = new | INSERT(a1,11) | INSERT (a2,12);
+#endif /* RF_LONGSHIFT > 2 */
+ d ^= new;
+ *dest++ = d;
+ length--;
+ }
+}
+
+/*
+ compute
+
+ dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ]
+
+ on a five bit basis.
+ optimization: compute old ^ new on 64 bit basis.
+
+ length in bytes.
+*/
+
+static void QDelta(
+ char *dest,
+ char *obuf,
+ char *nbuf,
+ unsigned length,
+ unsigned char coeff)
+{
+ unsigned long a, d, new;
+ unsigned long a1, a2;
+ unsigned int *q = &(rf_qfor[28-coeff][0]);
+ unsigned r = rf_rn[coeff+1];
+
+#ifdef KERNEL
+ /* PQ in kernel currently not supported because the encoding/decoding table is not present */
+ bzero(dest, length);
+#else /* KERNEL */
+ /* this code probably doesn't work and should be rewritten -wvcii */
+ /* 13 5 bit quants in a 64 bit word */
+ length /= 8;
+ while (length)
+ {
+ a = *obuf++; /* XXX need to reorg to avoid cache conflicts */
+ a ^= *nbuf++;
+ d = *dest;
+ a1 = EXTRACT(a,0) ^ r;
+ a2 = EXTRACT(a,1) ^ r;
+ a1 = q[a1];
+ a2 = q[a2];
+ new = INSERT(a2,1) | a1 ;
+ a1 = EXTRACT(a,2) ^ r;
+ a2 = EXTRACT(a,3) ^ r;
+ a1 = q[a1];
+ a2 = q[a2];
+ new = new | INSERT(a1,2) | INSERT (a2,3);
+ a1 = EXTRACT(a,4) ^ r;
+ a2 = EXTRACT(a,5) ^ r;
+ a1 = q[a1];
+ a2 = q[a2];
+ new = new | INSERT(a1,4) | INSERT (a2,5);
+ a1 = EXTRACT(a,5) ^ r;
+ a2 = EXTRACT(a,6) ^ r;
+ a1 = q[a1];
+ a2 = q[a2];
+ new = new | INSERT(a1,5) | INSERT (a2,6);
+#if RF_LONGSHIFT > 2
+ a1 = EXTRACT(a,7) ^ r;
+ a2 = EXTRACT(a,8) ^ r;
+ a1 = q[a1];
+ a2 = q[a2];
+ new = new | INSERT(a1,7) | INSERT (a2,8);
+ a1 = EXTRACT(a,9) ^ r;
+ a2 = EXTRACT(a,10) ^ r;
+ a1 = q[a1];
+ a2 = q[a2];
+ new = new | INSERT(a1,9) | INSERT (a2,10);
+ a1 = EXTRACT(a,11) ^ r;
+ a2 = EXTRACT(a,12) ^ r;
+ a1 = q[a1];
+ a2 = q[a2];
+ new = new | INSERT(a1,11) | INSERT (a2,12);
+#endif /* RF_LONGSHIFT > 2 */
+ d ^= new;
+ *dest++ = d;
+ length--;
+ }
+#endif /* KERNEL */
+}
+
+/*
+ recover columns a and b from the given p and q into
+ bufs abuf and bbuf. All bufs are word aligned.
+ Length is in bytes.
+*/
+
+
+/*
+ * XXX
+ *
+ * Everything about this seems wrong.
+ */
+void rf_PQ_recover(pbuf,qbuf,abuf,bbuf,length,coeff_a,coeff_b)
+ unsigned long *pbuf;
+ unsigned long *qbuf;
+ unsigned long *abuf;
+ unsigned long *bbuf;
+ unsigned length;
+ unsigned coeff_a;
+ unsigned coeff_b;
+{
+ unsigned long p, q, a, a0, a1;
+ int col = (29 * coeff_a) + coeff_b;
+ unsigned char *q0 = & (rf_qinv[col][0]);
+
+ length /= 8;
+ while (length)
+ {
+ p = *pbuf++;
+ q = *qbuf++;
+ a0 = EXTRACT(p,0);
+ a1 = EXTRACT(q,0);
+ a = q0[a0<<5 | a1];
+#define MF(i) \
+ a0 = EXTRACT(p,i); \
+ a1 = EXTRACT(q,i); \
+ a = a | INSERT(q0[a0<<5 | a1],i)
+
+ MF(1);
+ MF(2);
+ MF(3);
+ MF(4);
+ MF(5);
+ MF(6);
+#if 0
+ MF(7);
+ MF(8);
+ MF(9);
+ MF(10);
+ MF(11);
+ MF(12);
+#endif /* 0 */
+ *abuf++ = a;
+ *bbuf++ = a ^ p;
+ length--;
+ }
+}
+
+/*
+ Lost parity and a data column. Recover that data column.
+ Assume col coeff is lost. Let q the contents of Q after
+ all surviving data columns have been q-xored out of it.
+ Then we have the equation
+
+ q[28-coeff][a_i ^ r_i+1] = q
+
+ but q is cyclic with period 31.
+ So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
+ q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
+
+ so a_i = r_{coeff+1} ^ q[3+coeff][q]
+
+ The routine is passed q buffer and the buffer
+ the data is to be recoverd into. They can be the same.
+*/
+
+
+
+static void rf_InvertQ(
+ unsigned long *qbuf,
+ unsigned long *abuf,
+ unsigned length,
+ unsigned coeff)
+{
+ unsigned long a, new;
+ unsigned long a1, a2;
+ unsigned int *q = &(rf_qfor[3+coeff][0]);
+ unsigned r = rf_rn[coeff+1];
+
+ /* 13 5 bit quants in a 64 bit word */
+ length /= 8;
+ while (length)
+ {
+ a = *qbuf++;
+ a1 = EXTRACT(a,0);
+ a2 = EXTRACT(a,1);
+ a1 = r ^ q[a1];
+ a2 = r ^ q[a2];
+ new = INSERT(a2,1) | a1;
+#define M(i,j) \
+ a1 = EXTRACT(a,i); \
+ a2 = EXTRACT(a,j); \
+ a1 = r ^ q[a1]; \
+ a2 = r ^ q[a2]; \
+ new = new | INSERT(a1,i) | INSERT(a2,j)
+
+ M(2,3);
+ M(4,5);
+ M(5,6);
+#if RF_LONGSHIFT > 2
+ M(7,8);
+ M(9,10);
+ M(11,12);
+#endif /* RF_LONGSHIFT > 2 */
+ *abuf++ = new;
+ length--;
+ }
+}
+
+#endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */
diff --git a/sys/dev/raidframe/rf_pq.h b/sys/dev/raidframe/rf_pq.h
new file mode 100644
index 00000000000..52f816354fa
--- /dev/null
+++ b/sys/dev/raidframe/rf_pq.h
@@ -0,0 +1,115 @@
+/* $OpenBSD: rf_pq.h,v 1.1 1999/01/11 14:29:39 niklas Exp $ */
+/* $NetBSD: rf_pq.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */
+/*
+ * rf_pq.h
+ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * :
+ * Log: rf_pq.h,v
+ * Revision 1.9 1996/07/31 15:35:05 jimz
+ * evenodd changes; bugfixes for double-degraded archs, generalize
+ * some formerly PQ-only functions
+ *
+ * Revision 1.8 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.7 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.6 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.5 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.4 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.3 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.2 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.1 1996/05/18 19:56:21 jimz
+ * Initial revision
+ *
+ */
+
+#ifndef _RF__RF_PQ_H_
+#define _RF__RF_PQ_H_
+
+#include "rf_archs.h"
+
+extern RF_RedFuncs_t rf_pFuncs;
+extern RF_RedFuncs_t rf_pRecoveryFuncs;
+
+int rf_RegularONPFunc(RF_DagNode_t *node);
+int rf_SimpleONPFunc(RF_DagNode_t *node);
+int rf_RecoveryPFunc(RF_DagNode_t *node);
+int rf_RegularPFunc(RF_DagNode_t *node);
+
+#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
+
+extern RF_RedFuncs_t rf_qFuncs;
+extern RF_RedFuncs_t rf_qRecoveryFuncs;
+extern RF_RedFuncs_t rf_pqRecoveryFuncs;
+
+void rf_PQDagSelect(RF_Raid_t *raidPtr, RF_IoType_t type,
+ RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc);
+RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG);
+int rf_RegularONQFunc(RF_DagNode_t *node);
+int rf_SimpleONQFunc(RF_DagNode_t *node);
+RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG);
+int rf_RegularPQFunc(RF_DagNode_t *node);
+int rf_RegularQFunc(RF_DagNode_t *node);
+void rf_Degraded_100_PQFunc(RF_DagNode_t *node);
+int rf_RecoveryQFunc(RF_DagNode_t *node);
+int rf_RecoveryPQFunc(RF_DagNode_t *node);
+void rf_PQ_DegradedWriteQFunc(RF_DagNode_t *node);
+void rf_IncQ(unsigned long *dest, unsigned long *buf, unsigned length,
+ unsigned coeff);
+void rf_PQ_recover(unsigned long *pbuf, unsigned long *qbuf, unsigned long *abuf,
+ unsigned long *bbuf, unsigned length, unsigned coeff_a, unsigned coeff_b);
+
+#endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */
+
+#endif /* !_RF__RF_PQ_H_ */
diff --git a/sys/dev/raidframe/rf_pqdeg.c b/sys/dev/raidframe/rf_pqdeg.c
new file mode 100644
index 00000000000..6376201b6c3
--- /dev/null
+++ b/sys/dev/raidframe/rf_pqdeg.c
@@ -0,0 +1,286 @@
+/* $OpenBSD: rf_pqdeg.c,v 1.1 1999/01/11 14:29:39 niklas Exp $ */
+/* $NetBSD: rf_pqdeg.c,v 1.1 1998/11/13 04:20:32 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * Log: rf_pqdeg.c,v
+ * Revision 1.19 1996/11/05 21:10:40 jimz
+ * failed pda generalization
+ *
+ * Revision 1.18 1996/07/31 16:30:01 jimz
+ * asm/asmap fix
+ *
+ * Revision 1.17 1996/07/31 15:35:09 jimz
+ * evenodd changes; bugfixes for double-degraded archs, generalize
+ * some formerly PQ-only functions
+ *
+ * Revision 1.16 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.15 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.14 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.13 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.12 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.11 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.10 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.9 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.8 1996/05/03 19:41:07 wvcii
+ * added includes for dag library
+ *
+ * Revision 1.7 1995/11/30 16:19:36 wvcii
+ * added copyright info
+ *
+ * Revision 1.6 1995/11/07 16:15:08 wvcii
+ * updated/added prototyping for dag creation
+ *
+ * Revision 1.5 1995/03/01 20:25:48 holland
+ * kernelization changes
+ *
+ * Revision 1.4 1995/02/03 22:31:36 holland
+ * many changes related to kernelization
+ *
+ * Revision 1.3 1995/02/01 15:13:05 holland
+ * moved #include of general.h out of raid.h and into each file
+ *
+ * Revision 1.2 1994/12/05 04:50:26 danner
+ * additional pq support
+ *
+ * Revision 1.1 1994/11/29 20:36:02 danner
+ * Initial revision
+ *
+ */
+
+#include "rf_archs.h"
+
+#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_dagutils.h"
+#include "rf_dagfuncs.h"
+#include "rf_dagffrd.h"
+#include "rf_dagffwr.h"
+#include "rf_dagdegrd.h"
+#include "rf_dagdegwr.h"
+#include "rf_threadid.h"
+#include "rf_etimer.h"
+#include "rf_pqdeg.h"
+#include "rf_general.h"
+#include "rf_pqdegdags.h"
+#include "rf_pq.h"
+
+/*
+ Degraded mode dag functions for P+Q calculations.
+
+ The following nomenclature is used.
+
+ PQ_<D><P><Q>_Create{Large,Small}<Write|Read>DAG
+
+ where <D><P><Q> are single digits representing the number of failed
+ data units <D> (0,1,2), parity units <P> (0,1), and Q units <Q>, effecting
+ the I/O. The reads have only PQ_<D><P><Q>_CreateReadDAG variants, while
+ the single fault writes have both large and small write versions. (Single fault
+ PQ is equivalent to normal mode raid 5 in many aspects.
+
+ Some versions degenerate into the same case, and are grouped together below.
+*/
+
+/* Reads, single failure
+
+ we have parity, so we can do a raid 5
+ reconstruct read.
+*/
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_100_CreateReadDAG)
+{
+ rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pRecoveryFuncs);
+}
+
+/* Reads double failure */
+
+/*
+ Q is lost, but not parity
+ so we can a raid 5 reconstruct read.
+*/
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_101_CreateReadDAG)
+{
+ rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pRecoveryFuncs);
+}
+
+/*
+ parity is lost, so we need to
+ do a reconstruct read and recompute
+ the data with Q.
+*/
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_110_CreateReadDAG)
+{
+ RF_PhysDiskAddr_t *temp;
+ /* swap P and Q pointers to fake out the DegradedReadDAG code */
+ temp = asmap->parityInfo; asmap->parityInfo = asmap->qInfo; asmap->qInfo = temp;
+ rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_qRecoveryFuncs);
+}
+
+/*
+ Two data units are dead in this stripe, so we will need read
+ both P and Q to reconstruct the data. Note that only
+ one data unit we are reading may actually be missing.
+*/
+RF_CREATE_DAG_FUNC_DECL(rf_CreateDoubleDegradedReadDAG)
+{
+ rf_PQ_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList);
+}
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateReadDAG)
+{
+ rf_CreateDoubleDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList);
+}
+
+/* Writes, single failure */
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_100_CreateWriteDAG)
+{
+ if (asmap->numStripeUnitsAccessed != 1 &&
+ asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit)
+ RF_PANIC();
+ rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags,
+ allocList, 2, (int (*)())rf_Degraded_100_PQFunc, RF_FALSE);
+}
+
+/* Dead P - act like a RAID 5 small write with parity = Q */
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_010_CreateSmallWriteDAG)
+{
+ RF_PhysDiskAddr_t *temp;
+ /* swap P and Q pointers to fake out the DegradedReadDAG code */
+ temp = asmap->parityInfo; asmap->parityInfo = asmap->qInfo; asmap->qInfo = temp;
+ rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_qFuncs, NULL);
+}
+
+/* Dead Q - act like a RAID 5 small write */
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_001_CreateSmallWriteDAG)
+{
+ rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, NULL);
+}
+
+/* Dead P - act like a RAID 5 large write but for Q */
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_010_CreateLargeWriteDAG)
+{
+ RF_PhysDiskAddr_t *temp;
+ /* swap P and Q pointers to fake out the code */
+ temp = asmap->parityInfo; asmap->parityInfo = asmap->qInfo; asmap->qInfo = temp;
+ rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularQFunc, RF_FALSE);
+}
+
+/* Dead Q - act like a RAID 5 large write */
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_001_CreateLargeWriteDAG)
+{
+ rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 1, rf_RegularPFunc, RF_FALSE);
+}
+
+
+/*
+ * writes, double failure
+ */
+
+/*
+ * Lost P & Q - do a nonredundant write
+ */
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_011_CreateWriteDAG)
+{
+ rf_CreateNonRedundantWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
+ RF_IO_TYPE_WRITE);
+}
+
+/*
+ In the two cases below,
+ A nasty case arises when the write a (strict) portion of a failed stripe unit
+ and parts of another su. For now, we do not support this.
+*/
+
+/*
+ Lost Data and P - do a Q write.
+*/
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_110_CreateWriteDAG)
+{
+ RF_PhysDiskAddr_t *temp;
+
+ if (asmap->numStripeUnitsAccessed != 1 &&
+ asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit)
+ {
+ RF_PANIC();
+ }
+ /* swap P and Q to fake out parity code */
+ temp = asmap->parityInfo;
+ asmap->parityInfo = asmap->qInfo;
+ asmap->qInfo = temp;
+ rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags,
+ allocList,1, (int (*)())rf_PQ_DegradedWriteQFunc, RF_FALSE);
+ /* is the regular Q func the right one to call? */
+}
+
+/*
+ Lost Data and Q - do degraded mode P write
+*/
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_101_CreateWriteDAG)
+{
+ if (asmap->numStripeUnitsAccessed != 1 &&
+ asmap->failedPDAs[0]->numSector != raidPtr->Layout.sectorsPerStripeUnit)
+ RF_PANIC();
+ rf_CommonCreateSimpleDegradedWriteDAG(raidPtr, asmap, dag_h, bp, flags,
+ allocList,1, rf_RecoveryXorFunc, RF_FALSE);
+}
+
+#endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */
diff --git a/sys/dev/raidframe/rf_pqdeg.h b/sys/dev/raidframe/rf_pqdeg.h
new file mode 100644
index 00000000000..dc34a7970f7
--- /dev/null
+++ b/sys/dev/raidframe/rf_pqdeg.h
@@ -0,0 +1,93 @@
+/* $OpenBSD: rf_pqdeg.h,v 1.1 1999/01/11 14:29:39 niklas Exp $ */
+/* $NetBSD: rf_pqdeg.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* :
+ * Log: rf_pqdeg.h,v
+ * Revision 1.7 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.6 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.5 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.4 1995/11/30 16:19:11 wvcii
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_PQDEG_H_
+#define _RF__RF_PQDEG_H_
+
+#include "rf_types.h"
+
+#if RF_UTILITY == 0
+#include "rf_dag.h"
+
+/* extern decl's of the failure mode PQ functions.
+ * See pddeg.c for nomenclature discussion.
+ */
+
+/* reads, single failure */
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_100_CreateReadDAG);
+/* reads, two failure */
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_110_CreateReadDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_101_CreateReadDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateReadDAG);
+
+/* writes, single failure */
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_100_CreateWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_010_CreateSmallWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_010_CreateLargeWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_001_CreateSmallWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_001_CreateLargeWriteDAG);
+
+/* writes, double failure */
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_011_CreateWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_110_CreateWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_101_CreateWriteDAG);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG);
+#endif /* RF_UTILITY == 0 */
+
+typedef RF_uint32 RF_ua32_t[32];
+typedef RF_uint8 RF_ua1024_t[1024];
+
+extern RF_ua32_t rf_rn;
+extern RF_ua32_t rf_qfor[32];
+#ifndef KERNEL /* we don't support PQ in the kernel yet, so don't link in this monster table */
+extern RF_ua1024_t rf_qinv[29*29];
+#else /* !KERNEL */
+extern RF_ua1024_t rf_qinv[1];
+#endif /* !KERNEL */
+
+#endif /* !_RF__RF_PQDEG_H_ */
diff --git a/sys/dev/raidframe/rf_pqdegdags.c b/sys/dev/raidframe/rf_pqdegdags.c
new file mode 100644
index 00000000000..e8346b4f941
--- /dev/null
+++ b/sys/dev/raidframe/rf_pqdegdags.c
@@ -0,0 +1,554 @@
+/* $OpenBSD: rf_pqdegdags.c,v 1.1 1999/01/11 14:29:40 niklas Exp $ */
+/* $NetBSD: rf_pqdegdags.c,v 1.1 1998/11/13 04:20:32 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * rf_pqdegdags.c
+ * Degraded mode dags for double fault cases.
+*/
+
+/*
+ * :
+ * Log: rf_pqdegdags.c,v
+ * Revision 1.31 1996/11/05 21:10:40 jimz
+ * failed pda generalization
+ *
+ * Revision 1.30 1996/07/31 16:30:05 jimz
+ * asm/asmap fix
+ *
+ * Revision 1.29 1996/07/31 15:35:15 jimz
+ * evenodd changes; bugfixes for double-degraded archs, generalize
+ * some formerly PQ-only functions
+ *
+ * Revision 1.28 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.27 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.26 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.25 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.24 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.23 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.22 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.21 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.20 1996/05/30 12:59:18 jimz
+ * make etimer happier, more portable
+ *
+ * Revision 1.19 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.18 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.17 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.16 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.15 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.14 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.13 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.12 1996/05/08 21:01:24 jimz
+ * fixed up enum type names that were conflicting with other
+ * enums and function names (ie, "panic")
+ * future naming trends will be towards RF_ and rf_ for
+ * everything raidframe-related
+ *
+ * Revision 1.11 1996/05/03 19:47:50 wvcii
+ * removed include of rf_redstripe.h
+ *
+ * Revision 1.10 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.9 1995/11/30 16:17:57 wvcii
+ * added copyright info
+ *
+ * Revision 1.8 1995/11/07 15:33:25 wvcii
+ * dag creation routines now generate term node
+ * added asserts
+ * encoded commit point nodes, antecedence types into dags
+ * didn't add commit barrier - the code is a mess and needs to
+ * be cleand up first
+ *
+ */
+
+#include "rf_archs.h"
+
+#if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_dagfuncs.h"
+#include "rf_dagutils.h"
+#include "rf_etimer.h"
+#include "rf_acctrace.h"
+#include "rf_general.h"
+#include "rf_pqdegdags.h"
+#include "rf_pq.h"
+#include "rf_sys.h"
+
+static void applyPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, RF_PhysDiskAddr_t *ppda,
+ RF_PhysDiskAddr_t *qpda, void *bp);
+
+/*
+ Two data drives have failed, and we are doing a read that covers one of them.
+ We may also be reading some of the surviving drives.
+
+
+ *****************************************************************************************
+ *
+ * creates a DAG to perform a degraded-mode read of data within one stripe.
+ * This DAG is as follows:
+ *
+ * Hdr
+ * |
+ * Block
+ * / / \ \ \ \
+ * Rud ... Rud Rrd ... Rrd Rp Rq
+ * | \ | \ | \ | \ | \ | \
+ *
+ * | |
+ * Unblock X
+ * \ /
+ * ------ T ------
+ *
+ * Each R node is a successor of the L node
+ * One successor arc from each R node goes to U, and the other to X
+ * There is one Rud for each chunk of surviving user data requested by the user,
+ * and one Rrd for each chunk of surviving user data _not_ being read by the user
+ * R = read, ud = user data, rd = recovery (surviving) data, p = P data, q = Qdata
+ * X = pq recovery node, T = terminate
+ *
+ * The block & unblock nodes are leftovers from a previous version. They
+ * do nothing, but I haven't deleted them because it would be a tremendous
+ * effort to put them back in.
+ *
+ * Note: The target buffer for the XOR node is set to the actual user buffer where the
+ * failed data is supposed to end up. This buffer is zero'd by the code here. Thus,
+ * if you create a degraded read dag, use it, and then re-use, you have to be sure to
+ * zero the target buffer prior to the re-use.
+ *
+ * Every buffer read is passed to the pq recovery node, whose job it is to sort out whats
+ * needs and what's not.
+ ****************************************************************************************/
+/* init a disk node with 2 successors and one predecessor */
+#define INIT_DISK_NODE(node,name) \
+rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, dag_h, name, allocList); \
+(node)->succedents[0] = unblockNode; \
+(node)->succedents[1] = recoveryNode; \
+(node)->antecedents[0] = blockNode; \
+(node)->antType[0] = rf_control
+
+#define DISK_NODE_PARAMS(_node_,_p_) \
+ (_node_).params[0].p = _p_ ; \
+ (_node_).params[1].p = (_p_)->bufPtr; \
+ (_node_).params[2].v = parityStripeID; \
+ (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru)
+
+#define DISK_NODE_PDA(node) ((node)->params[0].p)
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
+{
+ rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
+ "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
+}
+
+static void applyPDA(raidPtr,pda,ppda,qpda, bp)
+ RF_Raid_t *raidPtr;
+ RF_PhysDiskAddr_t *pda;
+ RF_PhysDiskAddr_t *ppda;
+ RF_PhysDiskAddr_t *qpda;
+ void *bp;
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
+ RF_SectorCount_t s0len = ppda->numSector, len;
+ RF_SectorNum_t suoffset;
+ unsigned coeff;
+ char *pbuf = ppda->bufPtr;
+ char *qbuf = qpda->bufPtr;
+ char *buf;
+ int delta;
+
+ suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+ len = pda->numSector;
+ /* see if pda intersects a recovery pda */
+ if ((suoffset < s0off+s0len) && ( suoffset+len > s0off))
+ {
+ buf = pda->bufPtr;
+ coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),pda->raidAddress);
+ coeff = (coeff % raidPtr->Layout.numDataCol);
+
+ if (suoffset < s0off)
+ {
+ delta = s0off - suoffset;
+ buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),delta);
+ suoffset = s0off;
+ len -= delta;
+ }
+ if (suoffset > s0off)
+ {
+ delta = suoffset - s0off;
+ pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),delta);
+ qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),delta);
+ }
+ if ((suoffset + len) > (s0len + s0off))
+ len = s0len + s0off - suoffset;
+
+ /* src, dest, len */
+ rf_bxor(buf,pbuf,rf_RaidAddressToByte(raidPtr,len), bp);
+
+ /* dest, src, len, coeff */
+ rf_IncQ((unsigned long *)qbuf,(unsigned long *)buf,rf_RaidAddressToByte(raidPtr,len),coeff);
+ }
+}
+/*
+ Recover data in the case of a double failure. There can be two
+ result buffers, one for each chunk of data trying to be recovered.
+ The params are pda's that have not been range restricted or otherwise
+ politely massaged - this should be done here. The last params are the
+ pdas of P and Q, followed by the raidPtr. The list can look like
+
+ pda, pda, ... , p pda, q pda, raidptr, asm
+
+ or
+
+ pda, pda, ... , p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
+
+ depending on wether two chunks of recovery data were required.
+
+ The second condition only arises if there are two failed buffers
+ whose lengths do not add up a stripe unit.
+*/
+
+
+int rf_PQDoubleRecoveryFunc(node)
+ RF_DagNode_t *node;
+{
+ int np = node->numParams;
+ RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p;
+ RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p;
+ RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
+ int d, i;
+ unsigned coeff;
+ RF_RaidAddr_t sosAddr, suoffset;
+ RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
+ int two = 0;
+ RF_PhysDiskAddr_t *ppda,*ppda2,*qpda,*qpda2,*pda,npda;
+ char *buf;
+ int numDataCol = layoutPtr->numDataCol;
+ RF_Etimer_t timer;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+
+ RF_ETIMER_START(timer);
+
+ if (asmap->failedPDAs[1] &&
+ (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU))
+ {
+ RF_ASSERT(0);
+ ppda = node->params[np-6].p;
+ ppda2 = node->params[np-5].p;
+ qpda = node->params[np-4].p;
+ qpda2 = node->params[np-3].p;
+ d = (np-6);
+ two = 1;
+ }
+ else
+ {
+ ppda = node->params[np-4].p;
+ qpda = node->params[np-3].p;
+ d = (np-4);
+ }
+
+ for (i=0; i < d; i++)
+ {
+ pda = node->params[i].p;
+ buf = pda->bufPtr;
+ suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
+ len = pda->numSector;
+ coeff = rf_RaidAddressToStripeUnitID(layoutPtr,pda->raidAddress);
+ /* compute the data unit offset within the column */
+ coeff = (coeff % raidPtr->Layout.numDataCol);
+ /* see if pda intersects a recovery pda */
+ applyPDA(raidPtr,pda,ppda,qpda,node->dagHdr->bp);
+ if (two)
+ applyPDA(raidPtr,pda,ppda,qpda,node->dagHdr->bp);
+ }
+
+ /* ok, we got the parity back to the point where we can recover.
+ We now need to determine the coeff of the columns that need to be
+ recovered. We can also only need to recover a single stripe unit.
+ */
+
+ if (asmap->failedPDAs[1] == NULL)
+ { /* only a single stripe unit to recover. */
+ pda = asmap->failedPDAs[0];
+ sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+ /* need to determine the column of the other failed disk */
+ coeff = rf_RaidAddressToStripeUnitID(layoutPtr,pda->raidAddress);
+ /* compute the data unit offset within the column */
+ coeff = (coeff % raidPtr->Layout.numDataCol);
+ for (i=0; i < numDataCol; i++)
+ {
+ npda.raidAddress = sosAddr + (i * secPerSU);
+ (raidPtr->Layout.map->MapSector)(raidPtr,npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
+ /* skip over dead disks */
+ if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
+ if (i != coeff) break;
+ }
+ RF_ASSERT (i < numDataCol);
+ RF_ASSERT (two==0);
+ /* recover the data. Since we need only want to recover one column, we overwrite the
+ parity with the other one. */
+ if (coeff < i) /* recovering 'a' */
+ rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)pda->bufPtr,(unsigned long *)ppda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), coeff, i);
+ else /* recovering 'b' */
+ rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,(unsigned long *)pda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), i, coeff);
+ }
+ else
+ RF_PANIC();
+
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ if (tracerec)
+ tracerec->q_us += RF_ETIMER_VAL_US(timer);
+ rf_GenericWakeupFunc(node,0);
+ return(0);
+}
+
+int rf_PQWriteDoubleRecoveryFunc(node)
+ RF_DagNode_t *node;
+{
+ /* The situation:
+
+ We are doing a write that hits only one
+ failed data unit.
+ The other failed data unit is not being overwritten, so
+ we need to generate it.
+
+ For the moment, we assume all the nonfailed data being
+ written is in the shadow of the failed data unit.
+ (i.e,, either a single data unit write or the entire
+ failed stripe unit is being overwritten. )
+
+ Recovery strategy:
+ apply the recovery data to the parity and q.
+ Use P & Q to recover the second failed data unit in P.
+ Zero fill Q, then apply the recovered data to p.
+ Then apply the data being written to the failed drive.
+ Then walk through the surviving drives, applying new data
+ when it exists, othewise the recovery data. Quite a mess.
+
+
+ The params
+
+ read pda0, read pda1, ... read pda (numDataCol-3),
+ write pda0, ... , write pda (numStripeUnitAccess - numDataFailed),
+ failed pda, raidPtr, asmap
+ */
+
+ int np = node->numParams;
+ RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np-1].p;
+ RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np-2].p;
+ RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) &(raidPtr->Layout);
+ int i;
+ RF_RaidAddr_t sosAddr;
+ unsigned coeff;
+ RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
+ RF_PhysDiskAddr_t *ppda,*qpda,*pda,npda;
+ int numDataCol = layoutPtr->numDataCol;
+ RF_Etimer_t timer;
+ RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
+
+ RF_ASSERT(node->numResults == 2);
+ RF_ASSERT(asmap->failedPDAs[1] == NULL);
+ RF_ETIMER_START(timer);
+ ppda = node->results[0];
+ qpda = node->results[1];
+ /* apply the recovery data */
+ for (i=0; i < numDataCol-2; i++)
+ applyPDA(raidPtr,node->params[i].p,ppda,qpda, node->dagHdr->bp);
+
+ /* determine the other failed data unit */
+ pda = asmap->failedPDAs[0];
+ sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress);
+ /* need to determine the column of the other failed disk */
+ coeff = rf_RaidAddressToStripeUnitID(layoutPtr,pda->raidAddress);
+ /* compute the data unit offset within the column */
+ coeff = (coeff % raidPtr->Layout.numDataCol);
+ for (i=0; i < numDataCol; i++)
+ {
+ npda.raidAddress = sosAddr + (i * secPerSU);
+ (raidPtr->Layout.map->MapSector)(raidPtr,npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0);
+ /* skip over dead disks */
+ if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
+ if (i != coeff) break;
+ }
+ RF_ASSERT (i < numDataCol);
+ /* recover the data. The column we want to recover we write over the parity.
+ The column we don't care about we dump in q. */
+ if (coeff < i) /* recovering 'a' */
+ rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), coeff, i);
+ else /* recovering 'b' */
+ rf_PQ_recover((unsigned long *)ppda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,rf_RaidAddressToByte(raidPtr,pda->numSector), i, coeff);
+
+ /* OK. The valid data is in P. Zero fill Q, then inc it into it. */
+ bzero(qpda->bufPtr,rf_RaidAddressToByte(raidPtr,qpda->numSector));
+ rf_IncQ((unsigned long *)qpda->bufPtr,(unsigned long *)ppda->bufPtr,rf_RaidAddressToByte(raidPtr,qpda->numSector),i);
+
+ /* now apply all the write data to the buffer */
+ /* single stripe unit write case: the failed data is only thing we are writing. */
+ RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
+ /* dest, src, len, coeff */
+ rf_IncQ((unsigned long *)qpda->bufPtr,(unsigned long *)asmap->failedPDAs[0]->bufPtr,rf_RaidAddressToByte(raidPtr,qpda->numSector),coeff);
+ rf_bxor(asmap->failedPDAs[0]->bufPtr,ppda->bufPtr,rf_RaidAddressToByte(raidPtr,ppda->numSector),node->dagHdr->bp);
+
+ /* now apply all the recovery data */
+ for (i=0; i < numDataCol-2; i++)
+ applyPDA(raidPtr,node->params[i].p,ppda,qpda, node->dagHdr->bp);
+
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ if (tracerec)
+ tracerec->q_us += RF_ETIMER_VAL_US(timer);
+
+ rf_GenericWakeupFunc(node,0);
+ return(0);
+}
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
+{
+ RF_PANIC();
+}
+
+/*
+ Two lost data unit write case.
+
+ There are really two cases here:
+
+ (1) The write completely covers the two lost data units.
+ In that case, a reconstruct write that doesn't write the
+ failed data units will do the correct thing. So in this case,
+ the dag looks like
+
+ full stripe read of surviving data units (not being overwriten)
+ write new data (ignoring failed units) compute P&Q
+ write P&Q
+
+
+ (2) The write does not completely cover both failed data units
+ (but touches at least one of them). Then we need to do the
+ equivalent of a reconstruct read to recover the missing data
+ unit from the other stripe.
+
+ For any data we are writing that is not in the "shadow"
+ of the failed units, we need to do a four cycle update.
+ PANIC on this case. for now
+
+*/
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
+ int sum;
+ int nf = asmap->numDataFailed;
+
+ sum = asmap->failedPDAs[0]->numSector;
+ if (nf == 2)
+ sum += asmap->failedPDAs[1]->numSector;
+
+ if ((nf == 2) && ( sum == (2*sectorsPerSU)))
+ {
+ /* large write case */
+ rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
+ return;
+ }
+
+
+ if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU))
+ {
+ /* small write case, no user data not in shadow */
+ rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
+ return;
+ }
+ RF_PANIC();
+}
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
+{
+ rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
+}
+
+#endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */
diff --git a/sys/dev/raidframe/rf_pqdegdags.h b/sys/dev/raidframe/rf_pqdegdags.h
new file mode 100644
index 00000000000..e860ffe0183
--- /dev/null
+++ b/sys/dev/raidframe/rf_pqdegdags.h
@@ -0,0 +1,77 @@
+/* $OpenBSD: rf_pqdegdags.h,v 1.1 1999/01/11 14:29:40 niklas Exp $ */
+/* $NetBSD: rf_pqdegdags.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */
+/*
+ * rf_pqdegdags.h
+ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * rf_pqdegdags.c
+ * Degraded mode dags for double fault cases.
+ */
+/*
+ * :
+ * Log: rf_pqdegdags.h,v
+ * Revision 1.6 1996/07/31 15:35:20 jimz
+ * evenodd changes; bugfixes for double-degraded archs, generalize
+ * some formerly PQ-only functions
+ *
+ * Revision 1.5 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.4 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.3 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.2 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.1 1996/05/18 19:56:30 jimz
+ * Initial revision
+ *
+ */
+
+#ifndef _RF__RF_PQDEGDAGS_H_
+#define _RF__RF_PQDEGDAGS_H_
+
+#include "rf_dag.h"
+
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead);
+int rf_PQDoubleRecoveryFunc(RF_DagNode_t *node);
+int rf_PQWriteDoubleRecoveryFunc(RF_DagNode_t *node);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite);
+RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG);
+
+#endif /* !_RF__RF_PQDEGDAGS_H_ */
diff --git a/sys/dev/raidframe/rf_psstatus.c b/sys/dev/raidframe/rf_psstatus.c
new file mode 100644
index 00000000000..a1396d150bd
--- /dev/null
+++ b/sys/dev/raidframe/rf_psstatus.c
@@ -0,0 +1,417 @@
+/* $OpenBSD: rf_psstatus.c,v 1.1 1999/01/11 14:29:40 niklas Exp $ */
+/* $NetBSD: rf_psstatus.c,v 1.1 1998/11/13 04:20:32 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************
+ *
+ * psstatus.c
+ *
+ * The reconstruction code maintains a bunch of status related to the parity
+ * stripes that are currently under reconstruction. This header file defines
+ * the status structures.
+ *
+ *****************************************************************************/
+
+/* :
+ * Log: rf_psstatus.c,v
+ * Revision 1.29 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.28 1996/07/15 17:22:18 jimz
+ * nit-pick code cleanup
+ * resolve stdlib problems on DEC OSF
+ *
+ * Revision 1.27 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.26 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.25 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.24 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.23 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.22 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.21 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.20 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.19 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.18 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.17 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.16 1996/05/20 16:15:27 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.15 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.14 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.13 1995/11/30 16:17:18 wvcii
+ * added copyright info
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_threadid.h"
+#include "rf_general.h"
+#include "rf_debugprint.h"
+#include "rf_freelist.h"
+#include "rf_psstatus.h"
+#include "rf_shutdown.h"
+#include "rf_sys.h"
+
+#define Dprintf1(s,a) if (rf_pssDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf2(s,a,b) if (rf_pssDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf3(s,a,b,c) if (rf_pssDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
+
+static void RealPrintPSStatusTable(RF_Raid_t *raidPtr,
+ RF_PSStatusHeader_t *pssTable);
+
+#define RF_MAX_FREE_PSS 32
+#define RF_PSS_INC 8
+#define RF_PSS_INITIAL 4
+
+static int init_pss( RF_ReconParityStripeStatus_t *, RF_Raid_t *);
+static void clean_pss(RF_ReconParityStripeStatus_t *, RF_Raid_t *);
+static void rf_ShutdownPSStatus(void *);
+
+static int init_pss(p, raidPtr)
+ RF_ReconParityStripeStatus_t *p;
+ RF_Raid_t *raidPtr;
+{
+ RF_Calloc(p->issued, raidPtr->numCol, sizeof(char), (char *));
+ if (p->issued == NULL)
+ return(ENOMEM);
+ return(0);
+}
+
+static void clean_pss(p, raidPtr)
+ RF_ReconParityStripeStatus_t *p;
+ RF_Raid_t *raidPtr;
+{
+ RF_Free(p->issued, raidPtr->numCol*sizeof(char));
+}
+
+static void rf_ShutdownPSStatus(arg)
+ void *arg;
+{
+ RF_Raid_t *raidPtr = (RF_Raid_t *)arg;
+
+ RF_FREELIST_DESTROY_CLEAN_ARG(raidPtr->pss_freelist,next,(RF_ReconParityStripeStatus_t *),clean_pss,raidPtr);
+}
+
+int rf_ConfigurePSStatus(
+ RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr)
+{
+ int rc;
+
+ raidPtr->pssTableSize = RF_PSS_DEFAULT_TABLESIZE;
+ RF_FREELIST_CREATE(raidPtr->pss_freelist, RF_MAX_FREE_PSS,
+ RF_PSS_INC, sizeof(RF_ReconParityStripeStatus_t));
+ if (raidPtr->pss_freelist == NULL)
+ return(ENOMEM);
+ rc = rf_ShutdownCreate(listp, rf_ShutdownPSStatus, raidPtr);
+ if (rc) {
+ RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
+ __FILE__, __LINE__, rc);
+ rf_ShutdownPSStatus(raidPtr);
+ return(rc);
+ }
+ RF_FREELIST_PRIME_INIT_ARG(raidPtr->pss_freelist, RF_PSS_INITIAL,next,
+ (RF_ReconParityStripeStatus_t *),init_pss,raidPtr);
+ return(0);
+}
+
+/*****************************************************************************************
+ * sets up the pss table
+ * We pre-allocate a bunch of entries to avoid as much as possible having to
+ * malloc up hash chain entries.
+ ****************************************************************************************/
+RF_PSStatusHeader_t *rf_MakeParityStripeStatusTable(raidPtr)
+ RF_Raid_t *raidPtr;
+{
+ RF_PSStatusHeader_t *pssTable;
+ int i, j, rc;
+
+ RF_Calloc(pssTable, raidPtr->pssTableSize, sizeof(RF_PSStatusHeader_t), (RF_PSStatusHeader_t *));
+ for (i=0; i<raidPtr->pssTableSize; i++) {
+ rc = rf_mutex_init(&pssTable[i].mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ /* fail and deallocate */
+ for(j=0;j<i;j++) {
+ rf_mutex_destroy(&pssTable[i].mutex);
+ }
+ RF_Free(pssTable, raidPtr->pssTableSize*sizeof(RF_PSStatusHeader_t));
+ return(NULL);
+ }
+ }
+ return(pssTable);
+}
+
+void rf_FreeParityStripeStatusTable(raidPtr, pssTable)
+ RF_Raid_t *raidPtr;
+ RF_PSStatusHeader_t *pssTable;
+{
+ int i;
+
+ if (rf_pssDebug)
+ RealPrintPSStatusTable(raidPtr, pssTable);
+ for (i=0; i<raidPtr->pssTableSize; i++) {
+ if (pssTable[i].chain) {
+ printf("ERROR: pss hash chain not null at recon shutdown\n");
+ }
+ rf_mutex_destroy(&pssTable[i].mutex);
+ }
+ RF_Free(pssTable, raidPtr->pssTableSize * sizeof(RF_PSStatusHeader_t));
+}
+
+
+/* looks up the status structure for a parity stripe.
+ * if the create_flag is on, creates and returns the status structure it it doesn't exist
+ * otherwise returns NULL if the status structure does not exist
+ *
+ * ASSUMES THE PSS DESCRIPTOR IS LOCKED UPON ENTRY
+ */
+RF_ReconParityStripeStatus_t *rf_LookupRUStatus(
+ RF_Raid_t *raidPtr,
+ RF_PSStatusHeader_t *pssTable,
+ RF_StripeNum_t psID,
+ RF_ReconUnitNum_t which_ru,
+ RF_PSSFlags_t flags, /* whether or not to create it if it doesn't exist + what flags to set initially */
+ int *created)
+{
+ RF_PSStatusHeader_t *hdr = &pssTable[ RF_HASH_PSID(raidPtr,psID) ];
+ RF_ReconParityStripeStatus_t *p, *pssPtr = hdr->chain;
+
+ *created = 0;
+ for (p = pssPtr; p; p=p->next) {
+ if (p->parityStripeID == psID && p->which_ru == which_ru)
+ break;
+ }
+
+ if (!p && (flags&RF_PSS_CREATE)) {
+ Dprintf2("PSS: creating pss for psid %ld ru %d\n",psID,which_ru);
+ p = rf_AllocPSStatus(raidPtr);
+ p->next = hdr->chain; hdr->chain = p;
+
+ p->parityStripeID = psID;
+ p->which_ru = which_ru;
+ p->flags = flags;
+ p->rbuf = NULL;
+ p->writeRbuf = NULL;
+ p->blockCount = 0;
+ p->procWaitList = NULL;
+ p->blockWaitList = NULL;
+ p->bufWaitList = NULL;
+ *created = 1;
+ } else if (p) { /* we didn't create, but we want to specify some new status */
+ p->flags |= flags; /* add in whatever flags we're specifying */
+ }
+ if (p && (flags & RF_PSS_RECON_BLOCKED)) {
+ int tid;
+ rf_get_threadid(tid);
+ p->blockCount++; /* if we're asking to block recon, bump the count */
+ Dprintf3("[%d] Blocked recon on psid %ld. count now %d\n",tid,psID,p->blockCount);
+ }
+ return(p);
+}
+
+/* deletes an entry from the parity stripe status table. typically used
+ * when an entry has been allocated solely to block reconstruction, and
+ * no recon was requested while recon was blocked. Assumes the hash
+ * chain is ALREADY LOCKED.
+ */
+void rf_PSStatusDelete(raidPtr, pssTable, pssPtr)
+ RF_Raid_t *raidPtr;
+ RF_PSStatusHeader_t *pssTable;
+ RF_ReconParityStripeStatus_t *pssPtr;
+{
+ RF_PSStatusHeader_t *hdr = &(pssTable[ RF_HASH_PSID(raidPtr,pssPtr->parityStripeID) ] );
+ RF_ReconParityStripeStatus_t *p = hdr->chain, *pt = NULL;
+
+ while (p) {
+ if (p == pssPtr) {
+ if (pt) pt->next = p->next; else hdr->chain = p->next;
+ p->next = NULL;
+ rf_FreePSStatus(raidPtr, p);
+ return;
+ }
+ pt = p; p=p->next;
+ }
+ RF_ASSERT(0); /* we must find it here */
+}
+
+/* deletes an entry from the ps status table after reconstruction has completed */
+void rf_RemoveFromActiveReconTable(raidPtr, row, psid, which_ru)
+ RF_Raid_t *raidPtr;
+ RF_RowCol_t row;
+ RF_ReconUnitNum_t which_ru;
+ RF_StripeNum_t psid;
+{
+ RF_PSStatusHeader_t *hdr = &(raidPtr->reconControl[row]->pssTable[ RF_HASH_PSID(raidPtr,psid) ]);
+ RF_ReconParityStripeStatus_t *p, *pt;
+ RF_CallbackDesc_t *cb, *cb1;
+
+ RF_LOCK_MUTEX( hdr->mutex );
+ for (pt=NULL, p = hdr->chain; p; pt=p,p=p->next) {
+ if ((p->parityStripeID == psid) && (p->which_ru == which_ru))
+ break;
+ }
+ if (p == NULL) {
+ rf_PrintPSStatusTable(raidPtr, row);
+ }
+ RF_ASSERT(p); /* it must be there */
+
+ Dprintf2("PSS: deleting pss for psid %ld ru %d\n",psid,which_ru);
+
+ /* delete this entry from the hash chain */
+ if (pt) pt->next = p->next;
+ else hdr->chain = p->next;
+ p->next = NULL;
+
+ RF_UNLOCK_MUTEX( hdr->mutex );
+
+ /* wakup anyone waiting on the parity stripe ID */
+ cb = p->procWaitList;
+ p->procWaitList = NULL;
+ while (cb) {
+ Dprintf1("Waking up access waiting on parity stripe ID %ld\n",p->parityStripeID);
+ cb1 = cb->next;
+ (cb->callbackFunc)(cb->callbackArg);
+
+ /* THIS IS WHAT THE ORIGINAL CODE HAD... the extra 0 is bogus, IMHO */
+ /* (cb->callbackFunc)(cb->callbackArg, 0); */
+ rf_FreeCallbackDesc(cb);
+ cb = cb1;
+ }
+
+ rf_FreePSStatus(raidPtr, p);
+}
+
+RF_ReconParityStripeStatus_t *rf_AllocPSStatus(raidPtr)
+ RF_Raid_t *raidPtr;
+{
+ RF_ReconParityStripeStatus_t *p;
+
+ RF_FREELIST_GET_INIT_ARG(raidPtr->pss_freelist,p,next,(RF_ReconParityStripeStatus_t *),init_pss,raidPtr);
+ if (p) {
+ bzero(p->issued, raidPtr->numCol);
+ }
+ p->next = NULL;
+ /* no need to initialize here b/c the only place we're called from is the above Lookup */
+ return(p);
+}
+
+void rf_FreePSStatus(raidPtr, p)
+ RF_Raid_t *raidPtr;
+ RF_ReconParityStripeStatus_t *p;
+{
+ RF_ASSERT(p->procWaitList == NULL);
+ RF_ASSERT(p->blockWaitList == NULL);
+ RF_ASSERT(p->bufWaitList == NULL);
+
+ RF_FREELIST_FREE_CLEAN_ARG(raidPtr->pss_freelist,p,next,clean_pss,raidPtr);
+}
+
+static void RealPrintPSStatusTable(raidPtr, pssTable)
+ RF_Raid_t *raidPtr;
+ RF_PSStatusHeader_t *pssTable;
+{
+ int i, j, procsWaiting, blocksWaiting, bufsWaiting;
+ RF_ReconParityStripeStatus_t *p;
+ RF_CallbackDesc_t *cb;
+
+ printf("\nParity Stripe Status Table\n");
+ for (i=0; i< raidPtr->pssTableSize; i++) {
+ for (p = pssTable[i].chain; p; p=p->next) {
+ procsWaiting = blocksWaiting = bufsWaiting = 0;
+ for (cb = p->procWaitList; cb; cb=cb->next) procsWaiting++;
+ for (cb = p->blockWaitList; cb; cb=cb->next) blocksWaiting++;
+ for (cb = p->bufWaitList; cb; cb=cb->next) bufsWaiting++;
+ printf("PSID %ld RU %d : blockCount %d %d/%d/%d proc/block/buf waiting, issued ",
+ (long)p->parityStripeID, p->which_ru, p->blockCount, procsWaiting, blocksWaiting, bufsWaiting);
+ for (j=0;j<raidPtr->numCol; j++) printf("%c", (p->issued[j]) ? '1' : '0');
+ if (!p->flags) printf(" flags: (none)");
+ else {
+ if (p->flags & RF_PSS_UNDER_RECON) printf(" under-recon");
+ if (p->flags & RF_PSS_FORCED_ON_WRITE) printf(" forced-w");
+ if (p->flags & RF_PSS_FORCED_ON_READ) printf(" forced-r");
+ if (p->flags & RF_PSS_RECON_BLOCKED) printf(" blocked");
+ if (p->flags & RF_PSS_BUFFERWAIT) printf(" bufwait");
+ }
+ printf("\n");
+ }
+ }
+}
+
+void rf_PrintPSStatusTable(raidPtr, row)
+ RF_Raid_t *raidPtr;
+ RF_RowCol_t row;
+{
+ RF_PSStatusHeader_t *pssTable = raidPtr->reconControl[row]->pssTable;
+ RealPrintPSStatusTable(raidPtr, pssTable);
+}
diff --git a/sys/dev/raidframe/rf_psstatus.h b/sys/dev/raidframe/rf_psstatus.h
new file mode 100644
index 00000000000..eaca5822094
--- /dev/null
+++ b/sys/dev/raidframe/rf_psstatus.h
@@ -0,0 +1,154 @@
+/* $OpenBSD: rf_psstatus.h,v 1.1 1999/01/11 14:29:41 niklas Exp $ */
+/* $NetBSD: rf_psstatus.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************
+ *
+ * psstatus.h
+ *
+ * The reconstruction code maintains a bunch of status related to the parity
+ * stripes that are currently under reconstruction. This header file defines
+ * the status structures.
+ *
+ *****************************************************************************/
+
+/* :
+ * Log: rf_psstatus.h,v
+ * Revision 1.16 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.15 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.14 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.13 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.12 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.11 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.10 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.9 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.8 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.7 1995/11/30 16:17:28 wvcii
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_PSSTATUS_H_
+#define _RF__RF_PSSTATUS_H_
+
+#include "rf_types.h"
+#include "rf_threadstuff.h"
+#include "rf_callback.h"
+
+#define RF_PS_MAX_BUFS 10 /* max number of bufs we'll accumulate before we do an XOR */
+
+#define RF_PSS_DEFAULT_TABLESIZE 200
+
+/*
+ * Macros to acquire/release the mutex lock on a parity stripe status
+ * descriptor. Note that we use just one lock for the whole hash chain.
+ */
+#define RF_HASH_PSID(_raid_,_psid_) ( (_psid_) % ((_raid_)->pssTableSize) ) /* simple hash function */
+#define RF_LOCK_PSS_MUTEX(_raidPtr, _row, _psid) \
+ RF_LOCK_MUTEX((_raidPtr)->reconControl[_row]->pssTable[ RF_HASH_PSID(_raidPtr,_psid) ].mutex)
+#define RF_UNLOCK_PSS_MUTEX(_raidPtr, _row, _psid) \
+ RF_UNLOCK_MUTEX((_raidPtr)->reconControl[_row]->pssTable[ RF_HASH_PSID(_raidPtr,_psid) ].mutex)
+
+struct RF_ReconParityStripeStatus_s {
+ RF_StripeNum_t parityStripeID; /* the parity stripe ID */
+ RF_ReconUnitNum_t which_ru; /* which reconstruction unit with the indicated parity stripe */
+ RF_PSSFlags_t flags; /* flags indicating various conditions */
+ void *rbuf; /* this is the accumulating xor sum */
+ void *writeRbuf; /* DEBUG ONLY: a pointer to the rbuf after it has filled & been sent to disk */
+ void *rbufsForXor[RF_PS_MAX_BUFS]; /* these are buffers still to be xored into the accumulating sum */
+ int xorBufCount; /* num buffers waiting to be xored */
+ int blockCount; /* count of # proc that have blocked recon on this parity stripe */
+ char *issued; /* issued[i]==1 <=> column i has already issued a read request for the indicated RU */
+ RF_CallbackDesc_t *procWaitList; /* list of user procs waiting for recon to be done */
+ RF_CallbackDesc_t *blockWaitList;/* list of disks blocked waiting for user write to complete */
+ RF_CallbackDesc_t *bufWaitList; /* list of disks blocked waiting to acquire a buffer for this RU */
+ RF_ReconParityStripeStatus_t *next;
+};
+
+struct RF_PSStatusHeader_s {
+ RF_DECLARE_MUTEX(mutex) /* mutex for this hash chain */
+ RF_ReconParityStripeStatus_t *chain; /* the hash chain */
+};
+
+/* masks for the "flags" field above */
+#define RF_PSS_NONE 0x00000000 /* no flags */
+#define RF_PSS_UNDER_RECON 0x00000001 /* this parity stripe is currently under reconstruction */
+#define RF_PSS_FORCED_ON_WRITE 0x00000002 /* indicates a recon was forced due to a user-write operation */
+#define RF_PSS_FORCED_ON_READ 0x00000004 /* ditto for read, but not currently implemented */
+#define RF_PSS_RECON_BLOCKED 0x00000008 /* reconstruction is currently blocked due to a pending user I/O */
+#define RF_PSS_CREATE 0x00000010 /* tells LookupRUStatus to create the entry */
+#define RF_PSS_BUFFERWAIT 0x00000020 /* someone is waiting for a buffer for this RU */
+
+int rf_ConfigurePSStatus(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr);
+
+RF_PSStatusHeader_t *rf_MakeParityStripeStatusTable(RF_Raid_t *raidPtr);
+void rf_FreeParityStripeStatusTable(RF_Raid_t *raidPtr,
+ RF_PSStatusHeader_t *pssTable);
+RF_ReconParityStripeStatus_t *rf_LookupRUStatus(RF_Raid_t *raidPtr,
+ RF_PSStatusHeader_t *pssTable, RF_StripeNum_t psID,
+ RF_ReconUnitNum_t which_ru, RF_PSSFlags_t flags, int *created);
+void rf_PSStatusDelete(RF_Raid_t *raidPtr, RF_PSStatusHeader_t *pssTable,
+ RF_ReconParityStripeStatus_t *pssPtr);
+void rf_RemoveFromActiveReconTable(RF_Raid_t *raidPtr, RF_RowCol_t row,
+ RF_StripeNum_t psid, RF_ReconUnitNum_t which_ru);
+RF_ReconParityStripeStatus_t *rf_AllocPSStatus(RF_Raid_t *raidPtr);
+void rf_FreePSStatus(RF_Raid_t *raidPtr, RF_ReconParityStripeStatus_t *p);
+void rf_PrintPSStatusTable(RF_Raid_t *raidPtr, RF_RowCol_t row);
+
+#endif /* !_RF__RF_PSSTATUS_H_ */
diff --git a/sys/dev/raidframe/rf_raid.h b/sys/dev/raidframe/rf_raid.h
new file mode 100644
index 00000000000..278cc9f507a
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid.h
@@ -0,0 +1,437 @@
+/* $OpenBSD: rf_raid.h,v 1.1 1999/01/11 14:29:41 niklas Exp $ */
+/* $NetBSD: rf_raid.h,v 1.1 1998/11/13 04:20:32 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/**********************************************
+ * rf_raid.h -- main header file for RAID driver
+ **********************************************/
+
+/*
+ * :
+ * Log: rf_raid.h,v
+ * Revision 1.48 1996/08/20 22:33:54 jimz
+ * make hist_diskreq a doubly-indexed array
+ *
+ * Revision 1.47 1996/07/15 05:40:41 jimz
+ * some recon datastructure cleanup
+ * better handling of multiple failures
+ * added undocumented double-recon test
+ *
+ * Revision 1.46 1996/07/10 22:28:51 jimz
+ * get rid of obsolete row statuses (dead,degraded2)
+ *
+ * Revision 1.45 1996/06/14 14:56:29 jimz
+ * make engine threading stuff ifndef SIMULATE
+ *
+ * Revision 1.44 1996/06/14 14:16:54 jimz
+ * move in engine node queue, atomicity control
+ *
+ * Revision 1.43 1996/06/12 04:41:26 jimz
+ * tweaks to make genplot work with user-level driver
+ * (mainly change stat collection)
+ *
+ * Revision 1.42 1996/06/11 10:57:17 jimz
+ * add recon_done_procs, recon_done_proc_mutex
+ *
+ * Revision 1.41 1996/06/11 01:26:48 jimz
+ * added mechanism for user-level to sync diskthread startup,
+ * shutdown
+ *
+ * Revision 1.40 1996/06/10 14:18:58 jimz
+ * move user, throughput stats into per-array structure
+ *
+ * Revision 1.39 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.38 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.37 1996/06/05 19:38:32 jimz
+ * fixed up disk queueing types config
+ * added sstf disk queueing
+ * fixed exit bug on diskthreads (ref-ing bad mem)
+ *
+ * Revision 1.36 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.35 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.34 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.33 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.32 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.31 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.30 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.29 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.28 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.27 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.26 1996/05/08 21:01:24 jimz
+ * fixed up enum type names that were conflicting with other
+ * enums and function names (ie, "panic")
+ * future naming trends will be towards RF_ and rf_ for
+ * everything raidframe-related
+ *
+ * Revision 1.25 1996/05/02 14:57:55 jimz
+ * add sectorMask
+ *
+ * Revision 1.24 1996/04/22 15:53:13 jimz
+ * MAX_RAIDS -> NRAIDFRAME
+ *
+ * Revision 1.23 1995/12/14 18:39:46 jimz
+ * convert to rf_types.h types
+ *
+ * Revision 1.22 1995/12/06 15:02:26 root
+ * added copyright info
+ *
+ * Revision 1.21 1995/10/09 17:39:24 jimz
+ * added info for tracking number of outstanding accesses
+ * at user-level
+ *
+ * Revision 1.20 1995/09/30 20:37:46 jimz
+ * added acc_totals to Raid for kernel
+ *
+ * Revision 1.19 1995/09/19 22:57:14 jimz
+ * add cache of raidid for kernel
+ *
+ * Revision 1.18 1995/09/18 16:50:04 jimz
+ * added RF_MAX_DISKS (for config ioctls)
+ *
+ * Revision 1.17 1995/09/07 19:02:31 jimz
+ * mods to get raidframe to compile and link
+ * in kernel environment
+ *
+ * Revision 1.16 1995/07/21 19:29:51 robby
+ * added some info for the idler to the Raid
+ *
+ * Revision 1.15 1995/07/16 03:19:14 cfb
+ * added cachePtr to *raidPtr
+ *
+ * Revision 1.14 1995/06/23 13:39:36 robby
+ * updeated to prototypes in rf_layout.h
+ *
+ */
+
+#ifndef _RF__RF_RAID_H_
+#define _RF__RF_RAID_H_
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#include "rf_archs.h"
+#include "rf_types.h"
+#include "rf_threadstuff.h"
+
+#ifdef _KERNEL
+#if defined(__NetBSD__)
+#include "rf_netbsd.h"
+#elif defined(__OpenBSD__)
+#include "rf_openbsd.h"
+#endif
+#endif
+
+#ifdef KERNEL
+/* XXX Needs to be added. GO
+#include <raidframe.h>
+*/
+#include <sys/disklabel.h>
+#else /* KERNEL */
+#include <stdio.h>
+#include <assert.h>
+#endif /* KERNEL */
+#include <sys/types.h>
+
+#include "rf_alloclist.h"
+#include "rf_stripelocks.h"
+#include "rf_layout.h"
+#include "rf_disks.h"
+#include "rf_debugMem.h"
+#include "rf_diskqueue.h"
+#include "rf_reconstruct.h"
+#include "rf_acctrace.h"
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+#include "rf_paritylog.h"
+#endif /* RF_INCLUDE_PARITYLOGGING > 0 */
+
+#define RF_MAX_DISKS 128 /* max disks per array */
+#if defined(__NetBSD__) || defined(__OpenBSD__)
+#define RF_DEV2RAIDID(_dev) (DISKUNIT(_dev))
+#else
+#define RF_DEV2RAIDID(_dev) (minor(_dev)>>6) /* convert dev_t to raid id */
+#endif
+
+/*
+ * Each row in the array is a distinct parity group, so
+ * each has it's own status, which is one of the following.
+ */
+typedef enum RF_RowStatus_e {
+ rf_rs_optimal,
+ rf_rs_degraded,
+ rf_rs_reconstructing,
+ rf_rs_reconfigured
+} RF_RowStatus_t;
+
+struct RF_CumulativeStats_s {
+ struct timeval start; /* the time when the stats were last started*/
+ struct timeval stop; /* the time when the stats were last stopped */
+ long sum_io_us; /* sum of all user response times (us) */
+ long num_ios; /* total number of I/Os serviced */
+ long num_sect_moved; /* total number of sectors read or written */
+};
+
+struct RF_ThroughputStats_s {
+ RF_DECLARE_MUTEX(mutex)/* a mutex used to lock the configuration stuff */
+ struct timeval start; /* timer started when numOutstandingRequests moves from 0 to 1 */
+ struct timeval stop; /* timer stopped when numOutstandingRequests moves from 1 to 0 */
+ RF_uint64 sum_io_us; /* total time timer is enabled */
+ RF_uint64 num_ios; /* total number of ios processed by RAIDframe */
+ long num_out_ios; /* number of outstanding ios */
+};
+
+#ifdef SIMULATE
+typedef struct RF_PendingRecon_s RF_PendingRecon_t;
+struct RF_PendingRecon_s {
+ RF_RowCol_t row;
+ RF_RowCol_t col;
+ RF_PendingRecon_t *next;
+};
+#endif /* SIMULATE */
+
+struct RF_Raid_s {
+ /* This portion never changes, and can be accessed without locking */
+ /* an exception is Disks[][].status, which requires locking when it is changed */
+ u_int numRow; /* number of rows of disks, typically == # of ranks */
+ u_int numCol; /* number of columns of disks, typically == # of disks/rank */
+ u_int numSpare; /* number of spare disks */
+ int maxQueueDepth; /* max disk queue depth */
+ RF_SectorCount_t totalSectors; /* total number of sectors in the array */
+ RF_SectorCount_t sectorsPerDisk; /* number of sectors on each disk */
+ u_int logBytesPerSector; /* base-2 log of the number of bytes in a sector */
+ u_int bytesPerSector; /* bytes in a sector */
+ RF_int32 sectorMask; /* mask of bytes-per-sector */
+
+ RF_RaidLayout_t Layout; /* all information related to layout */
+ RF_RaidDisk_t **Disks; /* all information related to physical disks */
+ RF_DiskQueue_t **Queues; /* all information related to disk queues */
+ /* NOTE: This is an anchor point via which the queues can be accessed,
+ * but the enqueue/dequeue routines in diskqueue.c use a local copy of
+ * this pointer for the actual accesses.
+ */
+ /* The remainder of the structure can change, and therefore requires locking on reads and updates */
+ RF_DECLARE_MUTEX(mutex) /* mutex used to serialize access to the fields below */
+ RF_RowStatus_t *status; /* the status of each row in the array */
+ int valid; /* indicates successful configuration */
+ RF_LockTableEntry_t *lockTable; /* stripe-lock table */
+ RF_LockTableEntry_t *quiesceLock; /* quiesnce table */
+ int numFailures; /* total number of failures in the array */
+
+ /*
+ * Cleanup stuff
+ */
+ RF_ShutdownList_t *shutdownList; /* shutdown activities */
+ RF_AllocListElem_t *cleanupList; /* memory to be freed at shutdown time */
+
+ /*
+ * Recon stuff
+ */
+ RF_HeadSepLimit_t headSepLimit;
+ int numFloatingReconBufs;
+ int reconInProgress;
+#ifdef SIMULATE
+ RF_PendingRecon_t *pendingRecon;
+#endif /* SIMULATE */
+ RF_DECLARE_COND(waitForReconCond)
+ RF_RaidReconDesc_t *reconDesc; /* reconstruction descriptor */
+ RF_ReconCtrl_t **reconControl; /* reconstruction control structure pointers for each row in the array */
+
+#if !defined(KERNEL) && !defined(SIMULATE)
+ /*
+ * Disk thread stuff
+ */
+ int diskthreads_created;
+ int diskthreads_running;
+ int diskthreads_shutdown;
+ RF_DECLARE_MUTEX(diskthread_count_mutex)
+ RF_DECLARE_COND(diskthread_count_cond)
+#endif /* !KERNEL && !SIMULATE */
+
+ /*
+ * Array-quiescence stuff
+ */
+ RF_DECLARE_MUTEX(access_suspend_mutex)
+ RF_DECLARE_COND(quiescent_cond)
+ RF_IoCount_t accesses_suspended;
+ RF_IoCount_t accs_in_flight;
+ int access_suspend_release;
+ int waiting_for_quiescence;
+ RF_CallbackDesc_t *quiesce_wait_list;
+
+ /*
+ * Statistics
+ */
+#if !defined(KERNEL) && !defined(SIMULATE)
+ RF_ThroughputStats_t throughputstats;
+#endif /* !KERNEL && !SIMULATE */
+ RF_CumulativeStats_t userstats;
+
+ /*
+ * Engine thread control
+ */
+ RF_DECLARE_MUTEX(node_queue_mutex)
+ RF_DECLARE_COND(node_queue_cond)
+ RF_DagNode_t *node_queue;
+#ifndef SIMULATE
+ RF_Thread_t engine_thread;
+ RF_ThreadGroup_t engine_tg;
+#endif /* !SIMULATE */
+ int shutdown_engine;
+ int dags_in_flight; /* debug */
+
+ /*
+ * PSS (Parity Stripe Status) stuff
+ */
+ RF_FreeList_t *pss_freelist;
+ long pssTableSize;
+
+ /*
+ * Reconstruction stuff
+ */
+ int procsInBufWait;
+ int numFullReconBuffers;
+ RF_AccTraceEntry_t *recon_tracerecs;
+ unsigned long accumXorTimeUs;
+ RF_ReconDoneProc_t *recon_done_procs;
+ RF_DECLARE_MUTEX(recon_done_proc_mutex)
+
+#if !defined(KERNEL) && !defined(SIMULATE)
+ RF_Thread_t **diskthreads, *sparediskthreads; /* thread descriptors for disk threads in user-level version */
+#endif /* !KERNEL && !SIMULATE */
+
+ /*
+ * nAccOutstanding, waitShutdown protected by desc freelist lock
+ * (This may seem strange, since that's a central serialization point
+ * for a per-array piece of data, but otherwise, it'd be an extra
+ * per-array lock, and that'd only be less efficient...)
+ */
+ RF_DECLARE_COND(outstandingCond)
+ int waitShutdown;
+ int nAccOutstanding;
+
+ RF_DiskId_t **diskids;
+ RF_DiskId_t *sparediskids;
+
+#ifdef KERNEL
+ int raidid;
+#endif /* KERNEL */
+ RF_AccTotals_t acc_totals;
+ int keep_acc_totals;
+
+#ifdef _KERNEL
+ struct raidcinfo **raid_cinfo; /* array of component info */
+ struct proc *proc; /* XXX shouldn't be needed here.. :-p */
+#endif
+
+ int terminate_disk_queues;
+
+ /*
+ * XXX
+ *
+ * config-specific information should be moved
+ * somewhere else, or at least hung off this
+ * in some generic way
+ */
+
+ /* used by rf_compute_workload_shift */
+ RF_RowCol_t hist_diskreq[RF_MAXROW][RF_MAXCOL];
+
+ /* used by declustering */
+ int noRotate;
+
+#if RF_INCLUDE_PARITYLOGGING > 0
+ /* used by parity logging */
+ RF_SectorCount_t regionLogCapacity;
+ RF_ParityLogQueue_t parityLogPool; /* pool of unused parity logs */
+ RF_RegionInfo_t *regionInfo; /* array of region state */
+ int numParityLogs;
+ int numSectorsPerLog;
+ int regionParityRange;
+ int logsInUse; /* debugging */
+ RF_ParityLogDiskQueue_t parityLogDiskQueue; /* state of parity logging disk work */
+ RF_RegionBufferQueue_t regionBufferPool; /* buffers for holding region log */
+ RF_RegionBufferQueue_t parityBufferPool; /* buffers for holding parity */
+ caddr_t parityLogBufferHeap; /* pool of unused parity logs */
+#ifndef SIMULATE
+ RF_Thread_t pLogDiskThreadHandle;
+#endif /* !SIMULATE */
+
+#endif /* RF_INCLUDE_PARITYLOGGING > 0 */
+};
+
+#endif /* !_RF__RF_RAID_H_ */
diff --git a/sys/dev/raidframe/rf_raid0.c b/sys/dev/raidframe/rf_raid0.c
new file mode 100644
index 00000000000..c81068affd9
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid0.c
@@ -0,0 +1,242 @@
+/* $OpenBSD: rf_raid0.c,v 1.1 1999/01/11 14:29:41 niklas Exp $ */
+/* $NetBSD: rf_raid0.c,v 1.1 1998/11/13 04:20:33 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/***************************************
+ *
+ * rf_raid0.c -- implements RAID Level 0
+ *
+ ***************************************/
+
+/*
+ * :
+ * Log: rf_raid0.c,v
+ * Revision 1.24 1996/07/31 16:56:18 jimz
+ * dataBytesPerStripe, sectorsPerDisk init arch-indep.
+ *
+ * Revision 1.23 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.22 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.21 1996/06/19 22:07:34 jimz
+ * added parity verify
+ *
+ * Revision 1.20 1996/06/17 14:38:33 jimz
+ * properly #if out RF_DEMO code
+ * fix bug in MakeConfig that was causing weird behavior
+ * in configuration routines (config was not zeroed at start)
+ * clean up genplot handling of stacks
+ *
+ * Revision 1.19 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.18 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.17 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.16 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.15 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.14 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.13 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.12 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.11 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.10 1996/05/03 19:37:32 wvcii
+ * moved dag creation routines to dag library
+ *
+ * Revision 1.9 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.8 1995/12/06 15:06:36 root
+ * added copyright info
+ *
+ * Revision 1.7 1995/11/17 18:57:15 wvcii
+ * added prototypint to MapParity
+ *
+ * Revision 1.6 1995/11/16 13:53:51 wvcii
+ * fixed bug in CreateRAID0WriteDAG prototype
+ *
+ * Revision 1.5 1995/11/07 15:22:01 wvcii
+ * changed RAID0DagSelect prototype
+ * function no longer generates numHdrSucc, numTermAnt
+ *
+ * Revision 1.4 1995/06/23 13:39:17 robby
+ * updeated to prototypes in rf_layout.h
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_raid0.h"
+#include "rf_dag.h"
+#include "rf_dagffrd.h"
+#include "rf_dagffwr.h"
+#include "rf_dagutils.h"
+#include "rf_dagfuncs.h"
+#include "rf_threadid.h"
+#include "rf_general.h"
+#include "rf_configure.h"
+#include "rf_parityscan.h"
+
+typedef struct RF_Raid0ConfigInfo_s {
+ RF_RowCol_t *stripeIdentifier;
+} RF_Raid0ConfigInfo_t;
+
+int rf_ConfigureRAID0(
+ RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr)
+{
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_Raid0ConfigInfo_t *info;
+ RF_RowCol_t i;
+
+ /* create a RAID level 0 configuration structure */
+ RF_MallocAndAdd(info, sizeof(RF_Raid0ConfigInfo_t), (RF_Raid0ConfigInfo_t *), raidPtr->cleanupList);
+ if (info == NULL)
+ return(ENOMEM);
+ layoutPtr->layoutSpecificInfo = (void *)info;
+
+ RF_MallocAndAdd(info->stripeIdentifier, raidPtr->numCol * sizeof(RF_RowCol_t), (RF_RowCol_t *), raidPtr->cleanupList);
+ if (info->stripeIdentifier == NULL)
+ return(ENOMEM);
+ for (i=0; i<raidPtr->numCol; i++)
+ info->stripeIdentifier[i] = i;
+
+ RF_ASSERT(raidPtr->numRow == 1);
+ raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * raidPtr->numCol * layoutPtr->sectorsPerStripeUnit;
+ layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
+ layoutPtr->dataSectorsPerStripe = raidPtr->numCol * layoutPtr->sectorsPerStripeUnit;
+ layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+ layoutPtr->numDataCol = raidPtr->numCol;
+ layoutPtr->numParityCol = 0;
+ return(0);
+}
+
+void rf_MapSectorRAID0(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+ *row = 0;
+ *col = SUID % raidPtr->numCol;
+ *diskSector = (SUID / raidPtr->numCol) * raidPtr->Layout.sectorsPerStripeUnit +
+ (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+void rf_MapParityRAID0(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ *row = *col = 0;
+ *diskSector = 0;
+}
+
+void rf_IdentifyStripeRAID0(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids,
+ RF_RowCol_t *outRow)
+{
+ RF_Raid0ConfigInfo_t *info;
+
+ info = raidPtr->Layout.layoutSpecificInfo;
+ *diskids = info->stripeIdentifier;
+ *outRow = 0;
+}
+
+void rf_MapSIDToPSIDRAID0(
+ RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t stripeID,
+ RF_StripeNum_t *psID,
+ RF_ReconUnitNum_t *which_ru)
+{
+ *which_ru = 0;
+ *psID = stripeID;
+}
+
+void rf_RAID0DagSelect(
+ RF_Raid_t *raidPtr,
+ RF_IoType_t type,
+ RF_AccessStripeMap_t *asmap,
+ RF_VoidFuncPtr *createFunc)
+{
+ *createFunc = ((type == RF_IO_TYPE_READ) ?
+ (RF_VoidFuncPtr)rf_CreateFaultFreeReadDAG : (RF_VoidFuncPtr)rf_CreateRAID0WriteDAG);
+}
+
+int rf_VerifyParityRAID0(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidAddr,
+ RF_PhysDiskAddr_t *parityPDA,
+ int correct_it,
+ RF_RaidAccessFlags_t flags)
+{
+ /*
+ * No parity is always okay.
+ */
+ return(RF_PARITY_OKAY);
+}
diff --git a/sys/dev/raidframe/rf_raid0.h b/sys/dev/raidframe/rf_raid0.h
new file mode 100644
index 00000000000..fe90ff49c73
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid0.h
@@ -0,0 +1,111 @@
+/* $OpenBSD: rf_raid0.h,v 1.1 1999/01/11 14:29:41 niklas Exp $ */
+/* $NetBSD: rf_raid0.h,v 1.1 1998/11/13 04:20:33 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_raid0.h - header file for RAID Level 0 */
+
+/*
+ * :
+ * Log: rf_raid0.h,v
+ * Revision 1.15 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.14 1996/06/19 22:07:42 jimz
+ * added parity verify
+ *
+ * Revision 1.13 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.12 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.11 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.10 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.9 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.8 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.7 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.6 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.5 1995/12/06 15:02:36 root
+ * added copyright info
+ *
+ * Revision 1.4 1995/11/17 18:58:33 wvcii
+ * added prototyping to MapParity
+ *
+ * Revision 1.3 1995/11/07 15:21:00 wvcii
+ * changed RAID0DagSelect prototype
+ *
+ * Revision 1.2 1995/06/23 13:39:10 robby
+ * updeated to prototypes in rf_layout.h
+ *
+ */
+
+#ifndef _RF__RF_RAID0_H_
+#define _RF__RF_RAID0_H_
+
+int rf_ConfigureRAID0(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr);
+void rf_MapSectorRAID0(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_MapParityRAID0(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_IdentifyStripeRAID0(RF_Raid_t *raidPtr, RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids, RF_RowCol_t *outRow);
+void rf_MapSIDToPSIDRAID0(RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t stripeID, RF_StripeNum_t *psID,
+ RF_ReconUnitNum_t *which_ru);
+void rf_RAID0DagSelect(RF_Raid_t *raidPtr, RF_IoType_t type,
+ RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc);
+int rf_VerifyParityRAID0(RF_Raid_t *raidPtr, RF_RaidAddr_t raidAddr,
+ RF_PhysDiskAddr_t *parityPDA, int correct_it, RF_RaidAccessFlags_t flags);
+
+#endif /* !_RF__RF_RAID0_H_ */
diff --git a/sys/dev/raidframe/rf_raid1.c b/sys/dev/raidframe/rf_raid1.c
new file mode 100644
index 00000000000..e941bf384b2
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid1.c
@@ -0,0 +1,881 @@
+/* $OpenBSD: rf_raid1.c,v 1.1 1999/01/11 14:29:42 niklas Exp $ */
+/* $NetBSD: rf_raid1.c,v 1.1 1998/11/13 04:20:33 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************************************
+ *
+ * rf_raid1.c -- implements RAID Level 1
+ *
+ *****************************************************************************/
+
+/*
+ * :
+ * Log: rf_raid1.c,v
+ * Revision 1.46 1996/11/05 21:10:40 jimz
+ * failed pda generalization
+ *
+ * Revision 1.45 1996/07/31 16:56:18 jimz
+ * dataBytesPerStripe, sectorsPerDisk init arch-indep.
+ *
+ * Revision 1.44 1996/07/30 03:06:43 jimz
+ * get rid of extra rf_threadid.h include
+ *
+ * Revision 1.43 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.42 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.41 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.40 1996/07/17 14:31:19 jimz
+ * minor cleanup for readability
+ *
+ * Revision 1.39 1996/07/15 17:22:18 jimz
+ * nit-pick code cleanup
+ * resolve stdlib problems on DEC OSF
+ *
+ * Revision 1.38 1996/07/15 02:56:31 jimz
+ * fixed dag selection to deal with failed + recon to spare disks
+ * enhanced recon, parity check debugging
+ *
+ * Revision 1.37 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.36 1996/07/11 19:08:00 jimz
+ * generalize reconstruction mechanism
+ * allow raid1 reconstructs via copyback (done with array
+ * quiesced, not online, therefore not disk-directed)
+ *
+ * Revision 1.35 1996/07/10 23:01:24 jimz
+ * Better commenting of VerifyParity (for posterity)
+ *
+ * Revision 1.34 1996/07/10 22:29:45 jimz
+ * VerifyParityRAID1: corrected return values for stripes in degraded mode
+ *
+ * Revision 1.33 1996/07/10 16:05:39 jimz
+ * fixed a couple minor bugs in VerifyParityRAID1
+ * added code to correct bad RAID1 parity
+ *
+ * Revision 1.32 1996/06/20 18:47:04 jimz
+ * fix up verification bugs
+ *
+ * Revision 1.31 1996/06/20 15:38:59 jimz
+ * added parity verification
+ * can't correct bad parity yet, but can return pass/fail
+ *
+ * Revision 1.30 1996/06/19 22:23:01 jimz
+ * parity verification is now a layout-configurable thing
+ * not all layouts currently support it (correctly, anyway)
+ *
+ * Revision 1.29 1996/06/11 08:54:27 jimz
+ * improved error-checking at configuration time
+ *
+ * Revision 1.28 1996/06/10 18:25:24 wvcii
+ * fixed bug in rf_IdentifyStripeRAID1 - added array initialization
+ *
+ * Revision 1.27 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.26 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.25 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.24 1996/06/06 17:29:43 jimz
+ * use CreateMirrorIdleReadDAG for mirrored read
+ *
+ * Revision 1.23 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.22 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.21 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.20 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.19 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.18 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.17 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.16 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.15 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.14 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.13 1996/05/03 19:36:22 wvcii
+ * moved dag creation routines to dag library
+ *
+ * Revision 1.12 1996/02/23 01:38:16 amiri
+ * removed chained declustering special case in SelectIdleDisk
+ *
+ * Revision 1.11 1996/02/22 16:47:18 amiri
+ * disabled shortest queue optimization for chained declustering
+ *
+ * Revision 1.10 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.9 1995/12/04 19:21:28 wvcii
+ * modified SelectIdleDisk to take a mirror node as a parameter and
+ * conditionally swap params 0 (data pda) and 4 (mirror pda).
+ * modified CreateRaidOneReadDAG so that it creates the DAG itself
+ * as opposed to reusing code in CreateNonredundantDAG.
+ *
+ * Revision 1.8 1995/11/30 16:07:45 wvcii
+ * added copyright info
+ *
+ * Revision 1.7 1995/11/16 14:46:18 wvcii
+ * fixed bugs in mapping and degraded dag creation, added comments
+ *
+ * Revision 1.6 1995/11/14 22:29:16 wvcii
+ * fixed bugs in dag creation
+ *
+ * Revision 1.5 1995/11/07 15:23:33 wvcii
+ * changed RAID1DagSelect prototype
+ * function no longer generates numHdrSucc, numTermAnt
+ * changed dag creation routines:
+ * term node generated during dag creation
+ * encoded commit nodes, barrier, antecedent types
+ *
+ * Revision 1.4 1995/10/10 19:09:21 wvcii
+ * write dag now handles non-aligned accesses
+ *
+ * Revision 1.3 1995/10/05 02:32:56 jimz
+ * ifdef'd out queue locking for load balancing
+ *
+ * Revision 1.2 1995/10/04 07:04:40 wvcii
+ * reads are now scheduled according to disk queue length.
+ * queue length is the sum of number of ios queued in raidframe as well as those at the disk.
+ * reads are sent to the disk with the shortest queue.
+ * testing against user disks successful, sim & kernel untested.
+ *
+ * Revision 1.1 1995/10/04 03:53:23 wvcii
+ * Initial revision
+ *
+ *
+ */
+
+#include "rf_raid.h"
+#include "rf_raid1.h"
+#include "rf_dag.h"
+#include "rf_dagffrd.h"
+#include "rf_dagffwr.h"
+#include "rf_dagdegrd.h"
+#include "rf_dagutils.h"
+#include "rf_dagfuncs.h"
+#include "rf_threadid.h"
+#include "rf_diskqueue.h"
+#include "rf_general.h"
+#include "rf_utils.h"
+#include "rf_parityscan.h"
+#include "rf_mcpair.h"
+#include "rf_layout.h"
+#include "rf_map.h"
+#include "rf_engine.h"
+#include "rf_reconbuffer.h"
+#include "rf_sys.h"
+
+typedef struct RF_Raid1ConfigInfo_s {
+ RF_RowCol_t **stripeIdentifier;
+} RF_Raid1ConfigInfo_t;
+
+/* start of day code specific to RAID level 1 */
+int rf_ConfigureRAID1(
+ RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr)
+{
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_Raid1ConfigInfo_t *info;
+ RF_RowCol_t i;
+
+ /* create a RAID level 1 configuration structure */
+ RF_MallocAndAdd(info, sizeof(RF_Raid1ConfigInfo_t), (RF_Raid1ConfigInfo_t *), raidPtr->cleanupList);
+ if (info == NULL)
+ return(ENOMEM);
+ layoutPtr->layoutSpecificInfo = (void *) info;
+
+ /* ... and fill it in. */
+ info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol / 2, 2, raidPtr->cleanupList);
+ if (info->stripeIdentifier == NULL)
+ return(ENOMEM);
+ for (i = 0; i < (raidPtr->numCol / 2); i ++) {
+ info->stripeIdentifier[i][0] = (2 * i);
+ info->stripeIdentifier[i][1] = (2 * i) + 1;
+ }
+
+ RF_ASSERT(raidPtr->numRow == 1);
+
+ /* this implementation of RAID level 1 uses one row of numCol disks and allows multiple (numCol / 2)
+ * stripes per row. A stripe consists of a single data unit and a single parity (mirror) unit.
+ * stripe id = raidAddr / stripeUnitSize
+ */
+ raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * (raidPtr->numCol / 2) * layoutPtr->sectorsPerStripeUnit;
+ layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk * (raidPtr->numCol / 2);
+ layoutPtr->dataSectorsPerStripe = layoutPtr->sectorsPerStripeUnit;
+ layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+ layoutPtr->numDataCol = 1;
+ layoutPtr->numParityCol = 1;
+ return(0);
+}
+
+
+/* returns the physical disk location of the primary copy in the mirror pair */
+void rf_MapSectorRAID1(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+ RF_RowCol_t mirrorPair = SUID % (raidPtr->numCol / 2);
+
+ *row = 0;
+ *col = 2 * mirrorPair;
+ *diskSector = ((SUID / (raidPtr->numCol / 2)) * raidPtr->Layout.sectorsPerStripeUnit) + (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+
+/* Map Parity
+ *
+ * returns the physical disk location of the secondary copy in the mirror
+ * pair
+ */
+void rf_MapParityRAID1(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+ RF_RowCol_t mirrorPair = SUID % (raidPtr->numCol / 2);
+
+ *row = 0;
+ *col = (2 * mirrorPair) + 1;
+
+ *diskSector = ((SUID / (raidPtr->numCol / 2)) * raidPtr->Layout.sectorsPerStripeUnit) + (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+
+/* IdentifyStripeRAID1
+ *
+ * returns a list of disks for a given redundancy group
+ */
+void rf_IdentifyStripeRAID1(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids,
+ RF_RowCol_t *outRow)
+{
+ RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr);
+ RF_Raid1ConfigInfo_t *info = raidPtr->Layout.layoutSpecificInfo;
+ RF_ASSERT(stripeID >= 0);
+ RF_ASSERT(addr >= 0);
+ *outRow = 0;
+ *diskids = info->stripeIdentifier[ stripeID % (raidPtr->numCol/2)];
+ RF_ASSERT(*diskids);
+}
+
+
+/* MapSIDToPSIDRAID1
+ *
+ * maps a logical stripe to a stripe in the redundant array
+ */
+void rf_MapSIDToPSIDRAID1(
+ RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t stripeID,
+ RF_StripeNum_t *psID,
+ RF_ReconUnitNum_t *which_ru)
+{
+ *which_ru = 0;
+ *psID = stripeID;
+}
+
+
+
+/******************************************************************************
+ * select a graph to perform a single-stripe access
+ *
+ * Parameters: raidPtr - description of the physical array
+ * type - type of operation (read or write) requested
+ * asmap - logical & physical addresses for this access
+ * createFunc - name of function to use to create the graph
+ *****************************************************************************/
+
+void rf_RAID1DagSelect(
+ RF_Raid_t *raidPtr,
+ RF_IoType_t type,
+ RF_AccessStripeMap_t *asmap,
+ RF_VoidFuncPtr *createFunc)
+{
+ RF_RowCol_t frow, fcol, or, oc;
+ RF_PhysDiskAddr_t *failedPDA;
+ int prior_recon, tid;
+ RF_RowStatus_t rstat;
+ RF_SectorNum_t oo;
+
+
+ RF_ASSERT(RF_IO_IS_R_OR_W(type));
+
+ if (asmap->numDataFailed + asmap->numParityFailed > 1) {
+ RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n");
+ *createFunc = NULL;
+ return;
+ }
+
+ if (asmap->numDataFailed + asmap->numParityFailed) {
+ /*
+ * We've got a fault. Re-map to spare space, iff applicable.
+ * Shouldn't the arch-independent code do this for us?
+ * Anyway, it turns out if we don't do this here, then when
+ * we're reconstructing, writes go only to the surviving
+ * original disk, and aren't reflected on the reconstructed
+ * spare. Oops. --jimz
+ */
+ failedPDA = asmap->failedPDAs[0];
+ frow = failedPDA->row;
+ fcol = failedPDA->col;
+ rstat = raidPtr->status[frow];
+ prior_recon = (rstat == rf_rs_reconfigured) || (
+ (rstat == rf_rs_reconstructing) ?
+ rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0
+ );
+ if (prior_recon) {
+ or = frow;
+ oc = fcol;
+ oo = failedPDA->startSector;
+ /*
+ * If we did distributed sparing, we'd monkey with that here.
+ * But we don't, so we'll
+ */
+ failedPDA->row = raidPtr->Disks[frow][fcol].spareRow;
+ failedPDA->col = raidPtr->Disks[frow][fcol].spareCol;
+ /*
+ * Redirect other components, iff necessary. This looks
+ * pretty suspicious to me, but it's what the raid5
+ * DAG select does.
+ */
+ if (asmap->parityInfo->next) {
+ if (failedPDA == asmap->parityInfo) {
+ failedPDA->next->row = failedPDA->row;
+ failedPDA->next->col = failedPDA->col;
+ }
+ else {
+ if (failedPDA == asmap->parityInfo->next) {
+ asmap->parityInfo->row = failedPDA->row;
+ asmap->parityInfo->col = failedPDA->col;
+ }
+ }
+ }
+ if (rf_dagDebug || rf_mapDebug) {
+ rf_get_threadid(tid);
+ printf("[%d] Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n",
+ tid, type, or, oc, (long)oo, failedPDA->row, failedPDA->col,
+ (long)failedPDA->startSector);
+ }
+ asmap->numDataFailed = asmap->numParityFailed = 0;
+ }
+ }
+ if (type == RF_IO_TYPE_READ) {
+ if (asmap->numDataFailed == 0)
+ *createFunc = (RF_VoidFuncPtr)rf_CreateMirrorIdleReadDAG;
+ else
+ *createFunc = (RF_VoidFuncPtr)rf_CreateRaidOneDegradedReadDAG;
+ }
+ else {
+ *createFunc = (RF_VoidFuncPtr)rf_CreateRaidOneWriteDAG;
+ }
+}
+
+int rf_VerifyParityRAID1(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidAddr,
+ RF_PhysDiskAddr_t *parityPDA,
+ int correct_it,
+ RF_RaidAccessFlags_t flags)
+{
+ int nbytes, bcount, stripeWidth, ret, i, j, tid=0, nbad, *bbufs;
+ RF_DagNode_t *blockNode, *unblockNode, *wrBlock;
+ RF_DagHeader_t *rd_dag_h, *wr_dag_h;
+ RF_AccessStripeMapHeader_t *asm_h;
+ RF_AllocListElem_t *allocList;
+ RF_AccTraceEntry_t tracerec;
+ RF_ReconUnitNum_t which_ru;
+ RF_RaidLayout_t *layoutPtr;
+ RF_AccessStripeMap_t *aasm;
+ RF_SectorCount_t nsector;
+ RF_RaidAddr_t startAddr;
+ char *buf, *buf1, *buf2;
+ RF_PhysDiskAddr_t *pda;
+ RF_StripeNum_t psID;
+ RF_MCPair_t *mcpair;
+
+ if (rf_verifyParityDebug) {
+ rf_get_threadid(tid);
+ }
+
+ layoutPtr = &raidPtr->Layout;
+ startAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr);
+ nsector = parityPDA->numSector;
+ nbytes = rf_RaidAddressToByte(raidPtr, nsector);
+ psID = rf_RaidAddressToParityStripeID(layoutPtr, raidAddr, &which_ru);
+
+ asm_h = NULL;
+ rd_dag_h = wr_dag_h = NULL;
+ mcpair = NULL;
+
+ ret = RF_PARITY_COULD_NOT_VERIFY;
+
+ rf_MakeAllocList(allocList);
+ if (allocList == NULL)
+ return(RF_PARITY_COULD_NOT_VERIFY);
+ mcpair = rf_AllocMCPair();
+ if (mcpair == NULL)
+ goto done;
+ RF_ASSERT(layoutPtr->numDataCol == layoutPtr->numParityCol);
+ stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
+ bcount = nbytes*(layoutPtr->numDataCol + layoutPtr->numParityCol);
+ RF_MallocAndAdd(buf, bcount, (char *), allocList);
+ if (buf == NULL)
+ goto done;
+ if (rf_verifyParityDebug) {
+ printf("[%d] RAID1 parity verify: buf=%lx bcount=%d (%lx - %lx)\n",
+ tid, (long)buf, bcount, (long)buf, (long)buf+bcount);
+ }
+
+ /*
+ * Generate a DAG which will read the entire stripe- then we can
+ * just compare data chunks versus "parity" chunks.
+ */
+
+ rd_dag_h = rf_MakeSimpleDAG(raidPtr, stripeWidth, nbytes, buf,
+ rf_DiskReadFunc, rf_DiskReadUndoFunc, "Rod", allocList, flags,
+ RF_IO_NORMAL_PRIORITY);
+ if (rd_dag_h == NULL)
+ goto done;
+ blockNode = rd_dag_h->succedents[0];
+ unblockNode = blockNode->succedents[0]->succedents[0];
+
+ /*
+ * Map the access to physical disk addresses (PDAs)- this will
+ * get us both a list of data addresses, and "parity" addresses
+ * (which are really mirror copies).
+ */
+ asm_h = rf_MapAccess(raidPtr, startAddr, layoutPtr->dataSectorsPerStripe,
+ buf, RF_DONT_REMAP);
+ aasm = asm_h->stripeMap;
+
+ buf1 = buf;
+ /*
+ * Loop through the data blocks, setting up read nodes for each.
+ */
+ for(pda=aasm->physInfo,i=0;i<layoutPtr->numDataCol;i++,pda=pda->next)
+ {
+ RF_ASSERT(pda);
+
+ rf_RangeRestrictPDA(raidPtr, parityPDA, pda, 0, 1);
+
+ RF_ASSERT(pda->numSector != 0);
+ if (rf_TryToRedirectPDA(raidPtr, pda, 0)) {
+ /* cannot verify parity with dead disk */
+ goto done;
+ }
+ pda->bufPtr = buf1;
+ blockNode->succedents[i]->params[0].p = pda;
+ blockNode->succedents[i]->params[1].p = buf1;
+ blockNode->succedents[i]->params[2].v = psID;
+ blockNode->succedents[i]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ buf1 += nbytes;
+ }
+ RF_ASSERT(pda == NULL);
+ /*
+ * keep i, buf1 running
+ *
+ * Loop through parity blocks, setting up read nodes for each.
+ */
+ for(pda=aasm->parityInfo;i<layoutPtr->numDataCol+layoutPtr->numParityCol;i++,pda=pda->next)
+ {
+ RF_ASSERT(pda);
+ rf_RangeRestrictPDA(raidPtr, parityPDA, pda, 0, 1);
+ RF_ASSERT(pda->numSector != 0);
+ if (rf_TryToRedirectPDA(raidPtr, pda, 0)) {
+ /* cannot verify parity with dead disk */
+ goto done;
+ }
+ pda->bufPtr = buf1;
+ blockNode->succedents[i]->params[0].p = pda;
+ blockNode->succedents[i]->params[1].p = buf1;
+ blockNode->succedents[i]->params[2].v = psID;
+ blockNode->succedents[i]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ buf1 += nbytes;
+ }
+ RF_ASSERT(pda == NULL);
+
+ bzero((char *)&tracerec, sizeof(tracerec));
+ rd_dag_h->tracerec = &tracerec;
+
+ if (rf_verifyParityDebug > 1) {
+ printf("[%d] RAID1 parity verify read dag:\n", tid);
+ rf_PrintDAGList(rd_dag_h);
+ }
+
+ RF_LOCK_MUTEX(mcpair->mutex);
+ mcpair->flag = 0;
+ rf_DispatchDAG(rd_dag_h, (void (*)(void *))rf_MCPairWakeupFunc,
+ (void *)mcpair);
+ while (mcpair->flag == 0) {
+ RF_WAIT_MCPAIR(mcpair);
+ }
+ RF_UNLOCK_MUTEX(mcpair->mutex);
+
+ if (rd_dag_h->status != rf_enable) {
+ RF_ERRORMSG("Unable to verify raid1 parity: can't read stripe\n");
+ ret = RF_PARITY_COULD_NOT_VERIFY;
+ goto done;
+ }
+
+ /*
+ * buf1 is the beginning of the data blocks chunk
+ * buf2 is the beginning of the parity blocks chunk
+ */
+ buf1 = buf;
+ buf2 = buf + (nbytes * layoutPtr->numDataCol);
+ ret = RF_PARITY_OKAY;
+ /*
+ * bbufs is "bad bufs"- an array whose entries are the data
+ * column numbers where we had miscompares. (That is, column 0
+ * and column 1 of the array are mirror copies, and are considered
+ * "data column 0" for this purpose).
+ */
+ RF_MallocAndAdd(bbufs, layoutPtr->numParityCol*sizeof(int), (int *),
+ allocList);
+ nbad = 0;
+ /*
+ * Check data vs "parity" (mirror copy).
+ */
+ for(i=0;i<layoutPtr->numDataCol;i++) {
+ if (rf_verifyParityDebug) {
+ printf("[%d] RAID1 parity verify %d bytes: i=%d buf1=%lx buf2=%lx buf=%lx\n",
+ tid, nbytes, i, (long)buf1, (long)buf2, (long)buf);
+ }
+ ret = bcmp(buf1, buf2, nbytes);
+ if (ret) {
+ if (rf_verifyParityDebug > 1) {
+ for(j=0;j<nbytes;j++) {
+ if (buf1[j] != buf2[j])
+ break;
+ }
+ printf("psid=%ld j=%d\n", (long)psID, j);
+ printf("buf1 %02x %02x %02x %02x %02x\n", buf1[0]&0xff,
+ buf1[1]&0xff, buf1[2]&0xff, buf1[3]&0xff, buf1[4]&0xff);
+ printf("buf2 %02x %02x %02x %02x %02x\n", buf2[0]&0xff,
+ buf2[1]&0xff, buf2[2]&0xff, buf2[3]&0xff, buf2[4]&0xff);
+ }
+ if (rf_verifyParityDebug) {
+ printf("[%d] RAID1: found bad parity, i=%d\n", tid, i);
+ }
+ /*
+ * Parity is bad. Keep track of which columns were bad.
+ */
+ if (bbufs)
+ bbufs[nbad] = i;
+ nbad++;
+ ret = RF_PARITY_BAD;
+ }
+ buf1 += nbytes;
+ buf2 += nbytes;
+ }
+
+ if ((ret != RF_PARITY_OKAY) && correct_it) {
+ ret = RF_PARITY_COULD_NOT_CORRECT;
+ if (rf_verifyParityDebug) {
+ printf("[%d] RAID1 parity verify: parity not correct\n", tid);
+ }
+ if (bbufs == NULL)
+ goto done;
+ /*
+ * Make a DAG with one write node for each bad unit. We'll simply
+ * write the contents of the data unit onto the parity unit for
+ * correction. (It's possible that the mirror copy was the correct
+ * copy, and that we're spooging good data by writing bad over it,
+ * but there's no way we can know that.
+ */
+ wr_dag_h = rf_MakeSimpleDAG(raidPtr, nbad, nbytes, buf,
+ rf_DiskWriteFunc, rf_DiskWriteUndoFunc, "Wnp", allocList, flags,
+ RF_IO_NORMAL_PRIORITY);
+ if (wr_dag_h == NULL)
+ goto done;
+ wrBlock = wr_dag_h->succedents[0];
+ /*
+ * Fill in a write node for each bad compare.
+ */
+ for(i=0;i<nbad;i++) {
+ j = i+layoutPtr->numDataCol;
+ pda = blockNode->succedents[j]->params[0].p;
+ pda->bufPtr = blockNode->succedents[i]->params[1].p;
+ wrBlock->succedents[i]->params[0].p = pda;
+ wrBlock->succedents[i]->params[1].p = pda->bufPtr;
+ wrBlock->succedents[i]->params[2].v = psID;
+ wrBlock->succedents[0]->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
+ }
+ bzero((char *)&tracerec, sizeof(tracerec));
+ wr_dag_h->tracerec = &tracerec;
+ if (rf_verifyParityDebug > 1) {
+ printf("Parity verify write dag:\n");
+ rf_PrintDAGList(wr_dag_h);
+ }
+ RF_LOCK_MUTEX(mcpair->mutex);
+ mcpair->flag = 0;
+ /* fire off the write DAG */
+ rf_DispatchDAG(wr_dag_h, (void (*)(void *))rf_MCPairWakeupFunc,
+ (void *)mcpair);
+ while (!mcpair->flag) {
+ RF_WAIT_COND(mcpair->cond, mcpair->mutex);
+ }
+ RF_UNLOCK_MUTEX(mcpair->mutex);
+ if (wr_dag_h->status != rf_enable) {
+ RF_ERRORMSG("Unable to correct RAID1 parity in VerifyParity\n");
+ goto done;
+ }
+ ret = RF_PARITY_CORRECTED;
+ }
+
+done:
+ /*
+ * All done. We might've gotten here without doing part of the function,
+ * so cleanup what we have to and return our running status.
+ */
+ if (asm_h)
+ rf_FreeAccessStripeMap(asm_h);
+ if (rd_dag_h)
+ rf_FreeDAG(rd_dag_h);
+ if (wr_dag_h)
+ rf_FreeDAG(wr_dag_h);
+ if (mcpair)
+ rf_FreeMCPair(mcpair);
+ rf_FreeAllocList(allocList);
+ if (rf_verifyParityDebug) {
+ printf("[%d] RAID1 parity verify, returning %d\n", tid, ret);
+ }
+ return(ret);
+}
+
+int rf_SubmitReconBufferRAID1(rbuf, keep_it, use_committed)
+ RF_ReconBuffer_t *rbuf; /* the recon buffer to submit */
+ int keep_it; /* whether we can keep this buffer or we have to return it */
+ int use_committed; /* whether to use a committed or an available recon buffer */
+{
+ RF_ReconParityStripeStatus_t *pssPtr;
+ RF_ReconCtrl_t *reconCtrlPtr;
+ RF_RaidLayout_t *layoutPtr;
+ int tid=0, retcode, created;
+ RF_CallbackDesc_t *cb, *p;
+ RF_ReconBuffer_t *t;
+ RF_Raid_t *raidPtr;
+ caddr_t ta;
+
+ retcode = 0;
+ created = 0;
+
+ raidPtr = rbuf->raidPtr;
+ layoutPtr = &raidPtr->Layout;
+ reconCtrlPtr = raidPtr->reconControl[rbuf->row];
+
+ RF_ASSERT(rbuf);
+ RF_ASSERT(rbuf->col != reconCtrlPtr->fcol);
+
+ if (rf_reconbufferDebug) {
+ rf_get_threadid(tid);
+ printf("[%d] RAID1 reconbuffer submission r%d c%d psid %ld ru%d (failed offset %ld)\n",
+ tid, rbuf->row, rbuf->col, (long)rbuf->parityStripeID, rbuf->which_ru,
+ (long)rbuf->failedDiskSectorOffset);
+ }
+
+ if (rf_reconDebug) {
+ printf("RAID1 reconbuffer submit psid %ld buf %lx\n",
+ (long)rbuf->parityStripeID, (long)rbuf->buffer);
+ printf("RAID1 psid %ld %02x %02x %02x %02x %02x\n",
+ (long)rbuf->parityStripeID,
+ rbuf->buffer[0], rbuf->buffer[1], rbuf->buffer[2], rbuf->buffer[3],
+ rbuf->buffer[4]);
+ }
+
+ RF_LOCK_PSS_MUTEX(raidPtr,rbuf->row,rbuf->parityStripeID);
+
+ RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
+
+ pssPtr = rf_LookupRUStatus(raidPtr, reconCtrlPtr->pssTable,
+ rbuf->parityStripeID, rbuf->which_ru, RF_PSS_NONE, &created);
+ RF_ASSERT(pssPtr); /* if it didn't exist, we wouldn't have gotten an rbuf for it */
+
+ /*
+ * Since this is simple mirroring, the first submission for a stripe is also
+ * treated as the last.
+ */
+
+ t = NULL;
+ if (keep_it) {
+ if (rf_reconbufferDebug) {
+ printf("[%d] RAID1 rbuf submission: keeping rbuf\n", tid);
+ }
+ t = rbuf;
+ }
+ else {
+ if (use_committed) {
+ if (rf_reconbufferDebug) {
+ printf("[%d] RAID1 rbuf submission: using committed rbuf\n", tid);
+ }
+ t = reconCtrlPtr->committedRbufs;
+ RF_ASSERT(t);
+ reconCtrlPtr->committedRbufs = t->next;
+ t->next = NULL;
+ }
+ else if (reconCtrlPtr->floatingRbufs) {
+ if (rf_reconbufferDebug) {
+ printf("[%d] RAID1 rbuf submission: using floating rbuf\n", tid);
+ }
+ t = reconCtrlPtr->floatingRbufs;
+ reconCtrlPtr->floatingRbufs = t->next;
+ t->next = NULL;
+ }
+ }
+ if (t == NULL) {
+ if (rf_reconbufferDebug) {
+ printf("[%d] RAID1 rbuf submission: waiting for rbuf\n", tid);
+ }
+ RF_ASSERT((keep_it == 0) && (use_committed == 0));
+ raidPtr->procsInBufWait++;
+ if ((raidPtr->procsInBufWait == (raidPtr->numCol-1))
+ && (raidPtr->numFullReconBuffers == 0))
+ {
+ /* ruh-ro */
+ RF_ERRORMSG("Buffer wait deadlock\n");
+ rf_PrintPSStatusTable(raidPtr, rbuf->row);
+ RF_PANIC();
+ }
+ pssPtr->flags |= RF_PSS_BUFFERWAIT;
+ cb = rf_AllocCallbackDesc();
+ cb->row = rbuf->row;
+ cb->col = rbuf->col;
+ cb->callbackArg.v = rbuf->parityStripeID;
+ cb->callbackArg2.v = rbuf->which_ru;
+ cb->next = NULL;
+ if (reconCtrlPtr->bufferWaitList == NULL) {
+ /* we are the wait list- lucky us */
+ reconCtrlPtr->bufferWaitList = cb;
+ }
+ else {
+ /* append to wait list */
+ for(p=reconCtrlPtr->bufferWaitList;p->next;p=p->next);
+ p->next = cb;
+ }
+ retcode = 1;
+ goto out;
+ }
+ if (t != rbuf) {
+ t->row = rbuf->row;
+ t->col = reconCtrlPtr->fcol;
+ t->parityStripeID = rbuf->parityStripeID;
+ t->which_ru = rbuf->which_ru;
+ t->failedDiskSectorOffset = rbuf->failedDiskSectorOffset;
+ t->spRow = rbuf->spRow;
+ t->spCol = rbuf->spCol;
+ t->spOffset = rbuf->spOffset;
+ /* Swap buffers. DANCE! */
+ ta = t->buffer;
+ t->buffer = rbuf->buffer;
+ rbuf->buffer = ta;
+ }
+ /*
+ * Use the rbuf we've been given as the target.
+ */
+ RF_ASSERT(pssPtr->rbuf == NULL);
+ pssPtr->rbuf = t;
+
+ t->count = 1;
+ /*
+ * Below, we use 1 for numDataCol (which is equal to the count in the
+ * previous line), so we'll always be done.
+ */
+ rf_CheckForFullRbuf(raidPtr, reconCtrlPtr, pssPtr, 1);
+
+out:
+ RF_UNLOCK_PSS_MUTEX( raidPtr,rbuf->row,rbuf->parityStripeID);
+ RF_UNLOCK_MUTEX( reconCtrlPtr->rb_mutex );
+ if (rf_reconbufferDebug) {
+ printf("[%d] RAID1 rbuf submission: returning %d\n", tid, retcode);
+ }
+ return(retcode);
+}
diff --git a/sys/dev/raidframe/rf_raid1.h b/sys/dev/raidframe/rf_raid1.h
new file mode 100644
index 00000000000..9ce0cb64067
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid1.h
@@ -0,0 +1,130 @@
+/* $OpenBSD: rf_raid1.h,v 1.1 1999/01/11 14:29:42 niklas Exp $ */
+/* $NetBSD: rf_raid1.h,v 1.1 1998/11/13 04:20:33 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: William V. Courtright II
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* header file for RAID Level 1 */
+
+/*
+ * :
+ * Log: rf_raid1.h,v
+ * Revision 1.17 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.16 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.15 1996/07/11 19:08:00 jimz
+ * generalize reconstruction mechanism
+ * allow raid1 reconstructs via copyback (done with array
+ * quiesced, not online, therefore not disk-directed)
+ *
+ * Revision 1.14 1996/06/19 22:23:01 jimz
+ * parity verification is now a layout-configurable thing
+ * not all layouts currently support it (correctly, anyway)
+ *
+ * Revision 1.13 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.12 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.11 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.10 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.9 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.8 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.7 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.6 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.5 1996/05/03 19:35:34 wvcii
+ * moved dags to dag library
+ *
+ * Revision 1.4 1995/11/30 16:07:26 wvcii
+ * added copyright info
+ *
+ * Revision 1.3 1995/11/16 14:56:41 wvcii
+ * updated prototypes
+ *
+ * Revision 1.2 1995/11/07 15:23:01 wvcii
+ * changed RAID1DagSelect prototype
+ * function no longer generates numHdrSucc, numTermAnt
+ *
+ * Revision 1.1 1995/10/04 03:52:59 wvcii
+ * Initial revision
+ *
+ *
+ */
+
+#ifndef _RF__RF_RAID1_H_
+#define _RF__RF_RAID1_H_
+
+#include "rf_types.h"
+
+int rf_ConfigureRAID1(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr);
+void rf_MapSectorRAID1(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_MapParityRAID1(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_IdentifyStripeRAID1(RF_Raid_t *raidPtr, RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids, RF_RowCol_t *outRow);
+void rf_MapSIDToPSIDRAID1(RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t stripeID, RF_StripeNum_t *psID,
+ RF_ReconUnitNum_t *which_ru);
+void rf_RAID1DagSelect(RF_Raid_t *raidPtr, RF_IoType_t type,
+ RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc);
+int rf_VerifyParityRAID1(RF_Raid_t *raidPtr, RF_RaidAddr_t raidAddr,
+ RF_PhysDiskAddr_t *parityPDA, int correct_it, RF_RaidAccessFlags_t flags);
+int rf_SubmitReconBufferRAID1(RF_ReconBuffer_t *rbuf, int keep_int,
+ int use_committed);
+
+#endif /* !_RF__RF_RAID1_H_ */
diff --git a/sys/dev/raidframe/rf_raid4.c b/sys/dev/raidframe/rf_raid4.c
new file mode 100644
index 00000000000..5a2c0da50bf
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid4.c
@@ -0,0 +1,225 @@
+/* $OpenBSD: rf_raid4.c,v 1.1 1999/01/11 14:29:43 niklas Exp $ */
+/* $NetBSD: rf_raid4.c,v 1.1 1998/11/13 04:20:33 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Rachad Youssef
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/***************************************
+ *
+ * rf_raid4.c -- implements RAID Level 4
+ *
+ ***************************************/
+
+/*
+ * :
+ * Log: rf_raid4.c,v
+ * Revision 1.24 1996/07/31 16:56:18 jimz
+ * dataBytesPerStripe, sectorsPerDisk init arch-indep.
+ *
+ * Revision 1.23 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.22 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.21 1996/06/11 08:54:27 jimz
+ * improved error-checking at configuration time
+ *
+ * Revision 1.20 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.19 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.18 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.17 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.16 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.15 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.14 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.13 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.12 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.11 1996/05/03 19:39:41 wvcii
+ * added includes for dag library
+ *
+ * Revision 1.10 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.9 1995/12/06 15:02:46 root
+ * added copyright info
+ *
+ * Revision 1.8 1995/11/17 18:57:32 wvcii
+ * added prototyping to MapParity
+ *
+ * Revision 1.7 1995/06/23 13:38:58 robby
+ * updeated to prototypes in rf_layout.h
+ *
+ */
+
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_dagutils.h"
+#include "rf_dagfuncs.h"
+#include "rf_dagffrd.h"
+#include "rf_dagffwr.h"
+#include "rf_dagdegrd.h"
+#include "rf_dagdegwr.h"
+#include "rf_threadid.h"
+#include "rf_raid4.h"
+#include "rf_general.h"
+
+typedef struct RF_Raid4ConfigInfo_s {
+ RF_RowCol_t *stripeIdentifier; /* filled in at config time & used by IdentifyStripe */
+} RF_Raid4ConfigInfo_t;
+
+
+
+int rf_ConfigureRAID4(
+ RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr)
+{
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_Raid4ConfigInfo_t *info;
+ int i;
+
+ /* create a RAID level 4 configuration structure ... */
+ RF_MallocAndAdd(info, sizeof(RF_Raid4ConfigInfo_t), (RF_Raid4ConfigInfo_t *), raidPtr->cleanupList);
+ if (info == NULL)
+ return(ENOMEM);
+ layoutPtr->layoutSpecificInfo = (void *) info;
+
+ /* ... and fill it in. */
+ RF_MallocAndAdd(info->stripeIdentifier, raidPtr->numCol * sizeof(RF_RowCol_t), (RF_RowCol_t *), raidPtr->cleanupList);
+ if (info->stripeIdentifier == NULL)
+ return(ENOMEM);
+ for (i=0; i<raidPtr->numCol; i++)
+ info->stripeIdentifier[i] = i;
+
+ RF_ASSERT(raidPtr->numRow == 1);
+
+ /* fill in the remaining layout parameters */
+ layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
+ layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+ layoutPtr->numDataCol = raidPtr->numCol-1;
+ layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+ layoutPtr->numParityCol = 1;
+ raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+
+ return(0);
+}
+
+int rf_GetDefaultNumFloatingReconBuffersRAID4(RF_Raid_t *raidPtr)
+{
+ return(20);
+}
+
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitRAID4(RF_Raid_t *raidPtr)
+{
+ return(20);
+}
+
+void rf_MapSectorRAID4(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+ *row = 0;
+ *col = SUID % raidPtr->Layout.numDataCol;
+ *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
+ (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+void rf_MapParityRAID4(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+
+ *row = 0;
+ *col = raidPtr->Layout.numDataCol;
+ *diskSector =(SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
+ (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+void rf_IdentifyStripeRAID4(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids,
+ RF_RowCol_t *outRow)
+{
+ RF_Raid4ConfigInfo_t *info = raidPtr->Layout.layoutSpecificInfo;
+
+ *outRow = 0;
+ *diskids = info->stripeIdentifier;
+}
+
+void rf_MapSIDToPSIDRAID4(
+ RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t stripeID,
+ RF_StripeNum_t *psID,
+ RF_ReconUnitNum_t *which_ru)
+{
+ *which_ru = 0;
+ *psID = stripeID;
+}
diff --git a/sys/dev/raidframe/rf_raid4.h b/sys/dev/raidframe/rf_raid4.h
new file mode 100644
index 00000000000..81f8e5375d3
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid4.h
@@ -0,0 +1,109 @@
+/* $OpenBSD: rf_raid4.h,v 1.1 1999/01/11 14:29:43 niklas Exp $ */
+/* $NetBSD: rf_raid4.h,v 1.1 1998/11/13 04:20:33 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Rachad Youssef
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_raid4.h header file for RAID Level 4 */
+
+/*
+ * :
+ * Log: rf_raid4.h,v
+ * Revision 1.15 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.14 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.13 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.12 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.11 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.10 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.9 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.8 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.7 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.6 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.5 1995/12/06 15:07:03 root
+ * added copyright info
+ *
+ * Revision 1.4 1995/11/17 18:58:46 wvcii
+ * added prototyping to MapParity
+ *
+ * Revision 1.3 1995/06/23 13:38:46 robby
+ * updeated to prototypes in rf_layout.h
+ *
+ */
+
+#ifndef _RF__RF_RAID4_H_
+#define _RF__RF_RAID4_H_
+
+int rf_ConfigureRAID4(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr);
+int rf_GetDefaultNumFloatingReconBuffersRAID4(RF_Raid_t *raidPtr);
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitRAID4(RF_Raid_t *raidPtr);
+void rf_MapSectorRAID4(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_MapParityRAID4(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_IdentifyStripeRAID4(RF_Raid_t *raidPtr, RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids, RF_RowCol_t *outRow);
+void rf_MapSIDToPSIDRAID4(RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t stripeID, RF_StripeNum_t *psID,
+ RF_ReconUnitNum_t *which_ru);
+void rf_RAID4DagSelect(RF_Raid_t *raidPtr, RF_IoType_t type,
+ RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc);
+
+#endif /* !_RF__RF_RAID4_H_ */
diff --git a/sys/dev/raidframe/rf_raid5.c b/sys/dev/raidframe/rf_raid5.c
new file mode 100644
index 00000000000..febb9f51f44
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid5.c
@@ -0,0 +1,403 @@
+/* $OpenBSD: rf_raid5.c,v 1.1 1999/01/11 14:29:43 niklas Exp $ */
+/* $NetBSD: rf_raid5.c,v 1.1 1998/11/13 04:20:33 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/******************************************************************************
+ *
+ * rf_raid5.c -- implements RAID Level 5
+ *
+ *****************************************************************************/
+
+/*
+ * :
+ * Log: rf_raid5.c,v
+ * Revision 1.26 1996/11/05 21:10:40 jimz
+ * failed pda generalization
+ *
+ * Revision 1.25 1996/07/31 16:56:18 jimz
+ * dataBytesPerStripe, sectorsPerDisk init arch-indep.
+ *
+ * Revision 1.24 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.23 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.22 1996/06/11 08:54:27 jimz
+ * improved error-checking at configuration time
+ *
+ * Revision 1.21 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.20 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.19 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.18 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.17 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.16 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.15 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.14 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.13 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.12 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.11 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.10 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.9 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.8 1996/05/03 19:38:58 wvcii
+ * moved dag creation routines to dag library
+ *
+ * Revision 1.7 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.6 1995/12/06 15:04:28 root
+ * added copyright info
+ *
+ * Revision 1.5 1995/11/17 18:59:41 wvcii
+ * added prototyping to MapParity
+ *
+ * Revision 1.4 1995/06/23 13:38:21 robby
+ * updeated to prototypes in rf_layout.h
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_raid5.h"
+#include "rf_dag.h"
+#include "rf_dagffrd.h"
+#include "rf_dagffwr.h"
+#include "rf_dagdegrd.h"
+#include "rf_dagdegwr.h"
+#include "rf_dagutils.h"
+#include "rf_threadid.h"
+#include "rf_general.h"
+#include "rf_map.h"
+#include "rf_utils.h"
+
+typedef struct RF_Raid5ConfigInfo_s {
+ RF_RowCol_t **stripeIdentifier; /* filled in at config time and used by IdentifyStripe */
+} RF_Raid5ConfigInfo_t;
+
+int rf_ConfigureRAID5(
+ RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr)
+{
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_Raid5ConfigInfo_t *info;
+ RF_RowCol_t i, j, startdisk;
+
+ /* create a RAID level 5 configuration structure */
+ RF_MallocAndAdd(info, sizeof(RF_Raid5ConfigInfo_t), (RF_Raid5ConfigInfo_t *), raidPtr->cleanupList);
+ if (info == NULL)
+ return(ENOMEM);
+ layoutPtr->layoutSpecificInfo = (void *) info;
+
+ RF_ASSERT(raidPtr->numRow == 1);
+
+ /* the stripe identifier must identify the disks in each stripe,
+ * IN THE ORDER THAT THEY APPEAR IN THE STRIPE.
+ */
+ info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol, raidPtr->numCol, raidPtr->cleanupList);
+ if (info->stripeIdentifier == NULL)
+ return(ENOMEM);
+ startdisk = 0;
+ for (i=0; i<raidPtr->numCol; i++) {
+ for (j=0; j<raidPtr->numCol; j++) {
+ info->stripeIdentifier[i][j] = (startdisk + j) % raidPtr->numCol;
+ }
+ if ((--startdisk) < 0) startdisk = raidPtr->numCol-1;
+ }
+
+ /* fill in the remaining layout parameters */
+ layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
+ layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+ layoutPtr->numDataCol = raidPtr->numCol-1;
+ layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+ layoutPtr->numParityCol = 1;
+ layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk;
+
+ raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+
+ return(0);
+}
+
+int rf_GetDefaultNumFloatingReconBuffersRAID5(RF_Raid_t *raidPtr)
+{
+ return(20);
+}
+
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitRAID5(RF_Raid_t *raidPtr)
+{
+ return(10);
+}
+
+#if !defined(__NetBSD__) && !defined(__OpenBSD__) && !defined(_KERNEL)
+/* not currently used */
+int rf_ShutdownRAID5(RF_Raid_t *raidPtr)
+{
+ return(0);
+}
+#endif
+
+void rf_MapSectorRAID5(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+ *row = 0;
+ *col = (SUID % raidPtr->numCol);
+ *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
+ (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+void rf_MapParityRAID5(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+
+ *row = 0;
+ *col = raidPtr->Layout.numDataCol-(SUID/raidPtr->Layout.numDataCol)%raidPtr->numCol;
+ *diskSector =(SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
+ (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+void rf_IdentifyStripeRAID5(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids,
+ RF_RowCol_t *outRow)
+{
+ RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr);
+ RF_Raid5ConfigInfo_t *info = (RF_Raid5ConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+
+ *outRow = 0;
+ *diskids = info->stripeIdentifier[ stripeID % raidPtr->numCol ];
+}
+
+void rf_MapSIDToPSIDRAID5(
+ RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t stripeID,
+ RF_StripeNum_t *psID,
+ RF_ReconUnitNum_t *which_ru)
+{
+ *which_ru = 0;
+ *psID = stripeID;
+}
+
+/* select an algorithm for performing an access. Returns two pointers,
+ * one to a function that will return information about the DAG, and
+ * another to a function that will create the dag.
+ */
+void rf_RaidFiveDagSelect(
+ RF_Raid_t *raidPtr,
+ RF_IoType_t type,
+ RF_AccessStripeMap_t *asmap,
+ RF_VoidFuncPtr *createFunc)
+{
+ RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
+ RF_PhysDiskAddr_t *failedPDA=NULL;
+ RF_RowCol_t frow, fcol;
+ RF_RowStatus_t rstat;
+ int prior_recon;
+ int tid;
+
+ RF_ASSERT(RF_IO_IS_R_OR_W(type));
+
+ if (asmap->numDataFailed + asmap->numParityFailed > 1) {
+ RF_ERRORMSG("Multiple disks failed in a single group! Aborting I/O operation.\n");
+ /* *infoFunc = */ *createFunc = NULL;
+ return;
+ } else if (asmap->numDataFailed + asmap->numParityFailed == 1) {
+
+ /* if under recon & already reconstructed, redirect the access to the spare drive
+ * and eliminate the failure indication
+ */
+ failedPDA = asmap->failedPDAs[0];
+ frow = failedPDA->row; fcol = failedPDA->col;
+ rstat = raidPtr->status[failedPDA->row];
+ prior_recon = (rstat == rf_rs_reconfigured) || (
+ (rstat == rf_rs_reconstructing) ?
+ rf_CheckRUReconstructed(raidPtr->reconControl[frow]->reconMap, failedPDA->startSector) : 0
+ );
+ if (prior_recon) {
+ RF_RowCol_t or = failedPDA->row,oc=failedPDA->col;
+ RF_SectorNum_t oo=failedPDA->startSector;
+
+ if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { /* redirect to dist spare space */
+
+ if (failedPDA == asmap->parityInfo) {
+
+ /* parity has failed */
+ (layoutPtr->map->MapParity)(raidPtr, failedPDA->raidAddress, &failedPDA->row,
+ &failedPDA->col, &failedPDA->startSector, RF_REMAP);
+
+ if (asmap->parityInfo->next) { /* redir 2nd component, if any */
+ RF_PhysDiskAddr_t *p = asmap->parityInfo->next;
+ RF_SectorNum_t SUoffs = p->startSector % layoutPtr->sectorsPerStripeUnit;
+ p->row = failedPDA->row;
+ p->col = failedPDA->col;
+ p->startSector = rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr, failedPDA->startSector) +
+ SUoffs; /* cheating: startSector is not really a RAID address */
+ }
+
+ } else if (asmap->parityInfo->next && failedPDA == asmap->parityInfo->next) {
+ RF_ASSERT(0); /* should not ever happen */
+ } else {
+
+ /* data has failed */
+ (layoutPtr->map->MapSector)(raidPtr, failedPDA->raidAddress, &failedPDA->row,
+ &failedPDA->col, &failedPDA->startSector, RF_REMAP);
+
+ }
+
+ } else { /* redirect to dedicated spare space */
+
+ failedPDA->row = raidPtr->Disks[frow][fcol].spareRow;
+ failedPDA->col = raidPtr->Disks[frow][fcol].spareCol;
+
+ /* the parity may have two distinct components, both of which may need to be redirected */
+ if (asmap->parityInfo->next) {
+ if (failedPDA == asmap->parityInfo) {
+ failedPDA->next->row = failedPDA->row;
+ failedPDA->next->col = failedPDA->col;
+ } else if (failedPDA == asmap->parityInfo->next) { /* paranoid: should never occur */
+ asmap->parityInfo->row = failedPDA->row;
+ asmap->parityInfo->col = failedPDA->col;
+ }
+ }
+ }
+
+ RF_ASSERT(failedPDA->col != -1);
+
+ if (rf_dagDebug || rf_mapDebug) {
+ rf_get_threadid(tid);
+ printf("[%d] Redirected type '%c' r %d c %d o %ld -> r %d c %d o %ld\n",
+ tid,type,or,oc,(long)oo,failedPDA->row,failedPDA->col,
+ (long)failedPDA->startSector);
+ }
+
+ asmap->numDataFailed = asmap->numParityFailed = 0;
+ }
+
+ }
+
+ /* all dags begin/end with block/unblock node
+ * therefore, hdrSucc & termAnt counts should always be 1
+ * also, these counts should not be visible outside dag creation routines -
+ * manipulating the counts here should be removed */
+ if (type == RF_IO_TYPE_READ) {
+ if (asmap->numDataFailed == 0)
+ *createFunc = (RF_VoidFuncPtr)rf_CreateFaultFreeReadDAG;
+ else
+ *createFunc = (RF_VoidFuncPtr)rf_CreateRaidFiveDegradedReadDAG;
+ } else {
+
+
+ /* if mirroring, always use large writes. If the access requires two
+ * distinct parity updates, always do a small write. If the stripe
+ * contains a failure but the access does not, do a small write.
+ * The first conditional (numStripeUnitsAccessed <= numDataCol/2) uses a
+ * less-than-or-equal rather than just a less-than because when G is 3
+ * or 4, numDataCol/2 is 1, and I want single-stripe-unit updates to use
+ * just one disk.
+ */
+ if ( (asmap->numDataFailed + asmap->numParityFailed) == 0) {
+ if (rf_suppressLocksAndLargeWrites ||
+ (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol!=1)) ||
+ (asmap->parityInfo->next!=NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
+ *createFunc = (RF_VoidFuncPtr)rf_CreateSmallWriteDAG;
+ }
+ else
+ *createFunc = (RF_VoidFuncPtr)rf_CreateLargeWriteDAG;
+ }
+ else {
+ if (asmap->numParityFailed == 1)
+ *createFunc = (RF_VoidFuncPtr)rf_CreateNonRedundantWriteDAG;
+ else
+ if (asmap->numStripeUnitsAccessed != 1 && failedPDA->numSector != layoutPtr->sectorsPerStripeUnit)
+ *createFunc = NULL;
+ else
+ *createFunc = (RF_VoidFuncPtr)rf_CreateDegradedWriteDAG;
+ }
+ }
+}
diff --git a/sys/dev/raidframe/rf_raid5.h b/sys/dev/raidframe/rf_raid5.h
new file mode 100644
index 00000000000..a6ffc971ca4
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid5.h
@@ -0,0 +1,113 @@
+/* $OpenBSD: rf_raid5.h,v 1.1 1999/01/11 14:29:43 niklas Exp $ */
+/* $NetBSD: rf_raid5.h,v 1.1 1998/11/13 04:20:33 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_raid5.h - header file for RAID Level 5 */
+
+/*
+ * :
+ * Log: rf_raid5.h,v
+ * Revision 1.15 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.14 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.13 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.12 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.11 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.10 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.9 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.8 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.7 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.6 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.5 1995/12/06 15:04:35 root
+ * added copyright info
+ *
+ * Revision 1.4 1995/11/17 19:09:08 wvcii
+ * added prototyping to MapParity
+ *
+ * Revision 1.3 1995/11/07 15:25:40 wvcii
+ * changed RAIDFiveDagSelect prototype
+ * function no longer generates numHdrSucc, numTermAnt
+ *
+ * Revision 1.2 1995/06/23 13:37:53 robby
+ * updeated to prototypes in rf_layout.h
+ *
+ */
+
+#ifndef _RF__RF_RAID5_H_
+#define _RF__RF_RAID5_H_
+
+int rf_ConfigureRAID5(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr);
+int rf_GetDefaultNumFloatingReconBuffersRAID5(RF_Raid_t *raidPtr);
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimitRAID5(RF_Raid_t *raidPtr);
+void rf_MapSectorRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_MapParityRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_IdentifyStripeRAID5(RF_Raid_t *raidPtr, RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids, RF_RowCol_t *outRow);
+void rf_MapSIDToPSIDRAID5(RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t stripeID, RF_StripeNum_t *psID,
+ RF_ReconUnitNum_t *which_ru);
+void rf_RaidFiveDagSelect(RF_Raid_t *raidPtr, RF_IoType_t type,
+ RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc);
+
+#endif /* !_RF__RF_RAID5_H_ */
diff --git a/sys/dev/raidframe/rf_raid5_rotatedspare.c b/sys/dev/raidframe/rf_raid5_rotatedspare.c
new file mode 100644
index 00000000000..ca103f2116a
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid5_rotatedspare.c
@@ -0,0 +1,250 @@
+/* $OpenBSD: rf_raid5_rotatedspare.c,v 1.1 1999/01/11 14:29:44 niklas Exp $ */
+/* $NetBSD: rf_raid5_rotatedspare.c,v 1.1 1998/11/13 04:20:33 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Khalil Amiri
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/**************************************************************************
+ *
+ * rf_raid5_rotated_spare.c -- implements RAID Level 5 with rotated sparing
+ *
+ **************************************************************************/
+
+/* :
+ * Log: rf_raid5_rotatedspare.c,v
+ * Revision 1.22 1996/07/31 16:56:18 jimz
+ * dataBytesPerStripe, sectorsPerDisk init arch-indep.
+ *
+ * Revision 1.21 1996/07/29 14:05:12 jimz
+ * fix numPUs/numRUs confusion (everything is now numRUs)
+ * clean up some commenting, return values
+ *
+ * Revision 1.20 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.19 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.18 1996/06/19 17:53:48 jimz
+ * move GetNumSparePUs, InstallSpareTable ops into layout switch
+ *
+ * Revision 1.17 1996/06/11 08:54:27 jimz
+ * improved error-checking at configuration time
+ *
+ * Revision 1.16 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.15 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.14 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.13 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.12 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.11 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.10 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.9 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.8 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.7 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.6 1996/05/03 19:48:36 wvcii
+ * removed include of rf_redstripe.h
+ *
+ * Revision 1.5 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.4 1995/12/06 15:05:53 root
+ * added copyright info
+ *
+ * Revision 1.3 1995/11/19 21:26:29 amiri
+ * Added an assert to make sure numCol >= 3
+ *
+ * Revision 1.2 1995/11/17 19:03:18 wvcii
+ * added prototyping to MapParity
+ *
+ */
+
+#include "rf_raid.h"
+#include "rf_raid5.h"
+#include "rf_dag.h"
+#include "rf_dagutils.h"
+#include "rf_dagfuncs.h"
+#include "rf_threadid.h"
+#include "rf_general.h"
+#include "rf_utils.h"
+#include "rf_raid5_rotatedspare.h"
+
+typedef struct RF_Raid5RSConfigInfo_s {
+ RF_RowCol_t **stripeIdentifier; /* filled in at config time & used by IdentifyStripe */
+} RF_Raid5RSConfigInfo_t;
+
+int rf_ConfigureRAID5_RS(
+ RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr)
+{
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_Raid5RSConfigInfo_t *info;
+ RF_RowCol_t i, j, startdisk;
+
+ /* create a RAID level 5 configuration structure */
+ RF_MallocAndAdd(info, sizeof(RF_Raid5RSConfigInfo_t), (RF_Raid5RSConfigInfo_t *), raidPtr->cleanupList);
+ if (info == NULL)
+ return(ENOMEM);
+ layoutPtr->layoutSpecificInfo = (void *) info;
+
+ RF_ASSERT(raidPtr->numRow == 1);
+ RF_ASSERT(raidPtr->numCol >= 3);
+
+ /* the stripe identifier must identify the disks in each stripe,
+ * IN THE ORDER THAT THEY APPEAR IN THE STRIPE.
+ */
+ info->stripeIdentifier = rf_make_2d_array(raidPtr->numCol, raidPtr->numCol, raidPtr->cleanupList);
+ if (info->stripeIdentifier == NULL)
+ return(ENOMEM);
+ startdisk = 0;
+ for (i=0; i<raidPtr->numCol; i++) {
+ for (j=0; j<raidPtr->numCol; j++) {
+ info->stripeIdentifier[i][j] = (startdisk + j) % raidPtr->numCol;
+ }
+ if ((--startdisk) < 0) startdisk = raidPtr->numCol-1;
+ }
+
+ /* fill in the remaining layout parameters */
+ layoutPtr->numStripe = layoutPtr->stripeUnitsPerDisk;
+ layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit << raidPtr->logBytesPerSector;
+ layoutPtr->numDataCol = raidPtr->numCol-2;
+ layoutPtr->dataSectorsPerStripe = layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+ layoutPtr->numParityCol = 1;
+ layoutPtr->dataStripeUnitsPerDisk = layoutPtr->stripeUnitsPerDisk;
+ raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk * layoutPtr->sectorsPerStripeUnit;
+
+ raidPtr->totalSectors = layoutPtr->stripeUnitsPerDisk * layoutPtr->numDataCol * layoutPtr->sectorsPerStripeUnit;
+
+ return(0);
+}
+
+RF_ReconUnitCount_t rf_GetNumSpareRUsRAID5_RS(raidPtr)
+ RF_Raid_t *raidPtr;
+{
+ return ( raidPtr->Layout.stripeUnitsPerDisk / raidPtr->numCol );
+}
+
+void rf_MapSectorRAID5_RS(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+
+ *row = 0;
+ if (remap) {
+ *col = raidPtr->numCol-1-(1+SUID/raidPtr->Layout.numDataCol)%raidPtr->numCol;
+ *col = (*col+1)%raidPtr->numCol; /*spare unit is rotated with parity; line above maps to parity */
+ }
+ else {
+ *col = ( SUID + (SUID/raidPtr->Layout.numDataCol) ) % raidPtr->numCol;
+ }
+ *diskSector = (SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
+ (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+}
+
+void rf_MapParityRAID5_RS(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row,
+ RF_RowCol_t *col,
+ RF_SectorNum_t *diskSector,
+ int remap)
+{
+ RF_StripeNum_t SUID = raidSector / raidPtr->Layout.sectorsPerStripeUnit;
+
+ *row = 0;
+ *col = raidPtr->numCol-1-(1+SUID/raidPtr->Layout.numDataCol)%raidPtr->numCol;
+ *diskSector =(SUID / (raidPtr->Layout.numDataCol)) * raidPtr->Layout.sectorsPerStripeUnit +
+ (raidSector % raidPtr->Layout.sectorsPerStripeUnit);
+ if (remap)
+ *col = (*col+1)%raidPtr->numCol;
+}
+
+void rf_IdentifyStripeRAID5_RS(
+ RF_Raid_t *raidPtr,
+ RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids,
+ RF_RowCol_t *outRow)
+{
+ RF_StripeNum_t stripeID = rf_RaidAddressToStripeID(&raidPtr->Layout, addr);
+ RF_Raid5RSConfigInfo_t *info = (RF_Raid5RSConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
+ *outRow = 0;
+ *diskids = info->stripeIdentifier[ stripeID % raidPtr->numCol ];
+
+}
+
+void rf_MapSIDToPSIDRAID5_RS(
+ RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t stripeID,
+ RF_StripeNum_t *psID,
+ RF_ReconUnitNum_t *which_ru)
+{
+ *which_ru = 0;
+ *psID = stripeID;
+}
+
diff --git a/sys/dev/raidframe/rf_raid5_rotatedspare.h b/sys/dev/raidframe/rf_raid5_rotatedspare.h
new file mode 100644
index 00000000000..e144b00f6d0
--- /dev/null
+++ b/sys/dev/raidframe/rf_raid5_rotatedspare.h
@@ -0,0 +1,105 @@
+/* $OpenBSD: rf_raid5_rotatedspare.h,v 1.1 1999/01/11 14:29:44 niklas Exp $ */
+/* $NetBSD: rf_raid5_rotatedspare.h,v 1.1 1998/11/13 04:20:33 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Khalil Amiri
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_raid5_rotatedspare.h - header file for RAID Level 5 with rotated sparing */
+
+/* :
+ * Log: rf_raid5_rotatedspare.h,v
+ * Revision 1.13 1996/07/29 14:05:12 jimz
+ * fix numPUs/numRUs confusion (everything is now numRUs)
+ * clean up some commenting, return values
+ *
+ * Revision 1.12 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.11 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.10 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.9 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.8 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.7 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.6 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.5 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.4 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.3 1995/12/06 15:06:00 root
+ * added copyright info
+ *
+ * Revision 1.2 1995/11/17 19:09:54 wvcii
+ * added prototyping to MapParity
+ *
+ */
+
+#ifndef _RF__RF_RAID5_ROTATEDSPARE_H_
+#define _RF__RF_RAID5_ROTATEDSPARE_H_
+
+int rf_ConfigureRAID5_RS(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr);
+RF_ReconUnitCount_t rf_GetNumSpareRUsRAID5_RS(RF_Raid_t *raidPtr);
+void rf_MapSectorRAID5_RS(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_MapParityRAID5_RS(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
+ RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap);
+void rf_IdentifyStripeRAID5_RS(RF_Raid_t *raidPtr, RF_RaidAddr_t addr,
+ RF_RowCol_t **diskids, RF_RowCol_t *outRow);
+void rf_MapSIDToPSIDRAID5_RS(RF_RaidLayout_t *layoutPtr,
+ RF_StripeNum_t stripeID, RF_StripeNum_t *psID,
+ RF_ReconUnitNum_t *which_ru);
+
+#endif /* !_RF__RF_RAID5_ROTATEDSPARE_H_ */
diff --git a/sys/dev/raidframe/rf_raidframe.h b/sys/dev/raidframe/rf_raidframe.h
new file mode 100644
index 00000000000..e316dd09eb4
--- /dev/null
+++ b/sys/dev/raidframe/rf_raidframe.h
@@ -0,0 +1,165 @@
+/* $OpenBSD: rf_raidframe.h,v 1.1 1999/01/11 14:29:44 niklas Exp $ */
+/* $NetBSD: rf_raidframe.h,v 1.1 1998/11/13 04:20:33 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*****************************************************
+ *
+ * rf_raidframe.h
+ *
+ * main header file for using raidframe in the kernel.
+ *
+ *****************************************************/
+
+/*
+ * :
+ *
+ * Log: rf_raidframe.h,v
+ * Revision 1.21 1996/06/17 03:00:15 jimz
+ * Change RAIDFRAME_GET_INFO interface to work around ioctl
+ * size limitation problem. This operation now takes a pointer
+ * to a pointer, and does its own copyout() (so it can transfer
+ * more than 8k at a time).
+ *
+ * Revision 1.20 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.19 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.18 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.17 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.16 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.15 1996/05/02 22:09:48 jimz
+ * change devs and spares in device_config to RF_RaidDisk_t
+ *
+ * Revision 1.14 1995/12/06 15:03:33 root
+ * added copyright info
+ *
+ * Revision 1.13 1995/09/30 20:39:54 jimz
+ * added new ioctls:
+ * RAIDFRAME_RESET_ACCTOTALS
+ * RAIDFRAME_GET_ACCTOTALS
+ * RAIDFRAME_KEEP_ACCTOTALS
+ *
+ * Revision 1.12 1995/09/25 20:11:51 wvcii
+ * Added #include "rf_raid.h"
+ *
+ *
+ */
+
+#ifndef _RF__RF_RAIDFRAME_H_
+#define _RF__RF_RAIDFRAME_H_
+
+#include "rf_types.h"
+#include "rf_configure.h"
+#include "rf_disks.h"
+#include "rf_raid.h"
+
+struct rf_test_acc { /* used by RAIDFRAME_TEST_ACC ioctl */
+ RF_SectorNum_t startSector; /* raidAddress */
+ RF_SectorCount_t numSector; /* number of sectors to xfer */
+ char *buf; /* data buffer */
+ void *returnBufs[10]; /* for async accs only, completed I/Os returned */
+ struct rf_test_acc *next; /* for making lists */
+ RF_IoType_t type; /* (see rf_types.h for RF_IO_TYPE_*) */
+ struct rf_test_acc *myaddr; /* user-address of this struct */
+ void *bp; /* used in-kernel: need not be set by user */
+};
+
+typedef RF_uint32 RF_ReconReqFlags_t;
+
+struct rf_recon_req { /* used to tell the kernel to fail a disk */
+ RF_RowCol_t row, col;
+ RF_ReconReqFlags_t flags;
+ void *raidPtr; /* used internally; need not be set at ioctl time */
+ struct rf_recon_req *next; /* used internally; need not be set at ioctl time */
+};
+
+struct RF_SparetWait_s {
+ int C, G, fcol; /* C = # disks in row, G = # units in stripe, fcol = which disk has failed */
+
+ RF_StripeCount_t SUsPerPU; /* this stuff is the info required to create a spare table */
+ int TablesPerSpareRegion;
+ int BlocksPerTable;
+ RF_StripeCount_t TableDepthInPUs;
+ RF_StripeCount_t SpareSpaceDepthPerRegionInSUs;
+
+ RF_SparetWait_t *next; /* used internally; need not be set at ioctl time */
+};
+
+typedef struct RF_DeviceConfig_s {
+ u_int rows;
+ u_int cols;
+ u_int maxqdepth;
+ int ndevs;
+ RF_RaidDisk_t devs[RF_MAX_DISKS];
+ int nspares;
+ RF_RaidDisk_t spares[RF_MAX_DISKS];
+} RF_DeviceConfig_t;
+
+
+/* flags that can be put in the rf_recon_req structure */
+#define RF_FDFLAGS_NONE 0x0 /* just fail the disk */
+#define RF_FDFLAGS_RECON 0x1 /* fail and initiate recon */
+
+#define RF_SCSI_DISK_MAJOR 8 /* the device major number for disks in the system */
+
+#define RAIDFRAME_CONFIGURE _IOW ('r', 1, void *) /* configure the driver */
+#define RAIDFRAME_SHUTDOWN _IO ('r', 2) /* shutdown the driver */
+#define RAIDFRAME_TUR _IOW ('r', 3, dev_t) /* debug only: test unit ready */
+#define RAIDFRAME_TEST_ACC _IOWR('r', 4, struct rf_test_acc) /* run a test access */
+#define RAIDFRAME_FAIL_DISK _IOW ('r', 5, struct rf_recon_req) /* fail a disk & optionally start recon */
+#define RAIDFRAME_CHECKRECON _IOWR('r', 6, int) /* get reconstruction % complete on indicated row */
+#define RAIDFRAME_REWRITEPARITY _IO ('r', 7) /* rewrite (initialize) all parity */
+#define RAIDFRAME_COPYBACK _IO ('r', 8) /* copy reconstructed data back to replaced disk */
+#define RAIDFRAME_SPARET_WAIT _IOR ('r', 9, RF_SparetWait_t) /* does not return until kernel needs a spare table */
+#define RAIDFRAME_SEND_SPARET _IOW ('r', 10, void *) /* used to send a spare table down into the kernel */
+#define RAIDFRAME_ABORT_SPARET_WAIT _IO ('r', 11) /* used to wake up the sparemap daemon & tell it to exit */
+#define RAIDFRAME_START_ATRACE _IO ('r', 12) /* start tracing accesses */
+#define RAIDFRAME_STOP_ATRACE _IO ('r', 13) /* stop tracing accesses */
+#define RAIDFRAME_GET_SIZE _IOR ('r', 14, int) /* get size (# sectors) in raid device */
+#define RAIDFRAME_GET_INFO _IOWR('r', 15, RF_DeviceConfig_t *) /* get configuration */
+#define RAIDFRAME_RESET_ACCTOTALS _IO ('r', 16) /* reset AccTotals for device */
+#define RAIDFRAME_GET_ACCTOTALS _IOR ('r', 17, RF_AccTotals_t) /* retrieve AccTotals for device */
+#define RAIDFRAME_KEEP_ACCTOTALS _IOW ('r', 18, int) /* turn AccTotals on or off for device */
+
+#endif /* !_RF__RF_RAIDFRAME_H_ */
diff --git a/sys/dev/raidframe/rf_randmacros.h b/sys/dev/raidframe/rf_randmacros.h
new file mode 100644
index 00000000000..c3536e0c613
--- /dev/null
+++ b/sys/dev/raidframe/rf_randmacros.h
@@ -0,0 +1,228 @@
+/* $OpenBSD: rf_randmacros.h,v 1.1 1999/01/11 14:29:45 niklas Exp $ */
+/* $NetBSD: rf_randmacros.h,v 1.1 1998/11/13 04:20:33 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_randmacros.h
+ * some macros to simplify using random in a multithreaded environment
+ */
+
+/* :
+ * Log: rf_randmacros.h,v
+ * Revision 1.17 1996/08/12 22:37:57 jimz
+ * use regular random() stuff for AIX
+ *
+ * Revision 1.16 1996/08/11 00:41:03 jimz
+ * fix up for aix4
+ *
+ * Revision 1.15 1996/07/29 05:22:34 jimz
+ * use rand/srand on hpux
+ *
+ * Revision 1.14 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.13 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.12 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.11 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.10 1996/07/15 17:22:18 jimz
+ * nit-pick code cleanup
+ * resolve stdlib problems on DEC OSF
+ *
+ * Revision 1.9 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.8 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.7 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.6 1996/05/21 18:52:56 jimz
+ * mask out highest bit from RANDOM (was causing angst)
+ *
+ * Revision 1.5 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.4 1995/12/06 15:05:41 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_RANDMACROS_H_
+#define _RF__RF_RANDMACROS_H_
+
+#ifndef KERNEL
+
+#ifdef __osf__
+/*
+ * Okay, here's the deal. The DEC man page for initstate_r() sez:
+ *
+ * int initstate_r(unsigned seed, char *state, int size, char **retval,
+ * struct random_data *rand_data);
+ *
+ * That wouldn't bug me so much, if /usr/include/random.h on the alpha
+ * didn't say:
+ *
+ * int initstate_r(unsigned, char *, int, RANDMOD *);
+ *
+ * Most of the other random functions have similar problems (docs
+ * don't match random.h). This is the case for random_r(), for
+ * instance. Generally, I'm inclined to trust the code over the
+ * documentation. Problem is, I have no clue what the arguments for
+ * the prototyped versions are, since they don't have descriptive names
+ * comma the bastards.
+ *
+ * Update: I looked at the DU sources to get this straightened out.
+ * The docs are correct. and everything in random.h is wrong. Uh, that's
+ * really cool or something. Not. I'm going to try slapping in prototypes
+ * that match my view of the universe, here.
+ *
+ * Okay, now let's have some more fun. /usr/include/stdlib.h also defines
+ * all this stuff, only differently. I mean differently from random.h,
+ * _and_ differently from the source. How cool is _that_?
+ *
+ * --jimz
+ */
+#ifndef _NO_PROTO
+#define _NO_PROTO
+#define _RF_SPANKME
+#endif /* !_NO_PROTO */
+#include <random.h>
+#ifdef _RF_SPANKME
+#undef _NO_PROTO
+#undef _RF_SPANKME
+#endif /* _RF_SPANKME */
+
+extern int initstate_r(unsigned seed, char *arg_state, int n, char **retval,
+ struct random_data *rand_data);
+extern int random_r(int *retval, struct random_data *rand_data);
+#endif /* __osf__ */
+#ifdef SIMULATE
+#if defined(DEC_OSF) || defined(hpux)
+extern int random(void);
+extern int srandom(unsigned);
+#endif /* DEC_OSF || hpux */
+#if defined(AIX) && RF_AIXVERS == 3
+extern int random(void);
+extern int srandom(unsigned);
+#endif /* AIX && RF_AIXVERS == 3 */
+#endif /* SIMULATE */
+
+#define RF_FASTRANDOM 0 /* when >0 make RANDOM a macro instead of a function */
+
+#ifdef __osf__
+long rf_do_random(long *rval, struct random_data *rdata); /* in utils.c */
+#endif /* __osf__ */
+
+#ifndef SIMULATE
+
+#ifdef __osf__
+/*
+ * Mark's original comment about this rigamarole was, "What a pile of crap."
+ */
+#define RF_DECLARE_RANDOM \
+ struct random_data randdata; \
+ long randstate[64+1]; \
+ char *stptr = ((char *) randstate)+4; \
+ char *randst; \
+ long randval
+
+#define RF_DECLARE_STATIC_RANDOM \
+ static struct random_data randdata_st; \
+ static long randstate_st[64+1]; \
+ static char *stptr_st = ((char *) randstate_st)+4; \
+ static char *randst_st; \
+ long randval_st;
+
+#define RF_INIT_RANDOM(_s_) \
+ randdata.state = NULL; \
+ initstate_r((unsigned) (_s_), stptr, 64, &randst, &randdata);
+
+#define RF_INIT_STATIC_RANDOM(_s_) \
+ randdata_st.state = NULL; \
+ initstate_r((unsigned) (_s_), stptr_st, 64, &randst_st, &randdata_st);
+
+#if RF_FASTRANDOM > 0
+#define RF_RANDOM() (random_r(&randval, &randdata),randval)
+#define RF_STATIC_RANDOM() (random_r(&randval_st, &randdata_st),randval_st)
+#else /* RF_FASTRANDOM > 0 */
+#define RF_RANDOM() (rf_do_random(&randval, &randdata)&0x7fffffffffffffff)
+#define RF_STATIC_RANDOM() rf_do_random(&randval_st, &randdata_st)
+#endif /* RF_FASTRANDOM > 0 */
+
+#define RF_SRANDOM(_s_) srandom_r((_s_), &randdata)
+#define RF_STATIC_SRANDOM(_s_) srandom_r((_s_), &randdata_st)
+#endif /* __osf__ */
+
+#ifdef AIX
+#define RF_INIT_STATIC_RANDOM(_s_)
+#define RF_DECLARE_STATIC_RANDOM static int rf_rand_decl##__LINE__
+#define RF_DECLARE_RANDOM int rf_rand_decl##__LINE__
+#define RF_RANDOM() random()
+#define RF_STATIC_RANDOM() random()
+#define RF_INIT_RANDOM(_n_) srandom(_n_)
+#endif /* AIX */
+
+#else /* !SIMULATE */
+
+#define RF_INIT_STATIC_RANDOM(_s_)
+#define RF_DECLARE_STATIC_RANDOM static int rf_rand_decl##__LINE__
+#define RF_DECLARE_RANDOM int rf_rand_decl##__LINE__
+#if defined(sun) || defined(hpux)
+#define RF_RANDOM() rand()
+#define RF_STATIC_RANDOM() rand()
+#define RF_INIT_RANDOM(_n_) srand(_n_)
+#else /* sun || hpux */
+#define RF_RANDOM() random()
+#define RF_STATIC_RANDOM() random()
+#define RF_INIT_RANDOM(_n_) srandom(_n_)
+#endif /* sun || hpux */
+
+#endif /* !SIMULATE */
+
+#endif /* !KERNEL */
+
+#endif /* !_RF__RF_RANDMACROS_H_ */
diff --git a/sys/dev/raidframe/rf_reconbuffer.c b/sys/dev/raidframe/rf_reconbuffer.c
new file mode 100644
index 00000000000..2c24e47c111
--- /dev/null
+++ b/sys/dev/raidframe/rf_reconbuffer.c
@@ -0,0 +1,538 @@
+/* $OpenBSD: rf_reconbuffer.c,v 1.1 1999/01/11 14:29:45 niklas Exp $ */
+/* $NetBSD: rf_reconbuffer.c,v 1.1 1998/11/13 04:20:33 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/***************************************************
+ *
+ * rf_reconbuffer.c -- reconstruction buffer manager
+ *
+ ***************************************************/
+
+/* :
+ * Log: rf_reconbuffer.c,v
+ * Revision 1.33 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.32 1996/07/17 21:00:58 jimz
+ * clean up timer interface, tracing
+ *
+ * Revision 1.31 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.30 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.29 1996/06/06 01:23:58 jimz
+ * don't free reconCtrlPtr until after all fields have been used out of it
+ *
+ * Revision 1.28 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.27 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.26 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.25 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.24 1996/05/30 12:59:18 jimz
+ * make etimer happier, more portable
+ *
+ * Revision 1.23 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.22 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.21 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.20 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.19 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.18 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.17 1995/12/06 15:03:24 root
+ * added copyright info
+ *
+ */
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#include "rf_raid.h"
+#include "rf_reconbuffer.h"
+#include "rf_acctrace.h"
+#include "rf_etimer.h"
+#include "rf_general.h"
+#include "rf_debugprint.h"
+#include "rf_revent.h"
+#include "rf_reconutil.h"
+#include "rf_nwayxor.h"
+
+#ifdef KERNEL
+#define Dprintf1(s,a) if (rf_reconbufferDebug) printf(s,a)
+#define Dprintf2(s,a,b) if (rf_reconbufferDebug) printf(s,a,b)
+#define Dprintf3(s,a,b,c) if (rf_reconbufferDebug) printf(s,a,b,c)
+#define Dprintf4(s,a,b,c,d) if (rf_reconbufferDebug) printf(s,a,b,c,d)
+#define Dprintf5(s,a,b,c,d,e) if (rf_reconbufferDebug) printf(s,a,b,c,d,e)
+#else /* KERNEL */
+#define Dprintf1(s,a) if (rf_reconbufferDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf2(s,a,b) if (rf_reconbufferDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf3(s,a,b,c) if (rf_reconbufferDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
+#define Dprintf4(s,a,b,c,d) if (rf_reconbufferDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL)
+#define Dprintf5(s,a,b,c,d,e) if (rf_reconbufferDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL)
+#endif /* KERNEL */
+
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+
+/* XXX XXX XXX This is wrong, for a number of reasons:
+ a) thread_block doesn't exist with UVM
+ b) The prototype begin used here is wrong for the regular VM
+ (regular VM expects a (char *) as an argument. I don't put
+ that in here as this code uses thread_block with no arguments.. :-/
+
+*/
+#if 0
+void thread_block(void);
+#endif
+#endif
+
+/*****************************************************************************************
+ *
+ * Submit a reconstruction buffer to the manager for XOR.
+ * We can only submit a buffer if (1) we can xor into an existing buffer, which means
+ * we don't have to acquire a new one, (2) we can acquire a floating
+ * recon buffer, or (3) the caller has indicated that we are allowed to keep the
+ * submitted buffer.
+ *
+ * Returns non-zero if and only if we were not able to submit.
+ * In this case, we append the current disk ID to the wait list on the indicated
+ * RU, so that it will be re-enabled when we acquire a buffer for this RU.
+ *
+ ****************************************************************************************/
+
+/* just to make the code below more readable */
+#define BUFWAIT_APPEND(_cb_, _pssPtr_, _row_, _col_) \
+ _cb_ = rf_AllocCallbackDesc(); \
+ (_cb_)->row = (_row_); (_cb_)->col = (_col_); (_cb_)->next = (_pssPtr_)->bufWaitList; (_pssPtr_)->bufWaitList = (_cb_);
+
+/*
+ * nWayXorFuncs[i] is a pointer to a function that will xor "i"
+ * bufs into the accumulating sum.
+ */
+static RF_VoidFuncPtr nWayXorFuncs[] = {
+ NULL,
+ (RF_VoidFuncPtr)rf_nWayXor1,
+ (RF_VoidFuncPtr)rf_nWayXor2,
+ (RF_VoidFuncPtr)rf_nWayXor3,
+ (RF_VoidFuncPtr)rf_nWayXor4,
+ (RF_VoidFuncPtr)rf_nWayXor5,
+ (RF_VoidFuncPtr)rf_nWayXor6,
+ (RF_VoidFuncPtr)rf_nWayXor7,
+ (RF_VoidFuncPtr)rf_nWayXor8,
+ (RF_VoidFuncPtr)rf_nWayXor9
+};
+
+int rf_SubmitReconBuffer(rbuf, keep_it, use_committed)
+ RF_ReconBuffer_t *rbuf; /* the recon buffer to submit */
+ int keep_it; /* whether we can keep this buffer or we have to return it */
+ int use_committed; /* whether to use a committed or an available recon buffer */
+{
+ RF_LayoutSW_t *lp;
+ int rc;
+
+ lp = rbuf->raidPtr->Layout.map;
+ rc = lp->SubmitReconBuffer(rbuf, keep_it, use_committed);
+ return(rc);
+}
+
+int rf_SubmitReconBufferBasic(rbuf, keep_it, use_committed)
+ RF_ReconBuffer_t *rbuf; /* the recon buffer to submit */
+ int keep_it; /* whether we can keep this buffer or we have to return it */
+ int use_committed; /* whether to use a committed or an available recon buffer */
+{
+ RF_Raid_t *raidPtr = rbuf->raidPtr;
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[rbuf->row];
+ RF_ReconParityStripeStatus_t *pssPtr;
+ RF_ReconBuffer_t *targetRbuf, *t = NULL; /* temporary rbuf pointers */
+ caddr_t ta; /* temporary data buffer pointer */
+ RF_CallbackDesc_t *cb, *p;
+ int retcode = 0, created = 0;
+
+ RF_Etimer_t timer;
+
+ /* makes no sense to have a submission from the failed disk */
+ RF_ASSERT(rbuf);
+ RF_ASSERT(rbuf->col != reconCtrlPtr->fcol);
+
+ Dprintf5("RECON: submission by row %d col %d for psid %ld ru %d (failed offset %ld)\n",
+ rbuf->row, rbuf->col, (long)rbuf->parityStripeID, rbuf->which_ru, (long)rbuf->failedDiskSectorOffset);
+
+ RF_LOCK_PSS_MUTEX(raidPtr,rbuf->row,rbuf->parityStripeID);
+
+ RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
+
+ pssPtr = rf_LookupRUStatus(raidPtr, reconCtrlPtr->pssTable, rbuf->parityStripeID, rbuf->which_ru, RF_PSS_NONE, &created);
+ RF_ASSERT(pssPtr); /* if it didn't exist, we wouldn't have gotten an rbuf for it */
+
+ /* check to see if enough buffers have accumulated to do an XOR. If so, there's no need to
+ * acquire a floating rbuf. Before we can do any XORing, we must have acquired a destination
+ * buffer. If we have, then we can go ahead and do the XOR if (1) including this buffer, enough
+ * bufs have accumulated, or (2) this is the last submission for this stripe.
+ * Otherwise, we have to go acquire a floating rbuf.
+ */
+
+ targetRbuf = (RF_ReconBuffer_t *) pssPtr->rbuf;
+ if ( (targetRbuf != NULL) &&
+ ((pssPtr->xorBufCount == rf_numBufsToAccumulate-1) || (targetRbuf->count + pssPtr->xorBufCount + 1 == layoutPtr->numDataCol)) ) {
+ pssPtr->rbufsForXor[ pssPtr->xorBufCount++ ] = rbuf; /* install this buffer */
+ Dprintf3("RECON: row %d col %d invoking a %d-way XOR\n",rbuf->row, rbuf->col,pssPtr->xorBufCount);
+ RF_ETIMER_START(timer);
+ rf_MultiWayReconXor(raidPtr, pssPtr);
+ RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer);
+ raidPtr->accumXorTimeUs += RF_ETIMER_VAL_US(timer);
+ if (!keep_it) {
+ raidPtr->recon_tracerecs[rbuf->col].xor_us = RF_ETIMER_VAL_US(timer);
+ RF_ETIMER_STOP(raidPtr->recon_tracerecs[rbuf->col].recon_timer);
+ RF_ETIMER_EVAL(raidPtr->recon_tracerecs[rbuf->col].recon_timer);
+ raidPtr->recon_tracerecs[rbuf->col].specific.recon.recon_return_to_submit_us +=
+ RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[rbuf->col].recon_timer);
+ RF_ETIMER_START(raidPtr->recon_tracerecs[rbuf->col].recon_timer);
+
+ rf_LogTraceRec(raidPtr, &raidPtr->recon_tracerecs[rbuf->col]);
+ }
+ rf_CheckForFullRbuf(raidPtr, reconCtrlPtr, pssPtr, layoutPtr->numDataCol);
+
+ /* if use_committed is on, we _must_ consume a buffer off the committed list. */
+ if (use_committed) {
+ t = reconCtrlPtr->committedRbufs;
+ RF_ASSERT(t);
+ reconCtrlPtr->committedRbufs = t->next;
+ rf_ReleaseFloatingReconBuffer(raidPtr, rbuf->row, t);
+ }
+ if (keep_it) {
+ RF_UNLOCK_PSS_MUTEX( raidPtr,rbuf->row,rbuf->parityStripeID);
+ RF_UNLOCK_MUTEX( reconCtrlPtr->rb_mutex );
+ rf_FreeReconBuffer(rbuf);
+ return(retcode);
+ }
+ goto out;
+ }
+
+ /* set the value of "t", which we'll use as the rbuf from here on */
+ if (keep_it) {
+ t = rbuf;
+ }
+ else {
+ if (use_committed) { /* if a buffer has been committed to us, use it */
+ t = reconCtrlPtr->committedRbufs;
+ RF_ASSERT(t);
+ reconCtrlPtr->committedRbufs = t->next;
+ t->next = NULL;
+ } else if (reconCtrlPtr->floatingRbufs) {
+ t = reconCtrlPtr->floatingRbufs;
+ reconCtrlPtr->floatingRbufs = t->next;
+ t->next = NULL;
+ }
+ }
+
+ /* If we weren't able to acquire a buffer,
+ * append to the end of the buf list in the recon ctrl struct.
+ */
+ if (!t) {
+ RF_ASSERT(!keep_it && !use_committed);
+ Dprintf2("RECON: row %d col %d failed to acquire floating rbuf\n",rbuf->row, rbuf->col);
+
+ raidPtr->procsInBufWait++;
+ if ( (raidPtr->procsInBufWait == raidPtr->numCol -1) && (raidPtr->numFullReconBuffers == 0)) {
+ printf("Buffer wait deadlock detected. Exiting.\n");
+ rf_PrintPSStatusTable(raidPtr, rbuf->row);
+ RF_PANIC();
+ }
+ pssPtr->flags |= RF_PSS_BUFFERWAIT;
+ cb = rf_AllocCallbackDesc(); /* append to buf wait list in recon ctrl structure */
+ cb->row = rbuf->row; cb->col = rbuf->col;
+ cb->callbackArg.v = rbuf->parityStripeID;
+ cb->callbackArg2.v = rbuf->which_ru;
+ cb->next = NULL;
+ if (!reconCtrlPtr->bufferWaitList) reconCtrlPtr->bufferWaitList = cb;
+ else { /* might want to maintain head/tail pointers here rather than search for end of list */
+ for (p = reconCtrlPtr->bufferWaitList; p->next; p=p->next);
+ p->next = cb;
+ }
+ retcode = 1;
+ goto out;
+ }
+ Dprintf2("RECON: row %d col %d acquired rbuf\n",rbuf->row, rbuf->col);
+ RF_ETIMER_STOP(raidPtr->recon_tracerecs[rbuf->col].recon_timer);
+ RF_ETIMER_EVAL(raidPtr->recon_tracerecs[rbuf->col].recon_timer);
+ raidPtr->recon_tracerecs[rbuf->col].specific.recon.recon_return_to_submit_us +=
+ RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[rbuf->col].recon_timer);
+ RF_ETIMER_START(raidPtr->recon_tracerecs[rbuf->col].recon_timer);
+
+ rf_LogTraceRec(raidPtr, &raidPtr->recon_tracerecs[rbuf->col]);
+
+ /* initialize the buffer */
+ if (t!=rbuf) {
+ t->row = rbuf->row; t->col = reconCtrlPtr->fcol;
+ t->parityStripeID = rbuf->parityStripeID;
+ t->which_ru = rbuf->which_ru;
+ t->failedDiskSectorOffset = rbuf->failedDiskSectorOffset;
+ t->spRow=rbuf->spRow;
+ t->spCol=rbuf->spCol;
+ t->spOffset=rbuf->spOffset;
+
+ ta = t->buffer; t->buffer = rbuf->buffer; rbuf->buffer = ta; /* swap buffers */
+ }
+
+ /* the first installation always gets installed as the destination buffer.
+ * subsequent installations get stacked up to allow for multi-way XOR
+ */
+ if (!pssPtr->rbuf) {pssPtr->rbuf = t; t->count = 1;}
+ else pssPtr->rbufsForXor[ pssPtr->xorBufCount++ ] = t; /* install this buffer */
+
+ rf_CheckForFullRbuf(raidPtr, reconCtrlPtr, pssPtr, layoutPtr->numDataCol); /* the buffer is full if G=2 */
+
+out:
+ RF_UNLOCK_PSS_MUTEX( raidPtr,rbuf->row,rbuf->parityStripeID);
+ RF_UNLOCK_MUTEX( reconCtrlPtr->rb_mutex );
+ return(retcode);
+}
+
+int rf_MultiWayReconXor(raidPtr, pssPtr)
+ RF_Raid_t *raidPtr;
+ RF_ReconParityStripeStatus_t *pssPtr; /* the pss descriptor for this parity stripe */
+{
+ int i, numBufs = pssPtr->xorBufCount;
+ int numBytes = rf_RaidAddressToByte(raidPtr, raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU);
+ RF_ReconBuffer_t **rbufs = (RF_ReconBuffer_t **) pssPtr->rbufsForXor;
+ RF_ReconBuffer_t *targetRbuf = (RF_ReconBuffer_t *) pssPtr->rbuf;
+
+ RF_ASSERT(pssPtr->rbuf != NULL);
+ RF_ASSERT(numBufs > 0 && numBufs < RF_PS_MAX_BUFS);
+#ifdef KERNEL
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+ thread_block(); /* yield the processor before doing a big XOR */
+#endif
+#endif /* KERNEL */
+ /*
+ * XXX
+ *
+ * What if more than 9 bufs?
+ */
+ nWayXorFuncs[numBufs](pssPtr->rbufsForXor, targetRbuf, numBytes/sizeof(long));
+
+ /* release all the reconstruction buffers except the last one, which belongs to the
+ * the disk who's submission caused this XOR to take place
+ */
+ for (i=0; i < numBufs-1; i++) {
+ if (rbufs[i]->type == RF_RBUF_TYPE_FLOATING) rf_ReleaseFloatingReconBuffer(raidPtr, rbufs[i]->row, rbufs[i]);
+ else if (rbufs[i]->type == RF_RBUF_TYPE_FORCED) rf_FreeReconBuffer(rbufs[i]);
+ else RF_ASSERT(0);
+ }
+ targetRbuf->count += pssPtr->xorBufCount;
+ pssPtr->xorBufCount = 0;
+ return(0);
+}
+
+/* removes one full buffer from one of the full-buffer lists and returns it.
+ *
+ * ASSUMES THE RB_MUTEX IS UNLOCKED AT ENTRY.
+ */
+RF_ReconBuffer_t *rf_GetFullReconBuffer(reconCtrlPtr)
+ RF_ReconCtrl_t *reconCtrlPtr;
+{
+ RF_ReconBuffer_t *p;
+
+ RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
+
+ if ( (p=reconCtrlPtr->priorityList) != NULL) {
+ reconCtrlPtr->priorityList = p->next;
+ p->next = NULL;
+ goto out;
+ }
+ if ( (p=reconCtrlPtr->fullBufferList) != NULL) {
+ reconCtrlPtr->fullBufferList = p->next;
+ p->next = NULL;
+ goto out;
+ }
+
+out:
+ RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
+ return(p);
+}
+
+
+/* if the reconstruction buffer is full, move it to the full list, which is maintained
+ * sorted by failed disk sector offset
+ *
+ * ASSUMES THE RB_MUTEX IS LOCKED AT ENTRY.
+ */
+int rf_CheckForFullRbuf(raidPtr, reconCtrl, pssPtr, numDataCol)
+ RF_Raid_t *raidPtr;
+ RF_ReconCtrl_t *reconCtrl;
+ RF_ReconParityStripeStatus_t *pssPtr;
+ int numDataCol;
+{
+ RF_ReconBuffer_t *p, *pt, *rbuf = (RF_ReconBuffer_t *) pssPtr->rbuf;
+
+ if (rbuf->count == numDataCol) {
+ raidPtr->numFullReconBuffers++;
+ Dprintf2("RECON: rbuf for psid %ld ru %d has filled\n",
+ (long)rbuf->parityStripeID, rbuf->which_ru);
+ if (!reconCtrl->fullBufferList || (rbuf->failedDiskSectorOffset < reconCtrl->fullBufferList->failedDiskSectorOffset)) {
+ Dprintf2("RECON: rbuf for psid %ld ru %d is head of list\n",
+ (long)rbuf->parityStripeID, rbuf->which_ru);
+ rbuf->next = reconCtrl->fullBufferList;
+ reconCtrl->fullBufferList = rbuf;
+ }
+ else {
+ for (pt = reconCtrl->fullBufferList, p = pt->next; p && p->failedDiskSectorOffset < rbuf->failedDiskSectorOffset; pt=p, p=p->next);
+ rbuf->next = p;
+ pt->next = rbuf;
+ Dprintf2("RECON: rbuf for psid %ld ru %d is in list\n",
+ (long)rbuf->parityStripeID, rbuf->which_ru);
+ }
+#if 0
+ pssPtr->writeRbuf = pssPtr->rbuf; /* DEBUG ONLY: we like to be able to find this rbuf while it's awaiting write */
+#else
+ rbuf->pssPtr = pssPtr;
+#endif
+ pssPtr->rbuf = NULL;
+ rf_CauseReconEvent(raidPtr, rbuf->row, rbuf->col, NULL, RF_REVENT_BUFREADY);
+ }
+ return(0);
+}
+
+
+/* release a floating recon buffer for someone else to use.
+ * assumes the rb_mutex is LOCKED at entry
+ */
+void rf_ReleaseFloatingReconBuffer(raidPtr, row, rbuf)
+ RF_Raid_t *raidPtr;
+ RF_RowCol_t row;
+ RF_ReconBuffer_t *rbuf;
+{
+ RF_ReconCtrl_t *rcPtr = raidPtr->reconControl[row];
+ RF_CallbackDesc_t *cb;
+
+ Dprintf2("RECON: releasing rbuf for psid %ld ru %d\n",
+ (long)rbuf->parityStripeID, rbuf->which_ru);
+
+ /* if anyone is waiting on buffers, wake one of them up. They will subsequently wake up anyone
+ * else waiting on their RU
+ */
+ if (rcPtr->bufferWaitList) {
+ rbuf->next = rcPtr->committedRbufs;
+ rcPtr->committedRbufs = rbuf;
+ cb = rcPtr->bufferWaitList;
+ rcPtr->bufferWaitList = cb->next;
+ rf_CauseReconEvent(raidPtr, cb->row, cb->col, (void *) 1, RF_REVENT_BUFCLEAR); /* arg==1 => we've committed a buffer */
+ rf_FreeCallbackDesc(cb);
+ raidPtr->procsInBufWait--;
+ } else {
+ rbuf->next = rcPtr->floatingRbufs;
+ rcPtr->floatingRbufs = rbuf;
+ }
+}
+
+/* release any disk that is waiting on a buffer for the indicated RU.
+ * assumes the rb_mutex is LOCKED at entry
+ */
+void rf_ReleaseBufferWaiters(raidPtr, pssPtr)
+ RF_Raid_t *raidPtr;
+ RF_ReconParityStripeStatus_t *pssPtr;
+{
+ RF_CallbackDesc_t *cb1, *cb = pssPtr->bufWaitList;
+
+ Dprintf2("RECON: releasing buf waiters for psid %ld ru %d\n",
+ (long)pssPtr->parityStripeID, pssPtr->which_ru);
+ pssPtr->flags &= ~RF_PSS_BUFFERWAIT;
+ while (cb) {
+ cb1 = cb->next;
+ cb->next = NULL;
+ rf_CauseReconEvent(raidPtr, cb->row, cb->col, (void *) 0, RF_REVENT_BUFCLEAR); /* arg==0 => we haven't committed a buffer */
+ rf_FreeCallbackDesc(cb);
+ cb = cb1;
+ }
+ pssPtr->bufWaitList = NULL;
+}
+
+/* when reconstruction is forced on an RU, there may be some disks waiting to
+ * acquire a buffer for that RU. Since we allocate a new buffer as part of
+ * the forced-reconstruction process, we no longer have to wait for any
+ * buffers, so we wakeup any waiter that we find in the bufferWaitList
+ *
+ * assumes the rb_mutex is LOCKED at entry
+ */
+void rf_ReleaseBufferWaiter(rcPtr, rbuf)
+ RF_ReconCtrl_t *rcPtr;
+ RF_ReconBuffer_t *rbuf;
+{
+ RF_CallbackDesc_t *cb, *cbt;
+
+ for (cbt = NULL, cb = rcPtr->bufferWaitList; cb; cbt = cb, cb=cb->next) {
+ if ( (cb->callbackArg.v == rbuf->parityStripeID) && ( cb->callbackArg2.v == rbuf->which_ru)) {
+ Dprintf2("RECON: Dropping row %d col %d from buffer wait list\n", cb->row, cb->col);
+ if (cbt) cbt->next = cb->next;
+ else rcPtr->bufferWaitList = cb->next;
+ rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, cb->row, cb->col, (void *) 0, RF_REVENT_BUFREADY); /* arg==0 => no committed buffer */
+ rf_FreeCallbackDesc(cb);
+ return;
+ }
+ }
+}
diff --git a/sys/dev/raidframe/rf_reconbuffer.h b/sys/dev/raidframe/rf_reconbuffer.h
new file mode 100644
index 00000000000..61ec9c1c4ff
--- /dev/null
+++ b/sys/dev/raidframe/rf_reconbuffer.h
@@ -0,0 +1,98 @@
+/* $OpenBSD: rf_reconbuffer.h,v 1.1 1999/01/11 14:29:45 niklas Exp $ */
+/* $NetBSD: rf_reconbuffer.h,v 1.1 1998/11/13 04:20:33 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*******************************************************************
+ *
+ * rf_reconbuffer.h -- header file for reconstruction buffer manager
+ *
+ *******************************************************************/
+
+/* :
+ * Log: rf_reconbuffer.h,v
+ * Revision 1.9 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.8 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.7 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.6 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.5 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.4 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.3 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.2 1995/12/06 15:04:47 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_RECONBUFFER_H_
+#define _RF__RF_RECONBUFFER_H_
+
+#include "rf_types.h"
+#include "rf_reconstruct.h"
+
+int rf_SubmitReconBuffer(RF_ReconBuffer_t *rbuf, int keep_int,
+ int use_committed);
+int rf_SubmitReconBufferBasic(RF_ReconBuffer_t *rbuf, int keep_int,
+ int use_committed);
+int rf_MultiWayReconXor(RF_Raid_t *raidPtr,
+ RF_ReconParityStripeStatus_t *pssPtr);
+RF_ReconBuffer_t *rf_GetFullReconBuffer(RF_ReconCtrl_t *reconCtrlPtr);
+int rf_CheckForFullRbuf(RF_Raid_t *raidPtr, RF_ReconCtrl_t *reconCtrl,
+ RF_ReconParityStripeStatus_t *pssPtr, int numDataCol);
+void rf_ReleaseFloatingReconBuffer(RF_Raid_t *raidPtr, RF_RowCol_t row,
+ RF_ReconBuffer_t *rbuf);
+void rf_ReleaseBufferWaiters(RF_Raid_t *raidPtr,
+ RF_ReconParityStripeStatus_t *pssPtr);
+void rf_ReleaseBufferWaiter(RF_ReconCtrl_t *rcPtr, RF_ReconBuffer_t *rbuf);
+
+#endif /* !_RF__RF_RECONBUFFER_H_ */
diff --git a/sys/dev/raidframe/rf_reconmap.c b/sys/dev/raidframe/rf_reconmap.c
new file mode 100644
index 00000000000..565a4ca616c
--- /dev/null
+++ b/sys/dev/raidframe/rf_reconmap.c
@@ -0,0 +1,459 @@
+/* $OpenBSD: rf_reconmap.c,v 1.1 1999/01/11 14:29:46 niklas Exp $ */
+/* $NetBSD: rf_reconmap.c,v 1.1 1998/11/13 04:20:33 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*************************************************************************
+ * rf_reconmap.c
+ *
+ * code to maintain a map of what sectors have/have not been reconstructed
+ *
+ *************************************************************************/
+
+/* :
+ * Log: rf_reconmap.c,v
+ * Revision 1.23 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.22 1996/07/15 17:22:18 jimz
+ * nit-pick code cleanup
+ * resolve stdlib problems on DEC OSF
+ *
+ * Revision 1.21 1996/06/17 14:38:33 jimz
+ * properly #if out RF_DEMO code
+ * fix bug in MakeConfig that was causing weird behavior
+ * in configuration routines (config was not zeroed at start)
+ * clean up genplot handling of stacks
+ *
+ * Revision 1.20 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.19 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.18 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.17 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.16 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.15 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.14 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.13 1996/05/24 04:40:57 jimz
+ * don't do recon meter demo stuff in kernel
+ *
+ * Revision 1.12 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.11 1996/05/20 16:14:50 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.10 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.9 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.8 1995/12/06 15:05:23 root
+ * added copyright info
+ *
+ */
+
+#include "rf_raid.h"
+#include <sys/time.h>
+#include "rf_general.h"
+#include "rf_utils.h"
+#if RF_DEMO > 0
+#include "rf_demo.h"
+#endif /* RF_DEMO > 0 */
+#include "rf_sys.h"
+
+/* special pointer values indicating that a reconstruction unit
+ * has been either totally reconstructed or not at all. Both
+ * are illegal pointer values, so you have to be careful not to
+ * dereference through them. RU_NOTHING must be zero, since
+ * MakeReconMap uses bzero to initialize the structure. These are used
+ * only at the head of the list.
+ */
+#define RU_ALL ((RF_ReconMapListElem_t *) -1)
+#define RU_NOTHING ((RF_ReconMapListElem_t *) 0)
+
+/* used to mark the end of the list */
+#define RU_NIL ((RF_ReconMapListElem_t *) 0)
+
+
+static void compact_stat_entry(RF_Raid_t *raidPtr, RF_ReconMap_t *mapPtr,
+ int i);
+static void crunch_list(RF_ReconMap_t *mapPtr, RF_ReconMapListElem_t *listPtr);
+static RF_ReconMapListElem_t *MakeReconMapListElem(RF_SectorNum_t startSector,
+ RF_SectorNum_t stopSector, RF_ReconMapListElem_t *next);
+static void FreeReconMapListElem(RF_ReconMap_t *mapPtr,
+ RF_ReconMapListElem_t *p);
+static void update_size(RF_ReconMap_t *mapPtr, int size);
+static void PrintList(RF_ReconMapListElem_t *listPtr);
+
+/*-----------------------------------------------------------------------------
+ *
+ * Creates and initializes new Reconstruction map
+ *
+ *-----------------------------------------------------------------------------*/
+
+RF_ReconMap_t *rf_MakeReconMap(raidPtr, ru_sectors, disk_sectors, spareUnitsPerDisk)
+ RF_Raid_t *raidPtr;
+ RF_SectorCount_t ru_sectors; /* size of reconstruction unit in sectors */
+ RF_SectorCount_t disk_sectors; /* size of disk in sectors */
+ RF_ReconUnitCount_t spareUnitsPerDisk; /* zero unless distributed sparing */
+{
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_ReconUnitCount_t num_rus = layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerRU;
+ RF_ReconMap_t *p;
+ int rc;
+
+ RF_Malloc(p, sizeof(RF_ReconMap_t), (RF_ReconMap_t *));
+ p->sectorsPerReconUnit = ru_sectors;
+ p->sectorsInDisk = disk_sectors;
+
+ p->totalRUs = num_rus;
+ p->spareRUs = spareUnitsPerDisk;
+ p->unitsLeft = num_rus - spareUnitsPerDisk;
+
+ RF_Malloc(p->status, num_rus * sizeof(RF_ReconMapListElem_t *), (RF_ReconMapListElem_t **));
+ RF_ASSERT(p->status != (RF_ReconMapListElem_t **) NULL);
+
+ (void) bzero((char *) p->status, num_rus * sizeof(RF_ReconMapListElem_t *));
+
+ p->size = sizeof(RF_ReconMap_t) + num_rus * sizeof(RF_ReconMapListElem_t *);
+ p->maxSize = p->size;
+
+ rc = rf_mutex_init(&p->mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ RF_Free(p->status, num_rus * sizeof(RF_ReconMapListElem_t *));
+ RF_Free(p, sizeof(RF_ReconMap_t));
+ return(NULL);
+ }
+ return(p);
+}
+
+
+/*-----------------------------------------------------------------------------
+ *
+ * marks a new set of sectors as reconstructed. All the possible mergings get
+ * complicated. To simplify matters, the approach I take is to just dump
+ * something into the list, and then clean it up (i.e. merge elements and
+ * eliminate redundant ones) in a second pass over the list (compact_stat_entry()).
+ * Not 100% efficient, since a structure can be allocated and then immediately
+ * freed, but it keeps this code from becoming (more of) a nightmare of
+ * special cases. The only thing that compact_stat_entry() assumes is that the
+ * list is sorted by startSector, and so this is the only condition I maintain
+ * here. (MCH)
+ *
+ *-----------------------------------------------------------------------------*/
+
+void rf_ReconMapUpdate(raidPtr, mapPtr, startSector, stopSector)
+ RF_Raid_t *raidPtr;
+ RF_ReconMap_t *mapPtr;
+ RF_SectorNum_t startSector;
+ RF_SectorNum_t stopSector;
+{
+ RF_SectorCount_t sectorsPerReconUnit = mapPtr->sectorsPerReconUnit;
+ RF_SectorNum_t i, first_in_RU, last_in_RU;
+ RF_ReconMapListElem_t *p, *pt;
+
+ RF_LOCK_MUTEX(mapPtr->mutex);
+ RF_ASSERT(startSector >=0 && stopSector < mapPtr->sectorsInDisk && stopSector > startSector);
+
+ while (startSector <= stopSector) {
+ i = startSector/mapPtr->sectorsPerReconUnit;
+ first_in_RU = i*sectorsPerReconUnit;
+ last_in_RU = first_in_RU + sectorsPerReconUnit -1 ;
+ p = mapPtr->status[i];
+ if (p!=RU_ALL) {
+ if (p==RU_NOTHING || p->startSector > startSector ) { /* insert at front of list */
+
+ mapPtr->status[i] = MakeReconMapListElem(startSector, RF_MIN(stopSector,last_in_RU), (p==RU_NOTHING) ? NULL : p);
+ update_size(mapPtr, sizeof(RF_ReconMapListElem_t));
+
+ } else { /* general case */
+ do { /* search for place to insert */
+ pt = p; p = p->next;
+ } while (p && (p->startSector < startSector));
+ pt->next = MakeReconMapListElem(startSector,RF_MIN(stopSector,last_in_RU),p);
+ update_size(mapPtr, sizeof(RF_ReconMapListElem_t));
+ }
+ compact_stat_entry(raidPtr, mapPtr, i);
+ }
+ startSector = RF_MIN(stopSector, last_in_RU) +1;
+ }
+ RF_UNLOCK_MUTEX(mapPtr->mutex);
+}
+
+
+
+/*-----------------------------------------------------------------------------
+ *
+ * performs whatever list compactions can be done, and frees any space
+ * that is no longer necessary. Assumes only that the list is sorted
+ * by startSector. crunch_list() compacts a single list as much as possible,
+ * and the second block of code deletes the entire list if possible.
+ * crunch_list() is also called from MakeReconMapAccessList().
+ *
+ * When a recon unit is detected to be fully reconstructed, we set the
+ * corresponding bit in the parity stripe map so that the head follow
+ * code will not select this parity stripe again. This is redundant (but
+ * harmless) when compact_stat_entry is called from the reconstruction code,
+ * but necessary when called from the user-write code.
+ *
+ *-----------------------------------------------------------------------------*/
+
+static void compact_stat_entry(raidPtr, mapPtr, i)
+ RF_Raid_t *raidPtr;
+ RF_ReconMap_t *mapPtr;
+ int i;
+{
+ RF_SectorCount_t sectorsPerReconUnit = mapPtr->sectorsPerReconUnit;
+ RF_ReconMapListElem_t *p = mapPtr->status[i];
+
+ crunch_list(mapPtr, p);
+
+ if ((p->startSector == i*sectorsPerReconUnit) &&
+ (p->stopSector == i*sectorsPerReconUnit +sectorsPerReconUnit -1)) {
+ mapPtr->status[i] = RU_ALL;
+ mapPtr->unitsLeft--;
+ FreeReconMapListElem(mapPtr,p);
+ }
+}
+
+static void crunch_list(mapPtr, listPtr)
+ RF_ReconMap_t *mapPtr;
+ RF_ReconMapListElem_t *listPtr;
+{
+ RF_ReconMapListElem_t *pt, *p = listPtr;
+
+ if (!p) return;
+ pt = p; p = p->next;
+ while (p) {
+ if (pt->stopSector >= p->startSector-1) {
+ pt->stopSector = RF_MAX(pt->stopSector, p->stopSector);
+ pt->next = p->next;
+ FreeReconMapListElem(mapPtr, p);
+ p = pt->next;
+ }
+ else {
+ pt = p;
+ p = p->next;
+ }
+ }
+}
+
+/*-----------------------------------------------------------------------------
+ *
+ * Allocate and fill a new list element
+ *
+ *-----------------------------------------------------------------------------*/
+
+static RF_ReconMapListElem_t *MakeReconMapListElem(
+ RF_SectorNum_t startSector,
+ RF_SectorNum_t stopSector,
+ RF_ReconMapListElem_t *next)
+{
+ RF_ReconMapListElem_t *p;
+
+ RF_Malloc(p, sizeof(RF_ReconMapListElem_t), (RF_ReconMapListElem_t *));
+ if (p == NULL)
+ return(NULL);
+ p->startSector = startSector;
+ p->stopSector = stopSector;
+ p->next = next;
+ return(p);
+}
+
+/*-----------------------------------------------------------------------------
+ *
+ * Free a list element
+ *
+ *-----------------------------------------------------------------------------*/
+
+static void FreeReconMapListElem(mapPtr,p)
+ RF_ReconMap_t *mapPtr;
+ RF_ReconMapListElem_t *p;
+{
+ int delta;
+
+ if (mapPtr) {
+ delta = 0 - (int)sizeof(RF_ReconMapListElem_t);
+ update_size(mapPtr, delta);
+ }
+ RF_Free(p, sizeof(*p));
+}
+
+/*-----------------------------------------------------------------------------
+ *
+ * Free an entire status structure. Inefficient, but can be called at any time.
+ *
+ *-----------------------------------------------------------------------------*/
+void rf_FreeReconMap(mapPtr)
+ RF_ReconMap_t *mapPtr;
+{
+ RF_ReconMapListElem_t *p, *q;
+ RF_ReconUnitCount_t numRUs;
+ RF_ReconUnitNum_t i;
+
+ numRUs = mapPtr->sectorsInDisk / mapPtr->sectorsPerReconUnit;
+ if (mapPtr->sectorsInDisk % mapPtr->sectorsPerReconUnit)
+ numRUs++;
+
+ for (i=0; i<numRUs; i++) {
+ p = mapPtr->status[i];
+ while (p != RU_NOTHING && p != RU_ALL) {
+ q = p; p = p->next;
+ RF_Free(q, sizeof(*q));
+ }
+ }
+ rf_mutex_destroy(&mapPtr->mutex);
+ RF_Free(mapPtr->status, mapPtr->totalRUs * sizeof(RF_ReconMapListElem_t *));
+ RF_Free(mapPtr, sizeof(RF_ReconMap_t));
+}
+
+/*-----------------------------------------------------------------------------
+ *
+ * returns nonzero if the indicated RU has been reconstructed already
+ *
+ *---------------------------------------------------------------------------*/
+
+int rf_CheckRUReconstructed(mapPtr, startSector)
+ RF_ReconMap_t *mapPtr;
+ RF_SectorNum_t startSector;
+{
+ RF_ReconMapListElem_t *l; /* used for searching */
+ RF_ReconUnitNum_t i;
+
+ i = startSector / mapPtr->sectorsPerReconUnit;
+ l = mapPtr->status[i];
+ return( (l == RU_ALL) ? 1 : 0 );
+}
+
+RF_ReconUnitCount_t rf_UnitsLeftToReconstruct(mapPtr)
+ RF_ReconMap_t *mapPtr;
+{
+ RF_ASSERT(mapPtr != NULL);
+ return( mapPtr->unitsLeft );
+}
+
+/* updates the size fields of a status descriptor */
+static void update_size(mapPtr, size)
+ RF_ReconMap_t *mapPtr;
+ int size;
+{
+ mapPtr->size += size;
+ mapPtr->maxSize = RF_MAX(mapPtr->size, mapPtr->maxSize);
+}
+
+static void PrintList(listPtr)
+ RF_ReconMapListElem_t *listPtr;
+{
+ while (listPtr) {
+ printf("%d,%d -> ",(int)listPtr->startSector,(int)listPtr->stopSector);
+ listPtr = listPtr->next;
+ }
+ printf("\n");
+}
+
+void rf_PrintReconMap(raidPtr, mapPtr, frow, fcol)
+ RF_Raid_t *raidPtr;
+ RF_ReconMap_t *mapPtr;
+ RF_RowCol_t frow;
+ RF_RowCol_t fcol;
+{
+ RF_ReconUnitCount_t numRUs;
+ RF_ReconMapListElem_t *p;
+ RF_ReconUnitNum_t i;
+
+ numRUs = mapPtr->totalRUs;
+ if (mapPtr->sectorsInDisk % mapPtr->sectorsPerReconUnit)
+ numRUs++;
+
+ for (i=0; i<numRUs; i++) {
+ p = mapPtr->status[i];
+ if (p==RU_ALL) /*printf("[%d] ALL\n",i)*/;
+ else if (p == RU_NOTHING) {
+ printf("%d: Unreconstructed\n",i);
+ } else {
+ printf("%d: ", i);
+ PrintList(p);
+ }
+ }
+}
+
+void rf_PrintReconSchedule(mapPtr, starttime)
+ RF_ReconMap_t *mapPtr;
+ struct timeval *starttime;
+{
+ static int old_pctg = -1;
+ struct timeval tv, diff;
+ int new_pctg;
+
+ new_pctg = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs);
+ if (new_pctg != old_pctg) {
+ RF_GETTIME(tv);
+ RF_TIMEVAL_DIFF(starttime, &tv, &diff);
+#if RF_DEMO > 0
+ if (rf_demoMode) {
+ rf_update_recon_meter(new_pctg);
+ }
+ else {
+ printf("%d %d.%06d\n",new_pctg, diff.tv_sec, diff.tv_usec);
+ }
+#else /* RF_DEMO > 0 */
+ printf("%d %d.%06d\n",(int)new_pctg, (int)diff.tv_sec, (int)diff.tv_usec);
+#endif /* RF_DEMO > 0 */
+ old_pctg = new_pctg;
+ }
+}
diff --git a/sys/dev/raidframe/rf_reconmap.h b/sys/dev/raidframe/rf_reconmap.h
new file mode 100644
index 00000000000..5d03baefb1b
--- /dev/null
+++ b/sys/dev/raidframe/rf_reconmap.h
@@ -0,0 +1,114 @@
+/* $OpenBSD: rf_reconmap.h,v 1.1 1999/01/11 14:29:46 niklas Exp $ */
+/* $NetBSD: rf_reconmap.h,v 1.1 1998/11/13 04:20:33 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/******************************************************************************
+ * rf_reconMap.h -- Header file describing reconstruction status data structure
+ ******************************************************************************/
+
+/* :
+ * Log: rf_reconmap.h,v
+ * Revision 1.10 1996/08/01 15:59:25 jimz
+ * minor cleanup
+ *
+ * Revision 1.9 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.8 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.7 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.6 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.5 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.4 1995/12/06 15:04:01 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_RECONMAP_H_
+#define _RF__RF_RECONMAP_H_
+
+#include "rf_types.h"
+#include "rf_threadstuff.h"
+
+/*
+ * Main reconstruction status descriptor. size and maxsize are used for
+ * monitoring only: they have no function for reconstruction.
+ */
+struct RF_ReconMap_s {
+ RF_SectorCount_t sectorsPerReconUnit; /* sectors per reconstruct unit */
+ RF_SectorCount_t sectorsInDisk; /* total sectors in disk */
+ RF_SectorCount_t unitsLeft; /* recon units left to recon */
+ RF_ReconUnitCount_t totalRUs; /* total recon units on disk */
+ RF_ReconUnitCount_t spareRUs; /* total number of spare RUs on failed disk */
+ RF_StripeCount_t totalParityStripes; /* total number of parity stripes in array */
+ u_int size; /* overall size of this structure */
+ u_int maxSize; /* maximum size so far */
+ RF_ReconMapListElem_t **status; /* array of ptrs to list elements */
+ RF_DECLARE_MUTEX(mutex)
+};
+
+/* a list element */
+struct RF_ReconMapListElem_s {
+ RF_SectorNum_t startSector; /* bounding sect nums on this block */
+ RF_SectorNum_t stopSector;
+ RF_ReconMapListElem_t *next; /* next element in list */
+};
+
+RF_ReconMap_t *rf_MakeReconMap(RF_Raid_t *raidPtr, RF_SectorCount_t ru_sectors,
+ RF_SectorCount_t disk_sectors, RF_ReconUnitCount_t spareUnitsPerDisk);
+
+void rf_ReconMapUpdate(RF_Raid_t *raidPtr, RF_ReconMap_t *mapPtr,
+ RF_SectorNum_t startSector, RF_SectorNum_t stopSector);
+
+void rf_FreeReconMap(RF_ReconMap_t *mapPtr);
+
+int rf_CheckRUReconstructed(RF_ReconMap_t *mapPtr, RF_SectorNum_t startSector);
+
+RF_ReconUnitCount_t rf_UnitsLeftToReconstruct(RF_ReconMap_t *mapPtr);
+
+void rf_PrintReconMap(RF_Raid_t *raidPtr, RF_ReconMap_t *mapPtr,
+ RF_RowCol_t frow, RF_RowCol_t fcol);
+
+void rf_PrintReconSchedule(RF_ReconMap_t *mapPtr, struct timeval *starttime);
+
+#endif /* !_RF__RF_RECONMAP_H_ */
diff --git a/sys/dev/raidframe/rf_reconstruct.c b/sys/dev/raidframe/rf_reconstruct.c
new file mode 100644
index 00000000000..7df351a7ec0
--- /dev/null
+++ b/sys/dev/raidframe/rf_reconstruct.c
@@ -0,0 +1,1595 @@
+/* $OpenBSD: rf_reconstruct.c,v 1.1 1999/01/11 14:29:46 niklas Exp $ */
+/* $NetBSD: rf_reconstruct.c,v 1.1 1998/11/13 04:20:33 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/************************************************************
+ *
+ * rf_reconstruct.c -- code to perform on-line reconstruction
+ *
+ ************************************************************/
+
+/*
+ * :
+ * Log: rf_reconstruct.c,v
+ * Revision 1.65 1996/08/06 22:24:56 jimz
+ * get rid of sys/buf.h on linux
+ *
+ * Revision 1.64 1996/07/30 04:28:53 jimz
+ * include rf_types.h first
+ *
+ * Revision 1.63 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.62 1996/07/17 21:00:58 jimz
+ * clean up timer interface, tracing
+ *
+ * Revision 1.61 1996/07/15 05:40:41 jimz
+ * some recon datastructure cleanup
+ * better handling of multiple failures
+ * added undocumented double-recon test
+ *
+ * Revision 1.60 1996/07/15 02:57:18 jimz
+ * added debugging (peek at first couple bytes of recon buffers
+ * as they go by)
+ *
+ * Revision 1.59 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.58 1996/07/11 19:08:00 jimz
+ * generalize reconstruction mechanism
+ * allow raid1 reconstructs via copyback (done with array
+ * quiesced, not online, therefore not disk-directed)
+ *
+ * Revision 1.57 1996/06/17 14:38:33 jimz
+ * properly #if out RF_DEMO code
+ * fix bug in MakeConfig that was causing weird behavior
+ * in configuration routines (config was not zeroed at start)
+ * clean up genplot handling of stacks
+ *
+ * Revision 1.56 1996/06/17 03:24:59 jimz
+ * include shutdown.h for define of now-macroized ShutdownCreate
+ *
+ * Revision 1.55 1996/06/11 10:58:36 jimz
+ * get rid of simulator-testcode artifacts
+ * add generic ReconDoneProc mechanism instead
+ *
+ * Revision 1.54 1996/06/10 14:18:58 jimz
+ * move user, throughput stats into per-array structure
+ *
+ * Revision 1.53 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.52 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.51 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.50 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.49 1996/06/06 01:24:36 jimz
+ * don't get rid of reconCtrlPtr until we're done with it
+ *
+ * Revision 1.48 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.47 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.46 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.45 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.44 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.43 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.42 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.41 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.40 1996/05/24 04:40:40 jimz
+ * don't do demoMode stuff in kernel
+ *
+ * Revision 1.39 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.38 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.37 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.36 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.35 1996/05/01 16:28:16 jimz
+ * don't include ccmn.h
+ *
+ * Revision 1.34 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.33 1995/12/06 15:05:09 root
+ * added copyright info
+ *
+ * Revision 1.32 1995/11/17 19:04:11 wvcii
+ * added prototyping to ComputePSDiskOffsets
+ * prow and pcol now type int (were u_int)
+ *
+ * Revision 1.31 1995/11/17 01:39:35 amiri
+ * isolated some demo related stuff
+ *
+ * Revision 1.30 1995/10/18 19:33:14 amiri
+ * removed fflush (stdin/stdout) calls from ReconstructFailedDisk
+ *
+ * Revision 1.29 1995/10/11 10:20:33 jimz
+ * #if 0'd problem code for sigmetrics
+ *
+ * Revision 1.28 1995/10/10 23:18:15 amiri
+ * added fflushes to stdin/stdout before requesting
+ * input in demo mode.
+ *
+ * Revision 1.27 1995/10/10 19:24:47 amiri
+ * took out update_mode (for demo) from
+ * KERNEL source.
+ *
+ * Revision 1.26 1995/10/09 23:35:48 amiri
+ * added support for more meters in recon. demo
+ *
+ * Revision 1.25 1995/07/03 18:14:30 holland
+ * changed the way the number of floating recon bufs &
+ * the head sep limit get set
+ *
+ * Revision 1.24 1995/07/02 15:07:42 holland
+ * bug fixes related to getting distributed sparing numbers
+ *
+ * Revision 1.23 1995/06/23 13:36:36 robby
+ * updeated to prototypes in rf_layout.h
+ *
+*/
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#include "rf_types.h"
+#include <sys/time.h>
+#ifndef LINUX
+#include <sys/buf.h>
+#endif /* !LINUX */
+#include <sys/errno.h>
+#include "rf_raid.h"
+#include "rf_reconutil.h"
+#include "rf_revent.h"
+#include "rf_reconbuffer.h"
+#include "rf_threadid.h"
+#include "rf_acctrace.h"
+#include "rf_etimer.h"
+#include "rf_dag.h"
+#include "rf_desc.h"
+#include "rf_general.h"
+#include "rf_freelist.h"
+#include "rf_debugprint.h"
+#include "rf_driver.h"
+#include "rf_utils.h"
+#include "rf_cpuutil.h"
+#include "rf_shutdown.h"
+#include "rf_sys.h"
+
+#if RF_DEMO > 0
+#include "rf_demo.h"
+#endif /* RF_DEMO > 0 */
+
+#ifdef KERNEL
+#include "rf_kintf.h"
+#endif /* KERNEL */
+
+/* setting these to -1 causes them to be set to their default values if not set by debug options */
+
+#define Dprintf(s) if (rf_reconDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf3(s,a,b,c) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
+#define Dprintf4(s,a,b,c,d) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL)
+#define Dprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL)
+#define Dprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL)
+#define Dprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL)
+#define Dprintf8(s,a,b,c,d,e,f,g,h) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),(void *)((unsigned long)h))
+
+#define DDprintf1(s,a) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
+#define DDprintf2(s,a,b) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
+#define DDprintf3(s,a,b,c) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
+#define DDprintf4(s,a,b,c,d) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL)
+#define DDprintf5(s,a,b,c,d,e) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL)
+#define DDprintf6(s,a,b,c,d,e,f) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL)
+#define DDprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL)
+#define DDprintf8(s,a,b,c,d,e,f,g,h) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),(void *)((unsigned long)h))
+
+#ifdef KERNEL
+static RF_Thread_t recon_thr_handle;
+static int recon_thread_initialized = 0;
+#endif /* KERNEL */
+
+static RF_FreeList_t *rf_recond_freelist;
+#define RF_MAX_FREE_RECOND 4
+#define RF_RECOND_INC 1
+
+static RF_RaidReconDesc_t *AllocRaidReconDesc(RF_Raid_t *raidPtr,
+ RF_RowCol_t row, RF_RowCol_t col, RF_RaidDisk_t *spareDiskPtr,
+ int numDisksDone, RF_RowCol_t srow, RF_RowCol_t scol);
+static void FreeReconDesc(RF_RaidReconDesc_t *reconDesc);
+static int ProcessReconEvent(RF_Raid_t *raidPtr, RF_RowCol_t frow,
+ RF_ReconEvent_t *event);
+static int IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t row,
+ RF_RowCol_t col);
+static int TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col);
+static int ComputePSDiskOffsets(RF_Raid_t *raidPtr, RF_StripeNum_t psid,
+ RF_RowCol_t row, RF_RowCol_t col, RF_SectorNum_t *outDiskOffset,
+ RF_SectorNum_t *outFailedDiskSectorOffset, RF_RowCol_t *spRow,
+ RF_RowCol_t *spCol, RF_SectorNum_t *spOffset);
+static int IssueNextWriteRequest(RF_Raid_t *raidPtr, RF_RowCol_t row);
+static int ReconReadDoneProc(void *arg, int status);
+static int ReconWriteDoneProc(void *arg, int status);
+static void CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_RowCol_t row,
+ RF_HeadSepLimit_t hsCtr);
+static int CheckHeadSeparation(RF_Raid_t *raidPtr, RF_PerDiskReconCtrl_t *ctrl,
+ RF_RowCol_t row, RF_RowCol_t col, RF_HeadSepLimit_t hsCtr,
+ RF_ReconUnitNum_t which_ru);
+static int CheckForcedOrBlockedReconstruction(RF_Raid_t *raidPtr,
+ RF_ReconParityStripeStatus_t *pssPtr, RF_PerDiskReconCtrl_t *ctrl,
+ RF_RowCol_t row, RF_RowCol_t col, RF_StripeNum_t psid,
+ RF_ReconUnitNum_t which_ru);
+static void ForceReconReadDoneProc(void *arg, int status);
+
+static void rf_ShutdownReconstruction(void *);
+
+
+struct RF_ReconDoneProc_s {
+ void (*proc)(RF_Raid_t *, void *);
+ void *arg;
+ RF_ReconDoneProc_t *next;
+};
+
+static RF_FreeList_t *rf_rdp_freelist;
+#define RF_MAX_FREE_RDP 4
+#define RF_RDP_INC 1
+
+static void SignalReconDone(RF_Raid_t *raidPtr)
+{
+ RF_ReconDoneProc_t *p;
+
+ RF_LOCK_MUTEX(raidPtr->recon_done_proc_mutex);
+ for(p=raidPtr->recon_done_procs;p;p=p->next) {
+ p->proc(raidPtr, p->arg);
+ }
+ RF_UNLOCK_MUTEX(raidPtr->recon_done_proc_mutex);
+}
+
+int rf_RegisterReconDoneProc(
+ RF_Raid_t *raidPtr,
+ void (*proc)(RF_Raid_t *, void *),
+ void *arg,
+ RF_ReconDoneProc_t **handlep)
+{
+ RF_ReconDoneProc_t *p;
+
+ RF_FREELIST_GET(rf_rdp_freelist,p,next,(RF_ReconDoneProc_t *));
+ if (p == NULL)
+ return(ENOMEM);
+ p->proc = proc;
+ p->arg = arg;
+ RF_LOCK_MUTEX(raidPtr->recon_done_proc_mutex);
+ p->next = raidPtr->recon_done_procs;
+ raidPtr->recon_done_procs = p;
+ RF_UNLOCK_MUTEX(raidPtr->recon_done_proc_mutex);
+ if (handlep)
+ *handlep = p;
+ return(0);
+}
+
+/*****************************************************************************************
+ *
+ * sets up the parameters that will be used by the reconstruction process
+ * currently there are none, except for those that the layout-specific
+ * configuration (e.g. rf_ConfigureDeclustered) routine sets up.
+ *
+ * in the kernel, we fire off the recon thread.
+ *
+ ****************************************************************************************/
+static void rf_ShutdownReconstruction(ignored)
+ void *ignored;
+{
+ RF_FREELIST_DESTROY(rf_recond_freelist,next,(RF_RaidReconDesc_t *));
+ RF_FREELIST_DESTROY(rf_rdp_freelist,next,(RF_ReconDoneProc_t *));
+}
+
+int rf_ConfigureReconstruction(listp)
+ RF_ShutdownList_t **listp;
+{
+ int rc;
+
+ RF_FREELIST_CREATE(rf_recond_freelist, RF_MAX_FREE_RECOND,
+ RF_RECOND_INC, sizeof(RF_RaidReconDesc_t));
+ if (rf_recond_freelist == NULL)
+ return(ENOMEM);
+ RF_FREELIST_CREATE(rf_rdp_freelist, RF_MAX_FREE_RDP,
+ RF_RDP_INC, sizeof(RF_ReconDoneProc_t));
+ if (rf_rdp_freelist == NULL) {
+ RF_FREELIST_DESTROY(rf_recond_freelist,next,(RF_RaidReconDesc_t *));
+ return(ENOMEM);
+ }
+ rc = rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL);
+ if (rc) {
+ RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
+ __FILE__, __LINE__, rc);
+ rf_ShutdownReconstruction(NULL);
+ return(rc);
+ }
+
+#ifdef KERNEL
+ if (!recon_thread_initialized) {
+ RF_CREATE_THREAD(recon_thr_handle, rf_ReconKernelThread, NULL);
+ recon_thread_initialized = 1;
+ }
+#endif /* KERNEL */
+
+ return(0);
+}
+
+static RF_RaidReconDesc_t *AllocRaidReconDesc(raidPtr, row, col, spareDiskPtr, numDisksDone, srow, scol)
+ RF_Raid_t *raidPtr;
+ RF_RowCol_t row;
+ RF_RowCol_t col;
+ RF_RaidDisk_t *spareDiskPtr;
+ int numDisksDone;
+ RF_RowCol_t srow;
+ RF_RowCol_t scol;
+{
+
+ RF_RaidReconDesc_t *reconDesc;
+
+ RF_FREELIST_GET(rf_recond_freelist,reconDesc,next,(RF_RaidReconDesc_t *));
+
+ reconDesc->raidPtr = raidPtr;
+ reconDesc->row = row;
+ reconDesc->col = col;
+ reconDesc->spareDiskPtr=spareDiskPtr;
+ reconDesc->numDisksDone=numDisksDone;
+ reconDesc->srow=srow;
+ reconDesc->scol=scol;
+ reconDesc->state = 0;
+ reconDesc->next = NULL;
+
+ return(reconDesc);
+}
+
+static void FreeReconDesc(reconDesc)
+ RF_RaidReconDesc_t *reconDesc;
+{
+#if RF_RECON_STATS > 0
+ printf("RAIDframe: %lu recon event waits, %lu recon delays\n",
+ (long)reconDesc->numReconEventWaits, (long)reconDesc->numReconExecDelays);
+#endif /* RF_RECON_STATS > 0 */
+#ifdef KERNEL
+ printf("RAIDframe: %lu max exec ticks\n",
+ (long)reconDesc->maxReconExecTicks);
+#endif /* KERNEL */
+#if (RF_RECON_STATS > 0) || defined(KERNEL)
+ printf("\n");
+#endif /* (RF_RECON_STATS > 0) || KERNEL */
+ RF_FREELIST_FREE(rf_recond_freelist,reconDesc,next);
+}
+
+
+/*****************************************************************************************
+ *
+ * primary routine to reconstruct a failed disk. This should be called from
+ * within its own thread. It won't return until reconstruction completes,
+ * fails, or is aborted.
+ ****************************************************************************************/
+int rf_ReconstructFailedDisk(raidPtr, row, col)
+ RF_Raid_t *raidPtr;
+ RF_RowCol_t row;
+ RF_RowCol_t col;
+{
+#ifdef SIMULATE
+ RF_PendingRecon_t *pend;
+ RF_RowCol_t r, c;
+#endif /* SIMULATE */
+ RF_LayoutSW_t *lp;
+ int rc;
+
+ lp = raidPtr->Layout.map;
+ if (lp->SubmitReconBuffer) {
+ /*
+ * The current infrastructure only supports reconstructing one
+ * disk at a time for each array.
+ */
+#ifdef SIMULATE
+ if (raidPtr->reconInProgress) {
+ RF_Malloc(pend, sizeof(RF_PendingRecon_t), (RF_PendingRecon_t *));
+ pend->row = row;
+ pend->col = col;
+ pend->next = raidPtr->pendingRecon;
+ raidPtr->pendingRecon = pend;
+ /* defer until current recon completes */
+ return(0);
+ }
+ raidPtr->reconInProgress++;
+#else /* SIMULATE */
+ RF_LOCK_MUTEX(raidPtr->mutex);
+ while (raidPtr->reconInProgress) {
+ RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
+ }
+ raidPtr->reconInProgress++;
+ RF_UNLOCK_MUTEX(raidPtr->mutex);
+#endif /* SIMULATE */
+ rc = rf_ReconstructFailedDiskBasic(raidPtr, row, col);
+ }
+ else {
+ RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n",
+ lp->parityConfig);
+ rc = EIO;
+ }
+#ifdef SIMULATE
+ pend = raidPtr->pendingRecon;
+ if (pend) {
+ /* launch next recon */
+ raidPtr->pendingRecon = pend->next;
+ r = pend->row;
+ c = pend->col;
+ RF_Free(pend, sizeof(RF_PendingRecon_t));
+ return(rf_ReconstructFailedDisk(raidPtr, r, c));
+ }
+#else /* SIMULATE */
+ RF_LOCK_MUTEX(raidPtr->mutex);
+ raidPtr->reconInProgress--;
+ RF_UNLOCK_MUTEX(raidPtr->mutex);
+ RF_SIGNAL_COND(raidPtr->waitForReconCond);
+#if 1
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ wakeup(&raidPtr->waitForReconCond); /* XXX Methinks this will be needed
+ at some point... GO*/
+#endif
+#endif
+#endif /* SIMULATE */
+ return(rc);
+}
+
+int rf_ReconstructFailedDiskBasic(raidPtr, row, col)
+ RF_Raid_t *raidPtr;
+ RF_RowCol_t row;
+ RF_RowCol_t col;
+{
+ RF_RaidDisk_t *spareDiskPtr = NULL;
+ RF_RaidReconDesc_t *reconDesc;
+ RF_RowCol_t srow, scol;
+ int numDisksDone=0, rc;
+
+ /* first look for a spare drive onto which to reconstruct the data */
+ /* spare disk descriptors are stored in row 0. This may have to change eventually */
+
+ RF_LOCK_MUTEX(raidPtr->mutex);
+ RF_ASSERT (raidPtr->Disks[row][col].status == rf_ds_failed);
+
+ if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
+ if (raidPtr->status[row] != rf_rs_degraded) {
+ RF_ERRORMSG2("Unable to reconstruct disk at row %d col %d because status not degraded\n",row,col);
+ RF_UNLOCK_MUTEX(raidPtr->mutex);
+ return(EINVAL);
+ }
+ srow = row;
+ scol = (-1);
+ }
+ else {
+ srow = 0;
+ for (scol=raidPtr->numCol; scol<raidPtr->numCol + raidPtr->numSpare; scol++) {
+ if (raidPtr->Disks[srow][scol].status == rf_ds_spare) {
+ spareDiskPtr = &raidPtr->Disks[srow][scol];
+ spareDiskPtr->status = rf_ds_used_spare;
+ break;
+ }
+ }
+ if (!spareDiskPtr) {
+ RF_ERRORMSG2("Unable to reconstruct disk at row %d col %d because no spares are available\n",row,col);
+ RF_UNLOCK_MUTEX(raidPtr->mutex);
+ return(ENOSPC);
+ }
+
+#if RF_DEMO > 0
+ if (!rf_demoMode) {
+#endif /* RF_DEMO > 0 */
+ printf("RECON: initiating reconstruction on row %d col %d -> spare at row %d col %d\n",row, col, srow, scol);
+#if RF_DEMO > 0
+ }
+#endif /* RF_DEMO > 0 */
+ }
+ RF_UNLOCK_MUTEX(raidPtr->mutex);
+
+ reconDesc = AllocRaidReconDesc((void *) raidPtr, row, col,spareDiskPtr, numDisksDone, srow , scol);
+ raidPtr->reconDesc = (void *) reconDesc;
+#if RF_RECON_STATS > 0
+ reconDesc->hsStallCount = 0;
+ reconDesc->numReconExecDelays = 0;
+ reconDesc->numReconEventWaits = 0;
+#endif /* RF_RECON_STATS > 0 */
+#ifdef KERNEL
+ reconDesc->reconExecTimerRunning = 0;
+ reconDesc->reconExecTicks = 0;
+ reconDesc->maxReconExecTicks = 0;
+#endif /* KERNEL */
+#if RF_DEMO > 0 && !defined(SIMULATE)
+ if (rf_demoMode) {
+ char cbuf[10];
+ printf("About to start reconstruction, hit return to continue:");
+ gets(cbuf);
+ }
+#endif /* RF_DEMO > 0 && !SIMULATE */
+ rc = rf_ContinueReconstructFailedDisk(reconDesc);
+ return(rc);
+}
+
+
+int rf_ContinueReconstructFailedDisk(reconDesc)
+ RF_RaidReconDesc_t *reconDesc;
+{
+ RF_Raid_t *raidPtr=reconDesc->raidPtr;
+ RF_RowCol_t row=reconDesc->row;
+ RF_RowCol_t col=reconDesc->col;
+ RF_RowCol_t srow=reconDesc->srow;
+ RF_RowCol_t scol=reconDesc->scol;
+ RF_ReconMap_t *mapPtr;
+
+ RF_ReconEvent_t *event;
+ struct timeval etime, elpsd;
+ unsigned long xor_s, xor_resid_us;
+ int retcode,i, ds;
+
+ switch (reconDesc->state)
+ {
+
+
+ case 0:
+
+ raidPtr->accumXorTimeUs = 0;
+
+ /* create one trace record per physical disk */
+ RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
+
+ /* quiesce the array prior to starting recon. this is needed to assure no nasty interactions
+ * with pending user writes. We need to do this before we change the disk or row status.
+ */
+ reconDesc->state=1;
+
+ Dprintf("RECON: begin request suspend\n");
+ retcode = rf_SuspendNewRequestsAndWait(raidPtr);
+ Dprintf("RECON: end request suspend\n");
+ rf_StartUserStats(raidPtr); /* zero out the stats kept on user accs */
+
+#ifdef SIMULATE
+ if (retcode) return(0);
+#endif /* SIMULATE */
+
+ /* fall through to state 1 */
+
+ case 1:
+
+ RF_LOCK_MUTEX(raidPtr->mutex);
+
+ /* create the reconstruction control pointer and install it in the right slot */
+ raidPtr->reconControl[row] = rf_MakeReconControl(reconDesc, row, col, srow, scol);
+ mapPtr=raidPtr->reconControl[row]->reconMap;
+ raidPtr->status[row] = rf_rs_reconstructing;
+ raidPtr->Disks[row][col].status = rf_ds_reconstructing;
+ raidPtr->Disks[row][col].spareRow = srow;
+ raidPtr->Disks[row][col].spareCol = scol;
+
+ RF_UNLOCK_MUTEX(raidPtr->mutex);
+
+ RF_GETTIME(raidPtr->reconControl[row]->starttime);
+#if RF_DEMO > 0
+ if (rf_demoMode) {
+ rf_demo_update_mode(RF_DEMO_RECON);
+ rf_startup_recon_demo(rf_demoMeterVpos, raidPtr->numCol,
+ raidPtr->Layout.numDataCol+raidPtr->Layout.numParityCol, 0);
+ }
+#endif /* RF_DEMO > 0 */
+
+ /* now start up the actual reconstruction: issue a read for each surviving disk */
+ rf_start_cpu_monitor();
+ reconDesc->numDisksDone = 0;
+ for (i=0; i<raidPtr->numCol; i++) {
+ if (i != col) {
+ /* find and issue the next I/O on the indicated disk */
+ if (IssueNextReadRequest(raidPtr, row, i)) {
+ Dprintf2("RECON: done issuing for r%d c%d\n", row, i);
+ reconDesc->numDisksDone++;
+ }
+ }
+ }
+
+ case 2:
+ Dprintf("RECON: resume requests\n");
+ rf_ResumeNewRequests(raidPtr);
+
+
+ reconDesc->state=3;
+
+ case 3:
+
+ /* process reconstruction events until all disks report that they've completed all work */
+ mapPtr=raidPtr->reconControl[row]->reconMap;
+
+
+
+ while (reconDesc->numDisksDone < raidPtr->numCol-1) {
+
+ event = rf_GetNextReconEvent(reconDesc, row, (void (*)(void *))rf_ContinueReconstructFailedDisk,reconDesc);
+#ifdef SIMULATE
+ if (event==NULL) {return(0);}
+#else /* SIMULATE */
+ RF_ASSERT(event);
+#endif /* SIMULATE */
+
+ if (ProcessReconEvent(raidPtr, row, event)) reconDesc->numDisksDone++;
+ raidPtr->reconControl[row]->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs);
+#if RF_DEMO > 0
+ if (rf_prReconSched || rf_demoMode)
+#else /* RF_DEMO > 0 */
+ if (rf_prReconSched)
+#endif /* RF_DEMO > 0 */
+ {
+ rf_PrintReconSchedule(raidPtr->reconControl[row]->reconMap, &(raidPtr->reconControl[row]->starttime));
+ }
+ }
+
+
+
+ reconDesc->state=4;
+
+
+ case 4:
+ mapPtr=raidPtr->reconControl[row]->reconMap;
+ if (rf_reconDebug) {
+ printf("RECON: all reads completed\n");
+ }
+
+
+
+ /* at this point all the reads have completed. We now wait for any pending writes
+ * to complete, and then we're done
+ */
+
+ while (rf_UnitsLeftToReconstruct(raidPtr->reconControl[row]->reconMap) > 0) {
+
+ event = rf_GetNextReconEvent(reconDesc, row, (void (*)(void *))rf_ContinueReconstructFailedDisk,reconDesc);
+#ifdef SIMULATE
+ if (event==NULL) {return(0);}
+#else /* SIMULATE */
+ RF_ASSERT(event);
+#endif /* SIMULATE */
+
+ (void) ProcessReconEvent(raidPtr, row, event); /* ignore return code */
+ raidPtr->reconControl[row]->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs);
+#if RF_DEMO > 0
+ if (rf_prReconSched || rf_demoMode)
+#else /* RF_DEMO > 0 */
+ if (rf_prReconSched)
+#endif /* RF_DEMO > 0 */
+ {
+ rf_PrintReconSchedule(raidPtr->reconControl[row]->reconMap, &(raidPtr->reconControl[row]->starttime));
+ }
+ }
+ reconDesc->state=5;
+
+ case 5:
+ rf_stop_cpu_monitor();
+
+ /* Success: mark the dead disk as reconstructed. We quiesce the array here to assure no
+ * nasty interactions with pending user accesses when we free up the psstatus structure
+ * as part of FreeReconControl()
+ */
+
+
+
+ reconDesc->state=6;
+
+ retcode = rf_SuspendNewRequestsAndWait(raidPtr);
+ rf_StopUserStats(raidPtr);
+ rf_PrintUserStats(raidPtr); /* print out the stats on user accs accumulated during recon */
+
+#ifdef SIMULATE
+ if (retcode) return(0);
+#endif /* SIMULATE */
+
+ /* fall through to state 6 */
+ case 6:
+
+
+
+ RF_LOCK_MUTEX(raidPtr->mutex);
+ raidPtr->numFailures--;
+ ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE);
+ raidPtr->Disks[row][col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared;
+ raidPtr->status[row] = (ds) ? rf_rs_reconfigured : rf_rs_optimal;
+ RF_UNLOCK_MUTEX(raidPtr->mutex);
+ RF_GETTIME(etime);
+ RF_TIMEVAL_DIFF(&(raidPtr->reconControl[row]->starttime), &etime, &elpsd);
+
+ /* XXX -- why is state 7 different from state 6 if there is no return() here? -- XXX
+ * Note that I set elpsd above & use it below, so if you put a return
+ * here you'll have to fix this. (also, FreeReconControl is called below)
+ */
+
+ case 7:
+
+ rf_ResumeNewRequests(raidPtr);
+
+#if RF_DEMO > 0
+ if (rf_demoMode) {
+ rf_finish_recon_demo(&elpsd);
+ }
+ else {
+#endif /* RF_DEMO > 0 */
+ printf("Reconstruction of disk at row %d col %d completed and spare disk reassigned\n", row, col);
+ xor_s = raidPtr->accumXorTimeUs/1000000;
+ xor_resid_us = raidPtr->accumXorTimeUs%1000000;
+ printf("Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n",
+ (int)elpsd.tv_sec,(int)elpsd.tv_usec,raidPtr->accumXorTimeUs,xor_s,xor_resid_us);
+ printf(" (start time %d sec %d usec, end time %d sec %d usec)\n",
+ (int)raidPtr->reconControl[row]->starttime.tv_sec,
+ (int)raidPtr->reconControl[row]->starttime.tv_usec,
+ (int)etime.tv_sec, (int)etime.tv_usec);
+ rf_print_cpu_util("reconstruction");
+#if RF_RECON_STATS > 0
+ printf("Total head-sep stall count was %d\n",
+ (int)reconDesc->hsStallCount);
+#endif /* RF_RECON_STATS > 0 */
+#if RF_DEMO > 0
+ }
+#endif /* RF_DEMO > 0 */
+ rf_FreeReconControl(raidPtr, row);
+ RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t));
+ FreeReconDesc(reconDesc);
+
+ }
+
+ SignalReconDone(raidPtr);
+ return (0);
+}
+
+/*****************************************************************************************
+ * do the right thing upon each reconstruction event.
+ * returns nonzero if and only if there is nothing left unread on the indicated disk
+ ****************************************************************************************/
+static int ProcessReconEvent(raidPtr, frow, event)
+ RF_Raid_t *raidPtr;
+ RF_RowCol_t frow;
+ RF_ReconEvent_t *event;
+{
+ int retcode = 0, submitblocked;
+ RF_ReconBuffer_t *rbuf;
+ RF_SectorCount_t sectorsPerRU;
+
+ Dprintf1("RECON: ProcessReconEvent type %d\n", event->type);
+ switch(event->type) {
+
+ /* a read I/O has completed */
+ case RF_REVENT_READDONE:
+ rbuf = raidPtr->reconControl[frow]->perDiskInfo[event->col].rbuf;
+ Dprintf3("RECON: READDONE EVENT: row %d col %d psid %ld\n",
+ frow, event->col, rbuf->parityStripeID);
+ Dprintf7("RECON: done read psid %ld buf %lx %02x %02x %02x %02x %02x\n",
+ rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0]&0xff, rbuf->buffer[1]&0xff,
+ rbuf->buffer[2]&0xff, rbuf->buffer[3]&0xff, rbuf->buffer[4]&0xff);
+ rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
+ submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0);
+ Dprintf1("RECON: submitblocked=%d\n", submitblocked);
+ if (!submitblocked) retcode = IssueNextReadRequest(raidPtr, frow, event->col);
+ break;
+
+ /* a write I/O has completed */
+ case RF_REVENT_WRITEDONE:
+ if (rf_floatingRbufDebug) {
+ rf_CheckFloatingRbufCount(raidPtr, 1);
+ }
+ sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
+ rbuf = (RF_ReconBuffer_t *) event->arg;
+ rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
+ Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n",
+ rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl[frow]->percentComplete);
+ rf_ReconMapUpdate(raidPtr, raidPtr->reconControl[frow]->reconMap,
+ rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU -1);
+ rf_RemoveFromActiveReconTable(raidPtr, frow, rbuf->parityStripeID, rbuf->which_ru);
+
+ if (rbuf->type == RF_RBUF_TYPE_FLOATING) {
+ RF_LOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex);
+ raidPtr->numFullReconBuffers--;
+ rf_ReleaseFloatingReconBuffer(raidPtr, frow, rbuf);
+ RF_UNLOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex);
+ } else if (rbuf->type == RF_RBUF_TYPE_FORCED) rf_FreeReconBuffer(rbuf);
+ else RF_ASSERT(0);
+ break;
+
+ case RF_REVENT_BUFCLEAR: /* A buffer-stall condition has been cleared */
+ Dprintf2("RECON: BUFCLEAR EVENT: row %d col %d\n",frow, event->col);
+ submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl[frow]->perDiskInfo[event->col].rbuf, 0, (int) (long)event->arg);
+ RF_ASSERT(!submitblocked); /* we wouldn't have gotten the BUFCLEAR event if we couldn't submit */
+ retcode = IssueNextReadRequest(raidPtr, frow, event->col);
+ break;
+
+ case RF_REVENT_BLOCKCLEAR: /* A user-write reconstruction blockage has been cleared */
+ DDprintf2("RECON: BLOCKCLEAR EVENT: row %d col %d\n",frow, event->col);
+ retcode = TryToRead(raidPtr, frow, event->col);
+ break;
+
+ case RF_REVENT_HEADSEPCLEAR: /* A max-head-separation reconstruction blockage has been cleared */
+ Dprintf2("RECON: HEADSEPCLEAR EVENT: row %d col %d\n",frow, event->col);
+ retcode = TryToRead(raidPtr, frow, event->col);
+ break;
+
+ /* a buffer has become ready to write */
+ case RF_REVENT_BUFREADY:
+ Dprintf2("RECON: BUFREADY EVENT: row %d col %d\n",frow, event->col);
+ retcode = IssueNextWriteRequest(raidPtr, frow);
+ if (rf_floatingRbufDebug) {
+ rf_CheckFloatingRbufCount(raidPtr, 1);
+ }
+ break;
+
+ /* we need to skip the current RU entirely because it got recon'd while we were waiting for something else to happen */
+ case RF_REVENT_SKIP:
+ DDprintf2("RECON: SKIP EVENT: row %d col %d\n",frow, event->col);
+ retcode = IssueNextReadRequest(raidPtr, frow, event->col);
+ break;
+
+ /* a forced-reconstruction read access has completed. Just submit the buffer */
+ case RF_REVENT_FORCEDREADDONE:
+ rbuf = (RF_ReconBuffer_t *) event->arg;
+ rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
+ DDprintf2("RECON: FORCEDREADDONE EVENT: row %d col %d\n",frow, event->col);
+ submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0);
+ RF_ASSERT(!submitblocked);
+ break;
+
+ default:
+ RF_PANIC();
+ }
+ rf_FreeReconEventDesc(event);
+ return(retcode);
+}
+
+/*****************************************************************************************
+ *
+ * find the next thing that's needed on the indicated disk, and issue a read
+ * request for it. We assume that the reconstruction buffer associated with this
+ * process is free to receive the data. If reconstruction is blocked on the
+ * indicated RU, we issue a blockage-release request instead of a physical disk
+ * read request. If the current disk gets too far ahead of the others, we issue
+ * a head-separation wait request and return.
+ *
+ * ctrl->{ru_count, curPSID, diskOffset} and rbuf->failedDiskSectorOffset are
+ * maintained to point the the unit we're currently accessing. Note that this deviates
+ * from the standard C idiom of having counters point to the next thing to be
+ * accessed. This allows us to easily retry when we're blocked by head separation
+ * or reconstruction-blockage events.
+ *
+ * returns nonzero if and only if there is nothing left unread on the indicated disk
+ ****************************************************************************************/
+static int IssueNextReadRequest(raidPtr, row, col)
+ RF_Raid_t *raidPtr;
+ RF_RowCol_t row;
+ RF_RowCol_t col;
+{
+ RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl[row]->perDiskInfo[col];
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_ReconBuffer_t *rbuf = ctrl->rbuf;
+ RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU;
+ RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
+ int do_new_check = 0, retcode = 0, status;
+
+ /* if we are currently the slowest disk, mark that we have to do a new check */
+ if (ctrl->headSepCounter <= raidPtr->reconControl[row]->minHeadSepCounter) do_new_check = 1;
+
+ while (1) {
+
+ ctrl->ru_count++;
+ if (ctrl->ru_count < RUsPerPU) {
+ ctrl->diskOffset += sectorsPerRU;
+ rbuf->failedDiskSectorOffset += sectorsPerRU;
+ } else {
+ ctrl->curPSID++;
+ ctrl->ru_count = 0;
+ /* code left over from when head-sep was based on parity stripe id */
+ if (ctrl->curPSID >= raidPtr->reconControl[row]->lastPSID) {
+ CheckForNewMinHeadSep(raidPtr, row, ++(ctrl->headSepCounter));
+ return(1); /* finito! */
+ }
+
+ /* find the disk offsets of the start of the parity stripe on both the current disk and the failed disk.
+ * skip this entire parity stripe if either disk does not appear in the indicated PS
+ */
+ status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, row, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset,
+ &rbuf->spRow, &rbuf->spCol, &rbuf->spOffset);
+ if (status) {
+ ctrl->ru_count = RUsPerPU-1; continue;
+ }
+ }
+ rbuf->which_ru = ctrl->ru_count;
+
+ /* skip this RU if it's already been reconstructed */
+ if (rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, rbuf->failedDiskSectorOffset)) {
+ Dprintf2("Skipping psid %ld ru %d: already reconstructed\n",ctrl->curPSID,ctrl->ru_count);
+ continue;
+ }
+ break;
+ }
+ ctrl->headSepCounter++;
+ if (do_new_check) CheckForNewMinHeadSep(raidPtr, row, ctrl->headSepCounter); /* update min if needed */
+
+
+ /* at this point, we have definitely decided what to do, and we have only to see if we can actually do it now */
+ rbuf->parityStripeID = ctrl->curPSID;
+ rbuf->which_ru = ctrl->ru_count;
+ bzero((char *)&raidPtr->recon_tracerecs[col], sizeof(raidPtr->recon_tracerecs[col]));
+ raidPtr->recon_tracerecs[col].reconacc = 1;
+ RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
+ retcode = TryToRead(raidPtr, row, col);
+ return(retcode);
+}
+
+/* tries to issue the next read on the indicated disk. We may be blocked by (a) the heads being too
+ * far apart, or (b) recon on the indicated RU being blocked due to a write by a user thread.
+ * In this case, we issue a head-sep or blockage wait request, which will cause this same routine
+ * to be invoked again later when the blockage has cleared.
+ */
+static int TryToRead(raidPtr, row, col)
+ RF_Raid_t *raidPtr;
+ RF_RowCol_t row;
+ RF_RowCol_t col;
+{
+ RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl[row]->perDiskInfo[col];
+ RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
+ RF_StripeNum_t psid = ctrl->curPSID;
+ RF_ReconUnitNum_t which_ru = ctrl->ru_count;
+ RF_DiskQueueData_t *req;
+ int status, created = 0;
+ RF_ReconParityStripeStatus_t *pssPtr;
+
+ /* if the current disk is too far ahead of the others, issue a head-separation wait and return */
+ if (CheckHeadSeparation(raidPtr, ctrl, row, col, ctrl->headSepCounter, which_ru)) return(0);
+ RF_LOCK_PSS_MUTEX(raidPtr, row, psid);
+ pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]->pssTable, psid, which_ru, RF_PSS_CREATE, &created);
+
+ /* if recon is blocked on the indicated parity stripe, issue a block-wait request and return.
+ * this also must mark the indicated RU in the stripe as under reconstruction if not blocked.
+ */
+ status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, row, col, psid, which_ru);
+ if (status == RF_PSS_RECON_BLOCKED) {
+ Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n",psid,which_ru);
+ goto out;
+ } else if (status == RF_PSS_FORCED_ON_WRITE) {
+ rf_CauseReconEvent(raidPtr, row, col, NULL, RF_REVENT_SKIP);
+ goto out;
+ }
+
+ /* make one last check to be sure that the indicated RU didn't get reconstructed while
+ * we were waiting for something else to happen. This is unfortunate in that it causes
+ * us to make this check twice in the normal case. Might want to make some attempt to
+ * re-work this so that we only do this check if we've definitely blocked on one of the
+ * above checks. When this condition is detected, we may have just created a bogus
+ * status entry, which we need to delete.
+ */
+ if (rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, ctrl->rbuf->failedDiskSectorOffset)) {
+ Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n",psid,which_ru);
+ if (created) rf_PSStatusDelete(raidPtr, raidPtr->reconControl[row]->pssTable, pssPtr);
+ rf_CauseReconEvent(raidPtr, row, col, NULL, RF_REVENT_SKIP);
+ goto out;
+ }
+
+ /* found something to read. issue the I/O */
+ Dprintf5("RECON: Read for psid %ld on row %d col %d offset %ld buf %lx\n",
+ psid, row, col, ctrl->diskOffset, ctrl->rbuf->buffer);
+ RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer);
+ RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer);
+ raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us =
+ RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer);
+ RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
+
+ /* should be ok to use a NULL proc pointer here, all the bufs we use should be in kernel space */
+ req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru,
+ ReconReadDoneProc, (void *) ctrl, NULL, &raidPtr->recon_tracerecs[col], (void *)raidPtr, 0, NULL);
+
+ RF_ASSERT(req); /* XXX -- fix this -- XXX */
+
+ ctrl->rbuf->arg = (void *) req;
+ rf_DiskIOEnqueue(&raidPtr->Queues[row][col], req, RF_IO_RECON_PRIORITY);
+ pssPtr->issued[col] = 1;
+
+out:
+ RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
+ return(0);
+}
+
+
+/* given a parity stripe ID, we want to find out whether both the current disk and the
+ * failed disk exist in that parity stripe. If not, we want to skip this whole PS.
+ * If so, we want to find the disk offset of the start of the PS on both the current
+ * disk and the failed disk.
+ *
+ * this works by getting a list of disks comprising the indicated parity stripe, and
+ * searching the list for the current and failed disks. Once we've decided they both
+ * exist in the parity stripe, we need to decide whether each is data or parity,
+ * so that we'll know which mapping function to call to get the corresponding disk
+ * offsets.
+ *
+ * this is kind of unpleasant, but doing it this way allows the reconstruction code
+ * to use parity stripe IDs rather than physical disks address to march through the
+ * failed disk, which greatly simplifies a lot of code, as well as eliminating the
+ * need for a reverse-mapping function. I also think it will execute faster, since
+ * the calls to the mapping module are kept to a minimum.
+ *
+ * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING THE STRIPE
+ * IN THE CORRECT ORDER
+ */
+static int ComputePSDiskOffsets(
+ RF_Raid_t *raidPtr, /* raid descriptor */
+ RF_StripeNum_t psid, /* parity stripe identifier */
+ RF_RowCol_t row, /* row and column of disk to find the offsets for */
+ RF_RowCol_t col,
+ RF_SectorNum_t *outDiskOffset,
+ RF_SectorNum_t *outFailedDiskSectorOffset,
+ RF_RowCol_t *spRow, /* OUT: row,col of spare unit for failed unit */
+ RF_RowCol_t *spCol,
+ RF_SectorNum_t *spOffset) /* OUT: offset into disk containing spare unit */
+{
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_RowCol_t fcol = raidPtr->reconControl[row]->fcol;
+ RF_RaidAddr_t sosRaidAddress; /* start-of-stripe */
+ RF_RowCol_t *diskids;
+ u_int i, j, k, i_offset, j_offset;
+ RF_RowCol_t prow, pcol;
+ int testcol, testrow;
+ RF_RowCol_t stripe;
+ RF_SectorNum_t poffset;
+ char i_is_parity=0, j_is_parity=0;
+ RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol;
+
+ /* get a listing of the disks comprising that stripe */
+ sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid);
+ (layoutPtr->map->IdentifyStripe)(raidPtr, sosRaidAddress, &diskids, &stripe);
+ RF_ASSERT(diskids);
+
+ /* reject this entire parity stripe if it does not contain the indicated disk or it does not contain the failed disk */
+ if (row != stripe)
+ goto skipit;
+ for (i=0; i<stripeWidth; i++) {
+ if (col == diskids[i])
+ break;
+ }
+ if (i == stripeWidth)
+ goto skipit;
+ for (j=0; j<stripeWidth; j++) {
+ if (fcol == diskids[j])
+ break;
+ }
+ if (j == stripeWidth) {
+ goto skipit;
+ }
+
+ /* find out which disk the parity is on */
+ (layoutPtr->map->MapParity)(raidPtr, sosRaidAddress, &prow, &pcol, &poffset, RF_DONT_REMAP);
+
+ /* find out if either the current RU or the failed RU is parity */
+ /* also, if the parity occurs in this stripe prior to the data and/or failed col, we need to decrement i and/or j */
+ for (k=0; k<stripeWidth; k++)
+ if (diskids[k] == pcol)
+ break;
+ RF_ASSERT(k < stripeWidth);
+ i_offset = i; j_offset=j;
+ if (k < i) i_offset--; else if (k==i) {i_is_parity = 1; i_offset = 0;} /* set offsets to zero to disable multiply below */
+ if (k < j) j_offset--; else if (k==j) {j_is_parity = 1; j_offset = 0;}
+
+ /* at this point, [ij]_is_parity tells us whether the [current,failed] disk is parity at
+ * the start of this RU, and, if data, "[ij]_offset" tells us how far into the stripe
+ * the [current,failed] disk is.
+ */
+
+ /* call the mapping routine to get the offset into the current disk, repeat for failed disk. */
+ if (i_is_parity)
+ layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outDiskOffset, RF_DONT_REMAP);
+ else
+ layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outDiskOffset, RF_DONT_REMAP);
+
+ RF_ASSERT(row == testrow && col == testcol);
+
+ if (j_is_parity)
+ layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
+ else
+ layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP);
+ RF_ASSERT(row == testrow && fcol == testcol);
+
+ /* now locate the spare unit for the failed unit */
+ if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
+ if (j_is_parity)
+ layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spRow, spCol, spOffset, RF_REMAP);
+ else
+ layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spRow, spCol, spOffset, RF_REMAP);
+ } else {
+ *spRow = raidPtr->reconControl[row]->spareRow;
+ *spCol = raidPtr->reconControl[row]->spareCol;
+ *spOffset = *outFailedDiskSectorOffset;
+ }
+
+ return(0);
+
+skipit:
+ Dprintf3("RECON: Skipping psid %ld: nothing needed from r%d c%d\n",
+ psid, row, col);
+ return(1);
+}
+
+/* this is called when a buffer has become ready to write to the replacement disk */
+static int IssueNextWriteRequest(raidPtr, row)
+ RF_Raid_t *raidPtr;
+ RF_RowCol_t row;
+{
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
+ RF_RowCol_t fcol = raidPtr->reconControl[row]->fcol;
+ RF_ReconBuffer_t *rbuf;
+ RF_DiskQueueData_t *req;
+
+ rbuf = rf_GetFullReconBuffer(raidPtr->reconControl[row]);
+ RF_ASSERT(rbuf); /* there must be one available, or we wouldn't have gotten the event that sent us here */
+ RF_ASSERT(rbuf->pssPtr);
+
+ rbuf->pssPtr->writeRbuf = rbuf;
+ rbuf->pssPtr = NULL;
+
+ Dprintf7("RECON: New write (r %d c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n",
+ rbuf->spRow, rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID,
+ rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer);
+ Dprintf6("RECON: new write psid %ld %02x %02x %02x %02x %02x\n",
+ rbuf->parityStripeID, rbuf->buffer[0]&0xff, rbuf->buffer[1]&0xff,
+ rbuf->buffer[2]&0xff, rbuf->buffer[3]&0xff, rbuf->buffer[4]&0xff);
+
+ /* should be ok to use a NULL b_proc here b/c all addrs should be in kernel space */
+ req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset,
+ sectorsPerRU, rbuf->buffer,
+ rbuf->parityStripeID, rbuf->which_ru,
+ ReconWriteDoneProc, (void *) rbuf, NULL,
+ &raidPtr->recon_tracerecs[fcol],
+ (void *)raidPtr, 0, NULL);
+
+ RF_ASSERT(req); /* XXX -- fix this -- XXX */
+
+ rbuf->arg = (void *) req;
+ rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spRow][rbuf->spCol], req, RF_IO_RECON_PRIORITY);
+
+ return(0);
+}
+
+/* this gets called upon the completion of a reconstruction read operation
+ * the arg is a pointer to the per-disk reconstruction control structure
+ * for the process that just finished a read.
+ *
+ * called at interrupt context in the kernel, so don't do anything illegal here.
+ */
+static int ReconReadDoneProc(arg, status)
+ void *arg;
+ int status;
+{
+ RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg;
+ RF_Raid_t *raidPtr = ctrl->reconCtrl->reconDesc->raidPtr;
+
+ if (status) {
+ /*
+ * XXX
+ */
+ printf("Recon read failed!\n");
+ RF_PANIC();
+ }
+
+ RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
+ RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
+ raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us =
+ RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
+ RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
+
+ rf_CauseReconEvent(raidPtr, ctrl->row, ctrl->col, NULL, RF_REVENT_READDONE);
+ return(0);
+}
+
+/* this gets called upon the completion of a reconstruction write operation.
+ * the arg is a pointer to the rbuf that was just written
+ *
+ * called at interrupt context in the kernel, so don't do anything illegal here.
+ */
+static int ReconWriteDoneProc(arg, status)
+ void *arg;
+ int status;
+{
+ RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg;
+
+ Dprintf2("Reconstruction completed on psid %ld ru %d\n",rbuf->parityStripeID, rbuf->which_ru);
+ if (status) {printf("Recon write failed!\n"); /*fprintf(stderr,"Recon write failed!\n");*/ RF_PANIC();}
+ rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, rbuf->row, rbuf->col, arg, RF_REVENT_WRITEDONE);
+ return(0);
+}
+
+
+/* computes a new minimum head sep, and wakes up anyone who needs to be woken as a result */
+static void CheckForNewMinHeadSep(raidPtr, row, hsCtr)
+ RF_Raid_t *raidPtr;
+ RF_RowCol_t row;
+ RF_HeadSepLimit_t hsCtr;
+{
+ RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row];
+ RF_HeadSepLimit_t new_min;
+ RF_RowCol_t i;
+ RF_CallbackDesc_t *p;
+ RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter); /* from the definition of a minimum */
+
+
+ RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
+
+ new_min = ~ (1L<< (8*sizeof(long)-1)); /* 0x7FFF....FFF */
+ for (i=0; i<raidPtr->numCol; i++) if (i != reconCtrlPtr->fcol) {
+ if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min) new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter;
+ }
+
+ /* set the new minimum and wake up anyone who can now run again */
+ if (new_min != reconCtrlPtr->minHeadSepCounter) {
+ reconCtrlPtr->minHeadSepCounter = new_min;
+ Dprintf1("RECON: new min head pos counter val is %ld\n",new_min);
+ while (reconCtrlPtr->headSepCBList) {
+ if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min) break;
+ p = reconCtrlPtr->headSepCBList;
+ reconCtrlPtr->headSepCBList = p->next;
+ p->next = NULL;
+ rf_CauseReconEvent(raidPtr, p->row, p->col, NULL, RF_REVENT_HEADSEPCLEAR);
+ rf_FreeCallbackDesc(p);
+ }
+
+ }
+
+ RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
+}
+
+/* checks to see that the maximum head separation will not be violated
+ * if we initiate a reconstruction I/O on the indicated disk. Limiting the
+ * maximum head separation between two disks eliminates the nasty buffer-stall
+ * conditions that occur when one disk races ahead of the others and consumes
+ * all of the floating recon buffers. This code is complex and unpleasant
+ * but it's necessary to avoid some very nasty, albeit fairly rare,
+ * reconstruction behavior.
+ *
+ * returns non-zero if and only if we have to stop working on the indicated disk
+ * due to a head-separation delay.
+ */
+static int CheckHeadSeparation(
+ RF_Raid_t *raidPtr,
+ RF_PerDiskReconCtrl_t *ctrl,
+ RF_RowCol_t row,
+ RF_RowCol_t col,
+ RF_HeadSepLimit_t hsCtr,
+ RF_ReconUnitNum_t which_ru)
+{
+ RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row];
+ RF_CallbackDesc_t *cb, *p, *pt;
+ int retval = 0, tid;
+
+ /* if we're too far ahead of the slowest disk, stop working on this disk
+ * until the slower ones catch up. We do this by scheduling a wakeup callback
+ * for the time when the slowest disk has caught up. We define "caught up"
+ * with 20% hysteresis, i.e. the head separation must have fallen to at most
+ * 80% of the max allowable head separation before we'll wake up.
+ *
+ */
+ rf_get_threadid(tid);
+ RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
+ if ((raidPtr->headSepLimit >= 0) &&
+ ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit))
+ {
+ Dprintf6("[%d] RECON: head sep stall: row %d col %d hsCtr %ld minHSCtr %ld limit %ld\n",
+ tid,row,col,ctrl->headSepCounter, reconCtrlPtr->minHeadSepCounter, raidPtr->headSepLimit);
+ cb = rf_AllocCallbackDesc();
+ /* the minHeadSepCounter value we have to get to before we'll wake up. build in 20% hysteresis. */
+ cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit/5);
+ cb->row = row; cb->col = col;
+ cb->next = NULL;
+
+ /* insert this callback descriptor into the sorted list of pending head-sep callbacks */
+ p = reconCtrlPtr->headSepCBList;
+ if (!p) reconCtrlPtr->headSepCBList = cb;
+ else if (cb->callbackArg.v < p->callbackArg.v) {
+ cb->next = reconCtrlPtr->headSepCBList;
+ reconCtrlPtr->headSepCBList = cb;
+ }
+ else {
+ for (pt=p, p=p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt=p,p=p->next);
+ cb->next = p;
+ pt->next = cb;
+ }
+ retval = 1;
+#if RF_RECON_STATS > 0
+ ctrl->reconCtrl->reconDesc->hsStallCount++;
+#endif /* RF_RECON_STATS > 0 */
+ }
+ RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
+
+ return(retval);
+}
+
+/* checks to see if reconstruction has been either forced or blocked by a user operation.
+ * if forced, we skip this RU entirely.
+ * else if blocked, put ourselves on the wait list.
+ * else return 0.
+ *
+ * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY
+ */
+static int CheckForcedOrBlockedReconstruction(
+ RF_Raid_t *raidPtr,
+ RF_ReconParityStripeStatus_t *pssPtr,
+ RF_PerDiskReconCtrl_t *ctrl,
+ RF_RowCol_t row,
+ RF_RowCol_t col,
+ RF_StripeNum_t psid,
+ RF_ReconUnitNum_t which_ru)
+{
+ RF_CallbackDesc_t *cb;
+ int retcode = 0;
+
+ if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE)) retcode = RF_PSS_FORCED_ON_WRITE;
+ else if (pssPtr->flags & RF_PSS_RECON_BLOCKED) {
+ Dprintf4("RECON: row %d col %d blocked at psid %ld ru %d\n",row, col, psid, which_ru);
+ cb = rf_AllocCallbackDesc(); /* append ourselves to the blockage-wait list */
+ cb->row = row; cb->col = col;
+ cb->next = pssPtr->blockWaitList;
+ pssPtr->blockWaitList = cb;
+ retcode = RF_PSS_RECON_BLOCKED;
+ }
+
+ if (!retcode) pssPtr->flags |= RF_PSS_UNDER_RECON; /* mark this RU as under reconstruction */
+
+ return(retcode);
+}
+
+/* if reconstruction is currently ongoing for the indicated stripeID, reconstruction
+ * is forced to completion and we return non-zero to indicate that the caller must
+ * wait. If not, then reconstruction is blocked on the indicated stripe and the
+ * routine returns zero. If and only if we return non-zero, we'll cause the cbFunc
+ * to get invoked with the cbArg when the reconstruction has completed.
+ */
+int rf_ForceOrBlockRecon(raidPtr, asmap, cbFunc, cbArg)
+ RF_Raid_t *raidPtr;
+ RF_AccessStripeMap_t *asmap;
+ void (*cbFunc)(RF_Raid_t *,void *);
+ void *cbArg;
+{
+ RF_RowCol_t row = asmap->physInfo->row; /* which row of the array we're working on */
+ RF_StripeNum_t stripeID = asmap->stripeID; /* the stripe ID we're forcing recon on */
+ RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; /* num sects in one RU */
+ RF_ReconParityStripeStatus_t *pssPtr; /* a pointer to the parity stripe status structure */
+ RF_StripeNum_t psid; /* parity stripe id */
+ RF_SectorNum_t offset, fd_offset; /* disk offset, failed-disk offset */
+ RF_RowCol_t *diskids;
+ RF_RowCol_t stripe;
+ int tid;
+ RF_ReconUnitNum_t which_ru; /* RU within parity stripe */
+ RF_RowCol_t fcol, diskno, i;
+ RF_ReconBuffer_t *new_rbuf; /* ptr to newly allocated rbufs */
+ RF_DiskQueueData_t *req; /* disk I/O req to be enqueued */
+ RF_CallbackDesc_t *cb;
+ int created = 0, nPromoted;
+
+ rf_get_threadid(tid);
+ psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
+
+ RF_LOCK_PSS_MUTEX(raidPtr, row, psid);
+
+ pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]->pssTable, psid, which_ru, RF_PSS_CREATE|RF_PSS_RECON_BLOCKED, &created);
+
+ /* if recon is not ongoing on this PS, just return */
+ if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
+ RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
+ return(0);
+ }
+
+ /* otherwise, we have to wait for reconstruction to complete on this RU. */
+ /* In order to avoid waiting for a potentially large number of low-priority accesses to
+ * complete, we force a normal-priority (i.e. not low-priority) reconstruction
+ * on this RU.
+ */
+ if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) {
+ DDprintf1("Forcing recon on psid %ld\n",psid);
+ pssPtr->flags |= RF_PSS_FORCED_ON_WRITE; /* mark this RU as under forced recon */
+ pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; /* clear the blockage that we just set */
+ fcol = raidPtr->reconControl[row]->fcol;
+
+ /* get a listing of the disks comprising the indicated stripe */
+ (raidPtr->Layout.map->IdentifyStripe)(raidPtr, asmap->raidAddress, &diskids, &stripe);
+ RF_ASSERT(row == stripe);
+
+ /* For previously issued reads, elevate them to normal priority. If the I/O has already completed,
+ * it won't be found in the queue, and hence this will be a no-op.
+ * For unissued reads, allocate buffers and issue new reads. The fact that we've set the
+ * FORCED bit means that the regular recon procs will not re-issue these reqs
+ */
+ for (i=0; i<raidPtr->Layout.numDataCol+raidPtr->Layout.numParityCol; i++) if ( (diskno = diskids[i]) != fcol) {
+ if (pssPtr->issued[diskno]) {
+ nPromoted = rf_DiskIOPromote(&raidPtr->Queues[row][diskno], psid, which_ru);
+ if (rf_reconDebug && nPromoted) printf("[%d] promoted read from row %d col %d\n",tid,row,diskno);
+ } else {
+ new_rbuf = rf_MakeReconBuffer(raidPtr, row, diskno, RF_RBUF_TYPE_FORCED); /* create new buf */
+ ComputePSDiskOffsets(raidPtr, psid, row, diskno, &offset, &fd_offset,
+ &new_rbuf->spRow, &new_rbuf->spCol, &new_rbuf->spOffset); /* find offsets & spare location */
+ new_rbuf->parityStripeID = psid; /* fill in the buffer */
+ new_rbuf->which_ru = which_ru;
+ new_rbuf->failedDiskSectorOffset = fd_offset;
+ new_rbuf->priority = RF_IO_NORMAL_PRIORITY;
+
+ /* use NULL b_proc b/c all addrs should be in kernel space */
+ req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer,
+ psid, which_ru, (int (*)(void *, int))ForceReconReadDoneProc, (void *) new_rbuf, NULL,
+ NULL,(void *)raidPtr, 0, NULL);
+
+ RF_ASSERT(req); /* XXX -- fix this -- XXX */
+
+ new_rbuf->arg = req;
+ rf_DiskIOEnqueue(&raidPtr->Queues[row][diskno], req, RF_IO_NORMAL_PRIORITY); /* enqueue the I/O */
+ Dprintf3("[%d] Issued new read req on row %d col %d\n",tid,row,diskno);
+ }
+ }
+
+ /* if the write is sitting in the disk queue, elevate its priority */
+ if (rf_DiskIOPromote(&raidPtr->Queues[row][fcol], psid, which_ru)) printf("[%d] promoted write to row %d col %d\n",tid,row,fcol);
+ }
+
+ /* install a callback descriptor to be invoked when recon completes on this parity stripe. */
+ cb = rf_AllocCallbackDesc();
+ /* XXX the following is bogus.. These functions don't really match!! GO */
+ cb->callbackFunc = (void (*)(RF_CBParam_t))cbFunc;
+ cb->callbackArg.p = (void *) cbArg;
+ cb->next = pssPtr->procWaitList;
+ pssPtr->procWaitList = cb;
+ DDprintf2("[%d] Waiting for forced recon on psid %ld\n",tid,psid);
+
+ RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
+ return(1);
+}
+
+/* called upon the completion of a forced reconstruction read.
+ * all we do is schedule the FORCEDREADONE event.
+ * called at interrupt context in the kernel, so don't do anything illegal here.
+ */
+static void ForceReconReadDoneProc(arg, status)
+ void *arg;
+ int status;
+{
+ RF_ReconBuffer_t *rbuf = arg;
+
+ if (status) {printf("Forced recon read failed!\n"); /*fprintf(stderr,"Forced recon read failed!\n");*/ RF_PANIC();}
+ rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, rbuf->row, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE);
+}
+
+/* releases a block on the reconstruction of the indicated stripe */
+int rf_UnblockRecon(raidPtr, asmap)
+ RF_Raid_t *raidPtr;
+ RF_AccessStripeMap_t *asmap;
+{
+ RF_RowCol_t row = asmap->origRow;
+ RF_StripeNum_t stripeID = asmap->stripeID;
+ RF_ReconParityStripeStatus_t *pssPtr;
+ RF_ReconUnitNum_t which_ru;
+ RF_StripeNum_t psid;
+ int tid, created = 0;
+ RF_CallbackDesc_t *cb;
+
+ rf_get_threadid(tid);
+ psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru);
+ RF_LOCK_PSS_MUTEX( raidPtr, row, psid);
+ pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]->pssTable, psid, which_ru, RF_PSS_NONE, &created);
+
+ /* When recon is forced, the pss desc can get deleted before we get back to unblock recon.
+ * But, this can _only_ happen when recon is forced.
+ * It would be good to put some kind of sanity check here, but how to decide if recon
+ * was just forced or not?
+ */
+ if (!pssPtr) {
+ /*printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n",psid,which_ru);*/
+ if (rf_reconDebug || rf_pssDebug) printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n",(long)psid,which_ru);
+ goto out;
+ }
+
+ pssPtr->blockCount--;
+ Dprintf3("[%d] unblocking recon on psid %ld: blockcount is %d\n",tid,psid,pssPtr->blockCount);
+ if (pssPtr->blockCount == 0) { /* if recon blockage has been released */
+
+ /* unblock recon before calling CauseReconEvent in case CauseReconEvent causes us to
+ * try to issue a new read before returning here.
+ */
+ pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;
+
+
+ while (pssPtr->blockWaitList) { /* spin through the block-wait list and release all the waiters */
+ cb = pssPtr->blockWaitList;
+ pssPtr->blockWaitList = cb->next;
+ cb->next = NULL;
+ rf_CauseReconEvent(raidPtr, cb->row, cb->col, NULL, RF_REVENT_BLOCKCLEAR);
+ rf_FreeCallbackDesc(cb);
+ }
+ if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) { /* if no recon was requested while recon was blocked */
+ rf_PSStatusDelete(raidPtr, raidPtr->reconControl[row]->pssTable, pssPtr);
+ }
+ }
+
+out:
+ RF_UNLOCK_PSS_MUTEX( raidPtr, row, psid );
+ return(0);
+}
diff --git a/sys/dev/raidframe/rf_reconstruct.h b/sys/dev/raidframe/rf_reconstruct.h
new file mode 100644
index 00000000000..5913e626609
--- /dev/null
+++ b/sys/dev/raidframe/rf_reconstruct.h
@@ -0,0 +1,258 @@
+/* $OpenBSD: rf_reconstruct.h,v 1.1 1999/01/11 14:29:47 niklas Exp $ */
+/* $NetBSD: rf_reconstruct.h,v 1.1 1998/11/13 04:20:34 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*********************************************************
+ * rf_reconstruct.h -- header file for reconstruction code
+ *********************************************************/
+
+/* :
+ * Log: rf_reconstruct.h,v
+ * Revision 1.25 1996/08/01 15:57:24 jimz
+ * minor cleanup
+ *
+ * Revision 1.24 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.23 1996/07/15 05:40:41 jimz
+ * some recon datastructure cleanup
+ * better handling of multiple failures
+ * added undocumented double-recon test
+ *
+ * Revision 1.22 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.21 1996/07/11 19:08:00 jimz
+ * generalize reconstruction mechanism
+ * allow raid1 reconstructs via copyback (done with array
+ * quiesced, not online, therefore not disk-directed)
+ *
+ * Revision 1.20 1996/06/11 10:57:30 jimz
+ * add rf_RegisterReconDoneProc
+ *
+ * Revision 1.19 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.18 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.17 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.16 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.15 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.14 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.13 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.12 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.11 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.10 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.9 1995/12/06 15:04:55 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_RECONSTRUCT_H_
+#define _RF__RF_RECONSTRUCT_H_
+
+#include "rf_types.h"
+#include <sys/time.h>
+#include "rf_reconmap.h"
+#include "rf_psstatus.h"
+
+/* reconstruction configuration information */
+struct RF_ReconConfig_s {
+ unsigned numFloatingReconBufs; /* number of floating recon bufs to use */
+ RF_HeadSepLimit_t headSepLimit; /* how far apart the heads are allow to become, in parity stripes */
+};
+
+/* a reconstruction buffer */
+struct RF_ReconBuffer_s {
+ RF_Raid_t *raidPtr; /* void * to avoid recursive includes */
+ caddr_t buffer; /* points to the data */
+ RF_StripeNum_t parityStripeID; /* the parity stripe that this data relates to */
+ int which_ru; /* which reconstruction unit within the PSS */
+ RF_SectorNum_t failedDiskSectorOffset;/* the offset into the failed disk */
+ RF_RowCol_t row, col; /* which disk this buffer belongs to or is targeted at */
+ RF_StripeCount_t count; /* counts the # of SUs installed so far */
+ int priority; /* used to force hi priority recon */
+ RF_RbufType_t type; /* FORCED or FLOATING */
+ char *arrived; /* [x] = 1/0 if SU from disk x has/hasn't arrived */
+ RF_ReconBuffer_t *next; /* used for buffer management */
+ void *arg; /* generic field for general use */
+ RF_RowCol_t spRow, spCol; /* spare disk to which this buf should be written */
+ /* if dist sparing off, always identifies the replacement disk */
+ RF_SectorNum_t spOffset; /* offset into the spare disk */
+ /* if dist sparing off, identical to failedDiskSectorOffset */
+ RF_ReconParityStripeStatus_t *pssPtr; /* debug- pss associated with issue-pending write */
+};
+
+/* a reconstruction event descriptor. The event types currently are:
+ * RF_REVENT_READDONE -- a read operation has completed
+ * RF_REVENT_WRITEDONE -- a write operation has completed
+ * RF_REVENT_BUFREADY -- the buffer manager has produced a full buffer
+ * RF_REVENT_BLOCKCLEAR -- a reconstruction blockage has been cleared
+ * RF_REVENT_BUFCLEAR -- the buffer manager has released a process blocked on submission
+ * RF_REVENT_SKIP -- we need to skip the current RU and go on to the next one, typ. b/c we found recon forced
+ * RF_REVENT_FORCEDREADONE- a forced-reconstructoin read operation has completed
+ */
+typedef enum RF_Revent_e {
+ RF_REVENT_READDONE,
+ RF_REVENT_WRITEDONE,
+ RF_REVENT_BUFREADY,
+ RF_REVENT_BLOCKCLEAR,
+ RF_REVENT_BUFCLEAR,
+ RF_REVENT_HEADSEPCLEAR,
+ RF_REVENT_SKIP,
+ RF_REVENT_FORCEDREADDONE
+} RF_Revent_t;
+
+struct RF_ReconEvent_s {
+ RF_Revent_t type; /* what kind of event has occurred */
+ RF_RowCol_t col; /* row ID is implicit in the queue in which the event is placed */
+ void *arg; /* a generic argument */
+ RF_ReconEvent_t *next;
+};
+
+/*
+ * Reconstruction control information maintained per-disk
+ * (for surviving disks)
+ */
+struct RF_PerDiskReconCtrl_s {
+ RF_ReconCtrl_t *reconCtrl;
+ RF_RowCol_t row, col; /* to make this structure self-identifying */
+ RF_StripeNum_t curPSID; /* the next parity stripe ID to check on this disk */
+ RF_HeadSepLimit_t headSepCounter; /* counter used to control maximum head separation */
+ RF_SectorNum_t diskOffset; /* the offset into the indicated disk of the current PU */
+ RF_ReconUnitNum_t ru_count; /* this counts off the recon units within each parity unit */
+ RF_ReconBuffer_t *rbuf; /* the recon buffer assigned to this disk */
+};
+
+/* main reconstruction control structure */
+struct RF_ReconCtrl_s {
+ RF_RaidReconDesc_t *reconDesc;
+ RF_RowCol_t fcol; /* which column has failed */
+ RF_PerDiskReconCtrl_t *perDiskInfo; /* information maintained per-disk */
+ RF_ReconMap_t *reconMap; /* map of what has/has not been reconstructed */
+ RF_RowCol_t spareRow; /* which of the spare disks we're using */
+ RF_RowCol_t spareCol;
+ RF_StripeNum_t lastPSID; /* the ID of the last parity stripe we want reconstructed */
+ int percentComplete; /* percentage completion of reconstruction */
+
+ /* reconstruction event queue */
+ RF_ReconEvent_t *eventQueue; /* queue of pending reconstruction events */
+ RF_DECLARE_MUTEX(eq_mutex) /* mutex for locking event queue */
+ RF_DECLARE_COND(eq_cond) /* condition variable for signalling recon events */
+ int eq_count; /* debug only */
+
+ /* reconstruction buffer management */
+ RF_DECLARE_MUTEX(rb_mutex) /* mutex for messing around with recon buffers */
+ RF_ReconBuffer_t *floatingRbufs; /* available floating reconstruction buffers */
+ RF_ReconBuffer_t *committedRbufs; /* recon buffers that have been committed to some waiting disk */
+ RF_ReconBuffer_t *fullBufferList; /* full buffers waiting to be written out */
+ RF_ReconBuffer_t *priorityList; /* full buffers that have been elevated to higher priority */
+ RF_CallbackDesc_t *bufferWaitList; /* disks that are currently blocked waiting for buffers */
+
+ /* parity stripe status table */
+ RF_PSStatusHeader_t *pssTable; /* stores the reconstruction status of active parity stripes */
+
+ /* maximum-head separation control */
+ RF_HeadSepLimit_t minHeadSepCounter; /* the minimum hs counter over all disks */
+ RF_CallbackDesc_t *headSepCBList; /* list of callbacks to be done as minPSID advances */
+
+ /* performance monitoring */
+ struct timeval starttime; /* recon start time */
+
+ void (*continueFunc)(void *); /* function to call when io returns*/
+ void *continueArg; /* argument for Func */
+};
+
+/* the default priority for reconstruction accesses */
+#define RF_IO_RECON_PRIORITY RF_IO_LOW_PRIORITY
+
+int rf_ConfigureReconstruction(RF_ShutdownList_t **listp);
+
+int rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t row,
+ RF_RowCol_t col);
+
+int rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t row,
+ RF_RowCol_t col);
+
+int rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc);
+
+int rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
+ void (*cbFunc)(RF_Raid_t *,void *), void *cbArg);
+
+int rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap);
+
+int rf_RegisterReconDoneProc(RF_Raid_t *raidPtr, void (*proc)(RF_Raid_t *, void *), void *arg,
+ RF_ReconDoneProc_t **handlep);
+
+#endif /* !_RF__RF_RECONSTRUCT_H_ */
diff --git a/sys/dev/raidframe/rf_reconstub.c b/sys/dev/raidframe/rf_reconstub.c
new file mode 100644
index 00000000000..2502462ea8b
--- /dev/null
+++ b/sys/dev/raidframe/rf_reconstub.c
@@ -0,0 +1,88 @@
+/* $OpenBSD: rf_reconstub.c,v 1.1 1999/01/11 14:29:47 niklas Exp $ */
+/* $NetBSD: rf_reconstub.c,v 1.1 1998/11/13 04:20:34 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/**************************************************************************
+ *
+ * rf_reconstub.c -- stub routines used when you don't want reconstruction
+ * in some particular instantiation of the raidframe
+ *
+ * this file also contains stubs for some reconstruction-related
+ * routines that we don't want compiled into the kernel.
+ *
+ * The OSF/1 kernel configuration includes an option "raidframe_recon". If
+ * enabled, most of this file is ifdef'd out.
+ *
+ **************************************************************************/
+
+/* :
+ * Log: rf_reconstub.c,v
+ * Revision 1.9 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.8 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.7 1996/04/03 23:25:33 jimz
+ * make inclusion of raidframe_recon.h #ifdef KERNEL
+ *
+ * Revision 1.6 1995/12/06 15:06:54 root
+ * added copyright info
+ *
+ */
+
+#ifdef KERNEL
+#include <raidframe_recon.h>
+#endif /* KERNEL */
+#include <sys/errno.h>
+
+#if RAIDFRAME_RECON == 0
+
+int rf_ConfigureReconstruction() { return(0); }
+int rf_ConfigureReconEvent() { return(0); }
+int rf_ConfigurePSStatus() { return(0); }
+int rf_ConfigureNWayXor() { return(0); }
+int rf_ConfigureCopyback() { return(0); }
+int rf_ShutdownCopyback() { return(0); }
+int rf_ShutdownReconstruction() { return(0); }
+int rf_ShutdownReconEvent() { return(0); }
+int rf_ShutdownPSStatus() { return(0); }
+int rf_ShutdownNWayXor() { return(0); }
+
+int rf_ForceOrBlockRecon() { return(0); }
+int rf_UnblockRecon() { return(0); }
+int rf_ReconstructFailedDisk() { return(ENOTTY); }
+int rf_CheckRUReconstructed() { return(0); }
+
+void rf_start_cpu_monitor() {}
+void rf_stop_cpu_monitor() {}
+void rf_print_cpu_util() {}
+
+#endif /* RAIDFRAME_RECON == 0 */
diff --git a/sys/dev/raidframe/rf_reconutil.c b/sys/dev/raidframe/rf_reconutil.c
new file mode 100644
index 00000000000..51267198a7e
--- /dev/null
+++ b/sys/dev/raidframe/rf_reconutil.c
@@ -0,0 +1,408 @@
+/* $OpenBSD: rf_reconutil.c,v 1.1 1999/01/11 14:29:47 niklas Exp $ */
+/* $NetBSD: rf_reconutil.c,v 1.1 1998/11/13 04:20:34 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/********************************************
+ * rf_reconutil.c -- reconstruction utilities
+ ********************************************/
+
+/* :
+ * Log: rf_reconutil.c,v
+ * Revision 1.32 1996/07/29 14:05:12 jimz
+ * fix numPUs/numRUs confusion (everything is now numRUs)
+ * clean up some commenting, return values
+ *
+ * Revision 1.31 1996/07/15 05:40:41 jimz
+ * some recon datastructure cleanup
+ * better handling of multiple failures
+ * added undocumented double-recon test
+ *
+ * Revision 1.30 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.29 1996/06/19 17:53:48 jimz
+ * move GetNumSparePUs, InstallSpareTable ops into layout switch
+ *
+ * Revision 1.28 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.27 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.26 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.25 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.24 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.23 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.22 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.21 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.20 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.19 1996/05/20 16:14:55 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.18 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.17 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.16 1995/12/06 15:05:31 root
+ * added copyright info
+ *
+ */
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_desc.h"
+#include "rf_reconutil.h"
+#include "rf_reconbuffer.h"
+#include "rf_general.h"
+#include "rf_decluster.h"
+#include "rf_raid5_rotatedspare.h"
+#include "rf_interdecluster.h"
+#include "rf_chaindecluster.h"
+
+/*******************************************************************
+ * allocates/frees the reconstruction control information structures
+ *******************************************************************/
+RF_ReconCtrl_t *rf_MakeReconControl(reconDesc, frow, fcol, srow, scol)
+ RF_RaidReconDesc_t *reconDesc;
+ RF_RowCol_t frow; /* failed row and column */
+ RF_RowCol_t fcol;
+ RF_RowCol_t srow; /* identifies which spare we're using */
+ RF_RowCol_t scol;
+{
+ RF_Raid_t *raidPtr = reconDesc->raidPtr;
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU;
+ RF_ReconUnitCount_t numSpareRUs;
+ RF_ReconCtrl_t *reconCtrlPtr;
+ RF_ReconBuffer_t *rbuf;
+ RF_LayoutSW_t *lp;
+ int retcode, rc;
+ RF_RowCol_t i;
+
+ lp = raidPtr->Layout.map;
+
+ /* make and zero the global reconstruction structure and the per-disk structure */
+ RF_Calloc(reconCtrlPtr, 1, sizeof(RF_ReconCtrl_t), (RF_ReconCtrl_t *));
+ RF_Calloc(reconCtrlPtr->perDiskInfo, raidPtr->numCol, sizeof(RF_PerDiskReconCtrl_t), (RF_PerDiskReconCtrl_t *)); /* this zeros it */
+ reconCtrlPtr->reconDesc = reconDesc;
+ reconCtrlPtr->fcol = fcol;
+ reconCtrlPtr->spareRow = srow;
+ reconCtrlPtr->spareCol = scol;
+ reconCtrlPtr->lastPSID = layoutPtr->numStripe/layoutPtr->SUsPerPU;
+ reconCtrlPtr->percentComplete = 0;
+
+ /* initialize each per-disk recon information structure */
+ for (i=0; i<raidPtr->numCol; i++) {
+ reconCtrlPtr->perDiskInfo[i].reconCtrl = reconCtrlPtr;
+ reconCtrlPtr->perDiskInfo[i].row = frow;
+ reconCtrlPtr->perDiskInfo[i].col = i;
+ reconCtrlPtr->perDiskInfo[i].curPSID = -1; /* make it appear as if we just finished an RU */
+ reconCtrlPtr->perDiskInfo[i].ru_count = RUsPerPU-1;
+ }
+
+ /* Get the number of spare units per disk and the sparemap in case spare is distributed */
+
+ if (lp->GetNumSpareRUs) {
+ numSpareRUs = lp->GetNumSpareRUs(raidPtr);
+ }
+ else {
+ numSpareRUs = 0;
+ }
+
+ /*
+ * Not all distributed sparing archs need dynamic mappings
+ */
+ if (lp->InstallSpareTable) {
+ retcode = rf_InstallSpareTable(raidPtr, frow, fcol);
+ if (retcode) {
+ RF_PANIC(); /* XXX fix this*/
+ }
+ }
+
+ /* make the reconstruction map */
+ reconCtrlPtr->reconMap = rf_MakeReconMap(raidPtr, (int) (layoutPtr->SUsPerRU * layoutPtr->sectorsPerStripeUnit),
+ raidPtr->sectorsPerDisk, numSpareRUs);
+
+ /* make the per-disk reconstruction buffers */
+ for (i=0; i<raidPtr->numCol; i++) {
+ reconCtrlPtr->perDiskInfo[i].rbuf = (i==fcol) ? NULL : rf_MakeReconBuffer(raidPtr, frow, i, RF_RBUF_TYPE_EXCLUSIVE);
+ }
+
+ /* initialize the event queue */
+ rc = rf_mutex_init(&reconCtrlPtr->eq_mutex);
+ if (rc) {
+ /* XXX deallocate, cleanup */
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ return(NULL);
+ }
+ rc = rf_cond_init(&reconCtrlPtr->eq_cond);
+ if (rc) {
+ /* XXX deallocate, cleanup */
+ RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ return(NULL);
+ }
+ reconCtrlPtr->eventQueue = NULL;
+ reconCtrlPtr->eq_count = 0;
+
+ /* make the floating recon buffers and append them to the free list */
+ rc = rf_mutex_init(&reconCtrlPtr->rb_mutex);
+ if (rc) {
+ /* XXX deallocate, cleanup */
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ return(NULL);
+ }
+ reconCtrlPtr->fullBufferList= NULL;
+ reconCtrlPtr->priorityList = NULL;
+ reconCtrlPtr->floatingRbufs = NULL;
+ reconCtrlPtr->committedRbufs= NULL;
+ for (i=0; i<raidPtr->numFloatingReconBufs; i++) {
+ rbuf = rf_MakeReconBuffer(raidPtr, frow, fcol, RF_RBUF_TYPE_FLOATING);
+ rbuf->next = reconCtrlPtr->floatingRbufs;
+ reconCtrlPtr->floatingRbufs = rbuf;
+ }
+
+ /* create the parity stripe status table */
+ reconCtrlPtr->pssTable = rf_MakeParityStripeStatusTable(raidPtr);
+
+ /* set the initial min head sep counter val */
+ reconCtrlPtr->minHeadSepCounter = 0;
+
+ return(reconCtrlPtr);
+}
+
+void rf_FreeReconControl(raidPtr, row)
+ RF_Raid_t *raidPtr;
+ RF_RowCol_t row;
+{
+ RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row];
+ RF_ReconBuffer_t *t;
+ RF_ReconUnitNum_t i;
+
+ RF_ASSERT(reconCtrlPtr);
+ for (i=0; i<raidPtr->numCol; i++) if (reconCtrlPtr->perDiskInfo[i].rbuf) rf_FreeReconBuffer(reconCtrlPtr->perDiskInfo[i].rbuf);
+ for (i=0; i<raidPtr->numFloatingReconBufs; i++) {
+ t = reconCtrlPtr->floatingRbufs;
+ RF_ASSERT(t);
+ reconCtrlPtr->floatingRbufs = t->next;
+ rf_FreeReconBuffer(t);
+ }
+ rf_mutex_destroy(&reconCtrlPtr->rb_mutex);
+ rf_mutex_destroy(&reconCtrlPtr->eq_mutex);
+ rf_cond_destroy(&reconCtrlPtr->eq_cond);
+ rf_FreeReconMap(reconCtrlPtr->reconMap);
+ rf_FreeParityStripeStatusTable(raidPtr, reconCtrlPtr->pssTable);
+ RF_Free(reconCtrlPtr->perDiskInfo, raidPtr->numCol * sizeof(RF_PerDiskReconCtrl_t));
+ RF_Free(reconCtrlPtr, sizeof(*reconCtrlPtr));
+}
+
+
+/******************************************************************************
+ * computes the default head separation limit
+ *****************************************************************************/
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimit(raidPtr)
+ RF_Raid_t *raidPtr;
+{
+ RF_HeadSepLimit_t hsl;
+ RF_LayoutSW_t *lp;
+
+ lp = raidPtr->Layout.map;
+ if (lp->GetDefaultHeadSepLimit == NULL)
+ return(-1);
+ hsl = lp->GetDefaultHeadSepLimit(raidPtr);
+ return(hsl);
+}
+
+
+/******************************************************************************
+ * computes the default number of floating recon buffers
+ *****************************************************************************/
+int rf_GetDefaultNumFloatingReconBuffers(raidPtr)
+ RF_Raid_t *raidPtr;
+{
+ RF_LayoutSW_t *lp;
+ int nrb;
+
+ lp = raidPtr->Layout.map;
+ if (lp->GetDefaultNumFloatingReconBuffers == NULL)
+ return(3 * raidPtr->numCol);
+ nrb = lp->GetDefaultNumFloatingReconBuffers(raidPtr);
+ return(nrb);
+}
+
+
+/******************************************************************************
+ * creates and initializes a reconstruction buffer
+ *****************************************************************************/
+RF_ReconBuffer_t *rf_MakeReconBuffer(
+ RF_Raid_t *raidPtr,
+ RF_RowCol_t row,
+ RF_RowCol_t col,
+ RF_RbufType_t type)
+{
+ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
+ RF_ReconBuffer_t *t;
+ u_int recon_buffer_size = rf_RaidAddressToByte(raidPtr, layoutPtr->SUsPerRU * layoutPtr->sectorsPerStripeUnit);
+
+ RF_Malloc(t, sizeof(RF_ReconBuffer_t), (RF_ReconBuffer_t *));
+ RF_Malloc(t->buffer, recon_buffer_size, (caddr_t));
+ RF_Malloc(t->arrived, raidPtr->numCol * sizeof(char), (char *));
+ t->raidPtr = raidPtr;
+ t->row = row; t->col = col;
+ t->priority = RF_IO_RECON_PRIORITY;
+ t->type = type;
+ t->pssPtr = NULL;
+ t->next = NULL;
+ return(t);
+}
+
+/******************************************************************************
+ * frees a reconstruction buffer
+ *****************************************************************************/
+void rf_FreeReconBuffer(rbuf)
+ RF_ReconBuffer_t *rbuf;
+{
+ RF_Raid_t *raidPtr = rbuf->raidPtr;
+ u_int recon_buffer_size = rf_RaidAddressToByte(raidPtr, raidPtr->Layout.SUsPerRU * raidPtr->Layout.sectorsPerStripeUnit);
+
+ RF_Free(rbuf->arrived, raidPtr->numCol * sizeof(char));
+ RF_Free(rbuf->buffer, recon_buffer_size);
+ RF_Free(rbuf, sizeof(*rbuf));
+}
+
+
+/******************************************************************************
+ * debug only: sanity check the number of floating recon bufs in use
+ *****************************************************************************/
+void rf_CheckFloatingRbufCount(raidPtr, dolock)
+ RF_Raid_t *raidPtr;
+ int dolock;
+{
+ RF_ReconParityStripeStatus_t *p;
+ RF_PSStatusHeader_t *pssTable;
+ RF_ReconBuffer_t *rbuf;
+ int i, j, sum = 0;
+ RF_RowCol_t frow=0;
+
+ for (i=0; i<raidPtr->numRow; i++)
+ if (raidPtr->reconControl[i]) {
+ frow = i;
+ break;
+ }
+ RF_ASSERT(frow >= 0);
+
+ if (dolock)
+ RF_LOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex);
+ pssTable = raidPtr->reconControl[frow]->pssTable;
+
+ for (i=0; i<raidPtr->pssTableSize; i++) {
+ RF_LOCK_MUTEX(pssTable[i].mutex);
+ for (p = pssTable[i].chain; p; p=p->next) {
+ rbuf = (RF_ReconBuffer_t *) p->rbuf;
+ if (rbuf && rbuf->type == RF_RBUF_TYPE_FLOATING)
+ sum++;
+
+ rbuf = (RF_ReconBuffer_t *) p->writeRbuf;
+ if (rbuf && rbuf->type == RF_RBUF_TYPE_FLOATING)
+ sum++;
+
+ for (j=0; j<p->xorBufCount; j++) {
+ rbuf = (RF_ReconBuffer_t *) p->rbufsForXor[j];
+ RF_ASSERT(rbuf);
+ if (rbuf->type == RF_RBUF_TYPE_FLOATING)
+ sum++;
+ }
+ }
+ RF_UNLOCK_MUTEX(pssTable[i].mutex);
+ }
+
+ for (rbuf = raidPtr->reconControl[frow]->floatingRbufs; rbuf; rbuf = rbuf->next) {
+ if (rbuf->type == RF_RBUF_TYPE_FLOATING)
+ sum++;
+ }
+ for (rbuf = raidPtr->reconControl[frow]->committedRbufs; rbuf; rbuf = rbuf->next) {
+ if (rbuf->type == RF_RBUF_TYPE_FLOATING)
+ sum++;
+ }
+ for (rbuf = raidPtr->reconControl[frow]->fullBufferList; rbuf; rbuf = rbuf->next) {
+ if (rbuf->type == RF_RBUF_TYPE_FLOATING)
+ sum++;
+ }
+ for (rbuf = raidPtr->reconControl[frow]->priorityList; rbuf; rbuf = rbuf->next) {
+ if (rbuf->type == RF_RBUF_TYPE_FLOATING)
+ sum++;
+ }
+
+ RF_ASSERT(sum == raidPtr->numFloatingReconBufs);
+
+ if (dolock)
+ RF_UNLOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex);
+}
diff --git a/sys/dev/raidframe/rf_reconutil.h b/sys/dev/raidframe/rf_reconutil.h
new file mode 100644
index 00000000000..f4ea1c6f5f7
--- /dev/null
+++ b/sys/dev/raidframe/rf_reconutil.h
@@ -0,0 +1,96 @@
+/* $OpenBSD: rf_reconutil.h,v 1.1 1999/01/11 14:29:48 niklas Exp $ */
+/* $NetBSD: rf_reconutil.h,v 1.1 1998/11/13 04:20:34 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/************************************************************
+ * rf_reconutil.h -- header file for reconstruction utilities
+ ************************************************************/
+
+/* :
+ * Log: rf_reconutil.h,v
+ * Revision 1.10 1996/07/15 05:40:41 jimz
+ * some recon datastructure cleanup
+ * better handling of multiple failures
+ * added undocumented double-recon test
+ *
+ * Revision 1.9 1996/07/13 00:00:59 jimz
+ * sanitized generalized reconstruction architecture
+ * cleaned up head sep, rbuf problems
+ *
+ * Revision 1.8 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.7 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.6 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.5 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.4 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.3 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.2 1995/12/06 15:06:47 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_RECONUTIL_H_
+#define _RF__RF_RECONUTIL_H_
+
+#include "rf_types.h"
+#include "rf_reconstruct.h"
+
+RF_ReconCtrl_t *rf_MakeReconControl(RF_RaidReconDesc_t *reconDesc,
+ RF_RowCol_t frow, RF_RowCol_t fcol, RF_RowCol_t srow, RF_RowCol_t scol);
+void rf_FreeReconControl(RF_Raid_t *raidPtr, RF_RowCol_t row);
+RF_HeadSepLimit_t rf_GetDefaultHeadSepLimit(RF_Raid_t *raidPtr);
+int rf_GetDefaultNumFloatingReconBuffers(RF_Raid_t *raidPtr);
+RF_ReconBuffer_t *rf_MakeReconBuffer(RF_Raid_t *raidPtr, RF_RowCol_t row,
+ RF_RowCol_t col, RF_RbufType_t type);
+void rf_FreeReconBuffer(RF_ReconBuffer_t *rbuf);
+void rf_CheckFloatingRbufCount(RF_Raid_t *raidPtr, int dolock);
+
+#endif /* !_RF__RF_RECONUTIL_H_ */
diff --git a/sys/dev/raidframe/rf_revent.c b/sys/dev/raidframe/rf_revent.c
new file mode 100644
index 00000000000..c4236962b64
--- /dev/null
+++ b/sys/dev/raidframe/rf_revent.c
@@ -0,0 +1,306 @@
+/* $OpenBSD: rf_revent.c,v 1.1 1999/01/11 14:29:48 niklas Exp $ */
+/* $NetBSD: rf_revent.c,v 1.1 1998/11/13 04:20:34 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author:
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * revent.c -- reconstruction event handling code
+ */
+/*
+ * :
+ * Log: rf_revent.c,v
+ * Revision 1.22 1996/08/11 00:41:11 jimz
+ * extern hz only for kernel
+ *
+ * Revision 1.21 1996/07/15 05:40:41 jimz
+ * some recon datastructure cleanup
+ * better handling of multiple failures
+ * added undocumented double-recon test
+ *
+ * Revision 1.20 1996/06/17 03:18:04 jimz
+ * include shutdown.h for macroized ShutdownCreate
+ *
+ * Revision 1.19 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.18 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.17 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.16 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.15 1996/05/30 12:59:18 jimz
+ * make etimer happier, more portable
+ *
+ * Revision 1.14 1996/05/20 16:13:40 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ * use RF_FREELIST for revents
+ *
+ * Revision 1.13 1996/05/18 20:09:47 jimz
+ * bit of cleanup to compile cleanly in kernel, once again
+ *
+ * Revision 1.12 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ */
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#include <sys/errno.h>
+
+#include "rf_raid.h"
+#include "rf_revent.h"
+#include "rf_etimer.h"
+#include "rf_general.h"
+#include "rf_freelist.h"
+#include "rf_desc.h"
+#include "rf_shutdown.h"
+
+static RF_FreeList_t *rf_revent_freelist;
+#define RF_MAX_FREE_REVENT 128
+#define RF_REVENT_INC 8
+#define RF_REVENT_INITIAL 8
+
+
+#ifdef KERNEL
+
+#include <sys/proc.h>
+
+extern int hz;
+
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#define DO_WAIT(_rc) mpsleep(&(_rc)->eventQueue, PZERO, "raidframe eventq", 0, \
+ (void *) simple_lock_addr((_rc)->eq_mutex), MS_LOCK_SIMPLE)
+#else
+#define DO_WAIT(_rc) tsleep(&(_rc)->eventQueue, PRIBIO | PCATCH, "raidframe eventq", 0)
+#endif
+
+#define DO_SIGNAL(_rc) wakeup(&(_rc)->eventQueue)
+
+#else /* KERNEL */
+
+#define DO_WAIT(_rc) RF_WAIT_COND((_rc)->eq_cond, (_rc)->eq_mutex)
+#define DO_SIGNAL(_rc) RF_SIGNAL_COND((_rc)->eq_cond)
+
+#endif /* KERNEL */
+
+static void rf_ShutdownReconEvent(void *);
+
+static RF_ReconEvent_t *GetReconEventDesc(RF_RowCol_t row, RF_RowCol_t col,
+ void *arg, RF_Revent_t type);
+RF_ReconEvent_t *rf_GetNextReconEvent(RF_RaidReconDesc_t *,
+ RF_RowCol_t, void (*continueFunc)(void *),
+ void *);
+
+static void rf_ShutdownReconEvent(ignored)
+ void *ignored;
+{
+ RF_FREELIST_DESTROY(rf_revent_freelist,next,(RF_ReconEvent_t *));
+}
+
+int rf_ConfigureReconEvent(listp)
+ RF_ShutdownList_t **listp;
+{
+ int rc;
+
+ RF_FREELIST_CREATE(rf_revent_freelist, RF_MAX_FREE_REVENT,
+ RF_REVENT_INC, sizeof(RF_ReconEvent_t));
+ if (rf_revent_freelist == NULL)
+ return(ENOMEM);
+ rc = rf_ShutdownCreate(listp, rf_ShutdownReconEvent, NULL);
+ if (rc) {
+ RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ rf_ShutdownReconEvent(NULL);
+ return(rc);
+ }
+ RF_FREELIST_PRIME(rf_revent_freelist, RF_REVENT_INITIAL,next,
+ (RF_ReconEvent_t *));
+ return(0);
+}
+
+/* returns the next reconstruction event, blocking the calling thread until
+ * one becomes available
+ */
+
+/* will now return null if it is blocked or will return an event if it is not */
+
+RF_ReconEvent_t *rf_GetNextReconEvent(reconDesc, row, continueFunc, continueArg)
+ RF_RaidReconDesc_t *reconDesc;
+ RF_RowCol_t row;
+ void (*continueFunc)(void *);
+ void *continueArg;
+{
+ RF_Raid_t *raidPtr = reconDesc->raidPtr;
+ RF_ReconCtrl_t *rctrl = raidPtr->reconControl[row];
+ RF_ReconEvent_t *event;
+
+ RF_ASSERT( row >= 0 && row <= raidPtr->numRow );
+ RF_LOCK_MUTEX(rctrl->eq_mutex);
+ RF_ASSERT( (rctrl->eventQueue==NULL) == (rctrl->eq_count == 0)); /* q null and count==0 must be equivalent conditions */
+
+
+ rctrl->continueFunc=continueFunc;
+ rctrl->continueArg=continueArg;
+
+#ifdef SIMULATE
+ if (!rctrl->eventQueue) {
+ RF_UNLOCK_MUTEX(rctrl->eq_mutex);
+ return (NULL);
+ }
+#else /* SIMULATE */
+
+#ifdef KERNEL
+
+/* mpsleep timeout value: secs = timo_val/hz. 'ticks' here is defined as cycle-counter ticks, not softclock ticks */
+#define MAX_RECON_EXEC_TICKS 15000000 /* 150 Mhz => this many ticks in 100 ms */
+#define RECON_DELAY_MS 25
+#define RECON_TIMO ((RECON_DELAY_MS * hz) / 1000)
+
+ /* we are not pre-emptible in the kernel, but we don't want to run forever. If we run w/o blocking
+ * for more than MAX_RECON_EXEC_TICKS ticks of the cycle counter, delay for RECON_DELAY before continuing.
+ * this may murder us with context switches, so we may need to increase both the MAX...TICKS and the RECON_DELAY_MS.
+ */
+ if (reconDesc->reconExecTimerRunning) {
+ int status;
+
+ RF_ETIMER_STOP(reconDesc->recon_exec_timer);
+ RF_ETIMER_EVAL(reconDesc->recon_exec_timer);
+ reconDesc->reconExecTicks += RF_ETIMER_VAL_TICKS(reconDesc->recon_exec_timer);
+ if (reconDesc->reconExecTicks > reconDesc->maxReconExecTicks)
+ reconDesc->maxReconExecTicks = reconDesc->reconExecTicks;
+ if (reconDesc->reconExecTicks >= MAX_RECON_EXEC_TICKS) {
+ /* we've been running too long. delay for RECON_DELAY_MS */
+#if RF_RECON_STATS > 0
+ reconDesc->numReconExecDelays++;
+#endif /* RF_RECON_STATS > 0 */
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+ status = mpsleep(&reconDesc->reconExecTicks, PZERO, "recon delay", RECON_TIMO, (void *) simple_lock_addr(rctrl->eq_mutex), MS_LOCK_SIMPLE);
+#else
+ status = tsleep(&reconDesc->reconExecTicks, PRIBIO | PCATCH, "recon delay", RECON_TIMO );
+#endif
+ RF_ASSERT(status == EWOULDBLOCK);
+ reconDesc->reconExecTicks = 0;
+ }
+ }
+
+#endif /* KERNEL */
+
+ while (!rctrl->eventQueue) {
+#if RF_RECON_STATS > 0
+ reconDesc->numReconEventWaits++;
+#endif /* RF_RECON_STATS > 0 */
+ DO_WAIT(rctrl);
+#ifdef KERNEL
+ reconDesc->reconExecTicks = 0; /* we've just waited */
+#endif /* KERNEL */
+ }
+
+#endif /* SIMULATE */
+
+#ifdef KERNEL
+ reconDesc->reconExecTimerRunning = 1;
+ RF_ETIMER_START(reconDesc->recon_exec_timer);
+#endif /* KERNEL */
+
+ event = rctrl->eventQueue;
+ rctrl->eventQueue = event->next;
+ event->next = NULL;
+ rctrl->eq_count--;
+ RF_ASSERT( (rctrl->eventQueue==NULL) == (rctrl->eq_count == 0)); /* q null and count==0 must be equivalent conditions */
+ RF_UNLOCK_MUTEX(rctrl->eq_mutex);
+ return(event);
+}
+
+/* enqueues a reconstruction event on the indicated queue */
+void rf_CauseReconEvent(raidPtr, row, col, arg, type)
+ RF_Raid_t *raidPtr;
+ RF_RowCol_t row;
+ RF_RowCol_t col;
+ void *arg;
+ RF_Revent_t type;
+{
+ RF_ReconCtrl_t *rctrl = raidPtr->reconControl[row];
+ RF_ReconEvent_t *event = GetReconEventDesc(row, col, arg, type);
+
+ if (type == RF_REVENT_BUFCLEAR) {
+ RF_ASSERT(col != rctrl->fcol);
+ }
+
+ RF_ASSERT( row >= 0 && row <= raidPtr->numRow && col >=0 && col <= raidPtr->numCol );
+ RF_LOCK_MUTEX(rctrl->eq_mutex);
+ RF_ASSERT( (rctrl->eventQueue==NULL) == (rctrl->eq_count == 0)); /* q null and count==0 must be equivalent conditions */
+ event->next = rctrl->eventQueue;
+ rctrl->eventQueue = event;
+ rctrl->eq_count++;
+ RF_UNLOCK_MUTEX(rctrl->eq_mutex);
+
+#ifndef SIMULATE
+ DO_SIGNAL(rctrl);
+#else /* !SIMULATE */
+ (rctrl->continueFunc)(rctrl->continueArg);
+#endif /* !SIMULATE */
+}
+
+/* allocates and initializes a recon event descriptor */
+static RF_ReconEvent_t *GetReconEventDesc(row, col, arg, type)
+ RF_RowCol_t row;
+ RF_RowCol_t col;
+ void *arg;
+ RF_Revent_t type;
+{
+ RF_ReconEvent_t *t;
+
+ RF_FREELIST_GET(rf_revent_freelist,t,next,(RF_ReconEvent_t *));
+ if (t == NULL)
+ return(NULL);
+ t->col = col;
+ t->arg = arg;
+ t->type = type;
+ return(t);
+}
+
+void rf_FreeReconEventDesc(event)
+ RF_ReconEvent_t *event;
+{
+ RF_FREELIST_FREE(rf_revent_freelist,event,next);
+}
diff --git a/sys/dev/raidframe/rf_revent.h b/sys/dev/raidframe/rf_revent.h
new file mode 100644
index 00000000000..7029a8ef74d
--- /dev/null
+++ b/sys/dev/raidframe/rf_revent.h
@@ -0,0 +1,82 @@
+/* $OpenBSD: rf_revent.h,v 1.1 1999/01/11 14:29:48 niklas Exp $ */
+/* $NetBSD: rf_revent.h,v 1.1 1998/11/13 04:20:34 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author:
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*******************************************************************
+ *
+ * rf_revent.h -- header file for reconstruction event handling code
+ *
+ *******************************************************************/
+
+/* :
+ * Log: rf_revent.h,v
+ * Revision 1.7 1996/07/15 05:40:41 jimz
+ * some recon datastructure cleanup
+ * better handling of multiple failures
+ * added undocumented double-recon test
+ *
+ * Revision 1.6 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.5 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.4 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.3 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.2 1995/12/06 15:04:20 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_REVENT_H_
+#define _RF__RF_REVENT_H_
+
+#include "rf_types.h"
+
+int rf_ConfigureReconEvent(RF_ShutdownList_t **listp);
+
+RF_ReconEvent_t *rf_GetNextReconEvent(RF_RaidReconDesc_t *reconDesc,
+ RF_RowCol_t row, void (*continueFunc)(void *), void *continueArg);
+
+void rf_CauseReconEvent(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col,
+ void *arg, RF_Revent_t type);
+
+void rf_FreeReconEventDesc(RF_ReconEvent_t *event);
+
+#endif /* !_RF__RF_REVENT_H_ */
diff --git a/sys/dev/raidframe/rf_rst.h b/sys/dev/raidframe/rf_rst.h
new file mode 100644
index 00000000000..06e66275cd2
--- /dev/null
+++ b/sys/dev/raidframe/rf_rst.h
@@ -0,0 +1,78 @@
+/* $OpenBSD: rf_rst.h,v 1.1 1999/01/11 14:29:49 niklas Exp $ */
+/* $NetBSD: rf_rst.h,v 1.1 1998/11/13 04:20:34 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_rst.h - defines raidSim trace entry */
+
+/* :
+ * Log: rf_rst.h,v
+ * Revision 1.7 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.6 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.5 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.4 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.3 1995/12/06 15:03:15 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_RST_H_
+#define _RF__RF_RST_H_
+
+#include "rf_types.h"
+
+typedef struct RF_ScriptTraceEntry_s {
+ RF_int32 blkno;
+ RF_int32 size;
+ double delay;
+ RF_int16 pid;
+ RF_int8 op;
+ RF_int8 async_flag;
+} RF_ScriptTraceEntry_t;
+
+typedef struct RF_ScriptTraceEntryList_s RF_ScriptTraceEntryList_t;
+struct RF_ScriptTraceEntryList_s {
+ RF_ScriptTraceEntry_t entry;
+ RF_ScriptTraceEntryList_t *next;
+};
+
+#endif /* !_RF__RF_RST_H_ */
diff --git a/sys/dev/raidframe/rf_shutdown.c b/sys/dev/raidframe/rf_shutdown.c
new file mode 100644
index 00000000000..3e0dfc96a37
--- /dev/null
+++ b/sys/dev/raidframe/rf_shutdown.c
@@ -0,0 +1,114 @@
+/* $OpenBSD: rf_shutdown.c,v 1.1 1999/01/11 14:29:49 niklas Exp $ */
+/* $NetBSD: rf_shutdown.c,v 1.1 1998/11/13 04:20:34 oster Exp $ */
+/*
+ * rf_shutdown.c
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * Maintain lists of cleanup functions. Also, mechanisms for coordinating
+ * thread startup and shutdown.
+ */
+
+#include "rf_types.h"
+#include "rf_threadstuff.h"
+#include "rf_shutdown.h"
+#include "rf_debugMem.h"
+#include "rf_freelist.h"
+#include "rf_threadid.h"
+
+static void rf_FreeShutdownEnt(RF_ShutdownList_t *ent)
+{
+#ifdef KERNEL
+ FREE(ent, M_DEVBUF);
+#else /* KERNEL */
+ free(ent);
+#endif /* KERNEL */
+}
+
+int _rf_ShutdownCreate(
+ RF_ShutdownList_t **listp,
+ void (*cleanup)(void *arg),
+ void *arg,
+ char *file,
+ int line)
+{
+ RF_ShutdownList_t *ent;
+
+ /*
+ * Have to directly allocate memory here, since we start up before
+ * and shutdown after RAIDframe internal allocation system.
+ */
+#ifdef KERNEL
+ ent = (RF_ShutdownList_t *)malloc( sizeof(RF_ShutdownList_t), M_DEVBUF, M_WAITOK);
+#if 0
+ MALLOC(ent, RF_ShutdownList_t *, sizeof(RF_ShutdownList_t), M_DEVBUF, M_WAITOK);
+#endif
+#else /* KERNEL */
+ ent = (RF_ShutdownList_t *)malloc(sizeof(RF_ShutdownList_t));
+#endif /* KERNEL */
+ if (ent == NULL)
+ return(ENOMEM);
+ ent->cleanup = cleanup;
+ ent->arg = arg;
+ ent->file = file;
+ ent->line = line;
+ ent->next = *listp;
+ *listp = ent;
+ return(0);
+}
+
+int rf_ShutdownList(RF_ShutdownList_t **list)
+{
+ RF_ShutdownList_t *r, *next;
+ char *file;
+ int line;
+
+ for(r=*list;r;r=next) {
+ next = r->next;
+ file = r->file;
+ line = r->line;
+
+ if (rf_shutdownDebug) {
+ int tid;
+ rf_get_threadid(tid);
+ printf("[%d] call shutdown, created %s:%d\n", tid, file, line);
+ }
+
+ r->cleanup(r->arg);
+
+ if (rf_shutdownDebug) {
+ int tid;
+ rf_get_threadid(tid);
+ printf("[%d] completed shutdown, created %s:%d\n", tid, file, line);
+ }
+
+ rf_FreeShutdownEnt(r);
+ }
+ *list = NULL;
+ return(0);
+}
diff --git a/sys/dev/raidframe/rf_shutdown.h b/sys/dev/raidframe/rf_shutdown.h
new file mode 100644
index 00000000000..bddfe7f9c0d
--- /dev/null
+++ b/sys/dev/raidframe/rf_shutdown.h
@@ -0,0 +1,68 @@
+/* $OpenBSD: rf_shutdown.h,v 1.1 1999/01/11 14:29:49 niklas Exp $ */
+/* $NetBSD: rf_shutdown.h,v 1.1 1998/11/13 04:20:34 oster Exp $ */
+/*
+ * rf_shutdown.h
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * Maintain lists of cleanup functions. Also, mechanisms for coordinating
+ * thread startup and shutdown.
+ */
+
+#ifndef _RF__RF_SHUTDOWN_H_
+#define _RF__RF_SHUTDOWN_H_
+
+#include "rf_types.h"
+#include "rf_threadstuff.h"
+
+/*
+ * Important note: the shutdown list is run like a stack, new
+ * entries pushed on top. Therefore, the most recently added
+ * entry (last started) is the first removed (stopped). This
+ * should handle system-dependencies pretty nicely- if a system
+ * is there when you start another, it'll be there when you
+ * shut down another. Hopefully, this subsystem will remove
+ * more complexity than it introduces.
+ */
+
+struct RF_ShutdownList_s {
+ void (*cleanup)(void *arg);
+ void *arg;
+ char *file;
+ int line;
+ RF_ShutdownList_t *next;
+};
+
+#define rf_ShutdownCreate(_listp_,_func_,_arg_) \
+ _rf_ShutdownCreate(_listp_,_func_,_arg_,__FILE__,__LINE__)
+
+int _rf_ShutdownCreate(RF_ShutdownList_t **listp, void (*cleanup)(void *arg),
+ void *arg, char *file, int line);
+int rf_ShutdownList(RF_ShutdownList_t **listp);
+
+#endif /* !_RF__RF_SHUTDOWN_H_ */
diff --git a/sys/dev/raidframe/rf_sstf.c b/sys/dev/raidframe/rf_sstf.c
new file mode 100644
index 00000000000..21d97eef046
--- /dev/null
+++ b/sys/dev/raidframe/rf_sstf.c
@@ -0,0 +1,717 @@
+/* $OpenBSD: rf_sstf.c,v 1.1 1999/01/11 14:29:50 niklas Exp $ */
+/* $NetBSD: rf_sstf.c,v 1.1 1998/11/13 04:20:34 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*******************************************************************************
+ *
+ * sstf.c -- prioritized shortest seek time first disk queueing code
+ *
+ ******************************************************************************/
+
+/*
+ * :
+ * Log: rf_sstf.c,v
+ * Revision 1.7 1996/06/19 14:09:56 jimz
+ * SstfPeek wasn't calling closest_to_arm() properly- would bogart
+ * low priority I/Os
+ *
+ * Revision 1.6 1996/06/18 20:53:11 jimz
+ * fix up disk queueing (remove configure routine,
+ * add shutdown list arg to create routines)
+ *
+ * Revision 1.5 1996/06/13 20:42:13 jimz
+ * add scan, cscan
+ *
+ * Revision 1.4 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.3 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.2 1996/06/06 01:11:35 jimz
+ * fixed many priority-related bugs
+ *
+ * Revision 1.1 1996/06/05 19:17:40 jimz
+ * Initial revision
+ *
+ */
+
+#include "rf_alloclist.h"
+#include "rf_stripelocks.h"
+#include "rf_layout.h"
+#include "rf_diskqueue.h"
+#include "rf_sstf.h"
+#include "rf_debugMem.h"
+#include "rf_general.h"
+#include "rf_threadid.h"
+#include "rf_options.h"
+
+#define DIR_LEFT 1
+#define DIR_RIGHT 2
+#define DIR_EITHER 3
+
+#define SNUM_DIFF(_a_,_b_) (((_a_)>(_b_))?((_a_)-(_b_)):((_b_)-(_a_)))
+
+#define QSUM(_sstfq_) (((_sstfq_)->lopri.qlen)+((_sstfq_)->left.qlen)+((_sstfq_)->right.qlen))
+
+
+static void do_sstf_ord_q(RF_DiskQueueData_t **,
+ RF_DiskQueueData_t **,
+ RF_DiskQueueData_t *);
+
+static RF_DiskQueueData_t *closest_to_arm(RF_SstfQ_t *,
+ RF_SectorNum_t,
+ int *,
+ int);
+static void do_dequeue(RF_SstfQ_t *, RF_DiskQueueData_t *);
+
+
+static void do_sstf_ord_q(queuep, tailp, req)
+ RF_DiskQueueData_t **queuep;
+ RF_DiskQueueData_t **tailp;
+ RF_DiskQueueData_t *req;
+{
+ RF_DiskQueueData_t *r, *s;
+
+ if (*queuep == NULL) {
+ *queuep = req;
+ *tailp = req;
+ req->next = NULL;
+ req->prev = NULL;
+ return;
+ }
+ if (req->sectorOffset <= (*queuep)->sectorOffset) {
+ req->next = *queuep;
+ req->prev = NULL;
+ (*queuep)->prev = req;
+ *queuep = req;
+ return;
+ }
+ if (req->sectorOffset > (*tailp)->sectorOffset) {
+ /* optimization */
+ r = NULL;
+ s = *tailp;
+ goto q_at_end;
+ }
+ for(s=NULL,r=*queuep;r;s=r,r=r->next) {
+ if (r->sectorOffset >= req->sectorOffset) {
+ /* insert after s, before r */
+ RF_ASSERT(s);
+ req->next = r;
+ r->prev = req;
+ s->next = req;
+ req->prev = s;
+ return;
+ }
+ }
+q_at_end:
+ /* insert after s, at end of queue */
+ RF_ASSERT(r == NULL);
+ RF_ASSERT(s);
+ RF_ASSERT(s == (*tailp));
+ req->next = NULL;
+ req->prev = s;
+ s->next = req;
+ *tailp = req;
+}
+
+/* for removing from head-of-queue */
+#define DO_HEAD_DEQ(_r_,_q_) { \
+ _r_ = (_q_)->queue; \
+ RF_ASSERT((_r_) != NULL); \
+ (_q_)->queue = (_r_)->next; \
+ (_q_)->qlen--; \
+ if ((_q_)->qlen == 0) { \
+ RF_ASSERT((_r_) == (_q_)->qtail); \
+ RF_ASSERT((_q_)->queue == NULL); \
+ (_q_)->qtail = NULL; \
+ } \
+ else { \
+ RF_ASSERT((_q_)->queue->prev == (_r_)); \
+ (_q_)->queue->prev = NULL; \
+ } \
+}
+
+/* for removing from end-of-queue */
+#define DO_TAIL_DEQ(_r_,_q_) { \
+ _r_ = (_q_)->qtail; \
+ RF_ASSERT((_r_) != NULL); \
+ (_q_)->qtail = (_r_)->prev; \
+ (_q_)->qlen--; \
+ if ((_q_)->qlen == 0) { \
+ RF_ASSERT((_r_) == (_q_)->queue); \
+ RF_ASSERT((_q_)->qtail == NULL); \
+ (_q_)->queue = NULL; \
+ } \
+ else { \
+ RF_ASSERT((_q_)->qtail->next == (_r_)); \
+ (_q_)->qtail->next = NULL; \
+ } \
+}
+
+#define DO_BEST_DEQ(_l_,_r_,_q_) { \
+ if (SNUM_DIFF((_q_)->queue->sectorOffset,_l_) \
+ < SNUM_DIFF((_q_)->qtail->sectorOffset,_l_)) \
+ { \
+ DO_HEAD_DEQ(_r_,_q_); \
+ } \
+ else { \
+ DO_TAIL_DEQ(_r_,_q_); \
+ } \
+}
+
+static RF_DiskQueueData_t *closest_to_arm(queue, arm_pos, dir, allow_reverse)
+ RF_SstfQ_t *queue;
+ RF_SectorNum_t arm_pos;
+ int *dir;
+ int allow_reverse;
+{
+ RF_SectorNum_t best_pos_l=0, this_pos_l=0, last_pos=0;
+ RF_SectorNum_t best_pos_r=0, this_pos_r=0;
+ RF_DiskQueueData_t *r, *best_l, *best_r;
+
+ best_r = best_l = NULL;
+ for(r=queue->queue;r;r=r->next) {
+ if (r->sectorOffset < arm_pos) {
+ if (best_l == NULL) {
+ best_l = r;
+ last_pos = best_pos_l = this_pos_l;
+ }
+ else {
+ this_pos_l = arm_pos - r->sectorOffset;
+ if (this_pos_l < best_pos_l) {
+ best_l = r;
+ last_pos = best_pos_l = this_pos_l;
+ }
+ else {
+ last_pos = this_pos_l;
+ }
+ }
+ }
+ else {
+ if (best_r == NULL) {
+ best_r = r;
+ last_pos = best_pos_r = this_pos_r;
+ }
+ else {
+ this_pos_r = r->sectorOffset - arm_pos;
+ if (this_pos_r < best_pos_r) {
+ best_r = r;
+ last_pos = best_pos_r = this_pos_r;
+ }
+ else {
+ last_pos = this_pos_r;
+ }
+ if (this_pos_r > last_pos) {
+ /* getting farther away */
+ break;
+ }
+ }
+ }
+ }
+ if ((best_r == NULL) && (best_l == NULL))
+ return(NULL);
+ if ((*dir == DIR_RIGHT) && best_r)
+ return(best_r);
+ if ((*dir == DIR_LEFT) && best_l)
+ return(best_l);
+ if (*dir == DIR_EITHER) {
+ if (best_l == NULL)
+ return(best_r);
+ if (best_r == NULL)
+ return(best_l);
+ if (best_pos_r < best_pos_l)
+ return(best_r);
+ else
+ return(best_l);
+ }
+ /*
+ * Nothing in the direction we want to go. Reverse or
+ * reset the arm. We know we have an I/O in the other
+ * direction.
+ */
+ if (allow_reverse) {
+ if (*dir == DIR_RIGHT) {
+ *dir = DIR_LEFT;
+ return(best_l);
+ }
+ else {
+ *dir = DIR_RIGHT;
+ return(best_r);
+ }
+ }
+ /*
+ * Reset (beginning of queue).
+ */
+ RF_ASSERT(*dir == DIR_RIGHT);
+ return(queue->queue);
+}
+
+void *rf_SstfCreate(sect_per_disk, cl_list, listp)
+ RF_SectorCount_t sect_per_disk;
+ RF_AllocListElem_t *cl_list;
+ RF_ShutdownList_t **listp;
+{
+ RF_Sstf_t *sstfq;
+
+ RF_CallocAndAdd(sstfq, 1, sizeof(RF_Sstf_t), (RF_Sstf_t *), cl_list);
+ sstfq->dir = DIR_EITHER;
+ sstfq->allow_reverse = 1;
+ return((void *)sstfq);
+}
+
+void *rf_ScanCreate(sect_per_disk, cl_list, listp)
+ RF_SectorCount_t sect_per_disk;
+ RF_AllocListElem_t *cl_list;
+ RF_ShutdownList_t **listp;
+{
+ RF_Sstf_t *scanq;
+
+ RF_CallocAndAdd(scanq, 1, sizeof(RF_Sstf_t), (RF_Sstf_t *), cl_list);
+ scanq->dir = DIR_RIGHT;
+ scanq->allow_reverse = 1;
+ return((void *)scanq);
+}
+
+void *rf_CscanCreate(sect_per_disk, cl_list, listp)
+ RF_SectorCount_t sect_per_disk;
+ RF_AllocListElem_t *cl_list;
+ RF_ShutdownList_t **listp;
+{
+ RF_Sstf_t *cscanq;
+
+ RF_CallocAndAdd(cscanq, 1, sizeof(RF_Sstf_t), (RF_Sstf_t *), cl_list);
+ cscanq->dir = DIR_RIGHT;
+ return((void *)cscanq);
+}
+
+void rf_SstfEnqueue(qptr, req, priority)
+ void *qptr;
+ RF_DiskQueueData_t *req;
+ int priority;
+{
+ RF_Sstf_t *sstfq;
+
+ sstfq = (RF_Sstf_t *)qptr;
+
+ if (priority == RF_IO_LOW_PRIORITY) {
+ if (rf_sstfDebug || rf_scanDebug || rf_cscanDebug) {
+ RF_DiskQueue_t *dq;
+ int tid;
+ rf_get_threadid(tid);
+ dq = (RF_DiskQueue_t *)req->queue;
+ printf("[%d] ENQ lopri %d,%d queues are %d,%d,%d\n",
+ tid, dq->row, dq->col, sstfq->left.qlen, sstfq->right.qlen,
+ sstfq->lopri.qlen);
+ }
+ do_sstf_ord_q(&sstfq->lopri.queue, &sstfq->lopri.qtail, req);
+ sstfq->lopri.qlen++;
+ }
+ else {
+ if (req->sectorOffset < sstfq->last_sector) {
+ do_sstf_ord_q(&sstfq->left.queue, &sstfq->left.qtail, req);
+ sstfq->left.qlen++;
+ }
+ else {
+ do_sstf_ord_q(&sstfq->right.queue, &sstfq->right.qtail, req);
+ sstfq->right.qlen++;
+ }
+ }
+}
+
+static void do_dequeue(queue, req)
+ RF_SstfQ_t *queue;
+ RF_DiskQueueData_t *req;
+{
+ RF_DiskQueueData_t *req2;
+
+ if (rf_sstfDebug || rf_scanDebug || rf_cscanDebug) {
+ int tid;
+ rf_get_threadid(tid);
+ printf("[%d] do_dequeue\n", tid);
+ }
+ if (req == queue->queue) {
+ DO_HEAD_DEQ(req2,queue);
+ RF_ASSERT(req2 == req);
+ }
+ else if (req == queue->qtail) {
+ DO_TAIL_DEQ(req2,queue);
+ RF_ASSERT(req2 == req);
+ }
+ else {
+ /* dequeue from middle of list */
+ RF_ASSERT(req->next);
+ RF_ASSERT(req->prev);
+ queue->qlen--;
+ req->next->prev = req->prev;
+ req->prev->next = req->next;
+ req->next = req->prev = NULL;
+ }
+}
+
+RF_DiskQueueData_t *rf_SstfDequeue(qptr)
+ void *qptr;
+{
+ RF_DiskQueueData_t *req=NULL;
+ RF_Sstf_t *sstfq;
+
+ sstfq = (RF_Sstf_t *)qptr;
+
+ if (rf_sstfDebug) {
+ RF_DiskQueue_t *dq;
+ int tid;
+ rf_get_threadid(tid);
+ dq = (RF_DiskQueue_t *)req->queue;
+ RF_ASSERT(QSUM(sstfq)==dq->queueLength);
+ printf("[%d] sstf: Dequeue %d,%d queues are %d,%d,%d\n", tid,
+ dq->row, dq->col, sstfq->left.qlen, sstfq->right.qlen,
+ sstfq->lopri.qlen);
+ }
+ if (sstfq->left.queue == NULL) {
+ RF_ASSERT(sstfq->left.qlen == 0);
+ if (sstfq->right.queue == NULL) {
+ RF_ASSERT(sstfq->right.qlen == 0);
+ if (sstfq->lopri.queue == NULL) {
+ RF_ASSERT(sstfq->lopri.qlen == 0);
+ return(NULL);
+ }
+ if (rf_sstfDebug) {
+ int tid;
+ rf_get_threadid(tid);
+ printf("[%d] sstf: check for close lopri", tid);
+ }
+ req = closest_to_arm(&sstfq->lopri, sstfq->last_sector,
+ &sstfq->dir, sstfq->allow_reverse);
+ if (rf_sstfDebug) {
+ int tid;
+ rf_get_threadid(tid);
+ printf("[%d] sstf: closest_to_arm said %lx", tid, (long)req);
+ }
+ if (req == NULL)
+ return(NULL);
+ do_dequeue(&sstfq->lopri, req);
+ }
+ else {
+ DO_BEST_DEQ(sstfq->last_sector,req,&sstfq->right);
+ }
+ }
+ else {
+ if (sstfq->right.queue == NULL) {
+ RF_ASSERT(sstfq->right.qlen == 0);
+ DO_BEST_DEQ(sstfq->last_sector,req,&sstfq->left);
+ }
+ else {
+ if (SNUM_DIFF(sstfq->last_sector,sstfq->right.queue->sectorOffset)
+ < SNUM_DIFF(sstfq->last_sector,sstfq->left.qtail->sectorOffset))
+ {
+ DO_HEAD_DEQ(req,&sstfq->right);
+ }
+ else {
+ DO_TAIL_DEQ(req,&sstfq->left);
+ }
+ }
+ }
+ RF_ASSERT(req);
+ sstfq->last_sector = req->sectorOffset;
+ return(req);
+}
+
+RF_DiskQueueData_t *rf_ScanDequeue(qptr)
+ void *qptr;
+{
+ RF_DiskQueueData_t *req=NULL;
+ RF_Sstf_t *scanq;
+
+ scanq = (RF_Sstf_t *)qptr;
+
+ if (rf_scanDebug) {
+ RF_DiskQueue_t *dq;
+ int tid;
+ rf_get_threadid(tid);
+ dq = (RF_DiskQueue_t *)req->queue;
+ RF_ASSERT(QSUM(scanq)==dq->queueLength);
+ printf("[%d] scan: Dequeue %d,%d queues are %d,%d,%d\n", tid,
+ dq->row, dq->col, scanq->left.qlen, scanq->right.qlen,
+ scanq->lopri.qlen);
+ }
+ if (scanq->left.queue == NULL) {
+ RF_ASSERT(scanq->left.qlen == 0);
+ if (scanq->right.queue == NULL) {
+ RF_ASSERT(scanq->right.qlen == 0);
+ if (scanq->lopri.queue == NULL) {
+ RF_ASSERT(scanq->lopri.qlen == 0);
+ return(NULL);
+ }
+ req = closest_to_arm(&scanq->lopri, scanq->last_sector,
+ &scanq->dir, scanq->allow_reverse);
+ if (req == NULL)
+ return(NULL);
+ do_dequeue(&scanq->lopri, req);
+ }
+ else {
+ scanq->dir = DIR_RIGHT;
+ DO_HEAD_DEQ(req,&scanq->right);
+ }
+ }
+ else if (scanq->right.queue == NULL) {
+ RF_ASSERT(scanq->right.qlen == 0);
+ RF_ASSERT(scanq->left.queue);
+ scanq->dir = DIR_LEFT;
+ DO_TAIL_DEQ(req,&scanq->left);
+ }
+ else {
+ RF_ASSERT(scanq->right.queue);
+ RF_ASSERT(scanq->left.queue);
+ if (scanq->dir == DIR_RIGHT) {
+ DO_HEAD_DEQ(req,&scanq->right);
+ }
+ else {
+ DO_TAIL_DEQ(req,&scanq->left);
+ }
+ }
+ RF_ASSERT(req);
+ scanq->last_sector = req->sectorOffset;
+ return(req);
+}
+
+RF_DiskQueueData_t *rf_CscanDequeue(qptr)
+ void *qptr;
+{
+ RF_DiskQueueData_t *req=NULL;
+ RF_Sstf_t *cscanq;
+
+ cscanq = (RF_Sstf_t *)qptr;
+
+ RF_ASSERT(cscanq->dir == DIR_RIGHT);
+ if (rf_cscanDebug) {
+ RF_DiskQueue_t *dq;
+ int tid;
+ rf_get_threadid(tid);
+ dq = (RF_DiskQueue_t *)req->queue;
+ RF_ASSERT(QSUM(cscanq)==dq->queueLength);
+ printf("[%d] scan: Dequeue %d,%d queues are %d,%d,%d\n", tid,
+ dq->row, dq->col, cscanq->left.qlen, cscanq->right.qlen,
+ cscanq->lopri.qlen);
+ }
+ if (cscanq->right.queue) {
+ DO_HEAD_DEQ(req,&cscanq->right);
+ }
+ else {
+ RF_ASSERT(cscanq->right.qlen == 0);
+ if (cscanq->left.queue == NULL) {
+ RF_ASSERT(cscanq->left.qlen == 0);
+ if (cscanq->lopri.queue == NULL) {
+ RF_ASSERT(cscanq->lopri.qlen == 0);
+ return(NULL);
+ }
+ req = closest_to_arm(&cscanq->lopri, cscanq->last_sector,
+ &cscanq->dir, cscanq->allow_reverse);
+ if (req == NULL)
+ return(NULL);
+ do_dequeue(&cscanq->lopri, req);
+ }
+ else {
+ /*
+ * There's I/Os to the left of the arm. Swing
+ * on back (swap queues).
+ */
+ cscanq->right = cscanq->left;
+ cscanq->left.qlen = 0;
+ cscanq->left.queue = cscanq->left.qtail = NULL;
+ DO_HEAD_DEQ(req,&cscanq->right);
+ }
+ }
+ RF_ASSERT(req);
+ cscanq->last_sector = req->sectorOffset;
+ return(req);
+}
+
+RF_DiskQueueData_t *rf_SstfPeek(qptr)
+ void *qptr;
+{
+ RF_DiskQueueData_t *req;
+ RF_Sstf_t *sstfq;
+
+ sstfq = (RF_Sstf_t *)qptr;
+
+ if ((sstfq->left.queue == NULL) && (sstfq->right.queue == NULL)) {
+ req = closest_to_arm(&sstfq->lopri, sstfq->last_sector, &sstfq->dir,
+ sstfq->allow_reverse);
+ }
+ else {
+ if (sstfq->left.queue == NULL)
+ req = sstfq->right.queue;
+ else {
+ if (sstfq->right.queue == NULL)
+ req = sstfq->left.queue;
+ else {
+ if (SNUM_DIFF(sstfq->last_sector,sstfq->right.queue->sectorOffset)
+ <SNUM_DIFF(sstfq->last_sector,sstfq->left.qtail->sectorOffset))
+ {
+ req = sstfq->right.queue;
+ }
+ else {
+ req = sstfq->left.qtail;
+ }
+ }
+ }
+ }
+ if (req == NULL) {
+ RF_ASSERT(QSUM(sstfq) == 0);
+ }
+ return(req);
+}
+
+RF_DiskQueueData_t *rf_ScanPeek(qptr)
+ void *qptr;
+{
+ RF_DiskQueueData_t *req;
+ RF_Sstf_t *scanq;
+ int dir;
+
+ scanq = (RF_Sstf_t *)qptr;
+ dir = scanq->dir;
+
+ if (scanq->left.queue == NULL) {
+ RF_ASSERT(scanq->left.qlen == 0);
+ if (scanq->right.queue == NULL) {
+ RF_ASSERT(scanq->right.qlen == 0);
+ if (scanq->lopri.queue == NULL) {
+ RF_ASSERT(scanq->lopri.qlen == 0);
+ return(NULL);
+ }
+ req = closest_to_arm(&scanq->lopri, scanq->last_sector,
+ &dir, scanq->allow_reverse);
+ }
+ else {
+ req = scanq->right.queue;
+ }
+ }
+ else if (scanq->right.queue == NULL) {
+ RF_ASSERT(scanq->right.qlen == 0);
+ RF_ASSERT(scanq->left.queue);
+ req = scanq->left.qtail;
+ }
+ else {
+ RF_ASSERT(scanq->right.queue);
+ RF_ASSERT(scanq->left.queue);
+ if (scanq->dir == DIR_RIGHT) {
+ req = scanq->right.queue;
+ }
+ else {
+ req = scanq->left.qtail;
+ }
+ }
+ if (req == NULL) {
+ RF_ASSERT(QSUM(scanq) == 0);
+ }
+ return(req);
+}
+
+RF_DiskQueueData_t *rf_CscanPeek(qptr)
+ void *qptr;
+{
+ RF_DiskQueueData_t *req;
+ RF_Sstf_t *cscanq;
+
+ cscanq = (RF_Sstf_t *)qptr;
+
+ RF_ASSERT(cscanq->dir == DIR_RIGHT);
+ if (cscanq->right.queue) {
+ req = cscanq->right.queue;
+ }
+ else {
+ RF_ASSERT(cscanq->right.qlen == 0);
+ if (cscanq->left.queue == NULL) {
+ RF_ASSERT(cscanq->left.qlen == 0);
+ if (cscanq->lopri.queue == NULL) {
+ RF_ASSERT(cscanq->lopri.qlen == 0);
+ return(NULL);
+ }
+ req = closest_to_arm(&cscanq->lopri, cscanq->last_sector,
+ &cscanq->dir, cscanq->allow_reverse);
+ }
+ else {
+ /*
+ * There's I/Os to the left of the arm. We'll end
+ * up swinging on back.
+ */
+ req = cscanq->left.queue;
+ }
+ }
+ if (req == NULL) {
+ RF_ASSERT(QSUM(cscanq) == 0);
+ }
+ return(req);
+}
+
+int rf_SstfPromote(qptr, parityStripeID, which_ru)
+ void *qptr;
+ RF_StripeNum_t parityStripeID;
+ RF_ReconUnitNum_t which_ru;
+{
+ RF_DiskQueueData_t *r, *next;
+ RF_Sstf_t *sstfq;
+ int n;
+
+ sstfq = (RF_Sstf_t *)qptr;
+
+ n = 0;
+ if (rf_sstfDebug || rf_scanDebug || rf_cscanDebug) {
+ int tid;
+ rf_get_threadid(tid);
+ printf("[%d] promote %ld %d queues are %d,%d,%d\n",
+ tid, (long)parityStripeID, (int)which_ru,
+ sstfq->left.qlen,
+ sstfq->right.qlen,
+ sstfq->lopri.qlen);
+ }
+ for(r=sstfq->lopri.queue;r;r=next) {
+ next = r->next;
+ if (rf_sstfDebug || rf_scanDebug || rf_cscanDebug) {
+ int tid;
+ rf_get_threadid(tid);
+ printf("[%d] check promote %lx\n", tid, (long)r);
+ }
+ if ((r->parityStripeID == parityStripeID)
+ && (r->which_ru == which_ru))
+ {
+ do_dequeue(&sstfq->lopri, r);
+ rf_SstfEnqueue(qptr, r, RF_IO_NORMAL_PRIORITY);
+ n++;
+ }
+ }
+ if (rf_sstfDebug || rf_scanDebug || rf_cscanDebug) {
+ int tid;
+ rf_get_threadid(tid);
+ printf("[%d] promoted %d matching I/Os queues are %d,%d,%d\n",
+ tid, n, sstfq->left.qlen, sstfq->right.qlen, sstfq->lopri.qlen);
+ }
+ return(n);
+}
diff --git a/sys/dev/raidframe/rf_sstf.h b/sys/dev/raidframe/rf_sstf.h
new file mode 100644
index 00000000000..9d81a090826
--- /dev/null
+++ b/sys/dev/raidframe/rf_sstf.h
@@ -0,0 +1,90 @@
+/* $OpenBSD: rf_sstf.h,v 1.1 1999/01/11 14:29:50 niklas Exp $ */
+/* $NetBSD: rf_sstf.h,v 1.1 1998/11/13 04:20:34 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* :
+ * Log: rf_sstf.h,v
+ * Revision 1.6 1996/06/18 20:53:11 jimz
+ * fix up disk queueing (remove configure routine,
+ * add shutdown list arg to create routines)
+ *
+ * Revision 1.5 1996/06/13 20:42:08 jimz
+ * add scan, cscan
+ *
+ * Revision 1.4 1996/06/07 22:26:27 jimz
+ * type-ify which_ru (RF_ReconUnitNum_t)
+ *
+ * Revision 1.3 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.2 1996/06/06 01:22:24 jimz
+ * minor cleanup
+ *
+ * Revision 1.1 1996/06/05 19:17:40 jimz
+ * Initial revision
+ *
+ */
+
+#ifndef _RF__RF_SSTF_H_
+#define _RF__RF_SSTF_H_
+
+#include "rf_diskqueue.h"
+
+typedef struct RF_SstfQ_s {
+ RF_DiskQueueData_t *queue;
+ RF_DiskQueueData_t *qtail;
+ int qlen;
+} RF_SstfQ_t;
+
+typedef struct RF_Sstf_s {
+ RF_SstfQ_t left;
+ RF_SstfQ_t right;
+ RF_SstfQ_t lopri;
+ RF_SectorNum_t last_sector;
+ int dir;
+ int allow_reverse;
+} RF_Sstf_t;
+
+void *rf_SstfCreate(RF_SectorCount_t sect_per_disk,
+ RF_AllocListElem_t *cl_list, RF_ShutdownList_t **listp);
+void *rf_ScanCreate(RF_SectorCount_t sect_per_disk,
+ RF_AllocListElem_t *cl_list, RF_ShutdownList_t **listp);
+void *rf_CscanCreate(RF_SectorCount_t sect_per_disk,
+ RF_AllocListElem_t *cl_list, RF_ShutdownList_t **listp);
+void rf_SstfEnqueue(void *qptr, RF_DiskQueueData_t *req, int priority);
+RF_DiskQueueData_t *rf_SstfDequeue(void *qptr);
+RF_DiskQueueData_t *rf_SstfPeek(void *qptr);
+int rf_SstfPromote(void *qptr, RF_StripeNum_t parityStripeID,
+ RF_ReconUnitNum_t which_ru);
+RF_DiskQueueData_t *rf_ScanDequeue(void *qptr);
+RF_DiskQueueData_t *rf_ScanPeek(void *qptr);
+RF_DiskQueueData_t *rf_CscanDequeue(void *qptr);
+RF_DiskQueueData_t *rf_CscanPeek(void *qptr);
+
+#endif /* !_RF__RF_SSTF_H_ */
diff --git a/sys/dev/raidframe/rf_states.c b/sys/dev/raidframe/rf_states.c
new file mode 100644
index 00000000000..1bad7bd4ab7
--- /dev/null
+++ b/sys/dev/raidframe/rf_states.c
@@ -0,0 +1,873 @@
+/* $OpenBSD: rf_states.c,v 1.1 1999/01/11 14:29:50 niklas Exp $ */
+/* $NetBSD: rf_states.c,v 1.2 1998/11/13 13:47:56 drochner Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, William V. Courtright II, Robby Findler
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * :
+ * Log: rf_states.c,v
+ * Revision 1.45 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.44 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.43 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.42 1996/07/17 21:00:58 jimz
+ * clean up timer interface, tracing
+ *
+ * Revision 1.41 1996/07/11 19:08:00 jimz
+ * generalize reconstruction mechanism
+ * allow raid1 reconstructs via copyback (done with array
+ * quiesced, not online, therefore not disk-directed)
+ *
+ * Revision 1.40 1996/06/17 14:38:33 jimz
+ * properly #if out RF_DEMO code
+ * fix bug in MakeConfig that was causing weird behavior
+ * in configuration routines (config was not zeroed at start)
+ * clean up genplot handling of stacks
+ *
+ * Revision 1.39 1996/06/11 18:12:17 jimz
+ * got rid of evil race condition in LastState
+ *
+ * Revision 1.38 1996/06/10 14:18:58 jimz
+ * move user, throughput stats into per-array structure
+ *
+ * Revision 1.37 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.36 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.35 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.34 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.33 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.32 1996/05/30 12:59:18 jimz
+ * make etimer happier, more portable
+ *
+ * Revision 1.31 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.30 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.29 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.28 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.27 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.26 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.25 1996/05/20 19:31:46 jimz
+ * straighten out syntax problems
+ *
+ * Revision 1.24 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.23 1996/05/16 23:37:33 jimz
+ * fix misspelled "else"
+ *
+ * Revision 1.22 1996/05/15 22:33:32 jimz
+ * appropriately #ifdef cache stuff
+ *
+ * Revision 1.21 1996/05/06 22:09:20 wvcii
+ * rf_State_ExecuteDAG now only executes the first dag
+ * of each parity stripe in a multi-stripe access
+ *
+ * rf_State_ProcessDAG now executes all dags in a
+ * multi-stripe access except the first dag of each stripe.
+ *
+ * Revision 1.20 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.19 1995/11/19 16:29:50 wvcii
+ * replaced LaunchDAGState with CreateDAGState, ExecuteDAGState
+ * created rf_ContinueDagAccess
+ *
+ * Revision 1.18 1995/11/07 15:37:23 wvcii
+ * deleted states SendDAGState, RetryDAGState
+ * added staes: LaunchDAGState, ProcessDAGState
+ * code no longer has a hard-coded retry count of 1 but will support
+ * retries until a dag can not be found (selected) to perform the user request
+ *
+ * Revision 1.17 1995/10/09 23:36:08 amiri
+ * *** empty log message ***
+ *
+ * Revision 1.16 1995/10/09 18:36:58 jimz
+ * moved call to StopThroughput for user-level driver to rf_driver.c
+ *
+ * Revision 1.15 1995/10/09 18:07:23 wvcii
+ * lastState now call rf_StopThroughputStats
+ *
+ * Revision 1.14 1995/10/05 18:56:31 jimz
+ * no-op file if !INCLUDE_VS
+ *
+ * Revision 1.13 1995/09/30 20:38:24 jimz
+ * LogTraceRec now takes a Raid * as its first argument
+ *
+ * Revision 1.12 1995/09/19 22:58:54 jimz
+ * integrate DKUSAGE into raidframe
+ *
+ * Revision 1.11 1995/09/07 01:26:55 jimz
+ * Achive basic compilation in kernel. Kernel functionality
+ * is not guaranteed at all, but it'll compile. Mostly. I hope.
+ *
+ * Revision 1.10 1995/07/26 03:28:31 robby
+ * intermediary checkin
+ *
+ * Revision 1.9 1995/07/23 02:50:33 robby
+ * oops. fixed boo boo
+ *
+ * Revision 1.8 1995/07/22 22:54:54 robby
+ * removed incorrect comment
+ *
+ * Revision 1.7 1995/07/21 19:30:26 robby
+ * added idle state for rf_when-idle.c
+ *
+ * Revision 1.6 1995/07/10 19:06:28 rachad
+ * *** empty log message ***
+ *
+ * Revision 1.5 1995/07/10 17:30:38 robby
+ * added virtual striping lock states
+ *
+ * Revision 1.4 1995/07/08 18:05:39 rachad
+ * Linked up Claudsons code with the real cache
+ *
+ * Revision 1.3 1995/07/06 14:38:50 robby
+ * changed get_thread_id to get_threadid
+ *
+ * Revision 1.2 1995/07/06 14:24:15 robby
+ * added log
+ *
+ */
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#ifdef KERNEL
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include <dkusage.h>
+#endif /* !__NetBSD__ && !__OpenBSD__ */
+#endif /* KERNEL */
+
+#include <sys/errno.h>
+
+#include "rf_archs.h"
+#include "rf_threadstuff.h"
+#include "rf_raid.h"
+#include "rf_dag.h"
+#include "rf_desc.h"
+#include "rf_aselect.h"
+#include "rf_threadid.h"
+#include "rf_general.h"
+#include "rf_states.h"
+#include "rf_dagutils.h"
+#include "rf_driver.h"
+#include "rf_engine.h"
+#include "rf_map.h"
+#include "rf_etimer.h"
+
+#if defined(KERNEL) && (DKUSAGE > 0)
+#include <sys/dkusage.h>
+#include <io/common/iotypes.h>
+#include <io/cam/dec_cam.h>
+#include <io/cam/cam.h>
+#include <io/cam/pdrv.h>
+#endif /* KERNEL && DKUSAGE > 0 */
+
+/* prototypes for some of the available states.
+
+ States must:
+
+ - not block.
+
+ - either schedule rf_ContinueRaidAccess as a callback and return
+ RF_TRUE, or complete all of their work and return RF_FALSE.
+
+ - increment desc->state when they have finished their work.
+*/
+
+
+#ifdef SIMULATE
+extern int global_async_flag;
+#endif /* SIMULATE */
+
+static char *StateName(RF_AccessState_t state)
+{
+ switch (state) {
+ case rf_QuiesceState: return "QuiesceState";
+ case rf_MapState: return "MapState";
+ case rf_LockState: return "LockState";
+ case rf_CreateDAGState: return "CreateDAGState";
+ case rf_ExecuteDAGState: return "ExecuteDAGState";
+ case rf_ProcessDAGState: return "ProcessDAGState";
+ case rf_CleanupState: return "CleanupState";
+ case rf_LastState: return "LastState";
+ case rf_IncrAccessesCountState: return "IncrAccessesCountState";
+ case rf_DecrAccessesCountState: return "DecrAccessesCountState";
+ default: return "!!! UnnamedState !!!";
+ }
+}
+
+void rf_ContinueRaidAccess(RF_RaidAccessDesc_t *desc)
+{
+ int suspended = RF_FALSE;
+ int current_state_index = desc->state;
+ RF_AccessState_t current_state = desc->states[current_state_index];
+
+#ifdef SIMULATE
+ rf_SetCurrentOwner(desc->owner);
+#endif /* SIMULATE */
+
+ do {
+
+ current_state_index = desc->state;
+ current_state = desc->states [current_state_index];
+
+ switch (current_state) {
+
+ case rf_QuiesceState: suspended = rf_State_Quiesce(desc);
+ break;
+ case rf_IncrAccessesCountState: suspended = rf_State_IncrAccessCount(desc);
+ break;
+ case rf_MapState: suspended = rf_State_Map(desc);
+ break;
+ case rf_LockState: suspended = rf_State_Lock(desc);
+ break;
+ case rf_CreateDAGState: suspended = rf_State_CreateDAG(desc);
+ break;
+ case rf_ExecuteDAGState: suspended = rf_State_ExecuteDAG(desc);
+ break;
+ case rf_ProcessDAGState: suspended = rf_State_ProcessDAG(desc);
+ break;
+ case rf_CleanupState: suspended = rf_State_Cleanup(desc);
+ break;
+ case rf_DecrAccessesCountState: suspended = rf_State_DecrAccessCount(desc);
+ break;
+ case rf_LastState: suspended = rf_State_LastState(desc);
+ break;
+ }
+
+ /* after this point, we cannot dereference desc since desc may
+ have been freed. desc is only freed in LastState, so if we
+ renter this function or loop back up, desc should be valid. */
+
+ if (rf_printStatesDebug) {
+ int tid;
+ rf_get_threadid (tid);
+
+ printf ("[%d] State: %-24s StateIndex: %3i desc: 0x%ld %s\n",
+ tid, StateName(current_state), current_state_index, (long)desc,
+ suspended ? "callback scheduled" : "looping");
+ }
+ } while (!suspended && current_state != rf_LastState);
+
+ return;
+}
+
+
+void rf_ContinueDagAccess (RF_DagList_t *dagList)
+{
+ RF_AccTraceEntry_t *tracerec = &(dagList->desc->tracerec);
+ RF_RaidAccessDesc_t *desc;
+ RF_DagHeader_t *dag_h;
+ RF_Etimer_t timer;
+ int i;
+
+ desc = dagList->desc;
+
+ timer = tracerec->timer;
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ tracerec->specific.user.exec_us = RF_ETIMER_VAL_US(timer);
+ RF_ETIMER_START(tracerec->timer);
+
+ /* skip to dag which just finished */
+ dag_h = dagList->dags;
+ for (i = 0; i < dagList->numDagsDone; i++) {
+ dag_h = dag_h->next;
+ }
+
+ /* check to see if retry is required */
+ if (dag_h->status == rf_rollBackward) {
+ /* when a dag fails, mark desc status as bad and allow all other dags
+ * in the desc to execute to completion. then, free all dags and start over */
+ desc->status = 1; /* bad status */
+#if RF_DEMO > 0
+ if (!rf_demoMode)
+#endif /* RF_DEMO > 0 */
+ {
+ printf("[%d] DAG failure: %c addr 0x%lx (%ld) nblk 0x%x (%d) buf 0x%lx\n",
+ desc->tid, desc->type, (long)desc->raidAddress,
+ (long)desc->raidAddress,(int)desc->numBlocks,
+ (int)desc->numBlocks, (unsigned long) (desc->bufPtr));
+ }
+ }
+
+ dagList->numDagsDone++;
+ rf_ContinueRaidAccess(desc);
+}
+
+
+int rf_State_LastState(RF_RaidAccessDesc_t *desc)
+{
+ void (*callbackFunc)(RF_CBParam_t) = desc->callbackFunc;
+ RF_CBParam_t callbackArg;
+
+ callbackArg.p = desc->callbackArg;
+
+#ifdef SIMULATE
+ int tid;
+ rf_get_threadid(tid);
+
+ if (rf_accessDebug)
+ printf("async_flag set to %d\n",global_async_flag);
+ global_async_flag=desc->async_flag;
+ if (rf_accessDebug)
+ printf("Will now do clean up for %d\n",rf_GetCurrentOwner());
+ rf_FreeRaidAccDesc(desc);
+
+ if (callbackFunc)
+ callbackFunc(callbackArg);
+#else /* SIMULATE */
+
+#ifndef KERNEL
+
+ if (!(desc->flags & RF_DAG_NONBLOCKING_IO)) {
+ /* bummer that we have to take another lock here */
+ RF_LOCK_MUTEX(desc->mutex);
+ RF_ASSERT(desc->flags&RF_DAG_ACCESS_COMPLETE);
+ RF_SIGNAL_COND(desc->cond); /* DoAccess frees the desc in the blocking-I/O case */
+ RF_UNLOCK_MUTEX(desc->mutex);
+ }
+ else
+ rf_FreeRaidAccDesc(desc);
+
+ if (callbackFunc)
+ callbackFunc(callbackArg);
+
+#else /* KERNEL */
+ if (!(desc->flags & RF_DAG_TEST_ACCESS)) {/* don't biodone if this */
+#if DKUSAGE > 0
+ RF_DKU_END_IO(((RF_Raid_t *)desc->raidPtr)->raidid,(struct buf *)desc->bp);
+#else
+ RF_DKU_END_IO(((RF_Raid_t *)desc->raidPtr)->raidid);
+#endif /* DKUSAGE > 0 */
+ /* printf("Calling biodone on 0x%x\n",desc->bp); */
+ biodone(desc->bp); /* access came through ioctl */
+ }
+
+ if (callbackFunc) callbackFunc(callbackArg);
+ rf_FreeRaidAccDesc(desc);
+
+#endif /* ! KERNEL */
+#endif /* SIMULATE */
+
+ return RF_FALSE;
+}
+
+int rf_State_IncrAccessCount(RF_RaidAccessDesc_t *desc)
+{
+ RF_Raid_t *raidPtr;
+
+ raidPtr = desc->raidPtr;
+ /* Bummer. We have to do this to be 100% safe w.r.t. the increment below */
+ RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
+ raidPtr->accs_in_flight++; /* used to detect quiescence */
+ RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
+
+ desc->state++;
+ return RF_FALSE;
+}
+
+int rf_State_DecrAccessCount(RF_RaidAccessDesc_t *desc)
+{
+ RF_Raid_t *raidPtr;
+
+ raidPtr = desc->raidPtr;
+
+ RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
+ raidPtr->accs_in_flight--;
+ if (raidPtr->accesses_suspended && raidPtr->accs_in_flight == 0) {
+ rf_SignalQuiescenceLock(raidPtr, raidPtr->reconDesc);
+ }
+ rf_UpdateUserStats(raidPtr, RF_ETIMER_VAL_US(desc->timer), desc->numBlocks);
+ RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
+
+ desc->state++;
+ return RF_FALSE;
+}
+
+int rf_State_Quiesce(RF_RaidAccessDesc_t *desc)
+{
+ RF_AccTraceEntry_t *tracerec = &desc->tracerec;
+ RF_Etimer_t timer;
+ int suspended = RF_FALSE;
+ RF_Raid_t *raidPtr;
+
+ raidPtr = desc->raidPtr;
+
+ RF_ETIMER_START(timer);
+ RF_ETIMER_START(desc->timer);
+
+ RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
+ if (raidPtr->accesses_suspended) {
+ RF_CallbackDesc_t *cb;
+ cb = rf_AllocCallbackDesc();
+ /* XXX the following cast is quite bogus... rf_ContinueRaidAccess
+ takes a (RF_RaidAccessDesc_t *) as an argument.. GO */
+ cb->callbackFunc = (void (*)(RF_CBParam_t))rf_ContinueRaidAccess;
+ cb->callbackArg.p = (void *) desc;
+ cb->next = raidPtr->quiesce_wait_list;
+ raidPtr->quiesce_wait_list = cb;
+ suspended = RF_TRUE;
+ }
+
+ RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
+
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ tracerec->specific.user.suspend_ovhd_us += RF_ETIMER_VAL_US(timer);
+
+ if (suspended && rf_quiesceDebug)
+ printf("Stalling access due to quiescence lock\n");
+
+ desc->state++;
+ return suspended;
+}
+
+int rf_State_Map(RF_RaidAccessDesc_t *desc)
+{
+ RF_Raid_t *raidPtr = desc->raidPtr;
+ RF_AccTraceEntry_t *tracerec = &desc->tracerec;
+ RF_Etimer_t timer;
+
+ RF_ETIMER_START(timer);
+
+ if (!(desc->asmap = rf_MapAccess(raidPtr, desc->raidAddress, desc->numBlocks,
+ desc->bufPtr, RF_DONT_REMAP)))
+ RF_PANIC();
+
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ tracerec->specific.user.map_us = RF_ETIMER_VAL_US(timer);
+
+ desc->state ++;
+ return RF_FALSE;
+}
+
+int rf_State_Lock(RF_RaidAccessDesc_t *desc)
+{
+ RF_AccTraceEntry_t *tracerec = &desc->tracerec;
+ RF_Raid_t *raidPtr = desc->raidPtr;
+ RF_AccessStripeMapHeader_t *asmh = desc->asmap;
+ RF_AccessStripeMap_t *asm_p;
+ RF_Etimer_t timer;
+ int suspended = RF_FALSE;
+
+ RF_ETIMER_START(timer);
+ if (!(raidPtr->Layout.map->flags & RF_NO_STRIPE_LOCKS)) {
+ RF_StripeNum_t lastStripeID = -1;
+
+ /* acquire each lock that we don't already hold */
+ for (asm_p = asmh->stripeMap; asm_p; asm_p = asm_p->next) {
+ RF_ASSERT(RF_IO_IS_R_OR_W(desc->type));
+ if (!rf_suppressLocksAndLargeWrites &&
+ asm_p->parityInfo &&
+ !(desc->flags& RF_DAG_SUPPRESS_LOCKS) &&
+ !(asm_p->flags & RF_ASM_FLAGS_LOCK_TRIED))
+ {
+ asm_p->flags |= RF_ASM_FLAGS_LOCK_TRIED;
+ RF_ASSERT(asm_p->stripeID > lastStripeID); /* locks must be acquired
+ hierarchically */
+ lastStripeID = asm_p->stripeID;
+ /* XXX the cast to (void (*)(RF_CBParam_t)) below is bogus! GO */
+ RF_INIT_LOCK_REQ_DESC(asm_p->lockReqDesc, desc->type,
+ (void (*)(struct buf *))rf_ContinueRaidAccess, desc, asm_p,
+ raidPtr->Layout.dataSectorsPerStripe);
+ if (rf_AcquireStripeLock(raidPtr->lockTable, asm_p->stripeID,
+ &asm_p->lockReqDesc))
+ {
+ suspended = RF_TRUE;
+ break;
+ }
+ }
+
+ if (desc->type == RF_IO_TYPE_WRITE &&
+ raidPtr->status[asm_p->physInfo->row] == rf_rs_reconstructing)
+ {
+ if (! (asm_p->flags & RF_ASM_FLAGS_FORCE_TRIED) ) {
+ int val;
+
+ asm_p->flags |= RF_ASM_FLAGS_FORCE_TRIED;
+ /* XXX the cast below is quite bogus!!! XXX GO */
+ val = rf_ForceOrBlockRecon(raidPtr, asm_p,
+ (void (*)(RF_Raid_t *,void *))rf_ContinueRaidAccess, desc);
+ if (val == 0) {
+ asm_p->flags |= RF_ASM_FLAGS_RECON_BLOCKED;
+ }
+ else {
+ suspended = RF_TRUE;
+ break;
+ }
+ }
+ else {
+ if (rf_pssDebug) {
+ printf("[%d] skipping force/block because already done, psid %ld\n",
+ desc->tid,(long)asm_p->stripeID);
+ }
+ }
+ }
+ else {
+ if (rf_pssDebug) {
+ printf("[%d] skipping force/block because not write or not under recon, psid %ld\n",
+ desc->tid,(long)asm_p->stripeID);
+ }
+ }
+ }
+
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ tracerec->specific.user.lock_us += RF_ETIMER_VAL_US(timer);
+
+ if (suspended)
+ return(RF_TRUE);
+ }
+
+ desc->state++;
+ return(RF_FALSE);
+}
+
+/*
+ * the following three states create, execute, and post-process dags
+ * the error recovery unit is a single dag.
+ * by default, SelectAlgorithm creates an array of dags, one per parity stripe
+ * in some tricky cases, multiple dags per stripe are created
+ * - dags within a parity stripe are executed sequentially (arbitrary order)
+ * - dags for distinct parity stripes are executed concurrently
+ *
+ * repeat until all dags complete successfully -or- dag selection fails
+ *
+ * while !done
+ * create dag(s) (SelectAlgorithm)
+ * if dag
+ * execute dag (DispatchDAG)
+ * if dag successful
+ * done (SUCCESS)
+ * else
+ * !done (RETRY - start over with new dags)
+ * else
+ * done (FAIL)
+ */
+int rf_State_CreateDAG (RF_RaidAccessDesc_t *desc)
+{
+ RF_AccTraceEntry_t *tracerec = &desc->tracerec;
+ RF_Etimer_t timer;
+ RF_DagHeader_t *dag_h;
+ int i, selectStatus;
+
+ /* generate a dag for the access, and fire it off. When the dag
+ completes, we'll get re-invoked in the next state. */
+ RF_ETIMER_START(timer);
+ /* SelectAlgorithm returns one or more dags */
+ selectStatus = rf_SelectAlgorithm(desc, desc->flags|RF_DAG_SUPPRESS_LOCKS);
+ if (rf_printDAGsDebug)
+ for (i = 0; i < desc->numStripes; i++)
+ rf_PrintDAGList(desc->dagArray[i].dags);
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ /* update time to create all dags */
+ tracerec->specific.user.dag_create_us = RF_ETIMER_VAL_US(timer);
+
+ desc->status = 0; /* good status */
+
+ if (selectStatus) {
+ /* failed to create a dag */
+ /* this happens when there are too many faults or incomplete dag libraries */
+ printf("[Failed to create a DAG\n]");
+ RF_PANIC();
+ }
+ else {
+ /* bind dags to desc */
+ for (i = 0; i < desc->numStripes; i++) {
+ dag_h = desc->dagArray[i].dags;
+ while (dag_h) {
+#ifdef KERNEL
+ dag_h->bp = (struct buf *) desc->bp;
+#endif /* KERNEL */
+ dag_h->tracerec = tracerec;
+ dag_h = dag_h->next;
+ }
+ }
+ desc->flags |= RF_DAG_DISPATCH_RETURNED;
+ desc->state++; /* next state should be rf_State_ExecuteDAG */
+ }
+ return RF_FALSE;
+}
+
+
+
+/* the access has an array of dagLists, one dagList per parity stripe.
+ * fire the first dag in each parity stripe (dagList).
+ * dags within a stripe (dagList) must be executed sequentially
+ * - this preserves atomic parity update
+ * dags for independents parity groups (stripes) are fired concurrently */
+
+int rf_State_ExecuteDAG(RF_RaidAccessDesc_t *desc)
+{
+ int i;
+ RF_DagHeader_t *dag_h;
+ RF_DagList_t *dagArray = desc->dagArray;
+
+ /* next state is always rf_State_ProcessDAG
+ * important to do this before firing the first dag
+ * (it may finish before we leave this routine) */
+ desc->state++;
+
+ /* sweep dag array, a stripe at a time, firing the first dag in each stripe */
+ for (i = 0; i < desc->numStripes; i++) {
+ RF_ASSERT(dagArray[i].numDags > 0);
+ RF_ASSERT(dagArray[i].numDagsDone == 0);
+ RF_ASSERT(dagArray[i].numDagsFired == 0);
+ RF_ETIMER_START(dagArray[i].tracerec.timer);
+ /* fire first dag in this stripe */
+ dag_h = dagArray[i].dags;
+ RF_ASSERT(dag_h);
+ dagArray[i].numDagsFired++;
+ /* XXX Yet another case where we pass in a conflicting function pointer
+ :-( XXX GO */
+ rf_DispatchDAG(dag_h, (void (*)(void *))rf_ContinueDagAccess, &dagArray[i]);
+ }
+
+ /* the DAG will always call the callback, even if there was no
+ * blocking, so we are always suspended in this state */
+ return RF_TRUE;
+}
+
+
+
+/* rf_State_ProcessDAG is entered when a dag completes.
+ * first, check to all dags in the access have completed
+ * if not, fire as many dags as possible */
+
+int rf_State_ProcessDAG(RF_RaidAccessDesc_t *desc)
+{
+ RF_AccessStripeMapHeader_t *asmh = desc->asmap;
+ RF_Raid_t *raidPtr = desc->raidPtr;
+ RF_DagHeader_t *dag_h;
+ int i, j, done = RF_TRUE;
+ RF_DagList_t *dagArray = desc->dagArray;
+ RF_Etimer_t timer;
+
+ /* check to see if this is the last dag */
+ for (i = 0; i < desc->numStripes; i++)
+ if (dagArray[i].numDags != dagArray[i].numDagsDone)
+ done = RF_FALSE;
+
+ if (done) {
+ if (desc->status) {
+ /* a dag failed, retry */
+ RF_ETIMER_START(timer);
+ /* free all dags */
+ for (i = 0; i < desc->numStripes; i++) {
+ rf_FreeDAG(desc->dagArray[i].dags);
+ }
+ rf_MarkFailuresInASMList(raidPtr, asmh);
+ /* back up to rf_State_CreateDAG */
+ desc->state = desc->state - 2;
+ return RF_FALSE;
+ }
+ else {
+ /* move on to rf_State_Cleanup */
+ desc->state++;
+ }
+ return RF_FALSE;
+ }
+ else {
+ /* more dags to execute */
+ /* see if any are ready to be fired. if so, fire them */
+ /* don't fire the initial dag in a list, it's fired in rf_State_ExecuteDAG */
+ for (i = 0; i < desc->numStripes; i++) {
+ if ((dagArray[i].numDagsDone < dagArray[i].numDags)
+ && (dagArray[i].numDagsDone == dagArray[i].numDagsFired)
+ && (dagArray[i].numDagsFired > 0)) {
+ RF_ETIMER_START(dagArray[i].tracerec.timer);
+ /* fire next dag in this stripe */
+ /* first, skip to next dag awaiting execution */
+ dag_h = dagArray[i].dags;
+ for (j = 0; j < dagArray[i].numDagsDone; j++)
+ dag_h = dag_h->next;
+ dagArray[i].numDagsFired++;
+ /* XXX and again we pass a different function pointer.. GO */
+ rf_DispatchDAG(dag_h, (void (*)(void *))rf_ContinueDagAccess,
+ &dagArray[i]);
+ }
+ }
+ return RF_TRUE;
+ }
+}
+
+/* only make it this far if all dags complete successfully */
+int rf_State_Cleanup(RF_RaidAccessDesc_t *desc)
+{
+ RF_AccTraceEntry_t *tracerec = &desc->tracerec;
+ RF_AccessStripeMapHeader_t *asmh = desc->asmap;
+ RF_Raid_t *raidPtr = desc->raidPtr;
+ RF_AccessStripeMap_t *asm_p;
+ RF_DagHeader_t *dag_h;
+ RF_Etimer_t timer;
+ int tid, i;
+
+ desc->state ++;
+
+ rf_get_threadid(tid);
+
+ timer = tracerec->timer;
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ tracerec->specific.user.dag_retry_us = RF_ETIMER_VAL_US(timer);
+
+ /* the RAID I/O is complete. Clean up. */
+ tracerec->specific.user.dag_retry_us = 0;
+
+ RF_ETIMER_START(timer);
+ if (desc->flags & RF_DAG_RETURN_DAG) {
+ /* copy dags into paramDAG */
+ *(desc->paramDAG) = desc->dagArray[0].dags;
+ dag_h = *(desc->paramDAG);
+ for (i = 1; i < desc->numStripes; i++) {
+ /* concatenate dags from remaining stripes */
+ RF_ASSERT(dag_h);
+ while (dag_h->next)
+ dag_h = dag_h->next;
+ dag_h->next = desc->dagArray[i].dags;
+ }
+ }
+ else {
+ /* free all dags */
+ for (i = 0; i < desc->numStripes; i++) {
+ rf_FreeDAG(desc->dagArray[i].dags);
+ }
+ }
+
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ tracerec->specific.user.cleanup_us = RF_ETIMER_VAL_US(timer);
+
+ RF_ETIMER_START(timer);
+ if (!(raidPtr->Layout.map->flags & RF_NO_STRIPE_LOCKS)) {
+ for (asm_p = asmh->stripeMap; asm_p; asm_p = asm_p->next) {
+ if (!rf_suppressLocksAndLargeWrites &&
+ asm_p->parityInfo &&
+ !(desc->flags&RF_DAG_SUPPRESS_LOCKS))
+ {
+ RF_ASSERT_VALID_LOCKREQ(&asm_p->lockReqDesc);
+ rf_ReleaseStripeLock(raidPtr->lockTable, asm_p->stripeID,
+ &asm_p->lockReqDesc);
+ }
+ if (asm_p->flags & RF_ASM_FLAGS_RECON_BLOCKED) {
+ rf_UnblockRecon(raidPtr, asm_p);
+ }
+ }
+ }
+
+#ifdef SIMULATE
+ /* refresh current owner in case blocked ios where allowed to run */
+ rf_SetCurrentOwner(desc->owner);
+#endif /* SIMULATE */
+
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ tracerec->specific.user.lock_us += RF_ETIMER_VAL_US(timer);
+
+ RF_ETIMER_START(timer);
+ if (desc->flags & RF_DAG_RETURN_ASM)
+ *(desc->paramASM) = asmh;
+ else
+ rf_FreeAccessStripeMap(asmh);
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ tracerec->specific.user.cleanup_us += RF_ETIMER_VAL_US(timer);
+
+ RF_ETIMER_STOP(desc->timer);
+ RF_ETIMER_EVAL(desc->timer);
+
+ timer = desc->tracerec.tot_timer;
+ RF_ETIMER_STOP(timer);
+ RF_ETIMER_EVAL(timer);
+ desc->tracerec.total_us = RF_ETIMER_VAL_US(timer);
+
+ rf_LogTraceRec(raidPtr, tracerec);
+
+ desc->flags |= RF_DAG_ACCESS_COMPLETE;
+
+ return RF_FALSE;
+}
diff --git a/sys/dev/raidframe/rf_states.h b/sys/dev/raidframe/rf_states.h
new file mode 100644
index 00000000000..2e2895caa5e
--- /dev/null
+++ b/sys/dev/raidframe/rf_states.h
@@ -0,0 +1,70 @@
+/* $OpenBSD: rf_states.h,v 1.1 1999/01/11 14:29:51 niklas Exp $ */
+/* $NetBSD: rf_states.h,v 1.1 1998/11/13 04:20:34 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, William V. Courtright II, Robby Findler
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* :
+ * Log: rf_states.h,v
+ * Revision 1.5 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.4 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.3 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.2 1996/05/06 22:08:28 wvcii
+ * added copyright info and change log
+ *
+ * Revision 1.1 1995/07/06 14:23:39 robby
+ * Initial revision
+ *
+ */
+
+#ifndef _RF__RF_STATES_H_
+#define _RF__RF_STATES_H_
+
+#include "rf_types.h"
+
+void rf_ContinueRaidAccess(RF_RaidAccessDesc_t *desc);
+void rf_ContinueDagAccess(RF_DagList_t *dagList);
+int rf_State_LastState(RF_RaidAccessDesc_t *desc);
+int rf_State_IncrAccessCount(RF_RaidAccessDesc_t *desc);
+int rf_State_DecrAccessCount(RF_RaidAccessDesc_t *desc);
+int rf_State_Quiesce(RF_RaidAccessDesc_t *desc);
+int rf_State_Map(RF_RaidAccessDesc_t *desc);
+int rf_State_Lock(RF_RaidAccessDesc_t *desc);
+int rf_State_CreateDAG(RF_RaidAccessDesc_t *desc);
+int rf_State_ExecuteDAG(RF_RaidAccessDesc_t *desc);
+int rf_State_ProcessDAG(RF_RaidAccessDesc_t *desc);
+int rf_State_Cleanup(RF_RaidAccessDesc_t *desc);
+
+#endif /* !_RF__RF_STATES_H_ */
diff --git a/sys/dev/raidframe/rf_stripelocks.c b/sys/dev/raidframe/rf_stripelocks.c
new file mode 100644
index 00000000000..c9b9502ad70
--- /dev/null
+++ b/sys/dev/raidframe/rf_stripelocks.c
@@ -0,0 +1,642 @@
+/* $OpenBSD: rf_stripelocks.c,v 1.1 1999/01/11 14:29:51 niklas Exp $ */
+/* $NetBSD: rf_stripelocks.c,v 1.1 1998/11/13 04:20:34 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Authors: Mark Holland, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* :
+ * Log: rf_stripelocks.c,v
+ * Revision 1.35 1996/06/10 12:50:57 jimz
+ * Add counters to freelists to track number of allocations, frees,
+ * grows, max size, etc. Adjust a couple sets of PRIME params based
+ * on the results.
+ *
+ * Revision 1.34 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.33 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.32 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.31 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.30 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.29 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.28 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.27 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.26 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.25 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.24 1996/05/20 16:15:00 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.23 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.22 1996/05/16 22:28:11 jimz
+ * misc cleanup
+ *
+ * Revision 1.21 1996/05/15 23:39:52 jimz
+ * remove #if 0 code
+ *
+ * Revision 1.20 1996/05/15 23:37:38 jimz
+ * convert to using RF_FREELIST stuff for StripeLockDesc allocation
+ *
+ * Revision 1.19 1996/05/08 18:00:53 jimz
+ * fix number of args to debug printf
+ *
+ * Revision 1.18 1996/05/06 22:33:07 jimz
+ * added better debug info
+ *
+ * Revision 1.17 1996/05/06 22:09:01 wvcii
+ * added copyright info and change log
+ *
+ */
+
+/*
+ * stripelocks.c -- code to lock stripes for read and write access
+ *
+ * The code distinguishes between read locks and write locks. There can be
+ * as many readers to given stripe as desired. When a write request comes
+ * in, no further readers are allowed to enter, and all subsequent requests
+ * are queued in FIFO order. When a the number of readers goes to zero, the
+ * writer is given the lock. When a writer releases the lock, the list of
+ * queued requests is scanned, and all readersq up to the next writer are
+ * given the lock.
+ *
+ * The lock table size must be one less than a power of two, but HASH_STRIPEID
+ * is the only function that requires this.
+ *
+ * The code now supports "range locks". When you ask to lock a stripe, you
+ * specify a range of addresses in that stripe that you want to lock. When
+ * you acquire the lock, you've locked only this range of addresses, and
+ * other threads can concurrently read/write any non-overlapping portions
+ * of the stripe. The "addresses" that you lock are abstract in that you
+ * can pass in anything you like. The expectation is that you'll pass in
+ * the range of physical disk offsets of the parity bits you're planning
+ * to update. The idea behind this, of course, is to allow sub-stripe
+ * locking. The implementation is perhaps not the best imaginable; in the
+ * worst case a lock release is O(n^2) in the total number of outstanding
+ * requests to a given stripe. Note that if you're striping with a
+ * stripe unit size equal to an entire disk (i.e. not striping), there will
+ * be only one stripe and you may spend some significant number of cycles
+ * searching through stripe lock descriptors.
+ */
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#include "rf_types.h"
+#include "rf_raid.h"
+#include "rf_stripelocks.h"
+#include "rf_alloclist.h"
+#include "rf_threadid.h"
+#include "rf_general.h"
+#include "rf_freelist.h"
+#include "rf_debugprint.h"
+#include "rf_driver.h"
+#include "rf_shutdown.h"
+
+#define Dprintf1(s,a) rf_debug_printf(s,(void *)((unsigned long)a),NULL,NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf2(s,a,b) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),NULL,NULL,NULL,NULL,NULL,NULL)
+#define Dprintf3(s,a,b,c) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),NULL,NULL,NULL,NULL,NULL)
+#define Dprintf4(s,a,b,c,d) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),NULL,NULL,NULL,NULL)
+#define Dprintf5(s,a,b,c,d,e) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),NULL,NULL,NULL)
+#define Dprintf6(s,a,b,c,d,e,f) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),NULL,NULL)
+#define Dprintf7(s,a,b,c,d,e,f,g) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL)
+#define Dprintf8(s,a,b,c,d,e,f,g,h) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),(void *)((unsigned long)h))
+
+#ifndef KERNEL
+#define FLUSH fflush(stdout)
+#else /* !KERNEL */
+#define FLUSH
+#endif /* !KERNEL */
+
+#define HASH_STRIPEID(_sid_) ( (_sid_) & (rf_lockTableSize-1) )
+#define MAX_FREELIST 100
+
+static void AddToWaitersQueue(RF_LockTableEntry_t *lockTable, RF_StripeLockDesc_t *lockDesc, RF_LockReqDesc_t *lockReqDesc);
+static RF_StripeLockDesc_t *AllocStripeLockDesc(RF_StripeNum_t stripeID);
+static void FreeStripeLockDesc(RF_StripeLockDesc_t *p);
+static void PrintLockedStripes(RF_LockTableEntry_t *lockTable);
+
+/* determines if two ranges overlap. always yields false if either start value is negative */
+#define SINGLE_RANGE_OVERLAP(_strt1, _stop1, _strt2, _stop2) \
+ ( (_strt1 >= 0) && (_strt2 >= 0) && (RF_MAX(_strt1, _strt2) <= RF_MIN(_stop1, _stop2)) )
+
+/* determines if any of the ranges specified in the two lock descriptors overlap each other */
+#define RANGE_OVERLAP(_cand, _pred) \
+ ( SINGLE_RANGE_OVERLAP((_cand)->start, (_cand)->stop, (_pred)->start, (_pred)->stop ) || \
+ SINGLE_RANGE_OVERLAP((_cand)->start2, (_cand)->stop2, (_pred)->start, (_pred)->stop ) || \
+ SINGLE_RANGE_OVERLAP((_cand)->start, (_cand)->stop, (_pred)->start2, (_pred)->stop2) || \
+ SINGLE_RANGE_OVERLAP((_cand)->start2, (_cand)->stop2, (_pred)->start2, (_pred)->stop2) )
+
+/* Determines if a candidate lock request conflicts with a predecessor lock req.
+ * Note that the arguments are not interchangeable.
+ * The rules are:
+ * a candidate read conflicts with a predecessor write if any ranges overlap
+ * a candidate write conflicts with a predecessor read if any ranges overlap
+ * a candidate write conflicts with a predecessor write if any ranges overlap
+ */
+#define STRIPELOCK_CONFLICT(_cand, _pred) \
+ RANGE_OVERLAP((_cand), (_pred)) && \
+ ( ( (((_cand)->type == RF_IO_TYPE_READ) && ((_pred)->type == RF_IO_TYPE_WRITE)) || \
+ (((_cand)->type == RF_IO_TYPE_WRITE) && ((_pred)->type == RF_IO_TYPE_READ)) || \
+ (((_cand)->type == RF_IO_TYPE_WRITE) && ((_pred)->type == RF_IO_TYPE_WRITE)) \
+ ) \
+ )
+
+static RF_FreeList_t *rf_stripelock_freelist;
+#define RF_MAX_FREE_STRIPELOCK 128
+#define RF_STRIPELOCK_INC 8
+#define RF_STRIPELOCK_INITIAL 32
+
+static void rf_ShutdownStripeLockFreeList(void *);
+static void rf_RaidShutdownStripeLocks(void *);
+
+static void rf_ShutdownStripeLockFreeList(ignored)
+ void *ignored;
+{
+ RF_FREELIST_DESTROY(rf_stripelock_freelist,next,(RF_StripeLockDesc_t *));
+}
+
+int rf_ConfigureStripeLockFreeList(listp)
+ RF_ShutdownList_t **listp;
+{
+ unsigned mask;
+ int rc;
+
+ RF_FREELIST_CREATE(rf_stripelock_freelist, RF_MAX_FREE_STRIPELOCK,
+ RF_STRIPELOCK_INITIAL,sizeof(RF_StripeLockDesc_t));
+ rc = rf_ShutdownCreate(listp, rf_ShutdownStripeLockFreeList, NULL);
+ if (rc) {
+ RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
+ __FILE__, __LINE__, rc);
+ rf_ShutdownStripeLockFreeList(NULL);
+ return(rc);
+ }
+ RF_FREELIST_PRIME(rf_stripelock_freelist,RF_STRIPELOCK_INITIAL,next,
+ (RF_StripeLockDesc_t *));
+ for (mask=0x1; mask; mask<<=1)
+ if (rf_lockTableSize==mask)
+ break;
+ if (!mask) {
+ printf("[WARNING: lock table size must be a power of two. Setting to %d.]\n",RF_DEFAULT_LOCK_TABLE_SIZE);
+ rf_lockTableSize = RF_DEFAULT_LOCK_TABLE_SIZE;
+ }
+ return(0);
+}
+
+RF_LockTableEntry_t *rf_MakeLockTable()
+{
+ RF_LockTableEntry_t *lockTable;
+ int i, rc;
+
+ RF_Calloc(lockTable, ((int) rf_lockTableSize), sizeof(RF_LockTableEntry_t), (RF_LockTableEntry_t *));
+ if (lockTable == NULL)
+ return(NULL);
+ for (i=0; i<rf_lockTableSize; i++) {
+ rc = rf_mutex_init(&lockTable[i].mutex);
+ if (rc) {
+ RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d\n", __FILE__,
+ __LINE__, rc);
+ /* XXX clean up other mutexes */
+ return(NULL);
+ }
+ }
+ return(lockTable);
+}
+
+void rf_ShutdownStripeLocks(RF_LockTableEntry_t *lockTable)
+{
+ int i;
+
+ if (rf_stripeLockDebug) {
+ PrintLockedStripes(lockTable);
+ }
+ for (i=0; i<rf_lockTableSize; i++) {
+ rf_mutex_destroy(&lockTable[i].mutex);
+ }
+ RF_Free(lockTable, rf_lockTableSize*sizeof(RF_LockTableEntry_t));
+}
+
+static void rf_RaidShutdownStripeLocks(arg)
+ void *arg;
+{
+ RF_Raid_t *raidPtr = (RF_Raid_t *)arg;
+ rf_ShutdownStripeLocks(raidPtr->lockTable);
+}
+
+int rf_ConfigureStripeLocks(
+ RF_ShutdownList_t **listp,
+ RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr)
+{
+ int rc;
+
+ raidPtr->lockTable = rf_MakeLockTable();
+ if (raidPtr->lockTable == NULL)
+ return(ENOMEM);
+ rc = rf_ShutdownCreate(listp, rf_RaidShutdownStripeLocks, raidPtr);
+ if (rc) {
+ RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n",
+ __FILE__, __LINE__, rc);
+ rf_ShutdownStripeLocks(raidPtr->lockTable);
+ return(rc);
+ }
+ return(0);
+}
+
+/* returns 0 if you've got the lock, and non-zero if you have to wait.
+ * if and only if you have to wait, we'll cause cbFunc to get invoked
+ * with cbArg when you are granted the lock. We store a tag in *releaseTag
+ * that you need to give back to us when you release the lock.
+ */
+int rf_AcquireStripeLock(
+ RF_LockTableEntry_t *lockTable,
+ RF_StripeNum_t stripeID,
+ RF_LockReqDesc_t *lockReqDesc)
+{
+ RF_StripeLockDesc_t *lockDesc;
+ RF_LockReqDesc_t *p;
+ int tid=0, hashval = HASH_STRIPEID(stripeID);
+ int retcode = 0;
+
+ RF_ASSERT(RF_IO_IS_R_OR_W(lockReqDesc->type));
+
+ if (rf_stripeLockDebug) {
+ rf_get_threadid(tid);
+ if (stripeID == -1) Dprintf1("[%d] Lock acquisition supressed (stripeID == -1)\n",tid);
+ else {
+ Dprintf8("[%d] Trying to acquire stripe lock table 0x%lx SID %ld type %c range %ld-%ld, range2 %ld-%ld hashval %d\n",
+ tid, (unsigned long) lockTable, stripeID, lockReqDesc->type, lockReqDesc->start,
+ lockReqDesc->stop, lockReqDesc->start2, lockReqDesc->stop2);
+ Dprintf3("[%d] lock %ld hashval %d\n", tid, stripeID, hashval);
+ FLUSH;
+ }
+ }
+ if (stripeID == -1) return(0);
+ lockReqDesc->next = NULL; /* just to be sure */
+
+ RF_LOCK_MUTEX(lockTable[hashval].mutex);
+ for (lockDesc = lockTable[hashval].descList; lockDesc; lockDesc=lockDesc->next) {
+ if (lockDesc->stripeID == stripeID) break;
+ }
+
+ if (!lockDesc) { /* no entry in table => no one reading or writing */
+ lockDesc = AllocStripeLockDesc(stripeID);
+ lockDesc->next = lockTable[hashval].descList;
+ lockTable[hashval].descList = lockDesc;
+ if (lockReqDesc->type == RF_IO_TYPE_WRITE) lockDesc->nWriters++;
+ lockDesc->granted = lockReqDesc;
+ if (rf_stripeLockDebug) {Dprintf7("[%d] no one waiting: lock %ld %c %ld-%ld %ld-%ld granted\n",
+ tid,stripeID,lockReqDesc->type,lockReqDesc->start,lockReqDesc->stop,lockReqDesc->start2,lockReqDesc->stop2); FLUSH;}
+ } else {
+
+ if (lockReqDesc->type == RF_IO_TYPE_WRITE) lockDesc->nWriters++;
+
+ if (lockDesc->nWriters == 0) { /* no need to search any lists if there are no writers anywhere */
+ lockReqDesc->next = lockDesc->granted;
+ lockDesc->granted = lockReqDesc;
+ if (rf_stripeLockDebug) {Dprintf7("[%d] no writers: lock %ld %c %ld-%ld %ld-%ld granted\n",
+ tid,stripeID,lockReqDesc->type,lockReqDesc->start,lockReqDesc->stop,lockReqDesc->start2,lockReqDesc->stop2); FLUSH;}
+ } else {
+
+ /* search the granted & waiting lists for a conflict. stop searching as soon as we find one */
+ retcode = 0;
+ for (p = lockDesc->granted; p; p=p->next) if (STRIPELOCK_CONFLICT(lockReqDesc, p)) {retcode = 1; break;}
+ if (!retcode) for (p = lockDesc->waitersH; p; p=p->next) if (STRIPELOCK_CONFLICT(lockReqDesc, p)) {retcode = 2; break;}
+
+ if (!retcode) {
+ lockReqDesc->next = lockDesc->granted; /* no conflicts found => grant lock */
+ lockDesc->granted = lockReqDesc;
+ if (rf_stripeLockDebug) {
+ Dprintf7("[%d] no conflicts: lock %ld %c %ld-%ld %ld-%ld granted\n",
+ tid,stripeID,lockReqDesc->type,lockReqDesc->start,lockReqDesc->stop,
+ lockReqDesc->start2,lockReqDesc->stop2);
+ FLUSH;
+ }
+ } else {
+ if (rf_stripeLockDebug) {
+ Dprintf6("[%d] conflict: lock %ld %c %ld-%ld hashval=%d not granted\n",
+ tid,stripeID,lockReqDesc->type,lockReqDesc->start,lockReqDesc->stop,
+ hashval);
+ Dprintf3("[%d] lock %ld retcode=%d\n", tid, stripeID, retcode);
+ FLUSH;
+ }
+ AddToWaitersQueue(lockTable, lockDesc, lockReqDesc); /* conflict => the current access must wait */
+ }
+ }
+ }
+
+ RF_UNLOCK_MUTEX(lockTable[hashval].mutex);
+ return(retcode);
+}
+
+void rf_ReleaseStripeLock(
+ RF_LockTableEntry_t *lockTable,
+ RF_StripeNum_t stripeID,
+ RF_LockReqDesc_t *lockReqDesc)
+{
+ RF_StripeLockDesc_t *lockDesc, *ld_t;
+ RF_LockReqDesc_t *lr, *lr_t, *callbacklist, *t;
+ RF_IoType_t type = lockReqDesc->type;
+ int tid=0, hashval = HASH_STRIPEID(stripeID);
+ int release_it, consider_it;
+ RF_LockReqDesc_t *candidate, *candidate_t, *predecessor;
+
+ RF_ASSERT(RF_IO_IS_R_OR_W(type));
+
+ if (rf_stripeLockDebug) {
+ rf_get_threadid(tid);
+ if (stripeID == -1) Dprintf1("[%d] Lock release supressed (stripeID == -1)\n",tid);
+ else {Dprintf8("[%d] Releasing stripe lock on stripe ID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
+ tid,stripeID,lockReqDesc->type,lockReqDesc->start,lockReqDesc->stop,lockReqDesc->start2,lockReqDesc->stop2, lockTable); FLUSH;}
+ }
+
+ if (stripeID == -1) return;
+
+ RF_LOCK_MUTEX(lockTable[hashval].mutex);
+
+ /* find the stripe lock descriptor */
+ for (ld_t = NULL, lockDesc = lockTable[hashval].descList; lockDesc; ld_t = lockDesc, lockDesc=lockDesc->next) {
+ if (lockDesc->stripeID == stripeID) break;
+ }
+ RF_ASSERT(lockDesc); /* major error to release a lock that doesn't exist */
+
+ /* find the stripe lock request descriptor & delete it from the list */
+ for (lr_t = NULL, lr = lockDesc->granted; lr; lr_t = lr, lr=lr->next) if (lr == lockReqDesc) break;
+
+ RF_ASSERT(lr && (lr == lockReqDesc)); /* major error to release a lock that hasn't been granted */
+ if (lr_t) lr_t->next = lr->next; else {
+ RF_ASSERT(lr == lockDesc->granted);
+ lockDesc->granted = lr->next;
+ }
+ lr->next = NULL;
+
+ if (lockReqDesc->type == RF_IO_TYPE_WRITE) lockDesc->nWriters--;
+
+ /* search through the waiters list to see if anyone needs to be woken up.
+ * for each such descriptor in the wait list, we check it against everything granted and against
+ * everything _in front_ of it in the waiters queue. If it conflicts with none of these, we release it.
+ *
+ * DON'T TOUCH THE TEMPLINK POINTER OF ANYTHING IN THE GRANTED LIST HERE. This will roach the case where
+ * the callback tries to acquire a new lock in the same stripe. There are some asserts to try and detect this.
+ *
+ * We apply 2 performance optimizations:
+ * (1) if releasing this lock results in no more writers to this stripe, we just release everybody waiting,
+ * since we place no restrictions on the number of concurrent reads.
+ * (2) we consider as candidates for wakeup only those waiters that have a range overlap with either
+ * the descriptor being woken up or with something in the callbacklist (i.e. something we've just now woken up).
+ * This allows us to avoid the long evaluation for some descriptors.
+ */
+
+ callbacklist = NULL;
+ if (lockDesc->nWriters == 0) { /* performance tweak (1) */
+ while (lockDesc->waitersH) {
+
+ lr = lockDesc->waitersH; /* delete from waiters list */
+ lockDesc->waitersH = lr->next;
+
+ RF_ASSERT(lr->type == RF_IO_TYPE_READ);
+
+ lr->next = lockDesc->granted; /* add to granted list */
+ lockDesc->granted = lr;
+
+ RF_ASSERT(!lr->templink);
+ lr->templink = callbacklist; /* put on callback list so that we'll invoke callback below */
+ callbacklist = lr;
+ if (rf_stripeLockDebug) {Dprintf8("[%d] No writers: granting lock stripe ID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
+ tid,stripeID,lr->type,lr->start,lr->stop,lr->start2,lr->stop2,(unsigned long) lockTable); FLUSH;}
+ }
+ lockDesc->waitersT = NULL; /* we've purged the whole waiters list */
+
+ } else for (candidate_t = NULL, candidate = lockDesc->waitersH; candidate; ) {
+
+ /* performance tweak (2) */
+ consider_it = 0;
+ if (RANGE_OVERLAP(lockReqDesc, candidate)) consider_it = 1;
+ else for (t = callbacklist; t; t=t->templink) if (RANGE_OVERLAP(t, candidate)) {
+ consider_it = 1;
+ break;
+ }
+ if (!consider_it) {
+ if (rf_stripeLockDebug) {Dprintf8("[%d] No overlap: rejecting candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
+ tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2,
+ (unsigned long) lockTable); FLUSH;}
+ candidate_t = candidate; candidate = candidate->next;
+ continue;
+ }
+
+
+ /* we have a candidate for release. check to make sure it is not blocked by any granted locks */
+ release_it = 1;
+ for (predecessor = lockDesc->granted; predecessor; predecessor = predecessor->next) {
+ if (STRIPELOCK_CONFLICT(candidate, predecessor)) {
+ if (rf_stripeLockDebug) {
+ Dprintf8("[%d] Conflicts with granted lock: rejecting candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
+ tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2,
+ (unsigned long) lockTable); FLUSH;
+ }
+ release_it = 0; break;
+ }
+ }
+
+ /* now check to see if the candidate is blocked by any waiters that occur before it it the wait queue */
+ if (release_it) for (predecessor = lockDesc->waitersH; predecessor != candidate; predecessor = predecessor->next) {
+ if (STRIPELOCK_CONFLICT(candidate, predecessor)) {
+ if (rf_stripeLockDebug) {
+ Dprintf8("[%d] Conflicts with waiting lock: rejecting candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
+ tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2,
+ (unsigned long) lockTable); FLUSH;
+ }
+ release_it = 0; break;
+ }
+ }
+
+ /* release it if indicated */
+ if (release_it) {
+ if (rf_stripeLockDebug) {Dprintf8("[%d] Granting lock to candidate stripeID %ld, type %c range %ld-%ld %ld-%ld table 0x%lx\n",
+ tid, stripeID, candidate->type, candidate->start, candidate->stop, candidate->start2, candidate->stop2,
+ (unsigned long) lockTable); FLUSH;}
+ if (candidate_t) {
+ candidate_t->next = candidate->next;
+ if (lockDesc->waitersT == candidate) lockDesc->waitersT = candidate_t; /* cannot be waitersH since candidate_t is not NULL */
+ } else {
+ RF_ASSERT(candidate == lockDesc->waitersH);
+ lockDesc->waitersH = lockDesc->waitersH->next;
+ if (!lockDesc->waitersH) lockDesc->waitersT = NULL;
+ }
+ candidate->next = lockDesc->granted; /* move it to the granted list */
+ lockDesc->granted = candidate;
+
+ RF_ASSERT(!candidate->templink);
+ candidate->templink = callbacklist; /* put it on the list of things to be called after we release the mutex */
+ callbacklist = candidate;
+
+ if (!candidate_t) candidate = lockDesc->waitersH; else candidate = candidate_t->next; /* continue with the rest of the list */
+ } else {
+ candidate_t = candidate; candidate = candidate->next; /* continue with the rest of the list */
+ }
+ }
+
+ /* delete the descriptor if no one is waiting or active */
+ if (!lockDesc->granted && !lockDesc->waitersH) {
+ RF_ASSERT(lockDesc->nWriters == 0);
+ if (rf_stripeLockDebug) {
+ Dprintf3("[%d] Last lock released (table 0x%lx): deleting desc for stripeID %ld\n",tid,(unsigned long) lockTable, stripeID); FLUSH;
+ }
+ if (ld_t) ld_t->next = lockDesc->next; else {
+ RF_ASSERT(lockDesc == lockTable[hashval].descList);
+ lockTable[hashval].descList = lockDesc->next;
+ }
+ FreeStripeLockDesc(lockDesc);
+ lockDesc = NULL; /* only for the ASSERT below */
+ }
+
+ RF_UNLOCK_MUTEX(lockTable[hashval].mutex);
+
+ /* now that we've unlocked the mutex, invoke the callback on all the descriptors in the list */
+ RF_ASSERT(!( (callbacklist) && (!lockDesc) )); /* if we deleted the descriptor, we should have no callbacks to do */
+ for (candidate = callbacklist; candidate; ) {
+ t = candidate;
+ candidate = candidate->templink;
+ t->templink = NULL;
+ (t->cbFunc)(t->cbArg);
+ }
+}
+
+/* must have the indicated lock table mutex upon entry */
+static void AddToWaitersQueue(
+ RF_LockTableEntry_t *lockTable,
+ RF_StripeLockDesc_t *lockDesc,
+ RF_LockReqDesc_t *lockReqDesc)
+{
+ int tid;
+
+ if (rf_stripeLockDebug) {
+ rf_get_threadid(tid);
+ Dprintf3("[%d] Waiting on lock for stripe %ld table 0x%lx\n", tid, lockDesc->stripeID, (unsigned long) lockTable); FLUSH;
+ }
+ if (!lockDesc->waitersH) {
+ lockDesc->waitersH = lockDesc->waitersT = lockReqDesc;
+ } else {
+ lockDesc->waitersT->next = lockReqDesc;
+ lockDesc->waitersT = lockReqDesc;
+ }
+}
+
+static RF_StripeLockDesc_t *AllocStripeLockDesc(RF_StripeNum_t stripeID)
+{
+ RF_StripeLockDesc_t *p;
+
+ RF_FREELIST_GET(rf_stripelock_freelist,p,next,(RF_StripeLockDesc_t *));
+ if (p) {
+ p->stripeID = stripeID;
+ }
+ return(p);
+}
+
+static void FreeStripeLockDesc(RF_StripeLockDesc_t *p)
+{
+ RF_FREELIST_FREE(rf_stripelock_freelist,p,next);
+}
+
+static void PrintLockedStripes(lockTable)
+ RF_LockTableEntry_t *lockTable;
+{
+ int i, j, foundone = 0, did;
+ RF_StripeLockDesc_t *p;
+ RF_LockReqDesc_t *q;
+
+ RF_LOCK_MUTEX(rf_printf_mutex);
+ printf("Locked stripes:\n");
+ for (i=0; i<rf_lockTableSize; i++) if (lockTable[i].descList) {
+ foundone = 1;
+ for (p = lockTable[i].descList; p; p=p->next) {
+ printf("Stripe ID 0x%lx (%d) nWriters %d\n",
+ (long)p->stripeID, (int)p->stripeID, p->nWriters);
+
+ if (! (p->granted) ) printf("Granted: (none)\n"); else printf("Granted:\n");
+ for (did=1,j=0,q = p->granted; q; j++,q=q->next) {
+ printf(" %c(%ld-%ld",q->type,(long)q->start,(long)q->stop);
+ if (q->start2 != -1) printf(",%ld-%ld) ",(long)q->start2,
+ (long)q->stop2); else printf(") ");
+ if (j && !(j%4)) {printf("\n"); did=1;} else did=0;
+ }
+ if (!did) printf("\n");
+
+ if (! (p->waitersH) ) printf("Waiting: (none)\n"); else printf("Waiting:\n");
+ for (did=1,j=0,q = p->waitersH; q; j++,q=q->next) {
+ printf("%c(%ld-%ld",q->type,(long)q->start,(long)q->stop);
+ if (q->start2 != -1) printf(",%ld-%ld) ",(long)q->start2,(long)q->stop2); else printf(") ");
+ if (j && !(j%4)) {printf("\n "); did=1;} else did=0;
+ }
+ if (!did) printf("\n");
+ }
+ }
+ if (!foundone) printf("(none)\n"); else printf("\n");
+ RF_UNLOCK_MUTEX(rf_printf_mutex);
+}
diff --git a/sys/dev/raidframe/rf_stripelocks.h b/sys/dev/raidframe/rf_stripelocks.h
new file mode 100644
index 00000000000..46412504247
--- /dev/null
+++ b/sys/dev/raidframe/rf_stripelocks.h
@@ -0,0 +1,170 @@
+/* $OpenBSD: rf_stripelocks.h,v 1.1 1999/01/11 14:29:51 niklas Exp $ */
+/* $NetBSD: rf_stripelocks.h,v 1.1 1998/11/13 04:20:34 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* :
+ * Log: rf_stripelocks.h,v
+ * Revision 1.22 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.21 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.20 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.19 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.18 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.17 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.16 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.15 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.14 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.13 1996/05/06 22:08:46 wvcii
+ * added copyright info and change log
+ *
+ */
+
+/*****************************************************************************
+ *
+ * stripelocks.h -- header file for locking stripes
+ *
+ * Note that these functions are called from the execution routines of certain
+ * DAG Nodes, and so they must be NON-BLOCKING to assure maximum parallelism
+ * in the DAG. Accordingly, when a node wants to acquire a lock, it calls
+ * AcquireStripeLock, supplying a pointer to a callback function. If the lock
+ * is free at the time of the call, 0 is returned, indicating that the lock
+ * has been acquired. If the lock is not free, 1 is returned, and a copy of
+ * the function pointer and argument are held in the lock table. When the
+ * lock becomes free, the callback function is invoked.
+ *
+ *****************************************************************************/
+
+#ifndef _RF__RF_STRIPELOCKS_H_
+#define _RF__RF_STRIPELOCKS_H_
+
+#include <sys/buf.h>
+
+#include "rf_types.h"
+#include "rf_threadstuff.h"
+#include "rf_general.h"
+
+struct RF_LockReqDesc_s {
+ RF_IoType_t type; /* read or write */
+ RF_int64 start, stop; /* start and end of range to be locked */
+ RF_int64 start2, stop2; /* start and end of 2nd range to be locked */
+ void (*cbFunc)(struct buf *);/* callback function */
+ void *cbArg; /* argument to callback function */
+ RF_LockReqDesc_t *next; /* next element in chain */
+ RF_LockReqDesc_t *templink; /* for making short-lived lists of request descriptors */
+};
+
+#define RF_ASSERT_VALID_LOCKREQ(_lr_) { \
+ RF_ASSERT(RF_IO_IS_R_OR_W((_lr_)->type)); \
+}
+
+struct RF_StripeLockDesc_s {
+ RF_StripeNum_t stripeID; /* the stripe ID */
+ RF_LockReqDesc_t *granted; /* unordered list of granted requests */
+ RF_LockReqDesc_t *waitersH; /* FIFO queue of all waiting reqs, both read and write (Head and Tail) */
+ RF_LockReqDesc_t *waitersT;
+ int nWriters; /* number of writers either granted or waiting */
+ RF_StripeLockDesc_t *next; /* for hash table collision resolution */
+};
+
+struct RF_LockTableEntry_s {
+ RF_DECLARE_MUTEX(mutex) /* mutex on this hash chain */
+ RF_StripeLockDesc_t *descList; /* hash chain of lock descriptors */
+};
+
+/*
+ * Initializes a stripe lock descriptor. _defSize is the number of sectors
+ * that we lock when there is no parity information in the ASM (e.g. RAID0).
+ */
+
+#define RF_INIT_LOCK_REQ_DESC(_lrd, _typ, _cbf, _cba, _asm, _defSize) \
+ { \
+ (_lrd).type = _typ; \
+ (_lrd).start2 = -1; \
+ (_lrd).stop2 = -1; \
+ if ((_asm)->parityInfo) { \
+ (_lrd).start = (_asm)->parityInfo->startSector; \
+ (_lrd).stop = (_asm)->parityInfo->startSector + (_asm)->parityInfo->numSector-1; \
+ if ((_asm)->parityInfo->next) { \
+ (_lrd).start2 = (_asm)->parityInfo->next->startSector; \
+ (_lrd).stop2 = (_asm)->parityInfo->next->startSector + (_asm)->parityInfo->next->numSector-1; \
+ } \
+ } else { \
+ (_lrd).start = 0; \
+ (_lrd).stop = (_defSize); \
+ } \
+ (_lrd).templink= NULL; \
+ (_lrd).cbFunc = (_cbf); \
+ (_lrd).cbArg = (void *) (_cba); \
+ }
+
+int rf_ConfigureStripeLockFreeList(RF_ShutdownList_t **listp);
+RF_LockTableEntry_t *rf_MakeLockTable(void);
+void rf_ShutdownStripeLocks(RF_LockTableEntry_t *lockTable);
+int rf_ConfigureStripeLocks(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
+ RF_Config_t *cfgPtr);
+int rf_AcquireStripeLock(RF_LockTableEntry_t *lockTable,
+ RF_StripeNum_t stripeID, RF_LockReqDesc_t *lockReqDesc);
+void rf_ReleaseStripeLock(RF_LockTableEntry_t *lockTable,
+ RF_StripeNum_t stripeID, RF_LockReqDesc_t *lockReqDesc);
+
+#endif /* !_RF__RF_STRIPELOCKS_H_ */
diff --git a/sys/dev/raidframe/rf_strutils.c b/sys/dev/raidframe/rf_strutils.c
new file mode 100644
index 00000000000..1c42b6b6b56
--- /dev/null
+++ b/sys/dev/raidframe/rf_strutils.c
@@ -0,0 +1,62 @@
+/* $OpenBSD: rf_strutils.c,v 1.1 1999/01/11 14:29:51 niklas Exp $ */
+/* $NetBSD: rf_strutils.c,v 1.1 1998/11/13 04:20:35 oster Exp $ */
+/*
+ * rf_strutils.c
+ *
+ * String-parsing funcs
+ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/*
+ * rf_strutils.c -- some simple utilities for munging on strings.
+ * I put them in a file by themselves because they're needed in
+ * setconfig, in the user-level driver, and in the kernel.
+ *
+ * :
+ * Log: rf_strutils.c,v
+ * Revision 1.2 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ */
+
+#include "rf_utils.h"
+
+/* finds a non-white character in the line */
+char *rf_find_non_white(char *p)
+{
+ for (; *p != '\0' && (*p == ' ' || *p == '\t'); p++);
+ return(p);
+}
+
+/* finds a white character in the line */
+char *rf_find_white(char *p)
+{
+ for (; *p != '\0' && (*p != ' ' && *p != '\t'); p++);
+ return(p);
+}
diff --git a/sys/dev/raidframe/rf_sys.c b/sys/dev/raidframe/rf_sys.c
new file mode 100644
index 00000000000..e6eb17bb7ef
--- /dev/null
+++ b/sys/dev/raidframe/rf_sys.c
@@ -0,0 +1,260 @@
+/* $OpenBSD: rf_sys.c,v 1.1 1999/01/11 14:29:53 niklas Exp $ */
+/* $NetBSD: rf_sys.c,v 1.1 1998/11/13 04:20:35 oster Exp $ */
+/*
+ * rf_sys.c
+ *
+ * Jim Zelenka, CMU/SCS, 14 June 1996
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#include "rf_types.h"
+#include "rf_sys.h"
+#ifndef KERNEL
+#include <errno.h>
+#include <fcntl.h>
+#include <nlist.h>
+#include <stdio.h>
+#include <unistd.h>
+#endif /* !KERNEL */
+#include <sys/param.h>
+#if !defined(sun) && !defined(__NetBSD__) && !defined(__OpenBSD__) && !defined(LINUX) && (!defined(MACH) || defined(__osf__))
+#include <sys/sysinfo.h>
+#endif /* !sun && !__NetBSD__ && !__OpenBSD__ && !LINUX && (!MACH || __osf__) */
+#include <sys/time.h>
+#ifdef __osf__
+#include <machine/rpb.h>
+#include <machine/hal/hal_sysinfo.h>
+#endif /* __osf__ */
+#include "rf_etimer.h"
+#include "rf_general.h"
+#include "rf_threadstuff.h"
+
+#ifdef KERNEL
+extern struct rpb *rpb;
+#endif /* KERNEL */
+
+/* timer stuff */
+#ifdef __alpha
+long rf_timer_max_val;
+long rf_timer_ticks_per_second;
+unsigned long rf_timer_ticks_per_usec;
+#endif /* __alpha */
+
+
+#if defined(__NetBSD__) || defined(__OpenBSD__)
+long rf_timer_max_val;
+long rf_timer_ticks_per_second;
+unsigned long rf_timer_ticks_per_usec;
+#endif /* __NetBSD__ || __OpenBSD__ */
+
+#if !defined(KERNEL) && !defined(SIMULATE) && (RF_UTILITY == 0)
+pthread_attr_t raidframe_attr_default;
+
+int rf_thread_create(
+ RF_Thread_t *thread,
+ pthread_attr_t attr,
+ void (*func)(),
+ RF_ThreadArg_t arg)
+{
+ int rc;
+
+#ifdef __osf__
+ rc = pthread_create(thread, attr, (pthread_startroutine_t)func, arg);
+#endif /* __osf__ */
+#ifdef AIX
+ rc = pthread_create(thread, &attr, (void *(*)(void *))func, arg);
+#endif /* AIX */
+ if (rc)
+ return(errno);
+ rc = pthread_detach(thread);
+ if (rc) {
+ /* don't return error, because the thread exists, and must be cleaned up */
+ RF_ERRORMSG1("RAIDFRAME WARNING: failed detaching thread %lx\n", thread);
+ }
+ return(0);
+}
+#endif /* !KERNEL && !SIMULATE && (RF_UTILITY == 0) */
+
+#if defined(__osf__) && !defined(KERNEL)
+int rf_get_cpu_ticks_per_sec(long *ticksp)
+{
+ char *kmemdevname, buf[sizeof(struct rpb)+8];
+ char *memdevname, kernel_name[MAXPATHLEN+1];
+ struct nlist nl[2], *np;
+ unsigned long rpb_addr;
+ int kfd, rc, fd, bad;
+ struct rpb rpb;
+ off_t off;
+
+ kmemdevname = "/dev/kmem";
+ memdevname = "/dev/mem";
+
+ np = &nl[0];
+ bzero((char *)np, sizeof(nl));
+ nl[0].n_name = "pmap_physhwrpb";
+ nl[1].n_name = NULL;
+
+ bad = 0;
+
+ /* get running kernel name */
+ bzero(kernel_name, MAXPATHLEN+1);
+ kernel_name[0] = '/';
+ rc = getsysinfo(GSI_BOOTEDFILE, &kernel_name[1], MAXPATHLEN, 0, 0);
+ if (rc != 1) {
+ RF_ERRORMSG("RAIDFRAME: cannot get booted kernel name\n");
+ if (errno)
+ return(errno);
+ else
+ return(EIO);
+ }
+
+ rc = nlist(kernel_name, np);
+ if (rc) {
+ RF_ERRORMSG1("RAIDFRAME: cannot nlist %s\n", kernel_name);
+ return(EIO);
+ }
+
+ if (np->n_type == 0) {
+ RF_ERRORMSG1("RAIDFRAME: cannot usefully nlist %s\n", kernel_name);
+ return(EIO);
+ }
+
+ kfd = open(kmemdevname, O_RDONLY);
+ if (kfd < 0) {
+ perror(kmemdevname);
+ return(errno);
+ }
+ fd = open(memdevname, O_RDONLY);
+ if (fd < 0) {
+ perror(kmemdevname);
+ return(errno);
+ }
+
+ /*
+ * pmap_physhwrpb is a variable in the kernel containing the physical
+ * address of the hardware RPB. We'll just find that variable and
+ * read it, then use that as a physical memory address to read the
+ * rpb itself.
+ */
+
+ off = lseek(kfd, np->n_value, SEEK_SET);
+ if (off != np->n_value) {
+ RF_ERRORMSG("RAIDFRAME: cannot seek to address of hwrpb addr\n");
+ return(EIO);
+ }
+
+ rc = read(kfd, &rpb_addr, sizeof(rpb_addr));
+ if (rc != sizeof(rpb_addr)) {
+ RF_ERRORMSG("RAIDFRAME: cannot read address of hwrpb addr\n");
+ if (rc < 0)
+ bad = errno;
+ bad = EIO;
+ goto isbad;
+ }
+
+ off = lseek(fd, rpb_addr, SEEK_SET);
+ if (off != rpb_addr) {
+ RF_ERRORMSG("RAIDFRAME: cannot seek to rpb addr\n");
+ bad = EIO;
+ goto isbad;
+ }
+
+ rc = read(fd, &rpb, sizeof(rpb));
+ if (rc != sizeof(rpb)) {
+ RF_ERRORMSG1("RAIDFRAME: cannot read rpb (rc=%d)\n", rc);
+ if (rc < 0)
+ bad = errno;
+ bad = EIO;
+ goto isbad;
+ }
+
+ /*
+ * One extra sanity check: the RPB is self-identifying.
+ * This field is guaranteed to have the value
+ * 0x0000004250525748, always.
+ */
+ if (rpb.rpb_string != 0x0000004250525748) {
+ bad = EIO;
+ goto isbad;
+ }
+
+isbad:
+ if (bad) {
+ RF_ERRORMSG("ERROR: rpb failed validation\n");
+ RF_ERRORMSG1("RAIDFRAME: perhaps %s has changed since booting?\n",
+ kernel_name);
+ return(bad);
+ }
+
+ *ticksp = rpb.rpb_counter;
+
+ close(kfd);
+ close(fd);
+
+ return(0);
+}
+#endif /* __osf__ && !KERNEL */
+
+int rf_ConfigureEtimer(listp)
+ RF_ShutdownList_t **listp;
+{
+#ifdef __osf__
+ int rc;
+
+#ifdef KERNEL
+ rf_timer_ticks_per_second = rpb->rpb_counter;
+#else /* KERNEL */
+ rc = rf_get_cpu_ticks_per_sec(&rf_timer_ticks_per_second);
+ if (rc)
+ return(rc);
+#endif /* KERNEL */
+ rf_timer_max_val = RF_DEF_TIMER_MAX_VAL;
+ rf_timer_ticks_per_usec = rf_timer_ticks_per_second/1000000;
+#endif /* __osf__ */
+#if defined(NETBSD_ALPHA) || defined(OPENBSD_ALPHA)
+ /*
+ * XXX cgd fix this
+ */
+ rf_timer_ticks_per_second = 233100233;
+ rf_timer_max_val = RF_DEF_TIMER_MAX_VAL;
+ rf_timer_ticks_per_usec = rf_timer_ticks_per_second/1000000;
+#endif /* NETBSD_ALPHA || OPENBSD_ALPHA */
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+ /* XXX just picking some random values to keep things happy... without these
+ set, stuff will panic on division by zero errors!! */
+ rf_timer_ticks_per_second = 233100233;
+ rf_timer_max_val = RF_DEF_TIMER_MAX_VAL;
+ rf_timer_ticks_per_usec = rf_timer_ticks_per_second/1000000;
+
+#endif
+ return(0);
+}
diff --git a/sys/dev/raidframe/rf_sys.h b/sys/dev/raidframe/rf_sys.h
new file mode 100644
index 00000000000..f9606708c2e
--- /dev/null
+++ b/sys/dev/raidframe/rf_sys.h
@@ -0,0 +1,69 @@
+/* $OpenBSD: rf_sys.h,v 1.1 1999/01/11 14:29:53 niklas Exp $ */
+/* $NetBSD: rf_sys.h,v 1.1 1998/11/13 04:20:35 oster Exp $ */
+/*
+ * rf_sys.h
+ *
+ * Jim Zelenka, CMU/SCS, 14 June 1996
+ */
+/*
+ * Copyright (c) 1996 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifndef _RF__RF_SYS_H_
+#define _RF__RF_SYS_H_
+
+#include "rf_types.h"
+
+int rf_ConfigureEtimer(RF_ShutdownList_t **listp);
+
+#if defined(__osf__) && !defined(KERNEL)
+int rf_get_cpu_ticks_per_sec(long *ticksp);
+#endif /* __osf__ && !KERNEL */
+
+#ifdef AIX
+#include <nlist.h>
+#include <sys/time.h>
+#if RF_AIXVers == 3
+int gettimeofday(struct timeval *tp, struct timezone *tzp);
+#endif /* RF_AIXVers == 3 */
+int knlist(struct nlist *namelist, int nel, int size);
+int ffs(int index);
+#endif /* AIX */
+
+#ifdef sun
+#define bcopy(a,b,n) memcpy(b,a,n)
+#define bzero(b,n) memset(b,0,n)
+#define bcmp(a,b,n) memcmp(a,b,n)
+#endif /* sun */
+
+#ifdef __GNUC__
+/* we use gcc -Wall to check our anal-retentiveness level, occasionally */
+#if defined(DEC_OSF) && !defined(KERNEL)
+extern int ioctl(int fd, int req, ...);
+#endif /* DEC_OSF && !KERNEL */
+#endif /* __GNUC__ */
+
+#endif /* !_RF__RF_SYS_H_ */
diff --git a/sys/dev/raidframe/rf_threadid.h b/sys/dev/raidframe/rf_threadid.h
new file mode 100644
index 00000000000..ef77020b554
--- /dev/null
+++ b/sys/dev/raidframe/rf_threadid.h
@@ -0,0 +1,230 @@
+/* $OpenBSD: rf_threadid.h,v 1.1 1999/01/11 14:29:53 niklas Exp $ */
+/* $NetBSD: rf_threadid.h,v 1.1 1998/11/13 04:20:35 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Daniel Stodolsky, Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/* rf_threadid.h
+ *
+ * simple macros to register and lookup integer identifiers for threads.
+ * must include pthread.h before including this
+ *
+ * This is one of two places where the pthreads package is used explicitly.
+ * The other is in threadstuff.h
+ *
+ * none of this is used in the kernel, so it all gets compiled out if KERNEL is defined
+ */
+
+/* :
+ * Log: rf_threadid.h,v
+ * Revision 1.17 1996/08/12 20:11:17 jimz
+ * fix up for AIX4
+ *
+ * Revision 1.16 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.15 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.14 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.13 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.12 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.11 1996/05/20 16:13:46 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.10 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.9 1996/05/17 13:29:06 jimz
+ * did a dance on get_threadid such that it will do the pthread_attr_t -> int
+ * assignment without warnings, even on really anal compilers
+ *
+ * Revision 1.8 1995/12/06 15:15:00 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_THREADID_H_
+#define _RF__RF_THREADID_H_
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#ifndef SIMULATE
+#ifndef KERNEL
+
+/*
+ * User
+ */
+
+#include "rf_threadstuff.h"
+
+extern int rf_numThrsRegistered;
+extern pthread_key_t rf_thread_id_key;
+RF_DECLARE_EXTERN_MUTEX(rf_threadid_mutex)
+
+#define RF_THREAD_MAX 200
+
+/* these should be global since a function is declared. Should be invoked at only one place in code */
+#define RF_DECLARE_GLOBAL_THREADID \
+ int rf_numThrsRegistered = 0; \
+ pthread_key_t rf_thread_id_key; \
+ RF_DECLARE_MUTEX(rf_threadid_mutex) \
+ RF_Thread_t rf_regdThrs[RF_THREAD_MAX]; \
+ void rf_ThreadIdEmptyFunc() {}
+
+/* setup must be called exactly once, i.e. it can't be called by each thread */
+
+#ifdef AIX
+typedef void (*pthread_destructor_t)(void *);
+#endif /* AIX */
+
+#ifdef __osf__
+#define rf_setup_threadid() { \
+ extern void rf_ThreadIdEmptyFunc(); \
+ pthread_keycreate(&rf_thread_id_key, (pthread_destructor_t) rf_ThreadIdEmptyFunc); \
+ rf_mutex_init(&rf_threadid_mutex); /* XXX check return val */ \
+ rf_numThrsRegistered = 0; \
+}
+#endif /* __osf__ */
+
+#ifdef AIX
+#define rf_setup_threadid() { \
+ extern void rf_ThreadIdEmptyFunc(); \
+ pthread_key_create(&rf_thread_id_key, (pthread_destructor_t) rf_ThreadIdEmptyFunc); \
+ rf_mutex_init(&rf_threadid_mutex); /* XXX check return val */ \
+ rf_numThrsRegistered = 0; \
+}
+#endif /* AIX */
+
+#define rf_shutdown_threadid() { \
+ rf_mutex_destroy(&rf_threadid_mutex); \
+}
+
+#ifdef __osf__
+typedef pthread_addr_t RF_THID_cast_t;
+#endif /* __osf__ */
+
+#ifdef AIX
+typedef void *RF_THID_cast_t;
+#endif /* AIX */
+
+#define rf_assign_threadid() {RF_LOCK_MUTEX(rf_threadid_mutex); \
+ if (pthread_setspecific(rf_thread_id_key, (RF_THID_cast_t) ((unsigned long)(rf_numThrsRegistered++)))) { RF_PANIC(); } \
+ RF_UNLOCK_MUTEX(rf_threadid_mutex);}
+
+#ifdef __osf__
+#define rf_get_threadid(_id_) { \
+ RF_THID_cast_t _val; \
+ unsigned long _val2; \
+ if (pthread_getspecific(rf_thread_id_key, &_val)) \
+ RF_PANIC(); \
+ (_val2) = (unsigned long)_val; \
+ (_id_) = (int)_val2; \
+}
+#endif /* __osf__ */
+
+#ifdef AIX
+#define rf_get_threadid(_id_) { \
+ RF_THID_cast_t _val; \
+ unsigned long _val2; \
+ _val = pthread_getspecific(rf_thread_id_key); \
+ (_val2) = (unsigned long)_val; \
+ (_id_) = (int)_val2; \
+}
+#endif /* AIX */
+
+#else /* KERNEL */
+
+/*
+ * Kernel
+ */
+
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include <kern/task.h>
+#include <kern/thread.h>
+#include <mach/machine/vm_param.h>
+#endif
+
+#define RF_DECLARE_GLOBAL_THREADID
+#define rf_setup_threadid()
+#define rf_shutdown_threadid()
+#define rf_assign_threadid()
+
+
+
+#if defined(__NetBSD__) || defined(__OpenBSD__)
+
+#define rf_get_threadid(_id_) _id_ = 0;
+
+#else
+#define rf_get_threadid(_id_) { \
+ thread_t thread = current_thread(); \
+ _id_ = (int)(((thread->thread_self)>>(8*sizeof(int *)))&0x0fffffff); \
+}
+#endif /* __NetBSD__ || __OpenBSD__ */
+#endif /* KERNEL */
+
+#else /* SIMULATE */
+
+/*
+ * Simulator
+ */
+
+#include "rf_diskevent.h"
+
+#define RF_DECLARE_GLOBAL_THREADID
+#define rf_setup_threadid()
+#define rf_shutdown_threadid()
+#define rf_assign_threadid()
+
+#define rf_get_threadid(_id_) _id_ = rf_GetCurrentOwner()
+
+#endif /* SIMULATE */
+#endif /* !_RF__RF_THREADID_H_ */
diff --git a/sys/dev/raidframe/rf_threadstuff.c b/sys/dev/raidframe/rf_threadstuff.c
new file mode 100644
index 00000000000..0de5f36c679
--- /dev/null
+++ b/sys/dev/raidframe/rf_threadstuff.c
@@ -0,0 +1,477 @@
+/* $OpenBSD: rf_threadstuff.c,v 1.1 1999/01/11 14:29:53 niklas Exp $ */
+/* $NetBSD: rf_threadstuff.c,v 1.1 1998/11/13 04:20:35 oster Exp $ */
+/*
+ * rf_threadstuff.c
+ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+
+#include "rf_types.h"
+#include "rf_threadstuff.h"
+#include "rf_general.h"
+#include "rf_shutdown.h"
+
+static void mutex_destroyer(void *);
+static void cond_destroyer(void *);
+void thread_wakeup(void *);
+
+/*
+ * Shared stuff
+ */
+
+static void mutex_destroyer(arg)
+ void *arg;
+{
+ int rc;
+
+ rc = rf_mutex_destroy(arg);
+ if (rc) {
+ RF_ERRORMSG1("RAIDFRAME: Error %d auto-destroying mutex\n", rc);
+ }
+}
+
+static void cond_destroyer(arg)
+ void *arg;
+{
+ int rc;
+
+ rc = rf_cond_destroy(arg);
+ if (rc) {
+ RF_ERRORMSG1("RAIDFRAME: Error %d auto-destroying condition\n", rc);
+ }
+}
+
+int _rf_create_managed_mutex(listp, m, file, line)
+ RF_ShutdownList_t **listp;
+ RF_DECLARE_MUTEX(*m)
+ char *file;
+ int line;
+{
+ int rc, rc1;
+
+ rc = rf_mutex_init(m);
+ if (rc)
+ return(rc);
+ rc = _rf_ShutdownCreate(listp, mutex_destroyer, (void *)m, file, line);
+ if (rc) {
+ RF_ERRORMSG1("RAIDFRAME: Error %d adding shutdown entry\n", rc);
+ rc1 = rf_mutex_destroy(m);
+ if (rc1) {
+ RF_ERRORMSG1("RAIDFRAME: Error %d destroying mutex\n", rc1);
+ }
+ }
+ return(rc);
+}
+
+int _rf_create_managed_cond(listp, c, file, line)
+ RF_ShutdownList_t **listp;
+ RF_DECLARE_COND(*c)
+ char *file;
+ int line;
+{
+ int rc, rc1;
+
+ rc = rf_cond_init(c);
+ if (rc)
+ return(rc);
+ rc = _rf_ShutdownCreate(listp, cond_destroyer, (void *)c, file, line);
+ if (rc) {
+ RF_ERRORMSG1("RAIDFRAME: Error %d adding shutdown entry\n", rc);
+ rc1 = rf_cond_destroy(c);
+ if (rc1) {
+ RF_ERRORMSG1("RAIDFRAME: Error %d destroying cond\n", rc1);
+ }
+ }
+ return(rc);
+}
+
+int _rf_init_managed_threadgroup(listp, g, file, line)
+ RF_ShutdownList_t **listp;
+ RF_ThreadGroup_t *g;
+ char *file;
+ int line;
+{
+ int rc;
+
+ rc = _rf_create_managed_mutex(listp, &g->mutex, file, line);
+ if (rc)
+ return(rc);
+ rc = _rf_create_managed_cond(listp, &g->cond, file, line);
+ if (rc)
+ return(rc);
+ g->created = g->running = g->shutdown = 0;
+ return(0);
+}
+
+int _rf_destroy_threadgroup(g, file, line)
+ RF_ThreadGroup_t *g;
+ char *file;
+ int line;
+{
+ int rc1, rc2;
+
+#if RF_DEBUG_ATOMIC > 0
+ rc1 = _rf_mutex_destroy(&g->mutex, file, line);
+ rc2 = _rf_cond_destroy(&g->cond, file, line);
+#else /* RF_DEBUG_ATOMIC > 0 */
+ rc1 = rf_mutex_destroy(&g->mutex);
+ rc2 = rf_cond_destroy(&g->cond);
+#endif /* RF_DEBUG_ATOMIC > 0 */
+ if (rc1)
+ return(rc1);
+ return(rc2);
+}
+
+int _rf_init_threadgroup(g, file, line)
+ RF_ThreadGroup_t *g;
+ char *file;
+ int line;
+{
+ int rc;
+
+#if RF_DEBUG_ATOMIC > 0
+ rc = _rf_mutex_init(&g->mutex, file, line);
+ if (rc)
+ return(rc);
+ rc = _rf_cond_init(&g->cond, file, line);
+ if (rc) {
+ _rf_mutex_destroy(&g->mutex, file, line);
+ return(rc);
+ }
+#else /* RF_DEBUG_ATOMIC > 0 */
+ rc = rf_mutex_init(&g->mutex);
+ if (rc)
+ return(rc);
+ rc = rf_cond_init(&g->cond);
+ if (rc) {
+ rf_mutex_destroy(&g->mutex);
+ return(rc);
+ }
+#endif /* RF_DEBUG_ATOMIC > 0 */
+ g->created = g->running = g->shutdown = 0;
+ return(0);
+}
+
+/*
+ * User
+ */
+
+#if !defined(KERNEL) && !defined(SIMULATE)
+
+#if RF_DEBUG_ATOMIC > 0
+
+static RF_ATEnt_t rf_atent_list;
+static RF_ATEnt_t *rf_atent_done_list=NULL;
+
+static pthread_mutex_t rf_atent_mutex;
+
+void rf_atent_init()
+{
+ int rc;
+
+ rc = pthread_mutex_init(&rf_atent_mutex, pthread_mutexattr_default);
+ if (rc) {
+ fprintf(stderr, "ERROR: rc=%d creating rf_atent_mutex\n", rc);
+ fflush(stderr);
+ RF_PANIC();
+ }
+ rf_atent_list.next = rf_atent_list.prev = &rf_atent_list;
+}
+
+#define ATENT_TYPE(_e_) ((((_e_)->type == 0)||((_e_)->type > 2)) ? 0 : (_e_)->type)
+#define ATENT_OTYPE(_e_) ((((_e_)->otype == 0)||((_e_)->otype > 2)) ? 0 : (_e_)->otype)
+
+void rf_atent_shutdown()
+{
+ int rc, num_freed[3], num_not_freed[3];
+ RF_ATEnt_t *r, *n;
+
+ num_freed[0] = num_freed[1] = num_freed[2] = 0;
+ num_not_freed[0] = num_not_freed[1] = num_not_freed[2] = 0;
+ printf("rf_atent_shutdown:\n");
+ for(r=rf_atent_list.next;r!=&rf_atent_list;r=r->next) {
+ printf("r=%lx type=%d file=%s line=%d\n", r, r->type, r->file, r->line);
+ num_not_freed[ATENT_TYPE(r)]++;
+ }
+ rc = pthread_mutex_destroy(&rf_atent_mutex);
+ if (rc) {
+ fprintf(stderr, "ERROR: rc=%d destroying rf_atent_mutex\n", rc);
+ fflush(stderr);
+ RF_PANIC();
+ }
+ for(r=rf_atent_done_list;r;r=n) {
+ n = r->next;
+ num_freed[ATENT_OTYPE(r)]++;
+ free(r);
+ }
+ printf("%d mutexes not freed %d conditions not freed %d bogus not freed\n",
+ num_not_freed[1], num_not_freed[2], num_not_freed[0]);
+ printf("%d mutexes freed %d conditions freed %d bogus freed\n",
+ num_freed[1], num_freed[2], num_freed[0]);
+ fflush(stdout);
+ fflush(stderr);
+}
+
+static RF_ATEnt_t *AllocATEnt(file,line)
+ char *file;
+ int line;
+{
+ RF_ATEnt_t *t;
+
+ t = (RF_ATEnt_t *)malloc(sizeof(RF_ATEnt_t));
+ if (t == NULL) {
+ RF_PANIC();
+ }
+ t->file = file;
+ t->line = line;
+ t->type = 0;
+ return(t);
+}
+
+static void FreeATEnt(t)
+ RF_ATEnt_t *t;
+{
+ t->otype = t->type;
+ t->type = 0;
+ t->next = rf_atent_done_list;
+ rf_atent_done_list = t;
+}
+
+int _rf_mutex_init(m, file, line)
+ RF_ATEnt_t **m;
+ char *file;
+ int line;
+{
+ RF_ATEnt_t *a;
+ int rc;
+
+ a = AllocATEnt(file,line);
+ rc = pthread_mutex_init(&a->m, pthread_mutexattr_default);
+ if (rc == 0) {
+ pthread_mutex_lock(&rf_atent_mutex);
+ a->next = rf_atent_list.next;
+ a->prev = &rf_atent_list;
+ a->type = RF_ATENT_M;
+ a->next->prev = a;
+ a->prev->next = a;
+ pthread_mutex_unlock(&rf_atent_mutex);
+ }
+ else {
+ fprintf(stderr, "ERROR: rc=%d allocating mutex %s:%d\n",
+ rc, file, line);
+ fflush(stderr);
+ RF_PANIC();
+ }
+ *m = a;
+ return(0);
+}
+
+int _rf_mutex_destroy(m, file, line)
+ RF_ATEnt_t **m;
+ char *file;
+ int line;
+{
+ RF_ATEnt_t *r;
+ int rc;
+
+ r = *m;
+ rc = pthread_mutex_destroy(&r->m);
+ if (rc) {
+ fprintf(stderr, "ERROR: rc=%d destroying mutex %s:%d\n",
+ rc, file, line);
+ fflush(stderr);
+ RF_PANIC();
+ }
+ pthread_mutex_lock(&rf_atent_mutex);
+ r->next->prev = r->prev;
+ r->prev->next = r->next;
+ FreeATEnt(r);
+ pthread_mutex_unlock(&rf_atent_mutex);
+ *m = NULL;
+ return(0);
+}
+
+int _rf_cond_init(c, file, line)
+ RF_ATEnt_t **c;
+ char *file;
+ int line;
+{
+ RF_ATEnt_t *a;
+ int rc;
+
+ a = AllocATEnt(file,line);
+ rc = pthread_cond_init(&a->c, pthread_condattr_default);
+ if (rc == 0) {
+ pthread_mutex_lock(&rf_atent_mutex);
+ a->next = rf_atent_list.next;
+ a->prev = &rf_atent_list;
+ a->next->prev = a;
+ a->prev->next = a;
+ a->type = RF_ATENT_C;
+ pthread_mutex_unlock(&rf_atent_mutex);
+ }
+ else {
+ fprintf(stderr, "ERROR: rc=%d allocating cond %s:%d\n",
+ rc, file, line);
+ fflush(stderr);
+ RF_PANIC();
+ }
+ *c = a;
+ return(0);
+}
+
+int _rf_cond_destroy(c, file, line)
+ RF_ATEnt_t **c;
+ char *file;
+ int line;
+{
+ RF_ATEnt_t *r;
+ int rc;
+
+ r = *c;
+ rc = pthread_cond_destroy(&r->c);
+ if (rc) {
+ fprintf(stderr, "ERROR: rc=%d destroying cond %s:%d\n",
+ rc, file, line);
+ fflush(stderr);
+ RF_PANIC();
+ }
+ pthread_mutex_lock(&rf_atent_mutex);
+ r->next->prev = r->prev;
+ r->prev->next = r->next;
+ FreeATEnt(r);
+ pthread_mutex_unlock(&rf_atent_mutex);
+ *c = NULL;
+ return(0);
+}
+
+#else /* RF_DEBUG_ATOMIC > 0 */
+
+int rf_mutex_init(m)
+ pthread_mutex_t *m;
+{
+#ifdef __osf__
+ return(pthread_mutex_init(m, pthread_mutexattr_default));
+#endif /* __osf__ */
+#ifdef AIX
+ return(pthread_mutex_init(m, &pthread_mutexattr_default));
+#endif /* AIX */
+}
+
+int rf_mutex_destroy(m)
+ pthread_mutex_t *m;
+{
+ return(pthread_mutex_destroy(m));
+}
+
+int rf_cond_init(c)
+ pthread_cond_t *c;
+{
+#ifdef __osf__
+ return(pthread_cond_init(c, pthread_condattr_default));
+#endif /* __osf__ */
+#ifdef AIX
+ return(pthread_cond_init(c, &pthread_condattr_default));
+#endif /* AIX */
+}
+
+int rf_cond_destroy(c)
+ pthread_cond_t *c;
+{
+ return(pthread_cond_destroy(c));
+}
+
+#endif /* RF_DEBUG_ATOMIC > 0 */
+
+#endif /* !KERNEL && !SIMULATE */
+
+/*
+ * Kernel
+ */
+#ifdef KERNEL
+int rf_mutex_init(m)
+ decl_simple_lock_data(,*m)
+{
+ simple_lock_init(m);
+ return(0);
+}
+
+int rf_mutex_destroy(m)
+ decl_simple_lock_data(,*m)
+{
+ return(0);
+}
+
+int rf_cond_init(c)
+ RF_DECLARE_COND(*c)
+{
+ *c = 0; /* no reason */
+ return(0);
+}
+
+int rf_cond_destroy(c)
+ RF_DECLARE_COND(*c)
+{
+ return(0);
+}
+
+
+#endif /* KERNEL */
+
+/*
+ * Simulator
+ */
+#ifdef SIMULATE
+int rf_mutex_init(m)
+ RF_DECLARE_MUTEX(*m)
+{
+ return(0);
+}
+
+int rf_mutex_destroy(m)
+ RF_DECLARE_MUTEX(*m)
+{
+ return(0);
+}
+
+int rf_cond_init(c)
+ RF_DECLARE_COND(*c)
+{
+ return(0);
+}
+
+int rf_cond_destroy(c)
+ RF_DECLARE_COND(*c)
+{
+ return(0);
+}
+#endif /* SIMULATE */
diff --git a/sys/dev/raidframe/rf_threadstuff.h b/sys/dev/raidframe/rf_threadstuff.h
new file mode 100644
index 00000000000..1437b2b0edf
--- /dev/null
+++ b/sys/dev/raidframe/rf_threadstuff.h
@@ -0,0 +1,465 @@
+/* $OpenBSD: rf_threadstuff.h,v 1.1 1999/01/11 14:29:54 niklas Exp $ */
+/* $NetBSD: rf_threadstuff.h,v 1.1 1998/11/13 04:20:35 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland, Daniel Stodolsky, Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/*
+ * threadstuff.h -- definitions for threads, locks, and synchronization
+ *
+ * The purpose of this file is provide some illusion of portability.
+ * If the functions below can be implemented with the same semantics on
+ * some new system, then at least the synchronization and thread control
+ * part of the code should not require modification to port to a new machine.
+ * the only other place where the pthread package is explicitly used is
+ * threadid.h
+ *
+ * this file should be included above stdio.h to get some necessary defines.
+ *
+ */
+
+/* :
+ * Log: rf_threadstuff.h,v
+ * Revision 1.38 1996/08/12 22:37:47 jimz
+ * add AIX stuff for user driver
+ *
+ * Revision 1.37 1996/08/11 00:47:09 jimz
+ * make AIX friendly
+ *
+ * Revision 1.36 1996/07/23 22:06:59 jimz
+ * add rf_destroy_threadgroup
+ *
+ * Revision 1.35 1996/07/23 21:31:16 jimz
+ * add init_threadgroup
+ *
+ * Revision 1.34 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.33 1996/07/15 17:22:18 jimz
+ * nit-pick code cleanup
+ * resolve stdlib problems on DEC OSF
+ *
+ * Revision 1.32 1996/06/17 03:01:11 jimz
+ * get rid of JOIN stuff
+ *
+ * Revision 1.31 1996/06/14 23:15:38 jimz
+ * attempt to deal with thread GC problem
+ *
+ * Revision 1.30 1996/06/11 18:12:36 jimz
+ * get rid of JOIN operations
+ * use ThreadGroup stuff instead
+ * fix some allocation/deallocation and sync bugs
+ *
+ * Revision 1.29 1996/06/11 13:48:10 jimz
+ * make kernel RF_THREAD_CREATE give back happier return vals
+ *
+ * Revision 1.28 1996/06/10 16:40:01 jimz
+ * break user-level stuff out into lib+apps
+ *
+ * Revision 1.27 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.26 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.25 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.24 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.23 1996/05/20 19:31:54 jimz
+ * add atomic debug (mutex and cond leak finder) stuff
+ *
+ * Revision 1.22 1996/05/20 16:24:49 jimz
+ * get happy in simulator
+ *
+ * Revision 1.21 1996/05/20 16:15:07 jimz
+ * switch to rf_{mutex,cond}_{init,destroy}
+ *
+ * Revision 1.20 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.19 1996/05/09 17:16:53 jimz
+ * correct arg to JOIN_THREAD
+ *
+ * Revision 1.18 1995/12/12 18:10:06 jimz
+ * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT
+ * fix 80-column brain damage in comments
+ *
+ * Revision 1.17 1995/12/06 15:15:21 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_THREADSTUFF_H_
+#define _RF__RF_THREADSTUFF_H_
+
+#include "rf_types.h"
+
+#define rf_create_managed_mutex(a,b) _rf_create_managed_mutex(a,b,__FILE__,__LINE__)
+#define rf_create_managed_cond(a,b) _rf_create_managed_cond(a,b,__FILE__,__LINE__)
+#define rf_init_managed_threadgroup(a,b) _rf_init_managed_threadgroup(a,b,__FILE__,__LINE__)
+#define rf_init_threadgroup(a) _rf_init_threadgroup(a,__FILE__,__LINE__)
+#define rf_destroy_threadgroup(a) _rf_destroy_threadgroup(a,__FILE__,__LINE__)
+
+int _rf_init_threadgroup(RF_ThreadGroup_t *g, char *file, int line);
+int _rf_destroy_threadgroup(RF_ThreadGroup_t *g, char *file, int line);
+int _rf_init_managed_threadgroup(RF_ShutdownList_t **listp,
+ RF_ThreadGroup_t *g, char *file, int line);
+
+#ifndef SIMULATE /* will null all this calls */
+#ifndef KERNEL
+
+#if defined(__osf__) || defined(AIX)
+#include <pthread.h>
+#endif /* __osf__ || AIX */
+
+#define RF_DEBUG_ATOMIC 0
+
+#if RF_DEBUG_ATOMIC > 0
+#define RF_ATENT_M 1
+#define RF_ATENT_C 2
+typedef struct RF_ATEnt_s RF_ATEnt_t;
+struct RF_ATEnt_s {
+ char *file;
+ int line;
+ pthread_mutex_t m;
+ pthread_cond_t c;
+ int type;
+ int otype;
+ RF_ATEnt_t *next;
+ RF_ATEnt_t *prev;
+};
+
+#define RF_DECLARE_MUTEX(_m_) RF_ATEnt_t *_m_;
+#define RF_DECLARE_STATIC_MUTEX(_m_) static RF_ATEnt_t *_m_;
+#define RF_DECLARE_EXTERN_MUTEX(_m_) extern RF_ATEnt_t *_m_;
+#define RF_DECLARE_COND(_c_) RF_ATEnt_t *_c_;
+#define RF_DECLARE_STATIC_COND(_c_) static RF_ATEnt_t *_c_;
+#define RF_DECLARE_EXTERN_COND(_c_) extern RF_ATEnt_t *_c_;
+
+int _rf_mutex_init(RF_ATEnt_t **m, char *file, int line);
+int _rf_mutex_destroy(RF_ATEnt_t **m, char *file, int line);
+int _rf_cond_init(RF_ATEnt_t **c, char *file, int line);
+int _rf_cond_destroy(RF_ATEnt_t **c, char *file, int line);
+void rf_atent_init(void);
+void rf_atent_shutdown(void);
+
+#define rf_mutex_init(_m_) _rf_mutex_init(_m_,__FILE__,__LINE__)
+#define rf_mutex_destroy(_m_) _rf_mutex_destroy(_m_,__FILE__,__LINE__)
+#define rf_cond_init(_m_) _rf_cond_init(_m_,__FILE__,__LINE__)
+#define rf_cond_destroy(_m_) _rf_cond_destroy(_m_,__FILE__,__LINE__)
+
+#define RF_LOCK_MUTEX(_a_) {RF_ASSERT((_a_)->type == RF_ATENT_M); pthread_mutex_lock(&((_a_)->m));}
+#define RF_UNLOCK_MUTEX(_a_) {RF_ASSERT((_a_)->type == RF_ATENT_M); pthread_mutex_unlock(&((_a_)->m));}
+
+#define RF_WAIT_COND(_c_,_m_) { \
+ RF_ASSERT((_c_)->type == RF_ATENT_C); \
+ RF_ASSERT((_m_)->type == RF_ATENT_M); \
+ pthread_cond_wait( &((_c_)->c), &((_m_)->m) ); \
+}
+#define RF_SIGNAL_COND(_c_) {RF_ASSERT((_c_)->type == RF_ATENT_C); pthread_cond_signal( &((_c_)->c));}
+#define RF_BROADCAST_COND(_c_) {RF_ASSERT((_c_)->type == RF_ATENT_C); pthread_cond_broadcast(&((_c_)->c));}
+
+#else /* RF_DEBUG_ATOMIC > 0 */
+
+/* defining these as macros allows us to NULL them out in the kernel */
+#define RF_DECLARE_MUTEX(_m_) pthread_mutex_t _m_;
+#define RF_DECLARE_STATIC_MUTEX(_m_) static pthread_mutex_t _m_;
+#define RF_DECLARE_EXTERN_MUTEX(_m_) extern pthread_mutex_t _m_;
+#define RF_DECLARE_COND(_c_) pthread_cond_t _c_;
+#define RF_DECLARE_STATIC_COND(_c_) static pthread_cond_t _c_;
+#define RF_DECLARE_EXTERN_COND(_c_) extern pthread_cond_t _c_;
+
+int rf_mutex_init(pthread_mutex_t *m);
+int rf_mutex_destroy(pthread_mutex_t *m);
+int rf_cond_init(pthread_cond_t *c);
+int rf_cond_destroy(pthread_cond_t *c);
+
+#define RF_LOCK_MUTEX(_m_) {pthread_mutex_lock(&(_m_));}
+#define RF_UNLOCK_MUTEX(_m_) pthread_mutex_unlock(&(_m_))
+
+#define RF_WAIT_COND(_c_,_m_) pthread_cond_wait( &(_c_), &(_m_) )
+#define RF_SIGNAL_COND(_c_) pthread_cond_signal( &(_c_) )
+#define RF_BROADCAST_COND(_c_) pthread_cond_broadcast(&(_c_))
+
+#endif /* RF_DEBUG_ATOMIC > 0 */
+
+int _rf_create_managed_mutex(RF_ShutdownList_t **listp, pthread_mutex_t *m, char *file, int line);
+int _rf_create_managed_cond(RF_ShutdownList_t **listp, pthread_cond_t *c, char *file, int line);
+
+typedef pthread_t RF_Thread_t;
+#ifdef __osf__
+typedef pthread_addr_t RF_ThreadArg_t; /* the argument to a thread function */
+#else /* __osf__ */
+typedef void *RF_ThreadArg_t; /* the argument to a thread function */
+#endif /* __osf__ */
+typedef pthread_attr_t RF_ThreadAttr_t; /* a thread creation attribute structure */
+
+#ifdef __osf__
+#define RF_EXIT_THREAD(_status_) pthread_exit( (pthread_addr_t) (_status_) )
+#else /* __osf__ */
+#define RF_EXIT_THREAD(_status_) pthread_exit( (void *) (_status_) )
+#endif /* __osf__ */
+#define RF_DELAY_THREAD(_secs_, _msecs_) {struct timespec interval; \
+ interval.tv_sec = (_secs_); \
+ interval.tv_nsec = (_msecs_)*1000000; \
+ pthread_delay_np(&interval); \
+ }
+#define RF_DELAY_THREAD_TS(_ts_) pthread_delay_np(&(_ts_))
+
+#ifdef __osf__
+#define RF_THREAD_ATTR_CREATE(_attr_) pthread_attr_create( &(_attr_) )
+#define RF_THREAD_ATTR_DELETE(_attr_) pthread_attr_delete( &(_attr_) )
+#endif /* __osf__ */
+#ifdef AIX
+#define RF_THREAD_ATTR_CREATE(_attr_) pthread_attr_init( &(_attr_) )
+#define RF_THREAD_ATTR_DELETE(_attr_) pthread_attr_destroy( &(_attr_) )
+#endif /* AIX */
+#define RF_THREAD_ATTR_SETSTACKSIZE(_attr_,_sz_) pthread_attr_setstacksize(&(_attr_), (long) (_sz_))
+#define RF_THREAD_ATTR_GETSTACKSIZE(_attr_) pthread_attr_getstacksize(_attr_)
+#define RF_THREAD_ATTR_SETSCHED(_attr_,_sched_) pthread_attr_setsched(&(_attr_), (_sched_))
+#define RF_CREATE_ATTR_THREAD(_handle_, _attr_, _func_, _arg_) \
+ pthread_create(&(_handle_), (_attr_), (pthread_startroutine_t) (_func_), (_arg_))
+
+
+extern pthread_attr_t raidframe_attr_default;
+int rf_thread_create(RF_Thread_t *thread, pthread_attr_t attr,
+ void (*func)(), RF_ThreadArg_t arg);
+
+#define RF_CREATE_THREAD(_handle_, _func_, _arg_) \
+ rf_thread_create(&(_handle_), raidframe_attr_default, (_func_), (_arg_))
+
+#else /* KERNEL */
+#if defined(__NetBSD__) || defined(__OpenBSD__)
+#include <sys/lock.h>
+#define decl_simple_lock_data(a,b) a struct simplelock b;
+#define simple_lock_addr(a) ((struct simplelock *)&(a))
+#else
+#include <kern/task.h>
+#include <kern/thread.h>
+#include <kern/lock.h>
+#include <kern/sched_prim.h>
+#define decl_simple_lock_data(a,b) a int (b);
+#endif /* __NetBSD__ || __OpenBSD__ */
+
+#if defined(__NetBSD__) || defined(__OpenBSD__)
+typedef struct proc *RF_Thread_t;
+#else
+typedef thread_t RF_Thread_t;
+#endif
+typedef void *RF_ThreadArg_t;
+
+#define RF_DECLARE_MUTEX(_m_) decl_simple_lock_data(,(_m_))
+#define RF_DECLARE_STATIC_MUTEX(_m_) decl_simple_lock_data(static,(_m_))
+#define RF_DECLARE_EXTERN_MUTEX(_m_) decl_simple_lock_data(extern,(_m_))
+
+#define RF_DECLARE_COND(_c_) int _c_;
+#define RF_DECLARE_STATIC_COND(_c_) static int _c_;
+#define RF_DECLARE_EXTERN_COND(_c_) extern int _c_;
+
+#define RF_LOCK_MUTEX(_m_) simple_lock(&(_m_))
+#define RF_UNLOCK_MUTEX(_m_) simple_unlock(&(_m_))
+
+
+#if defined(__NetBSD__) || defined(__OpenBSD__)
+#include <sys/types.h>
+#include <sys/kthread.h>
+/*
+ * In Net- and OpenBSD, kernel threads are simply processes which share several
+ * substructures and never run in userspace.
+ *
+ * XXX Note, Net- and OpenBSD does not yet have a wakeup_one(), so we always
+ * XXX get Thundering Herd when a condition occurs.
+ */
+#define RF_WAIT_COND(_c_,_m_) { \
+ RF_UNLOCK_MUTEX(_m_); \
+ tsleep(&_c_, PRIBIO | PCATCH, "rfwcond", 0); \
+ RF_LOCK_MUTEX(_m_); \
+}
+#define RF_SIGNAL_COND(_c_) wakeup(&(_c_))
+#define RF_BROADCAST_COND(_c_) wakeup(&(_c_))
+#define RF_CREATE_THREAD(_handle_, _func_, _arg_) \
+ kthread_create((void (*) __P((void *)))(_func_), (void *)(_arg_), \
+ (struct proc **)&(_handle_), "raid")
+#else /* ! __NetBSD__ && ! __OpenBSD__ */
+/*
+ * Digital UNIX/Mach threads.
+ */
+#define RF_WAIT_COND(_c_,_m_) { \
+ assert_wait((vm_offset_t)&(_c_), TRUE); \
+ RF_UNLOCK_MUTEX(_m_); \
+ thread_block(); \
+ RF_LOCK_MUTEX(_m_); \
+}
+#define RF_SIGNAL_COND(_c_) thread_wakeup_one(((vm_offset_t)&(_c_)))
+#define RF_BROADCAST_COND(_c_) thread_wakeup(((vm_offset_t)&(_c_)))
+extern task_t first_task;
+#define RF_CREATE_THREAD(_handle_, _func_, _arg_) \
+ (((_handle_ = kernel_thread_w_arg(first_task, (void (*)())_func_, (void *)(_arg_))) != THREAD_NULL) ? 0 : ENOMEM)
+#endif /* __NetBSD__ || __OpenBSD__ */
+#endif /* KERNEL */
+#else /* SIMULATE */
+
+#define RF_DECLARE_MUTEX(_m_) int _m_;
+#define RF_DECLARE_STATIC_MUTEX(_m_) static int _m_;
+#define RF_DECLARE_EXTERN_MUTEX(_m_) extern int _m_;
+#define RF_DECLARE_COND(_c_) int _c_;
+#define RF_DECLARE_STATIC_COND(_c_) static int _c_;
+#define RF_DECLARE_EXTERN_COND(_c_) extern int _c_;
+
+extern int rf_mutex_init(int *m);
+extern int rf_mutex_destroy(int *m);
+extern int rf_cond_init(int *c);
+extern int rf_cond_destroy(int *c);
+
+int rf_mutex_init(int *m);
+int rf_mutex_destroy(int *m);
+int _rf_create_managed_mutex(RF_ShutdownList_t **listp, int *m, char *file, int line);
+int _rf_create_managed_cond(RF_ShutdownList_t **listp, int *m, char *file, int line);
+
+typedef void *RF_ThreadArg_t; /* the argument to a thread function */
+
+#define RF_LOCK_MUTEX(_m_)
+#define RF_UNLOCK_MUTEX(_m_)
+
+#define RF_WAIT_COND(_c_,_m_)
+#define RF_SIGNAL_COND(_c_)
+#define RF_BROADCAST_COND(_c_)
+
+#define RF_EXIT_THREAD(_status_)
+#define RF_DELAY_THREAD(_secs_, _msecs_)
+
+#define RF_THREAD_ATTR_CREATE(_attr_) ;
+#define RF_THREAD_ATTR_DELETE(_attr_) ;
+#define RF_THREAD_ATTR_SETSTACKSIZE(_attr_,_sz_) ;
+#define RF_THREAD_ATTR_SETSCHED(_attr_,_sched_) ;
+#define RF_CREATE_ATTR_THREAD(_handle_, _attr_, _func_, _arg_) ;
+
+#define RF_CREATE_THREAD(_handle_, _func_, _arg_) 1
+
+#endif /* SIMULATE */
+
+struct RF_ThreadGroup_s {
+ int created;
+ int running;
+ int shutdown;
+ RF_DECLARE_MUTEX(mutex)
+ RF_DECLARE_COND(cond)
+};
+
+/*
+ * Someone has started a thread in the group
+ */
+#define RF_THREADGROUP_STARTED(_g_) { \
+ RF_LOCK_MUTEX((_g_)->mutex); \
+ (_g_)->created++; \
+ RF_UNLOCK_MUTEX((_g_)->mutex); \
+}
+
+/*
+ * Thread announcing that it is now running
+ */
+#define RF_THREADGROUP_RUNNING(_g_) { \
+ RF_LOCK_MUTEX((_g_)->mutex); \
+ (_g_)->running++; \
+ RF_UNLOCK_MUTEX((_g_)->mutex); \
+ RF_SIGNAL_COND((_g_)->cond); \
+}
+
+/*
+ * Thread announcing that it is now done
+ */
+#define RF_THREADGROUP_DONE(_g_) { \
+ RF_LOCK_MUTEX((_g_)->mutex); \
+ (_g_)->shutdown++; \
+ RF_UNLOCK_MUTEX((_g_)->mutex); \
+ RF_SIGNAL_COND((_g_)->cond); \
+}
+
+/*
+ * Wait for all threads to start running
+ */
+#define RF_THREADGROUP_WAIT_START(_g_) { \
+ RF_LOCK_MUTEX((_g_)->mutex); \
+ while((_g_)->running < (_g_)->created) { \
+ RF_WAIT_COND((_g_)->cond, (_g_)->mutex); \
+ } \
+ RF_UNLOCK_MUTEX((_g_)->mutex); \
+}
+
+/*
+ * Wait for all threads to stop running
+ */
+#if !defined(__NetBSD__) && !defined(__OpenBSD__)
+#define RF_THREADGROUP_WAIT_STOP(_g_) { \
+ RF_LOCK_MUTEX((_g_)->mutex); \
+ RF_ASSERT((_g_)->running == (_g_)->created); \
+ while((_g_)->shutdown < (_g_)->running) { \
+ RF_WAIT_COND((_g_)->cond, (_g_)->mutex); \
+ } \
+ RF_UNLOCK_MUTEX((_g_)->mutex); \
+}
+#else
+ /* XXX Note that we've removed the assert. That should get put back
+ in once we actually get something like a kernel thread running */
+#define RF_THREADGROUP_WAIT_STOP(_g_) { \
+ RF_LOCK_MUTEX((_g_)->mutex); \
+ while((_g_)->shutdown < (_g_)->running) { \
+ RF_WAIT_COND((_g_)->cond, (_g_)->mutex); \
+ } \
+ RF_UNLOCK_MUTEX((_g_)->mutex); \
+}
+#endif
+
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL)
+
+int rf_mutex_init(struct simplelock *);
+int rf_mutex_destroy(struct simplelock *);
+int _rf_create_managed_mutex(RF_ShutdownList_t **, struct simplelock *,
+ char *, int);
+int _rf_create_managed_cond(RF_ShutdownList_t **listp, int *,
+ char *file, int line);
+
+int rf_cond_init(int *c); /* XXX need to write?? */
+int rf_cond_destroy(int *c); /* XXX need to write?? */
+#endif
+#endif /* !_RF__RF_THREADSTUFF_H_ */
diff --git a/sys/dev/raidframe/rf_types.h b/sys/dev/raidframe/rf_types.h
new file mode 100644
index 00000000000..6df3e9e5d78
--- /dev/null
+++ b/sys/dev/raidframe/rf_types.h
@@ -0,0 +1,583 @@
+/* $OpenBSD: rf_types.h,v 1.1 1999/01/11 14:29:54 niklas Exp $ */
+/* $NetBSD: rf_types.h,v 1.2 1998/11/16 04:14:10 mycroft Exp $ */
+/*
+ * rf_types.h
+ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Jim Zelenka
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+/***********************************************************
+ *
+ * rf_types.h -- standard types for RAIDframe
+ *
+ ***********************************************************/
+/*
+ * :
+ * Log: rf_types.h,v
+ * Revision 1.35 1996/08/09 18:48:29 jimz
+ * correct mips definition
+ *
+ * Revision 1.34 1996/08/07 22:50:14 jimz
+ * monkey with linux includes to get a good compile
+ *
+ * Revision 1.33 1996/08/07 21:09:28 jimz
+ * add SGI mips stuff (note: 64-bit stuff may be wrong, I didn't have
+ * a machine to test on)
+ *
+ * Revision 1.32 1996/08/06 22:24:27 jimz
+ * add LINUX_I386
+ *
+ * Revision 1.31 1996/07/31 16:30:12 jimz
+ * move in RF_LONGSHIFT
+ *
+ * Revision 1.30 1996/07/30 04:51:58 jimz
+ * ultrix port
+ *
+ * Revision 1.29 1996/07/29 16:37:34 jimz
+ * define DEC_OSF for osf/1 kernel
+ *
+ * Revision 1.28 1996/07/28 20:31:39 jimz
+ * i386netbsd port
+ * true/false fixup
+ *
+ * Revision 1.27 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.26 1996/07/27 18:40:24 jimz
+ * cleanup sweep
+ *
+ * Revision 1.25 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.24 1996/07/18 22:57:14 jimz
+ * port simulator to AIX
+ *
+ * Revision 1.23 1996/07/15 17:22:18 jimz
+ * nit-pick code cleanup
+ * resolve stdlib problems on DEC OSF
+ *
+ * Revision 1.22 1996/06/11 18:11:57 jimz
+ * add ThreadGroup
+ *
+ * Revision 1.21 1996/06/11 10:58:47 jimz
+ * add RF_ReconDoneProc_t
+ *
+ * Revision 1.20 1996/06/10 14:18:58 jimz
+ * move user, throughput stats into per-array structure
+ *
+ * Revision 1.19 1996/06/10 11:55:47 jimz
+ * Straightened out some per-array/not-per-array distinctions, fixed
+ * a couple bugs related to confusion. Added shutdown lists. Removed
+ * layout shutdown function (now subsumed by shutdown lists).
+ *
+ * Revision 1.18 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.17 1996/06/05 19:38:32 jimz
+ * fixed up disk queueing types config
+ * added sstf disk queueing
+ * fixed exit bug on diskthreads (ref-ing bad mem)
+ *
+ * Revision 1.16 1996/06/05 18:06:02 jimz
+ * Major code cleanup. The Great Renaming is now done.
+ * Better modularity. Better typing. Fixed a bunch of
+ * synchronization bugs. Made a lot of global stuff
+ * per-desc or per-array. Removed dead code.
+ *
+ * Revision 1.15 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.14 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.13 1996/05/31 22:26:54 jimz
+ * fix a lot of mapping problems, memory allocation problems
+ * found some weird lock issues, fixed 'em
+ * more code cleanup
+ *
+ * Revision 1.12 1996/05/30 23:22:16 jimz
+ * bugfixes of serialization, timing problems
+ * more cleanup
+ *
+ * Revision 1.11 1996/05/30 11:29:41 jimz
+ * Numerous bug fixes. Stripe lock release code disagreed with the taking code
+ * about when stripes should be locked (I made it consistent: no parity, no lock)
+ * There was a lot of extra serialization of I/Os which I've removed- a lot of
+ * it was to calculate values for the cache code, which is no longer with us.
+ * More types, function, macro cleanup. Added code to properly quiesce the array
+ * on shutdown. Made a lot of stuff array-specific which was (bogusly) general
+ * before. Fixed memory allocation, freeing bugs.
+ *
+ * Revision 1.10 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.9 1996/05/24 22:17:04 jimz
+ * continue code + namespace cleanup
+ * typed a bunch of flags
+ *
+ * Revision 1.8 1996/05/24 04:28:55 jimz
+ * release cleanup ckpt
+ *
+ * Revision 1.7 1996/05/24 01:59:45 jimz
+ * another checkpoint in code cleanup for release
+ * time to sync kernel tree
+ *
+ * Revision 1.6 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.5 1996/05/23 00:33:23 jimz
+ * code cleanup: move all debug decls to rf_options.c, all extern
+ * debug decls to rf_options.h, all debug vars preceded by rf_
+ *
+ * Revision 1.4 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.3 1996/05/10 16:22:46 jimz
+ * RF_offset -> RF_Offset
+ * add RF_SectorCount
+ *
+ * Revision 1.2 1996/05/02 14:58:50 jimz
+ * switch to _t for non-base-integral types
+ *
+ * Revision 1.1 1995/12/14 18:36:51 jimz
+ * Initial revision
+ *
+ */
+
+#ifndef _RF__RF_TYPES_H_
+#define _RF__RF_TYPES_H_
+
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#include "rf_archs.h"
+
+#ifndef KERNEL
+#ifdef LINUX
+#include <stdlib.h>
+#include <sys/types.h>
+#endif /* LINUX */
+#include <fcntl.h>
+#include <stdio.h>
+
+#ifdef __osf__
+/*
+ * The following monkeying is to get around some problems with
+ * conflicting definitions in /usr/include/random.h and /usr/include/stdlib.h
+ * on Digital Unix. They
+ * (1) define the same symbols
+ * (2) differently than one another
+ * (3) also differently from the DU libc sources
+ * This loses, bad.
+ */
+#include <standards.h>
+#include <cma.h>
+#ifdef _OSF_SOURCE
+#undef _OSF_SOURCE
+#define _RF_SPANKME
+#endif /* _OSF_SOURCE */
+#endif /* __osf__ */
+#include <stdlib.h>
+#ifdef __osf__
+#ifdef _RF_SPANKME
+#undef _RF_SPANKME
+#define _OSF_SOURCE
+#endif /* _RF_SPANKME */
+#endif /* __osf__ */
+
+#include <string.h>
+#include <unistd.h>
+#endif /* !KERNEL */
+#include <sys/errno.h>
+#include <sys/types.h>
+
+#ifdef AIX
+#include <sys/stream.h>
+#endif /* AIX */
+
+#if defined(hpux) || defined(__hpux)
+/*
+ * Yeah, we get one of hpux or __hpux, but not both. This is because
+ * HP didn't really want to provide an ANSI C compiler. Apparantly, they
+ * don't like standards. This explains a lot about their API. You might
+ * try using gcc, but you'll discover that it's sufficiently buggy that
+ * it can't even compile the core library.
+ *
+ * Hatred update: c89, the one thing which could both handle prototypes,
+ * and compile /usr/include/sys/timeout.h, can't do 64-bit ints.
+ *
+ * Note: the hpux port is incomplete. Why? Well, because I can't find
+ * a working C compiler. I've tried cc (both with and without -Ae),
+ * c89, and gcc, all with and without -D_HPUX_SOURCE. Sod it.
+ *
+ * -Jim Zelenka, 22 July 1996
+ */
+#ifndef hpux
+#define hpux
+#endif /* !hpux */
+#include <sys/hpibio.h>
+#endif /* hpux || __hpux*/
+
+#ifdef sun
+#ifndef KERNEL
+#include <errno.h>
+#endif /* !KERNEL */
+#endif /* sun */
+
+#if defined(OSF) && defined(__alpha) && defined(KERNEL)
+#ifndef DEC_OSF
+#define DEC_OSF
+#endif /* !DEC_OSF */
+#endif /* OSF && __alpha && KERNEL */
+
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(KERNEL)
+#include <sys/uio.h>
+#include <sys/param.h>
+#include <sys/lock.h>
+
+/* XXX not sure about these... */
+/* #define PZERO 0 */ /* actually defined in <sys/param.h> */
+#define MS_LOCK_SIMPLE 1
+
+#define TRUE 1 /* XXX why isn't this done somewhere already!! */
+
+#endif /* (__NetBSD__ || __OpenBSD__) && KERNEL */
+
+/*
+ * First, define system-dependent types and constants.
+ *
+ * If the machine is big-endian, RF_BIG_ENDIAN should be 1.
+ * Otherwise, it should be 0.
+ *
+ * The various integer types should be self-explanatory; we
+ * use these elsewhere to avoid size confusion.
+ *
+ * LONGSHIFT is lg(sizeof(long)) (that is, log base two of sizeof(long)
+ *
+ */
+
+#if defined(__NetBSD__) || defined(__OpenBSD__)
+
+#include <sys/types.h>
+#include <machine/endian.h>
+#include <machine/limits.h>
+
+#if BYTE_ORDER == BIG_ENDIAN
+#define RF_IS_BIG_ENDIAN 1
+#elif BYTE_ORDER == LITTLE_ENDIAN
+#define RF_IS_BIG_ENDIAN 0
+#else
+#error byte order not defined
+#endif
+typedef int8_t RF_int8;
+typedef u_int8_t RF_uint8;
+typedef int16_t RF_int16;
+typedef u_int16_t RF_uint16;
+typedef int32_t RF_int32;
+typedef u_int32_t RF_uint32;
+typedef int64_t RF_int64;
+typedef u_int64_t RF_uint64;
+#if LONG_BIT == 32
+#define RF_LONGSHIFT 2
+#elif LONG_BIT == 64
+#define RF_LONGSHIFT 3
+#else
+#error word size not defined
+#endif
+
+#else /* __NetBSD__ || __OpenBSD__ */
+
+#ifdef __alpha
+#define RF_IS_BIG_ENDIAN 0
+typedef signed char RF_int8;
+typedef unsigned char RF_uint8;
+typedef short RF_int16;
+typedef unsigned short RF_uint16;
+typedef int RF_int32;
+typedef unsigned int RF_uint32;
+typedef long RF_int64;
+typedef unsigned long RF_uint64;
+#define RF_LONGSHIFT 3
+#endif /* __alpha */
+
+#ifdef _IBMR2
+#define RF_IS_BIG_ENDIAN 1
+typedef signed char RF_int8;
+typedef unsigned char RF_uint8;
+typedef short RF_int16;
+typedef unsigned short RF_uint16;
+typedef int RF_int32;
+typedef unsigned int RF_uint32;
+typedef long long RF_int64;
+typedef unsigned long long RF_uint64;
+#define RF_LONGSHIFT 2
+#endif /* _IBMR2 */
+
+#ifdef hpux
+#define RF_IS_BIG_ENDIAN 1
+typedef signed char RF_int8;
+typedef unsigned char RF_uint8;
+typedef short RF_int16;
+typedef unsigned short RF_uint16;
+typedef int RF_int32;
+typedef unsigned int RF_uint32;
+typedef long long RF_int64;
+typedef unsigned long long RF_uint64;
+#define RF_LONGSHIFT 2
+#endif /* hpux */
+
+#ifdef sun
+#define RF_IS_BIG_ENDIAN 1
+typedef char RF_int8;
+typedef unsigned char RF_uint8;
+typedef short RF_int16;
+typedef unsigned short RF_uint16;
+typedef int RF_int32;
+typedef unsigned int RF_uint32;
+typedef long long RF_int64;
+typedef unsigned long long RF_uint64;
+#define RF_LONGSHIFT 2
+#endif /* sun */
+
+#if defined(NETBSD_I386) || defined(NETBSD_I386) || defined(LINUX_I386)
+#define RF_IS_BIG_ENDIAN 0
+typedef char RF_int8;
+typedef unsigned char RF_uint8;
+typedef short RF_int16;
+typedef unsigned short RF_uint16;
+typedef int RF_int32;
+typedef unsigned int RF_uint32;
+typedef long long RF_int64;
+typedef unsigned long long RF_uint64;
+#define RF_LONGSHIFT 2
+#endif /* NETBSD_I386 || OPENBSD_I386 || LINUX_I386 */
+
+#if defined(mips) && !defined(SGI) && !defined(__NetBSD__) && !defined(__OpenBSD__)
+#define RF_IS_BIG_ENDIAN 0
+typedef char RF_int8;
+typedef unsigned char RF_uint8;
+typedef short RF_int16;
+typedef unsigned short RF_uint16;
+typedef int RF_int32;
+typedef unsigned int RF_uint32;
+typedef long long RF_int64;
+typedef unsigned long long RF_uint64;
+#define RF_LONGSHIFT 2
+#endif /* mips && !SGI */
+
+#ifdef SGI
+#if _MIPS_SZLONG == 64
+#define RF_IS_BIG_ENDIAN 1
+typedef signed char RF_int8;
+typedef unsigned char RF_uint8;
+typedef short RF_int16;
+typedef unsigned short RF_uint16;
+typedef int RF_int32;
+typedef unsigned int RF_uint32;
+typedef long RF_int64;
+typedef unsigned long RF_uint64;
+#define RF_LONGSHIFT 3
+#endif /* _MIPS_SZLONG == 64 */
+#if _MIPS_SZLONG == 32
+#define RF_IS_BIG_ENDIAN 1
+typedef char RF_int8;
+typedef unsigned char RF_uint8;
+typedef short RF_int16;
+typedef unsigned short RF_uint16;
+typedef int RF_int32;
+typedef unsigned int RF_uint32;
+typedef long long RF_int64;
+typedef unsigned long long RF_uint64;
+#define RF_LONGSHIFT 2
+#endif /* _MIPS_SZLONG == 32 */
+#endif /* SGI */
+
+#endif /* __NetBSD__ || __OpenBSD__ */
+
+/*
+ * These are just zero and non-zero. We don't use "TRUE"
+ * and "FALSE" because there's too much nonsense trying
+ * to get them defined exactly once on every platform, given
+ * the different places they may be defined in system header
+ * files.
+ */
+#define RF_TRUE 1
+#define RF_FALSE 0
+
+/*
+ * Now, some generic types
+ */
+typedef RF_uint64 RF_IoCount_t;
+typedef RF_uint64 RF_Offset_t;
+typedef RF_uint32 RF_PSSFlags_t;
+typedef RF_uint64 RF_SectorCount_t;
+typedef RF_uint64 RF_StripeCount_t;
+typedef RF_int64 RF_SectorNum_t; /* these are unsigned so we can set them to (-1) for "uninitialized" */
+typedef RF_int64 RF_StripeNum_t;
+typedef RF_int64 RF_RaidAddr_t;
+typedef int RF_RowCol_t; /* unsigned so it can be (-1) */
+typedef RF_int64 RF_HeadSepLimit_t;
+typedef RF_int64 RF_ReconUnitCount_t;
+typedef int RF_ReconUnitNum_t;
+
+typedef char RF_ParityConfig_t;
+
+typedef char RF_DiskQueueType_t[1024];
+#define RF_DISK_QUEUE_TYPE_NONE ""
+
+/* values for the 'type' field in a reconstruction buffer */
+typedef int RF_RbufType_t;
+#define RF_RBUF_TYPE_EXCLUSIVE 0 /* this buf assigned exclusively to one disk */
+#define RF_RBUF_TYPE_FLOATING 1 /* this is a floating recon buf */
+#define RF_RBUF_TYPE_FORCED 2 /* this rbuf was allocated to complete a forced recon */
+
+typedef char RF_IoType_t;
+#define RF_IO_TYPE_READ 'r'
+#define RF_IO_TYPE_WRITE 'w'
+#define RF_IO_TYPE_NOP 'n'
+#define RF_IO_IS_R_OR_W(_type_) (((_type_) == RF_IO_TYPE_READ) \
+ || ((_type_) == RF_IO_TYPE_WRITE))
+
+#ifdef SIMULATE
+typedef double RF_TICS_t;
+typedef int RF_Owner_t;
+#endif /* SIMULATE */
+
+typedef void (*RF_VoidFuncPtr)(void *,...);
+
+typedef RF_uint32 RF_AccessStripeMapFlags_t;
+typedef RF_uint32 RF_DiskQueueDataFlags_t;
+typedef RF_uint32 RF_DiskQueueFlags_t;
+typedef RF_uint32 RF_RaidAccessFlags_t;
+
+#define RF_DISKQUEUE_DATA_FLAGS_NONE ((RF_DiskQueueDataFlags_t)0)
+
+typedef struct RF_AccessStripeMap_s RF_AccessStripeMap_t;
+typedef struct RF_AccessStripeMapHeader_s RF_AccessStripeMapHeader_t;
+typedef struct RF_AllocListElem_s RF_AllocListElem_t;
+typedef struct RF_CallbackDesc_s RF_CallbackDesc_t;
+typedef struct RF_ChunkDesc_s RF_ChunkDesc_t;
+typedef struct RF_CommonLogData_s RF_CommonLogData_t;
+typedef struct RF_Config_s RF_Config_t;
+typedef struct RF_CumulativeStats_s RF_CumulativeStats_t;
+typedef struct RF_DagHeader_s RF_DagHeader_t;
+typedef struct RF_DagList_s RF_DagList_t;
+typedef struct RF_DagNode_s RF_DagNode_t;
+typedef struct RF_DeclusteredConfigInfo_s RF_DeclusteredConfigInfo_t;
+typedef struct RF_DiskId_s RF_DiskId_t;
+typedef struct RF_DiskMap_s RF_DiskMap_t;
+typedef struct RF_DiskQueue_s RF_DiskQueue_t;
+typedef struct RF_DiskQueueData_s RF_DiskQueueData_t;
+typedef struct RF_DiskQueueSW_s RF_DiskQueueSW_t;
+typedef struct RF_Etimer_s RF_Etimer_t;
+typedef struct RF_EventCreate_s RF_EventCreate_t;
+typedef struct RF_FreeList_s RF_FreeList_t;
+typedef struct RF_LockReqDesc_s RF_LockReqDesc_t;
+typedef struct RF_LockTableEntry_s RF_LockTableEntry_t;
+typedef struct RF_MCPair_s RF_MCPair_t;
+typedef struct RF_OwnerInfo_s RF_OwnerInfo_t;
+typedef struct RF_ParityLog_s RF_ParityLog_t;
+typedef struct RF_ParityLogAppendQueue_s RF_ParityLogAppendQueue_t;
+typedef struct RF_ParityLogData_s RF_ParityLogData_t;
+typedef struct RF_ParityLogDiskQueue_s RF_ParityLogDiskQueue_t;
+typedef struct RF_ParityLogQueue_s RF_ParityLogQueue_t;
+typedef struct RF_ParityLogRecord_s RF_ParityLogRecord_t;
+typedef struct RF_PerDiskReconCtrl_s RF_PerDiskReconCtrl_t;
+typedef struct RF_PSStatusHeader_s RF_PSStatusHeader_t;
+typedef struct RF_PhysDiskAddr_s RF_PhysDiskAddr_t;
+typedef struct RF_PropHeader_s RF_PropHeader_t;
+typedef struct RF_Raid_s RF_Raid_t;
+typedef struct RF_RaidAccessDesc_s RF_RaidAccessDesc_t;
+typedef struct RF_RaidDisk_s RF_RaidDisk_t;
+typedef struct RF_RaidLayout_s RF_RaidLayout_t;
+typedef struct RF_RaidReconDesc_s RF_RaidReconDesc_t;
+typedef struct RF_ReconBuffer_s RF_ReconBuffer_t;
+typedef struct RF_ReconConfig_s RF_ReconConfig_t;
+typedef struct RF_ReconCtrl_s RF_ReconCtrl_t;
+typedef struct RF_ReconDoneProc_s RF_ReconDoneProc_t;
+typedef struct RF_ReconEvent_s RF_ReconEvent_t;
+typedef struct RF_ReconMap_s RF_ReconMap_t;
+typedef struct RF_ReconMapListElem_s RF_ReconMapListElem_t;
+typedef struct RF_ReconParityStripeStatus_s RF_ReconParityStripeStatus_t;
+typedef struct RF_RedFuncs_s RF_RedFuncs_t;
+typedef struct RF_RegionBufferQueue_s RF_RegionBufferQueue_t;
+typedef struct RF_RegionInfo_s RF_RegionInfo_t;
+typedef struct RF_ShutdownList_s RF_ShutdownList_t;
+typedef struct RF_SpareTableEntry_s RF_SpareTableEntry_t;
+typedef struct RF_SparetWait_s RF_SparetWait_t;
+typedef struct RF_StripeLockDesc_s RF_StripeLockDesc_t;
+typedef struct RF_ThreadGroup_s RF_ThreadGroup_t;
+typedef struct RF_ThroughputStats_s RF_ThroughputStats_t;
+
+/*
+ * Important assumptions regarding ordering of the states in this list
+ * have been made!!!
+ * Before disturbing this ordering, look at code in rf_states.c
+ */
+typedef enum RF_AccessState_e {
+ /* original states */
+ rf_QuiesceState, /* handles queisence for reconstruction */
+ rf_IncrAccessesCountState, /* count accesses in flight */
+ rf_DecrAccessesCountState,
+ rf_MapState, /* map access to disk addresses */
+ rf_LockState, /* take stripe locks */
+ rf_CreateDAGState, /* create DAGs */
+ rf_ExecuteDAGState, /* execute DAGs */
+ rf_ProcessDAGState, /* DAGs are completing- check if correct, or if we need to retry */
+ rf_CleanupState, /* release stripe locks, clean up */
+ rf_LastState /* must be the last state */
+} RF_AccessState_t;
+
+#define RF_MAXROW 10 /* these are arbitrary and can be modified at will */
+#define RF_MAXCOL 40
+#define RF_MAXSPARE 10
+#define RF_MAXDBGV 75 /* max number of debug variables */
+
+union RF_GenericParam_u {
+ void *p;
+ RF_uint64 v;
+};
+typedef union RF_GenericParam_u RF_DagParam_t;
+typedef union RF_GenericParam_u RF_CBParam_t;
+
+#endif /* _RF__RF_TYPES_H_ */
diff --git a/sys/dev/raidframe/rf_utils.c b/sys/dev/raidframe/rf_utils.c
new file mode 100644
index 00000000000..be379ed8e58
--- /dev/null
+++ b/sys/dev/raidframe/rf_utils.c
@@ -0,0 +1,231 @@
+/* $OpenBSD: rf_utils.c,v 1.1 1999/01/11 14:29:54 niklas Exp $ */
+/* $NetBSD: rf_utils.c,v 1.1 1998/11/13 04:20:35 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/****************************************
+ *
+ * rf_utils.c -- various support routines
+ *
+ ****************************************/
+
+/* :
+ * Log: rf_utils.c,v
+ * Revision 1.20 1996/07/27 23:36:08 jimz
+ * Solaris port of simulator
+ *
+ * Revision 1.19 1996/07/22 19:52:16 jimz
+ * switched node params to RF_DagParam_t, a union of
+ * a 64-bit int and a void *, for better portability
+ * attempted hpux port, but failed partway through for
+ * lack of a single C compiler capable of compiling all
+ * source files
+ *
+ * Revision 1.18 1996/07/15 17:22:18 jimz
+ * nit-pick code cleanup
+ * resolve stdlib problems on DEC OSF
+ *
+ * Revision 1.17 1996/06/09 02:36:46 jimz
+ * lots of little crufty cleanup- fixup whitespace
+ * issues, comment #ifdefs, improve typing in some
+ * places (esp size-related)
+ *
+ * Revision 1.16 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.15 1996/06/03 23:28:26 jimz
+ * more bugfixes
+ * check in tree to sync for IPDS runs with current bugfixes
+ * there still may be a problem with threads in the script test
+ * getting I/Os stuck- not trivially reproducible (runs ~50 times
+ * in a row without getting stuck)
+ *
+ * Revision 1.14 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.13 1996/05/27 18:56:37 jimz
+ * more code cleanup
+ * better typing
+ * compiles in all 3 environments
+ *
+ * Revision 1.12 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.11 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.10 1995/12/06 15:17:44 root
+ * added copyright info
+ *
+ */
+
+#include "rf_threadstuff.h"
+
+#ifdef _KERNEL
+#define KERNEL
+#endif
+
+#ifndef KERNEL
+#include <stdio.h>
+#endif /* !KERNEL */
+#include <sys/time.h>
+
+#include "rf_threadid.h"
+#include "rf_utils.h"
+#include "rf_debugMem.h"
+#include "rf_alloclist.h"
+#include "rf_general.h"
+#include "rf_sys.h"
+
+#ifndef KERNEL
+#include "rf_randmacros.h"
+#endif /* !KERNEL */
+
+/* creates & zeros 2-d array with b rows and k columns (MCH) */
+RF_RowCol_t **rf_make_2d_array(b, k, allocList)
+ int b;
+ int k;
+ RF_AllocListElem_t *allocList;
+{
+ RF_RowCol_t **retval, i;
+
+ RF_MallocAndAdd(retval, b * sizeof(RF_RowCol_t *), (RF_RowCol_t **), allocList);
+ for (i=0; i<b; i++) {
+ RF_MallocAndAdd(retval[i], k * sizeof(RF_RowCol_t), (RF_RowCol_t *), allocList);
+ (void) bzero((char *) retval[i], k*sizeof(RF_RowCol_t));
+ }
+ return(retval);
+}
+
+void rf_free_2d_array(a, b, k)
+ RF_RowCol_t **a;
+ int b;
+ int k;
+{
+ RF_RowCol_t i;
+
+ for (i=0; i<b; i++)
+ RF_Free(a[i], k*sizeof(RF_RowCol_t));
+ RF_Free(a, b*sizeof(RF_RowCol_t));
+}
+
+
+/* creates & zeros a 1-d array with c columns */
+RF_RowCol_t *rf_make_1d_array(c, allocList)
+ int c;
+ RF_AllocListElem_t *allocList;
+{
+ RF_RowCol_t *retval;
+
+ RF_MallocAndAdd(retval, c * sizeof(RF_RowCol_t), (RF_RowCol_t *), allocList);
+ (void) bzero((char *) retval, c*sizeof(RF_RowCol_t));
+ return(retval);
+}
+
+void rf_free_1d_array(a, n)
+ RF_RowCol_t *a;
+ int n;
+{
+ RF_Free(a, n * sizeof(RF_RowCol_t));
+}
+
+/* Euclid's algorithm: finds and returns the greatest common divisor
+ * between a and b. (MCH)
+ */
+int rf_gcd(m, n)
+ int m;
+ int n;
+{
+ int t;
+
+ while (m>0) {
+ t = n % m;
+ n = m;
+ m = t;
+ }
+ return(n);
+}
+
+#if !defined(KERNEL) && !defined(SIMULATE) && defined(__osf__)
+/* this is used to generate a random number when _FASTRANDOM is off
+ * in randmacros.h
+ */
+long rf_do_random(rval, rdata)
+ long *rval;
+ struct random_data *rdata;
+{
+ int a, b;
+ long c;
+ /*
+ * random_r() generates random 32-bit values. OR them together.
+ */
+ if (random_r(&a, rdata)!=0) {
+ fprintf(stderr,"Yikes! call to random_r failed\n");
+ exit(1);
+ }
+ if (random_r(&b, rdata)!=0) {
+ fprintf(stderr,"Yikes! call to random_r failed\n");
+ exit(1);
+ }
+ c = ((long)a)<<32;
+ *rval = c|b;
+ return(*rval);
+}
+#endif /* !KERNEL && !SIMULATE && __osf__ */
+
+/* these convert between text and integer. Apparently the regular C macros
+ * for doing this are not available in the kernel
+ */
+
+#define ISDIGIT(x) ( (x) >= '0' && (x) <= '9' )
+#define ISHEXCHAR(x) ( ((x) >= 'a' && (x) <= 'f') || ((x) >= 'A' && (x) <= 'F') )
+#define ISHEX(x) ( ISDIGIT(x) || ISHEXCHAR(x) )
+#define HC2INT(x) ( ((x) >= 'a' && (x) <= 'f') ? (x) - 'a' + 10 : \
+ ( ((x) >= 'A' && (x) <= 'F') ? (x) - 'A' + 10 : (x - '0') ) )
+
+int rf_atoi(p)
+ char *p;
+{
+ int val = 0, negate = 0;
+
+ if (*p == '-') {negate=1; p++;}
+ for ( ; ISDIGIT(*p); p++) val = 10 * val + (*p - '0');
+ return((negate) ? -val : val);
+}
+
+int rf_htoi(p)
+ char *p;
+{
+ int val = 0;
+ for ( ; ISHEXCHAR(*p); p++) val = 16 * val + HC2INT(*p);
+ return(val);
+}
diff --git a/sys/dev/raidframe/rf_utils.h b/sys/dev/raidframe/rf_utils.h
new file mode 100644
index 00000000000..73eede8f131
--- /dev/null
+++ b/sys/dev/raidframe/rf_utils.h
@@ -0,0 +1,90 @@
+/* $OpenBSD: rf_utils.h,v 1.1 1999/01/11 14:29:55 niklas Exp $ */
+/* $NetBSD: rf_utils.h,v 1.1 1998/11/13 04:20:35 oster Exp $ */
+/*
+ * Copyright (c) 1995 Carnegie-Mellon University.
+ * All rights reserved.
+ *
+ * Author: Mark Holland
+ *
+ * Permission to use, copy, modify and distribute this software and
+ * its documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ *
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
+ * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ *
+ * Carnegie Mellon requests users of this software to return to
+ *
+ * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
+ * School of Computer Science
+ * Carnegie Mellon University
+ * Pittsburgh PA 15213-3890
+ *
+ * any improvements or extensions that they make and grant Carnegie the
+ * rights to redistribute these changes.
+ */
+
+/***************************************
+ *
+ * rf_utils.c -- header file for utils.c
+ *
+ ***************************************/
+
+/* :
+ * Log: rf_utils.h,v
+ * Revision 1.7 1996/06/07 21:33:04 jimz
+ * begin using consistent types for sector numbers,
+ * stripe numbers, row+col numbers, recon unit numbers
+ *
+ * Revision 1.6 1996/06/02 17:31:48 jimz
+ * Moved a lot of global stuff into array structure, where it belongs.
+ * Fixed up paritylogging, pss modules in this manner. Some general
+ * code cleanup. Removed lots of dead code, some dead files.
+ *
+ * Revision 1.5 1996/05/23 21:46:35 jimz
+ * checkpoint in code cleanup (release prep)
+ * lots of types, function names have been fixed
+ *
+ * Revision 1.4 1996/05/18 19:51:34 jimz
+ * major code cleanup- fix syntax, make some types consistent,
+ * add prototypes, clean out dead code, et cetera
+ *
+ * Revision 1.3 1995/12/06 15:17:53 root
+ * added copyright info
+ *
+ */
+
+#ifndef _RF__RF_UTILS_H_
+#define _RF__RF_UTILS_H_
+
+#include "rf_types.h"
+#include "rf_alloclist.h"
+#include "rf_threadstuff.h"
+
+char *rf_find_non_white(char *p);
+char *rf_find_white(char *p);
+RF_RowCol_t **rf_make_2d_array(int b, int k, RF_AllocListElem_t *allocList);
+RF_RowCol_t *rf_make_1d_array(int c, RF_AllocListElem_t *allocList);
+void rf_free_2d_array(RF_RowCol_t **a, int b, int k);
+void rf_free_1d_array(RF_RowCol_t *a, int n);
+int rf_gcd(int m, int n);
+int rf_atoi(char *p);
+int rf_htoi(char *p);
+
+#define RF_USEC_PER_SEC 1000000
+#define RF_TIMEVAL_DIFF(_start_,_end_,_diff_) { \
+ if ((_end_)->tv_usec < (_start_)->tv_usec) { \
+ (_diff_)->tv_usec = ((_end_)->tv_usec + RF_USEC_PER_SEC) \
+ - (_start_)->tv_usec; \
+ (_diff_)->tv_sec = ((_end_)->tv_sec-1) - (_start_)->tv_sec; \
+ } \
+ else { \
+ (_diff_)->tv_usec = (_end_)->tv_usec - (_start_)->tv_usec; \
+ (_diff_)->tv_sec = (_end_)->tv_sec - (_start_)->tv_sec; \
+ } \
+}
+
+#endif /* !_RF__RF_UTILS_H_ */