sys/dev/vinum/vinumrequest.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094

/*-
 * Copyright (c) 1997, 1998, 1999
 *  Nan Yang Computer Services Limited.  All rights reserved.
 *
 *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
 *
 *  Written by Greg Lehey
 *
 *  This software is distributed under the so-called ``Berkeley
 *  License'':
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by Nan Yang Computer
 *      Services Limited.
 * 4. Neither the name of the Company nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * This software is provided ``as is'', and any express or implied
 * warranties, including, but not limited to, the implied warranties of
 * merchantability and fitness for a particular purpose are disclaimed.
 * In no event shall the company or contributors be liable for any
 * direct, indirect, incidental, special, exemplary, or consequential
 * damages (including, but not limited to, procurement of substitute
 * goods or services; loss of use, data, or profits; or business
 * interruption) however caused and on any theory of liability, whether
 * in contract, strict liability, or tort (including negligence or
 * otherwise) arising in any way out of the use of this software, even if
 * advised of the possibility of such damage.
 *
 * $Id: vinumrequest.c,v 1.26 1999/12/30 07:38:33 grog Exp grog $
 * $FreeBSD$
 */

#include <dev/vinum/vinumhdr.h>
#include <dev/vinum/request.h>
#include <sys/resourcevar.h>

enum requeststatus bre(struct request *rq,
    int plexno,
    daddr_t * diskstart,
    daddr_t diskend);
enum requeststatus bre5(struct request *rq,
    int plexno,
    daddr_t * diskstart,
    daddr_t diskend);
enum requeststatus build_read_request(struct request *rq, int volplexno);
enum requeststatus build_write_request(struct request *rq);
enum requeststatus build_rq_buffer(struct rqelement *rqe, struct plex *plex);
int find_alternate_sd(struct request *rq);
int check_range_covered(struct request *);
void complete_rqe(struct buf *bp);
void complete_raid5_write(struct rqelement *);
int abortrequest(struct request *rq, int error);
void sdio_done(struct buf *bp);
int vinum_bounds_check(struct buf *bp, struct volume *vol);
caddr_t allocdatabuf(struct rqelement *rqe);
void freedatabuf(struct rqelement *rqe);

#ifdef VINUMDEBUG
struct rqinfo rqinfo[RQINFO_SIZE];
struct rqinfo *rqip = rqinfo;

void
logrq(enum rqinfo_type type, union rqinfou info, struct buf *ubp)
{
    int s = splhigh();

    microtime(&rqip->timestamp);			    /* when did this happen? */
    rqip->type = type;
    rqip->bp = ubp;					    /* user buffer */
    switch (type) {
    case loginfo_user_bp:
    case loginfo_user_bpl:
    case loginfo_sdio:					    /* subdisk I/O */
    case loginfo_sdiol:					    /* subdisk I/O launch */
    case loginfo_sdiodone:				    /* subdisk I/O complete */
	bcopy(info.bp, &rqip->info.b, sizeof(struct buf));
	rqip->devmajor = major(info.bp->b_dev);
	rqip->devminor = minor(info.bp->b_dev);
	break;

    case loginfo_iodone:
    case loginfo_rqe:
    case loginfo_raid5_data:
    case loginfo_raid5_parity:
	bcopy(info.rqe, &rqip->info.rqe, sizeof(struct rqelement));
	rqip->devmajor = major(info.rqe->b.b_dev);
	rqip->devminor = minor(info.rqe->b.b_dev);
	break;

    case loginfo_lockwait:
    case loginfo_lock:
    case loginfo_unlock:
	bcopy(info.lockinfo, &rqip->info.lockinfo, sizeof(struct rangelock));

	break;

    case loginfo_unused:
	break;
    }
    rqip++;
    if (rqip >= &rqinfo[RQINFO_SIZE])			    /* wrap around */
	rqip = rqinfo;
    splx(s);
}

#endif

void
vinumstrategy(struct bio *biop)
{
    struct buf *bp = (struct buf *) biop;
    int volno;
    struct volume *vol = NULL;

    switch (DEVTYPE(bp->b_dev)) {
    case VINUM_SD_TYPE:
    case VINUM_RAWSD_TYPE:
	sdio(bp);
	return;

	/*
	 * In fact, vinum doesn't handle drives: they're
	 * handled directly by the disk drivers
	 */
    case VINUM_DRIVE_TYPE:
    default:
	bp->b_error = EIO;				    /* I/O error */
	bp->b_io.bio_flags |= BIO_ERROR;
	bufdone(bp);
	return;

    case VINUM_VOLUME_TYPE:				    /* volume I/O */
	volno = Volno(bp->b_dev);
	vol = &VOL[volno];
	if (vol->state != volume_up) {			    /* can't access this volume */
	    bp->b_error = EIO;				    /* I/O error */
	    bp->b_io.bio_flags |= BIO_ERROR;
	    bufdone(bp);
	    return;
	}
	if (vinum_bounds_check(bp, vol) <= 0) {		    /* don't like them bounds */
	    bufdone(bp);
	    return;
	}
	/* FALLTHROUGH */
	/*
	 * Plex I/O is pretty much the same as volume I/O
	 * for a single plex.  Indicate this by passing a NULL
	 * pointer (set above) for the volume
	 */
    case VINUM_PLEX_TYPE:
    case VINUM_RAWPLEX_TYPE:
	bp->b_resid = bp->b_bcount;			    /* transfer everything */
	vinumstart(bp, 0);
	return;
    }
}

/*
 * Start a transfer.  Return -1 on error,
 * 0 if OK, 1 if we need to retry.
 * Parameter reviveok is set when doing
 * transfers for revives: it allows transfers to
 * be started immediately when a revive is in
 * progress.  During revive, normal transfers
 * are queued if they share address space with
 * a currently active revive operation.
 */
int
vinumstart(struct buf *bp, int reviveok)
{
    int plexno;
    int maxplex;					    /* maximum number of plexes to handle */
    struct volume *vol;
    struct request *rq;					    /* build up our request here */
    enum requeststatus status;

#if VINUMDEBUG
    if (debug & DEBUG_LASTREQS)
	logrq(loginfo_user_bp, (union rqinfou) bp, bp);
#endif

    if ((bp->b_bcount % DEV_BSIZE) != 0) {		    /* bad length */
	bp->b_error = EINVAL;				    /* invalid size */
	bp->b_io.bio_flags |= BIO_ERROR;
	bufdone(bp);
	return -1;
    }
    rq = (struct request *) Malloc(sizeof(struct request)); /* allocate a request struct */
    if (rq == NULL) {					    /* can't do it */
	bp->b_error = ENOMEM;				    /* can't get memory */
	bp->b_io.bio_flags |= BIO_ERROR;
	bufdone(bp);
	return -1;
    }
    bzero(rq, sizeof(struct request));

    /*
     * Note the volume ID.  This can be NULL, which
     * the request building functions use as an
     * indication for single plex I/O
     */
    rq->bp = bp;					    /* and the user buffer struct */

    if (DEVTYPE(bp->b_dev) == VINUM_VOLUME_TYPE) {	    /* it's a volume, */
	rq->volplex.volno = Volno(bp->b_dev);		    /* get the volume number */
	vol = &VOL[rq->volplex.volno];			    /* and point to it */
	vol->active++;					    /* one more active request */
	maxplex = vol->plexes;				    /* consider all its plexes */
    } else {
	vol = NULL;					    /* no volume */
	rq->volplex.plexno = Plexno(bp->b_dev);		    /* point to the plex */
	rq->isplex = 1;					    /* note that it's a plex */
	maxplex = 1;					    /* just the one plex */
    }

    if (bp->b_iocmd == BIO_READ) {
	/*
	 * This is a read request.  Decide
	 * which plex to read from.
	 *
	 * There's a potential race condition here,
	 * since we're not locked, and we could end
	 * up multiply incrementing the round-robin
	 * counter.  This doesn't have any serious
	 * effects, however.
	 */
	if (vol != NULL) {
	    plexno = vol->preferred_plex;		    /* get the plex to use */
	    if (plexno < 0) {				    /* round robin */
		plexno = vol->last_plex_read;
		vol->last_plex_read++;
		if (vol->last_plex_read >= vol->plexes)	    /* got the the end? */
		    vol->last_plex_read = 0;		    /* wrap around */
	    }
	    status = build_read_request(rq, plexno);	    /* build a request */
	} else {
	    daddr_t diskaddr = bp->b_blkno;		    /* start offset of transfer */
	    status = bre(rq,				    /* build a request list */
		rq->volplex.plexno,
		&diskaddr,
		diskaddr + (bp->b_bcount / DEV_BSIZE));
	}

	if ((status > REQUEST_RECOVERED)		    /* can't satisfy it */
	||(bp->b_flags & B_DONE)) {			    /* XXX shouldn't get this without bad status */
	    if (status == REQUEST_DOWN) {		    /* not enough subdisks */
		bp->b_error = EIO;			    /* I/O error */
		bp->b_io.bio_flags |= BIO_ERROR;
	    }
	    bufdone(bp);
	    freerq(rq);
	    return -1;
	}
	return launch_requests(rq, reviveok);		    /* now start the requests if we can */
    } else
	/*
	 * This is a write operation.  We write to all plexes.  If this is
	 * a RAID-4 or RAID-5 plex, we must also update the parity stripe.
	 */
    {
	if (vol != NULL)
	    status = build_write_request(rq);		    /* Not all the subdisks are up */
	else {						    /* plex I/O */
	    daddr_t diskstart;

	    diskstart = bp->b_blkno;			    /* start offset of transfer */
	    status = bre(rq,
		Plexno(bp->b_dev),
		&diskstart,
		bp->b_blkno + (bp->b_bcount / DEV_BSIZE));  /* build requests for the plex */
	}
	if ((status > REQUEST_RECOVERED)		    /* can't satisfy it */
	||(bp->b_flags & B_DONE)) {			    /* XXX shouldn't get this without bad status */
	    if (status == REQUEST_DOWN) {		    /* not enough subdisks */
		bp->b_error = EIO;			    /* I/O error */
		bp->b_io.bio_flags |= BIO_ERROR;
	    }
	    if ((bp->b_flags & B_DONE) == 0)
		bufdone(bp);
	    freerq(rq);
	    return -1;
	}
	return launch_requests(rq, reviveok);		    /* now start the requests if we can */
    }
}

/*
 * Call the low-level strategy routines to
 * perform the requests in a struct request
 */
int
launch_requests(struct request *rq, int reviveok)
{
    struct rqgroup *rqg;
    int rqno;						    /* loop index */
    struct rqelement *rqe;				    /* current element */
    struct drive *drive;
    int rcount;						    /* request count */

    /*
     * First find out whether we're reviving, and the
     * request contains a conflict.  If so, we hang
     * the request off plex->waitlist of the first
     * plex we find which is reviving
     */

    if ((rq->flags & XFR_REVIVECONFLICT)		    /* possible revive conflict */
    &&(!reviveok)) {					    /* and we don't want to do it now, */
	struct sd *sd;
	struct request *waitlist;			    /* point to the waitlist */

	sd = &SD[rq->sdno];
	if (sd->waitlist != NULL) {			    /* something there already, */
	    waitlist = sd->waitlist;
	    while (waitlist->next != NULL)		    /* find the end */
		waitlist = waitlist->next;
	    waitlist->next = rq;			    /* hook our request there */
	} else
	    sd->waitlist = rq;				    /* hook our request at the front */

#if VINUMDEBUG
	if (debug & DEBUG_REVIVECONFLICT)
	    log(LOG_DEBUG,
		"Revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%x, length %ld\n",
		rq->sdno,
		rq,
		rq->bp->b_iocmd == BIO_READ ? "Read" : "Write",
		major(rq->bp->b_dev),
		minor(rq->bp->b_dev),
		rq->bp->b_blkno,
		rq->bp->b_bcount);
#endif
	return 0;					    /* and get out of here */
    }
    rq->active = 0;					    /* nothing yet */
#if VINUMDEBUG
    if (debug & DEBUG_ADDRESSES)
	log(LOG_DEBUG,
	    "Request: %p\n%s dev %d.%d, offset 0x%x, length %ld\n",
	    rq,
	    rq->bp->b_iocmd == BIO_READ ? "Read" : "Write",
	    major(rq->bp->b_dev),
	    minor(rq->bp->b_dev),
	    rq->bp->b_blkno,
	    rq->bp->b_bcount);
    vinum_conf.lastrq = rq;
    vinum_conf.lastbuf = rq->bp;
    if (debug & DEBUG_LASTREQS)
	logrq(loginfo_user_bpl, (union rqinfou) rq->bp, rq->bp);
#endif

    /*
     * We used to have an splbio() here anyway, out
     * of superstition.  With the division of labour
     * below (first count the requests, then issue
     * them), it looks as if we don't need this
     * splbio() protection.  In fact, as dillon
     * points out, there's a race condition
     * incrementing and decrementing rq->active and
     * rqg->active.  This splbio() didn't help
     * there, because the device strategy routine
     * can sleep.  Solve this by putting shorter
     * duration locks on the code.
     */
    /*
     * This loop happens without any participation
     * of the bottom half, so it requires no
     * protection.
     */
    for (rqg = rq->rqg; rqg != NULL; rqg = rqg->next) {	    /* through the whole request chain */
	rqg->active = rqg->count;			    /* they're all active */
	for (rqno = 0; rqno < rqg->count; rqno++) {
	    rqe = &rqg->rqe[rqno];
	    if (rqe->flags & XFR_BAD_SUBDISK)		    /* this subdisk is bad, */
		rqg->active--;				    /* one less active request */
	}
	if (rqg->active)				    /* we have at least one active request, */
	    rq->active++;				    /* one more active request group */
    }

    /*
     * Now fire off the requests.  In this loop the
     * bottom half could be completing requests
     * before we finish, so we need splbio() protection.
     */
    for (rqg = rq->rqg; rqg != NULL;) {			    /* through the whole request chain */
	if (rqg->lockbase >= 0)				    /* this rqg needs a lock first */
	    rqg->lock = lockrange(rqg->lockbase, rqg->rq->bp, &PLEX[rqg->plexno]);
	rcount = rqg->count;
	for (rqno = 0; rqno < rcount;) {
	    rqe = &rqg->rqe[rqno];

	    /*
	     * Point to next rqg before the bottom end
	     * changes the structures.
	     */
	    if (++rqno >= rcount)
		rqg = rqg->next;
	    if ((rqe->flags & XFR_BAD_SUBDISK) == 0) {	    /* this subdisk is good, */
		drive = &DRIVE[rqe->driveno];		    /* look at drive */
		drive->active++;
		if (drive->active >= drive->maxactive)
		    drive->maxactive = drive->active;
		vinum_conf.active++;
		if (vinum_conf.active >= vinum_conf.maxactive)
		    vinum_conf.maxactive = vinum_conf.active;

#ifdef VINUMDEBUG
		if (debug & DEBUG_ADDRESSES)
		    log(LOG_DEBUG,
			"  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
			rqe->b.b_iocmd == BIO_READ ? "Read" : "Write",
			major(rqe->b.b_dev),
			minor(rqe->b.b_dev),
			rqe->sdno,
			(u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
			rqe->b.b_blkno,
			rqe->b.b_bcount);
		if (debug & DEBUG_LASTREQS)
		    logrq(loginfo_rqe, (union rqinfou) rqe, rq->bp);
#endif
		/* fire off the request */
		DEV_STRATEGY(&rqe->b, 0);
	    }
	}
    }
    return 0;
}

/*
 * define the low-level requests needed to perform a
 * high-level I/O operation for a specific plex 'plexno'.
 *
 * Return REQUEST_OK if all subdisks involved in the request are up,
 * REQUEST_DOWN if some subdisks are not up, and REQUEST_EOF if the
 * request is at least partially outside the bounds of the subdisks.
 *
 * Modify the pointer *diskstart to point to the end address.  On
 * read, return on the first bad subdisk, so that the caller
 * (build_read_request) can try alternatives.
 *
 * On entry to this routine, the rqg structures are not assigned.  The
 * assignment is performed by expandrq().  Strictly speaking, the
 * elements rqe->sdno of all entries should be set to -1, since 0
 * (from bzero) is a valid subdisk number.  We avoid this problem by
 * initializing the ones we use, and not looking at the others (index
 * >= rqg->requests).
 */
enum requeststatus
bre(struct request *rq,
    int plexno,
    daddr_t * diskaddr,
    daddr_t diskend)
{
    int sdno;
    struct sd *sd;
    struct rqgroup *rqg;
    struct buf *bp;					    /* user's bp */
    struct plex *plex;
    enum requeststatus status;				    /* return value */
    daddr_t plexoffset;					    /* offset of transfer in plex */
    daddr_t stripebase;					    /* base address of stripe (1st subdisk) */
    daddr_t stripeoffset;				    /* offset in stripe */
    daddr_t blockoffset;				    /* offset in stripe on subdisk */
    struct rqelement *rqe;				    /* point to this request information */
    daddr_t diskstart = *diskaddr;			    /* remember where this transfer starts */
    enum requeststatus s;				    /* temp return value */

    bp = rq->bp;					    /* buffer pointer */
    status = REQUEST_OK;				    /* return value: OK until proven otherwise */
    plex = &PLEX[plexno];				    /* point to the plex */

    switch (plex->organization) {
    case plex_concat:
	sd = NULL;					    /* (keep compiler quiet) */
	for (sdno = 0; sdno < plex->subdisks; sdno++) {
	    sd = &SD[plex->sdnos[sdno]];
	    if (*diskaddr < sd->plexoffset)		    /* we must have a hole, */
		status = REQUEST_DEGRADED;		    /* note the fact */
	    if (*diskaddr < (sd->plexoffset + sd->sectors)) { /* the request starts in this subdisk */
		rqg = allocrqg(rq, 1);			    /* space for the request */
		if (rqg == NULL) {			    /* malloc failed */
		    bp->b_error = ENOMEM;
		    bp->b_io.bio_flags |= BIO_ERROR;
		    bufdone(bp);
		    return REQUEST_ENOMEM;
		}
		rqg->plexno = plexno;

		rqe = &rqg->rqe[0];			    /* point to the element */
		rqe->rqg = rqg;				    /* group */
		rqe->sdno = sd->sdno;			    /* put in the subdisk number */
		plexoffset = *diskaddr;			    /* start offset in plex */
		rqe->sdoffset = plexoffset - sd->plexoffset; /* start offset in subdisk */
		rqe->useroffset = plexoffset - diskstart;   /* start offset in user buffer */
		rqe->dataoffset = 0;
		rqe->datalen = min(diskend - *diskaddr,	    /* number of sectors to transfer in this sd */
		    sd->sectors - rqe->sdoffset);
		rqe->groupoffset = 0;			    /* no groups for concatenated plexes */
		rqe->grouplen = 0;
		rqe->buflen = rqe->datalen;		    /* buffer length is data buffer length */
		rqe->flags = 0;
		rqe->driveno = sd->driveno;
		if (sd->state != sd_up) {		    /* *now* we find the sd is down */
		    s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
		    if (s == REQUEST_DOWN) {		    /* down? */
			rqe->flags = XFR_BAD_SUBDISK;	    /* yup */
			if (rq->bp->b_iocmd == BIO_READ)    /* read request, */
			    return REQUEST_DEGRADED;	    /* give up here */
			/*
			 * If we're writing, don't give up
			 * because of a bad subdisk.  Go
			 * through to the bitter end, but note
			 * which ones we can't access.
			 */
			status = REQUEST_DEGRADED;	    /* can't do it all */
		    }
		}
		*diskaddr += rqe->datalen;		    /* bump the address */
		if (build_rq_buffer(rqe, plex)) {	    /* build the buffer */
		    deallocrqg(rqg);
		    bp->b_error = ENOMEM;
		    bp->b_io.bio_flags |= BIO_ERROR;
		    bufdone(bp);
		    return REQUEST_ENOMEM;		    /* can't do it */
		}
	    }
	    if (*diskaddr == diskend)			    /* we're finished, */
		break;					    /* get out of here */
	}
	/*
	 * We've got to the end of the plex.  Have we got to the end of
	 * the transfer?  It would seem that having an offset beyond the
	 * end of the subdisk is an error, but in fact it can happen if
	 * the volume has another plex of different size.  There's a valid
	 * question as to why you would want to do this, but currently
	 * it's allowed.
	 *
	 * In a previous version, I returned REQUEST_DOWN here.  I think
	 * REQUEST_EOF is more appropriate now.
	 */
	if (diskend > sd->sectors + sd->plexoffset)	    /* pointing beyond EOF? */
	    status = REQUEST_EOF;
	break;

    case plex_striped:
	{
	    while (*diskaddr < diskend) {		    /* until we get it all sorted out */
		if (*diskaddr >= plex->length)		    /* beyond the end of the plex */
		    return REQUEST_EOF;			    /* can't continue */

		/* The offset of the start address from the start of the stripe. */
		stripeoffset = *diskaddr % (plex->stripesize * plex->subdisks);

		/* The plex-relative address of the start of the stripe. */
		stripebase = *diskaddr - stripeoffset;

		/* The number of the subdisk in which the start is located. */
		sdno = stripeoffset / plex->stripesize;

		/* The offset from the beginning of the stripe on this subdisk. */
		blockoffset = stripeoffset % plex->stripesize;

		sd = &SD[plex->sdnos[sdno]];		    /* the subdisk in question */
		rqg = allocrqg(rq, 1);			    /* space for the request */
		if (rqg == NULL) {			    /* malloc failed */
		    bp->b_error = ENOMEM;
		    bp->b_io.bio_flags |= BIO_ERROR;
		    bufdone(bp);
		    return REQUEST_ENOMEM;
		}
		rqg->plexno = plexno;

		rqe = &rqg->rqe[0];			    /* point to the element */
		rqe->rqg = rqg;
		rqe->sdoffset = stripebase / plex->subdisks + blockoffset; /* start offset in this subdisk */
		rqe->useroffset = *diskaddr - diskstart;    /* The offset of the start in the user buffer */
		rqe->dataoffset = 0;
		rqe->datalen = min(diskend - *diskaddr,	    /* the amount remaining to transfer */
		    plex->stripesize - blockoffset);	    /* and the amount left in this stripe */
		rqe->groupoffset = 0;			    /* no groups for striped plexes */
		rqe->grouplen = 0;
		rqe->buflen = rqe->datalen;		    /* buffer length is data buffer length */
		rqe->flags = 0;
		rqe->sdno = sd->sdno;			    /* put in the subdisk number */
		rqe->driveno = sd->driveno;

		if (sd->state != sd_up) {		    /* *now* we find the sd is down */
		    s = checksdstate(sd, rq, *diskaddr, diskend); /* do we need to change state? */
		    if (s == REQUEST_DOWN) {		    /* down? */
			rqe->flags = XFR_BAD_SUBDISK;	    /* yup */
			if (rq->bp->b_iocmd == BIO_READ)    /* read request, */
			    return REQUEST_DEGRADED;	    /* give up here */
			/*
			 * If we're writing, don't give up
			 * because of a bad subdisk.  Go through
			 * to the bitter end, but note which
			 * ones we can't access.
			 */
			status = REQUEST_DEGRADED;	    /* can't do it all */
		    }
		}
		/*
		 * It would seem that having an offset
		 * beyond the end of the subdisk is an
		 * error, but in fact it can happen if the
		 * volume has another plex of different
		 * size.  There's a valid question as to why
		 * you would want to do this, but currently
		 * it's allowed.
		 */
		if (rqe->sdoffset + rqe->datalen > sd->sectors) { /* ends beyond the end of the subdisk? */
		    rqe->datalen = sd->sectors - rqe->sdoffset;	/* truncate */
#if VINUMDEBUG
		    if (debug & DEBUG_EOFINFO) {	    /* tell on the request */
			log(LOG_DEBUG,
			    "vinum: EOF on plex %s, sd %s offset %x (user offset %x)\n",
			    plex->name,
			    sd->name,
			    (u_int) sd->sectors,
			    bp->b_blkno);
			log(LOG_DEBUG,
			    "vinum: stripebase %x, stripeoffset %x, blockoffset %x\n",
			    stripebase,
			    stripeoffset,
			    blockoffset);
		    }
#endif
		}
		if (build_rq_buffer(rqe, plex)) {	    /* build the buffer */
		    deallocrqg(rqg);
		    bp->b_error = ENOMEM;
		    bp->b_io.bio_flags |= BIO_ERROR;
		    bufdone(bp);
		    return REQUEST_ENOMEM;		    /* can't do it */
		}
		*diskaddr += rqe->datalen;		    /* look at the remainder */
		if ((*diskaddr < diskend)		    /* didn't finish the request on this stripe */
		&&(*diskaddr < plex->length)) {		    /* and there's more to come */
		    plex->multiblock++;			    /* count another one */
		    if (sdno == plex->subdisks - 1)	    /* last subdisk, */
			plex->multistripe++;		    /* another stripe as well */
		}
	    }
	}
	break;

	/*
	 * RAID-4 and RAID-5 are complicated enough to have their own
	 * function.
	 */
    case plex_raid4:
    case plex_raid5:
	status = bre5(rq, plexno, diskaddr, diskend);
	break;

    default:
	log(LOG_ERR, "vinum: invalid plex type %d in bre\n", plex->organization);
	status = REQUEST_DOWN;				    /* can't access it */
    }

    return status;
}

/*
 * Build up a request structure for reading volumes.
 * This function is not needed for plex reads, since there's
 * no recovery if a plex read can't be satisified.
 */
enum requeststatus
build_read_request(struct request *rq,			    /* request */
    int plexindex)
{							    /* index in the volume's plex table */
    struct buf *bp;
    daddr_t startaddr;					    /* offset of previous part of transfer */
    daddr_t diskaddr;					    /* offset of current part of transfer */
    daddr_t diskend;					    /* and end offset of transfer */
    int plexno;						    /* plex index in vinum_conf */
    struct rqgroup *rqg;				    /* point to the request we're working on */
    struct volume *vol;					    /* volume in question */
    int recovered = 0;					    /* set if we recover a read */
    enum requeststatus status = REQUEST_OK;
    int plexmask;					    /* bit mask of plexes, for recovery */

    bp = rq->bp;					    /* buffer pointer */
    diskaddr = bp->b_blkno;				    /* start offset of transfer */
    diskend = diskaddr + (bp->b_bcount / DEV_BSIZE);	    /* and end offset of transfer */
    rqg = &rq->rqg[plexindex];				    /* plex request */
    vol = &VOL[rq->volplex.volno];			    /* point to volume */

    while (diskaddr < diskend) {			    /* build up request components */
	startaddr = diskaddr;
	status = bre(rq, vol->plex[plexindex], &diskaddr, diskend); /* build up a request */
	switch (status) {
	case REQUEST_OK:
	    continue;

	case REQUEST_RECOVERED:
	    /*
	     * XXX FIXME if we have more than one plex, and we can
	     * satisfy the request from another, don't use the
	     * recovered request, since it's more expensive.
	     */
	    recovered = 1;
	    break;

	case REQUEST_ENOMEM:
	    return status;
	    /*
	     * If we get here, our request is not complete.  Try
	     * to fill in the missing parts from another plex.
	     * This can happen multiple times in this function,
	     * and we reinitialize the plex mask each time, since
	     * we could have a hole in our plexes.
	     */
	case REQUEST_EOF:
	case REQUEST_DOWN:				    /* can't access the plex */
	case REQUEST_DEGRADED:				    /* can't access the plex */
	    plexmask = ((1 << vol->plexes) - 1)		    /* all plexes in the volume */
	    &~(1 << plexindex);				    /* except for the one we were looking at */
	    for (plexno = 0; plexno < vol->plexes; plexno++) {
		if (plexmask == 0)			    /* no plexes left to try */
		    return REQUEST_DOWN;		    /* failed */
		diskaddr = startaddr;			    /* start at the beginning again */
		if (plexmask & (1 << plexno)) {		    /* we haven't tried this plex yet */
		    bre(rq, vol->plex[plexno], &diskaddr, diskend); /* try a request */
		    if (diskaddr > startaddr) {		    /* we satisfied another part */
			recovered = 1;			    /* we recovered from the problem */
			status = REQUEST_OK;		    /* don't complain about it */
			break;
		    }
		}
	    }
	    if (diskaddr == startaddr)			    /* didn't get any further, */
		return status;
	}
	if (recovered)
	    vol->recovered_reads += recovered;		    /* adjust our recovery count */
    }
    return status;
}

/*
 * Build up a request structure for writes.
 * Return 0 if all subdisks involved in the request are up, 1 if some
 * subdisks are not up, and -1 if the request is at least partially
 * outside the bounds of the subdisks.
 */
enum requeststatus
build_write_request(struct request *rq)
{							    /* request */
    struct buf *bp;
    daddr_t diskstart;					    /* offset of current part of transfer */
    daddr_t diskend;					    /* and end offset of transfer */
    int plexno;						    /* plex index in vinum_conf */
    struct volume *vol;					    /* volume in question */
    enum requeststatus status;

    bp = rq->bp;					    /* buffer pointer */
    vol = &VOL[rq->volplex.volno];			    /* point to volume */
    diskend = bp->b_blkno + (bp->b_bcount / DEV_BSIZE);	    /* end offset of transfer */
    status = REQUEST_DOWN;				    /* assume the worst */
    for (plexno = 0; plexno < vol->plexes; plexno++) {
	diskstart = bp->b_blkno;			    /* start offset of transfer */
	/*
	 * Build requests for the plex.
	 * We take the best possible result here (min,
	 * not max): we're happy if we can write at all
	 */
	status = min(status, bre(rq,
		vol->plex[plexno],
		&diskstart,
		diskend));
    }
    return status;
}

/* Fill in the struct buf part of a request element. */
enum requeststatus
build_rq_buffer(struct rqelement *rqe, struct plex *plex)
{
    struct sd *sd;					    /* point to subdisk */
    struct volume *vol;
    struct buf *bp;
    struct buf *ubp;					    /* user (high level) buffer header */

    vol = &VOL[rqe->rqg->rq->volplex.volno];
    sd = &SD[rqe->sdno];				    /* point to subdisk */
    bp = &rqe->b;
    ubp = rqe->rqg->rq->bp;				    /* pointer to user buffer header */

    /* Initialize the buf struct */
    /* copy these flags from user bp */
    bp->b_flags = ubp->b_flags & (B_NOCACHE | B_ASYNC);
    bp->b_io.bio_flags = ubp->b_io.bio_flags & BIO_ORDERED;
    bp->b_iocmd = ubp->b_iocmd;
    BUF_LOCKINIT(bp);					    /* get a lock for the buffer */
    BUF_LOCK(bp, LK_EXCLUSIVE);				    /* and lock it */
    bp->b_iodone = complete_rqe;
    /*
     * You'd think that we wouldn't need to even
     * build the request buffer for a dead subdisk,
     * but in some cases we need information like
     * the user buffer address.  Err on the side of
     * generosity and supply what we can.  That
     * obviously doesn't include drive information
     * when the drive is dead.
     */
    if ((rqe->flags & XFR_BAD_SUBDISK) == 0)		    /* subdisk is accessible, */
	bp->b_dev = DRIVE[rqe->driveno].dev;		    /* drive device */
    bp->b_blkno = rqe->sdoffset + sd->driveoffset;	    /* start address */
    bp->b_bcount = rqe->buflen << DEV_BSHIFT;		    /* number of bytes to transfer */
    bp->b_resid = bp->b_bcount;				    /* and it's still all waiting */
    bp->b_bufsize = bp->b_bcount;			    /* and buffer size */
    bp->b_rcred = FSCRED;				    /* we have the file system credentials */
    bp->b_wcred = FSCRED;				    /* we have the file system credentials */

    if (rqe->flags & XFR_MALLOCED) {			    /* this operation requires a malloced buffer */
	bp->b_data = Malloc(bp->b_bcount);		    /* get a buffer to put it in */
	if (bp->b_data == NULL) {			    /* failed */
	    abortrequest(rqe->rqg->rq, ENOMEM);
	    return REQUEST_ENOMEM;			    /* no memory */
	}
    } else
	/*
	 * Point directly to user buffer data.  This means
	 * that we don't need to do anything when we have
	 * finished the transfer
	 */
	bp->b_data = ubp->b_data + rqe->useroffset * DEV_BSIZE;
    /*
     * On a recovery read, we perform an XOR of
     * all blocks to the user buffer.  To make
     * this work, we first clean out the buffer
     */
    if ((rqe->flags & (XFR_RECOVERY_READ | XFR_BAD_SUBDISK))
	== (XFR_RECOVERY_READ | XFR_BAD_SUBDISK)) {	    /* bad subdisk of a recovery read */
	int length = rqe->grouplen << DEV_BSHIFT;	    /* and count involved */
	char *data = (char *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* destination */

	bzero(data, length);				    /* clean it out */
    }
    return 0;
}

/*
 * Abort a request: free resources and complete the
 * user request with the specified error
 */
int
abortrequest(struct request *rq, int error)
{
    struct buf *bp = rq->bp;				    /* user buffer */

    bp->b_error = error;
    freerq(rq);						    /* free everything we're doing */
    bp->b_io.bio_flags |= BIO_ERROR;
    bufdone(bp);
    return error;					    /* and give up */
}

/*
 * Check that our transfer will cover the
 * complete address space of the user request.
 *
 * Return 1 if it can, otherwise 0
 */
int
check_range_covered(struct request *rq)
{
    return 1;
}

/* Perform I/O on a subdisk */
void
sdio(struct buf *bp)
{
    int s;						    /* spl */
    struct sd *sd;
    struct sdbuf *sbp;
    daddr_t endoffset;
    struct drive *drive;

#if VINUMDEBUG
    if (debug & DEBUG_LASTREQS)
	logrq(loginfo_sdio, (union rqinfou) bp, bp);
#endif
    sd = &SD[Sdno(bp->b_dev)];				    /* point to the subdisk */
    drive = &DRIVE[sd->driveno];

    if (drive->state != drive_up) {
	if (sd->state >= sd_crashed) {
	    if (bp->b_iocmd == BIO_WRITE)		    /* writing, */
		set_sd_state(sd->sdno, sd_stale, setstate_force);
	    else
		set_sd_state(sd->sdno, sd_crashed, setstate_force);
	}
	bp->b_error = EIO;
	bp->b_io.bio_flags |= BIO_ERROR;
	bufdone(bp);
	return;
    }
    /*
     * We allow access to any kind of subdisk as long as we can expect
     * to get the I/O performed.
     */
    if (sd->state < sd_empty) {				    /* nothing to talk to, */
	bp->b_error = EIO;
	bp->b_io.bio_flags |= BIO_ERROR;
	bufdone(bp);
	return;
    }
    /* Get a buffer */
    sbp = (struct sdbuf *) Malloc(sizeof(struct sdbuf));
    if (sbp == NULL) {
	bp->b_error = ENOMEM;
	bp->b_io.bio_flags |= BIO_ERROR;
	bufdone(bp);
	return;
    }
    bzero(sbp, sizeof(struct sdbuf));			    /* start with nothing */
    sbp->b.b_flags = bp->b_flags;
    sbp->b.b_iocmd = bp->b_iocmd;
    sbp->b.b_bufsize = bp->b_bufsize;			    /* buffer size */
    sbp->b.b_bcount = bp->b_bcount;			    /* number of bytes to transfer */
    sbp->b.b_resid = bp->b_resid;			    /* and amount waiting */
    sbp->b.b_dev = DRIVE[sd->driveno].dev;		    /* device */
    sbp->b.b_data = bp->b_data;				    /* data buffer */
    sbp->b.b_blkno = bp->b_blkno + sd->driveoffset;
    sbp->b.b_iodone = sdio_done;			    /* come here on completion */
    BUF_LOCKINIT(&sbp->b);				    /* get a lock for the buffer */
    BUF_LOCK(&sbp->b, LK_EXCLUSIVE);			    /* and lock it */
    sbp->bp = bp;					    /* note the address of the original header */
    sbp->sdno = sd->sdno;				    /* note for statistics */
    sbp->driveno = sd->driveno;
    endoffset = bp->b_blkno + sbp->b.b_bcount / DEV_BSIZE;  /* final sector offset */
    if (endoffset > sd->sectors) {			    /* beyond the end */
	sbp->b.b_bcount -= (endoffset - sd->sectors) * DEV_BSIZE; /* trim */
	if (sbp->b.b_bcount <= 0) {			    /* nothing to transfer */
	    bp->b_resid = bp->b_bcount;			    /* nothing transferred */
	    bufdone(bp);
	    Free(sbp);
	    return;
	}
    }
#if VINUMDEBUG
    if (debug & DEBUG_ADDRESSES)
	log(LOG_DEBUG,
	    "  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
	    sbp->b.b_iocmd == BIO_READ ? "Read" : "Write",
	    major(sbp->b.b_dev),
	    minor(sbp->b.b_dev),
	    sbp->sdno,
	    (u_int) (sbp->b.b_blkno - SD[sbp->sdno].driveoffset),
	    (int) sbp->b.b_blkno,
	    sbp->b.b_bcount);
#endif
    s = splbio();
#if VINUMDEBUG
    if (debug & DEBUG_LASTREQS)
	logrq(loginfo_sdiol, (union rqinfou) &sbp->b, &sbp->b);
#endif
    DEV_STRATEGY(&sbp->b, 0);
    splx(s);
}

/*
 * Simplified version of bounds_check_with_label
 * Determine the size of the transfer, and make sure it is
 * within the boundaries of the partition. Adjust transfer
 * if needed, and signal errors or early completion.
 *
 * Volumes are simpler than disk slices: they only contain
 * one component (though we call them a, b and c to make
 * system utilities happy), and they always take up the
 * complete space of the "partition".
 *
 * I'm still not happy with this: why should the label be
 * protected?  If it weren't so damned difficult to write
 * one in the first pleace (because it's protected), it wouldn't
 * be a problem.
 */
int
vinum_bounds_check(struct buf *bp, struct volume *vol)
{
    int maxsize = vol->size;				    /* size of the partition (sectors) */
    int size = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; /* size of this request (sectors) */

    /* Would this transfer overwrite the disk label? */
    if (bp->b_blkno <= LABELSECTOR			    /* starts before or at the label */
#if LABELSECTOR != 0
	&& bp->b_blkno + size > LABELSECTOR		    /* and finishes after */
#endif
	&& (!(vol->flags & VF_RAW))			    /* and it's not raw */
&&(bp->b_iocmd == BIO_WRITE)				    /* and it's a write */
	&&(!vol->flags & (VF_WLABEL | VF_LABELLING))) {	    /* and we're not allowed to write the label */
	bp->b_error = EROFS;				    /* read-only */
	bp->b_io.bio_flags |= BIO_ERROR;
	return -1;
    }
    if (size == 0)					    /* no transfer specified, */
	return 0;					    /* treat as EOF */
    /* beyond partition? */
    if (bp->b_blkno < 0					    /* negative start */
	|| bp->b_blkno + size > maxsize) {		    /* or goes beyond the end of the partition */
	/* if exactly at end of disk, return an EOF */
	if (bp->b_blkno == maxsize) {
	    bp->b_resid = bp->b_bcount;
	    return 0;
	}
	/* or truncate if part of it fits */
	size = maxsize - bp->b_blkno;
	if (size <= 0) {				    /* nothing to transfer */
	    bp->b_error = EINVAL;
	    bp->b_io.bio_flags |= BIO_ERROR;
	    return -1;
	}
	bp->b_bcount = size << DEV_BSHIFT;
    }
    bp->b_pblkno = bp->b_blkno;
    return 1;
}

/*
 * Allocate a request group and hook
 * it in in the list for rq
 */
struct rqgroup *
allocrqg(struct request *rq, int elements)
{
    struct rqgroup *rqg;				    /* the one we're going to allocate */
    int size = sizeof(struct rqgroup) + elements * sizeof(struct rqelement);

    rqg = (struct rqgroup *) Malloc(size);
    if (rqg != NULL) {					    /* malloc OK, */
	if (rq->rqg)					    /* we already have requests */
	    rq->lrqg->next = rqg;			    /* hang it off the end */
	else						    /* first request */
	    rq->rqg = rqg;				    /* at the start */
	rq->lrqg = rqg;					    /* this one is the last in the list */

	bzero(rqg, size);				    /* no old junk */
	rqg->rq = rq;					    /* point back to the parent request */
	rqg->count = elements;				    /* number of requests in the group */
    }
    rqg->lockbase = -1;					    /* no lock required yet */
    return rqg;
}

/*
 * Deallocate a request group out of a chain.  We do
 * this by linear search: the chain is short, this
 * almost never happens, and currently it can only
 * happen to the first member of the chain.
 */
void
deallocrqg(struct rqgroup *rqg)
{
    struct rqgroup *rqgc = rqg->rq->rqg;		    /* point to the request chain */

    if (rqg->lock)					    /* got a lock? */
	unlockrange(rqg->plexno, rqg->lock);		    /* yes, free it */
    if (rqgc == rqg)					    /* we're first in line */
	rqg->rq->rqg = rqg->next;			    /* unhook ourselves */
    else {
	while ((rqgc->next != NULL)			    /* find the group */
	&&(rqgc->next != rqg))
	    rqgc = rqgc->next;
	if (rqgc->next == NULL)
	    log(LOG_ERR,
		"vinum deallocrqg: rqg %p not found in request %p\n",
		rqg->rq,
		rqg);
	else
	    rqgc->next = rqg->next;			    /* make the chain jump over us */
    }
    Free(rqg);
}

/* Local Variables: */
/* fill-column: 50 */
/* End: */