summaryrefslogtreecommitdiffstats
path: root/sys/geom/raid/g_raid.h
blob: 1c14ad6738bf16a7791c67c62d3792b18a9da62d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
/*-
 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 * $FreeBSD$
 */

#ifndef	_G_RAID_H_
#define	_G_RAID_H_

#include <sys/param.h>
#include <sys/kobj.h>
#include <sys/bio.h>
#include <sys/time.h>

#define	G_RAID_CLASS_NAME	"RAID"

#define	G_RAID_MAGIC		"GEOM::RAID"

#define	G_RAID_VERSION		0

struct g_raid_md_object;
struct g_raid_tr_object;

#define	G_RAID_DEVICE_FLAG_NOAUTOSYNC	0x0000000000000001ULL
#define	G_RAID_DEVICE_FLAG_NOFAILSYNC	0x0000000000000002ULL
#define	G_RAID_DEVICE_FLAG_MASK	(G_RAID_DEVICE_FLAG_NOAUTOSYNC | \
					 G_RAID_DEVICE_FLAG_NOFAILSYNC)

#ifdef _KERNEL
extern u_int g_raid_aggressive_spare;
extern u_int g_raid_debug;
extern int g_raid_read_err_thresh;
extern u_int g_raid_start_timeout;
extern struct g_class g_raid_class;

#define	G_RAID_DEBUG(lvl, fmt, ...)	do {				\
	if (g_raid_debug >= (lvl)) {					\
		if (g_raid_debug > 0) {					\
			printf("GEOM_RAID[%u]: " fmt "\n",		\
			    lvl, ## __VA_ARGS__);			\
		} else {						\
			printf("GEOM_RAID: " fmt "\n",			\
			    ## __VA_ARGS__);				\
		}							\
	}								\
} while (0)
#define	G_RAID_DEBUG1(lvl, sc, fmt, ...)	do {			\
	if (g_raid_debug >= (lvl)) {					\
		if (g_raid_debug > 0) {					\
			printf("GEOM_RAID[%u]: %s: " fmt "\n",		\
			    lvl, (sc)->sc_name, ## __VA_ARGS__);	\
		} else {						\
			printf("GEOM_RAID: %s: " fmt "\n",		\
			    (sc)->sc_name, ## __VA_ARGS__);		\
		}							\
	}								\
} while (0)
#define	G_RAID_LOGREQ(lvl, bp, fmt, ...)	do {			\
	if (g_raid_debug >= (lvl)) {					\
		if (g_raid_debug > 0) {					\
			printf("GEOM_RAID[%u]: " fmt " ",		\
			    lvl, ## __VA_ARGS__);			\
		} else							\
			printf("GEOM_RAID: " fmt " ", ## __VA_ARGS__);	\
		g_print_bio(bp);					\
		printf("\n");						\
	}								\
} while (0)

/*
 * Flags we use to distinguish I/O initiated by the TR layer to maintain
 * the volume's characteristics, fix subdisks, extra copies of data, etc.
 *
 * G_RAID_BIO_FLAG_SYNC		I/O to update an extra copy of the data
 *				for RAID volumes that maintain extra data
 *				and need to rebuild that data.
 * G_RAID_BIO_FLAG_REMAP	I/O done to try to provoke a subdisk into
 *				doing some desirable action such as bad
 *				block remapping after we detect a bad part
 *				of the disk.
 * G_RAID_BIO_FLAG_LOCKED	I/O holds range lock that should re released.
 *
 * and the following meta item:
 * G_RAID_BIO_FLAG_SPECIAL	And of the I/O flags that need to make it
 *				through the range locking which would
 *				otherwise defer the I/O until after that
 *				range is unlocked.
 */
#define	G_RAID_BIO_FLAG_SYNC		0x01
#define	G_RAID_BIO_FLAG_REMAP		0x02
#define	G_RAID_BIO_FLAG_SPECIAL \
		(G_RAID_BIO_FLAG_SYNC|G_RAID_BIO_FLAG_REMAP)
#define	G_RAID_BIO_FLAG_LOCKED		0x80

struct g_raid_lock {
	off_t			 l_offset;
	off_t			 l_length;
	void			*l_callback_arg;
	int			 l_pending;
	LIST_ENTRY(g_raid_lock)	 l_next;
};

#define	G_RAID_EVENT_WAIT	0x01
#define	G_RAID_EVENT_VOLUME	0x02
#define	G_RAID_EVENT_SUBDISK	0x04
#define	G_RAID_EVENT_DISK	0x08
#define	G_RAID_EVENT_DONE	0x10
struct g_raid_event {
	void			*e_tgt;
	int			 e_event;
	int			 e_flags;
	int			 e_error;
	TAILQ_ENTRY(g_raid_event) e_next;
};
#define G_RAID_DISK_S_NONE		0x00	/* State is unknown. */
#define G_RAID_DISK_S_OFFLINE		0x01	/* Missing disk placeholder. */
#define G_RAID_DISK_S_FAILED		0x02	/* Failed. */
#define G_RAID_DISK_S_STALE_FAILED	0x03	/* Old failed. */
#define G_RAID_DISK_S_SPARE		0x04	/* Hot-spare. */
#define G_RAID_DISK_S_STALE		0x05	/* Old disk, unused now. */
#define G_RAID_DISK_S_ACTIVE		0x06	/* Operational. */

#define G_RAID_DISK_E_DISCONNECTED	0x01

struct g_raid_disk {
	struct g_raid_softc	*d_softc;	/* Back-pointer to softc. */
	struct g_consumer	*d_consumer;	/* GEOM disk consumer. */
	void			*d_md_data;	/* Disk's metadata storage. */
	struct g_kerneldump	 d_kd;		/* Kernel dumping method/args. */
	uint64_t		 d_flags;	/* Additional flags. */
	u_int			 d_state;	/* Disk state. */
	u_int			 d_load;	/* Disk average load. */
	off_t			 d_last_offset;	/* Last head offset. */
	int			 d_read_errs;	/* Count of the read errors */
	TAILQ_HEAD(, g_raid_subdisk)	 d_subdisks; /* List of subdisks. */
	TAILQ_ENTRY(g_raid_disk)	 d_next;	/* Next disk in the node. */
};

#define G_RAID_SUBDISK_S_NONE		0x00	/* Absent. */
#define G_RAID_SUBDISK_S_FAILED		0x01	/* Failed. */
#define G_RAID_SUBDISK_S_NEW		0x02	/* Blank. */
#define G_RAID_SUBDISK_S_REBUILD	0x03	/* Blank + rebuild. */
#define G_RAID_SUBDISK_S_UNINITIALIZED	0x04	/* Disk of the new volume. */
#define G_RAID_SUBDISK_S_STALE		0x05	/* Dirty. */
#define G_RAID_SUBDISK_S_RESYNC		0x06	/* Dirty + check/repair. */
#define G_RAID_SUBDISK_S_ACTIVE		0x07	/* Usable. */

#define G_RAID_SUBDISK_E_NEW		0x01	/* A new subdisk has arrived */
#define G_RAID_SUBDISK_E_FAILED		0x02	/* A subdisk failed, but remains in volume */
#define G_RAID_SUBDISK_E_DISCONNECTED	0x03	/* A subdisk removed from volume. */
#define G_RAID_SUBDISK_E_FIRST_TR_PRIVATE 0x80	/* translation private events */

#define G_RAID_SUBDISK_POS(sd)						\
    ((sd)->sd_disk ? ((sd)->sd_disk->d_last_offset - (sd)->sd_offset) : 0)
#define G_RAID_SUBDISK_TRACK_SIZE	(1 * 1024 * 1024)
#define G_RAID_SUBDISK_LOAD(sd)						\
    ((sd)->sd_disk ? ((sd)->sd_disk->d_load) : 0)
#define G_RAID_SUBDISK_LOAD_SCALE	256

struct g_raid_subdisk {
	struct g_raid_softc	*sd_softc;	/* Back-pointer to softc. */
	struct g_raid_disk	*sd_disk;	/* Where this subdisk lives. */
	struct g_raid_volume	*sd_volume;	/* Volume, sd is a part of. */
	off_t			 sd_offset;	/* Offset on the disk. */
	off_t			 sd_size;	/* Size on the disk. */
	u_int			 sd_pos;	/* Position in volume. */
	u_int			 sd_state;	/* Subdisk state. */
	off_t			 sd_rebuild_pos; /* Rebuild position. */
	int			 sd_recovery;	/* Count of recovery reqs. */
	TAILQ_ENTRY(g_raid_subdisk)	 sd_next; /* Next subdisk on disk. */
};

#define G_RAID_MAX_SUBDISKS	16
#define G_RAID_MAX_VOLUMENAME	32

#define G_RAID_VOLUME_S_STARTING	0x00
#define G_RAID_VOLUME_S_BROKEN		0x01
#define G_RAID_VOLUME_S_DEGRADED	0x02
#define G_RAID_VOLUME_S_SUBOPTIMAL	0x03
#define G_RAID_VOLUME_S_OPTIMAL		0x04
#define G_RAID_VOLUME_S_UNSUPPORTED	0x05
#define G_RAID_VOLUME_S_STOPPED		0x06

#define G_RAID_VOLUME_S_ALIVE(s)			\
    ((s) == G_RAID_VOLUME_S_DEGRADED ||			\
     (s) == G_RAID_VOLUME_S_SUBOPTIMAL ||		\
     (s) == G_RAID_VOLUME_S_OPTIMAL)

#define G_RAID_VOLUME_E_DOWN		0x00
#define G_RAID_VOLUME_E_UP		0x01
#define G_RAID_VOLUME_E_START		0x10
#define G_RAID_VOLUME_E_STARTMD		0x11

#define G_RAID_VOLUME_RL_RAID0		0x00
#define G_RAID_VOLUME_RL_RAID1		0x01
#define G_RAID_VOLUME_RL_RAID3		0x03
#define G_RAID_VOLUME_RL_RAID4		0x04
#define G_RAID_VOLUME_RL_RAID5		0x05
#define G_RAID_VOLUME_RL_RAID6		0x06
#define G_RAID_VOLUME_RL_RAID1E		0x11
#define G_RAID_VOLUME_RL_SINGLE		0x0f
#define G_RAID_VOLUME_RL_CONCAT		0x1f
#define G_RAID_VOLUME_RL_RAID5E		0x15
#define G_RAID_VOLUME_RL_RAID5EE	0x25
#define G_RAID_VOLUME_RL_UNKNOWN	0xff

#define G_RAID_VOLUME_RLQ_NONE		0x00
#define G_RAID_VOLUME_RLQ_UNKNOWN	0xff

struct g_raid_volume;

struct g_raid_volume {
	struct g_raid_softc	*v_softc;	/* Back-pointer to softc. */
	struct g_provider	*v_provider;	/* GEOM provider. */
	struct g_raid_subdisk	 v_subdisks[G_RAID_MAX_SUBDISKS];
						/* Subdisks of this volume. */
	void			*v_md_data;	/* Volume's metadata storage. */
	struct g_raid_tr_object	*v_tr;		/* Transformation object. */
	char			 v_name[G_RAID_MAX_VOLUMENAME];
						/* Volume name. */
	u_int			 v_state;	/* Volume state. */
	u_int			 v_raid_level;	/* Array RAID level. */
	u_int			 v_raid_level_qualifier; /* RAID level det. */
	u_int			 v_disks_count;	/* Number of disks in array. */
	u_int			 v_strip_size;	/* Array strip size. */
	u_int			 v_sectorsize;	/* Volume sector size. */
	off_t			 v_mediasize;	/* Volume media size.  */
	struct bio_queue_head	 v_inflight;	/* In-flight write requests. */
	struct bio_queue_head	 v_locked;	/* Blocked I/O requests. */
	LIST_HEAD(, g_raid_lock) v_locks;	 /* List of locked regions. */
	int			 v_pending_lock; /* writes to locked region */
	int			 v_dirty;	/* Volume is DIRTY. */
	struct timeval		 v_last_done;	/* Time of the last I/O. */
	time_t			 v_last_write;	/* Time of the last write. */
	u_int			 v_writes;	/* Number of active writes. */
	struct root_hold_token	*v_rootmount;	/* Root mount delay token. */
	int			 v_starting;	/* Volume is starting */
	int			 v_stopping;	/* Volume is stopping */
	int			 v_provider_open; /* Number of opens. */
	int			 v_global_id;	/* Global volume ID (rX). */
	TAILQ_ENTRY(g_raid_volume)	 v_next; /* List of volumes entry. */
	LIST_ENTRY(g_raid_volume)	 v_global_next; /* Global list entry. */
};

#define G_RAID_NODE_E_WAKE	0x00
#define G_RAID_NODE_E_START	0x01

struct g_raid_softc {
	struct g_raid_md_object	*sc_md;		/* Metadata object. */
	struct g_geom		*sc_geom;	/* GEOM class instance. */
	uint64_t		 sc_flags;	/* Additional flags. */
	TAILQ_HEAD(, g_raid_volume)	 sc_volumes;	/* List of volumes. */
	TAILQ_HEAD(, g_raid_disk)	 sc_disks;	/* List of disks. */
	struct sx		 sc_lock;	/* Main node lock. */
	struct proc		*sc_worker;	/* Worker process. */
	struct mtx		 sc_queue_mtx;	/* Worker queues lock. */
	TAILQ_HEAD(, g_raid_event) sc_events;	/* Worker events queue. */
	struct bio_queue_head	 sc_queue;	/* Worker I/O queue. */
	int			 sc_stopping;	/* Node is stopping */
};
#define	sc_name	sc_geom->name

/*
 * KOBJ parent class of metadata processing modules.
 */
struct g_raid_md_class {
	KOBJ_CLASS_FIELDS;
	int		 mdc_priority;
	LIST_ENTRY(g_raid_md_class) mdc_list;
};

/*
 * KOBJ instance of metadata processing module.
 */
struct g_raid_md_object {
	KOBJ_FIELDS;
	struct g_raid_md_class	*mdo_class;
	struct g_raid_softc	*mdo_softc;	/* Back-pointer to softc. */
};

int g_raid_md_modevent(module_t, int, void *);

#define	G_RAID_MD_DECLARE(name)					\
    static moduledata_t name##_mod = {				\
	#name,							\
	g_raid_md_modevent,					\
	&name##_class						\
    };								\
    DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_SECOND);	\
    MODULE_DEPEND(name, geom_raid, 0, 0, 0)

/*
 * KOBJ parent class of data transformation modules.
 */
struct g_raid_tr_class {
	KOBJ_CLASS_FIELDS;
	int		 trc_priority;
	LIST_ENTRY(g_raid_tr_class) trc_list;
};

/*
 * KOBJ instance of data transformation module.
 */
struct g_raid_tr_object {
	KOBJ_FIELDS;
	struct g_raid_tr_class	*tro_class;
	struct g_raid_volume 	*tro_volume;	/* Back-pointer to volume. */
};

int g_raid_tr_modevent(module_t, int, void *);

#define	G_RAID_TR_DECLARE(name)					\
    static moduledata_t name##_mod = {				\
	#name,							\
	g_raid_tr_modevent,					\
	&name##_class						\
    };								\
    DECLARE_MODULE(name, name##_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);	\
    MODULE_DEPEND(name, geom_raid, 0, 0, 0)

const char * g_raid_volume_level2str(int level, int qual);
int g_raid_volume_str2level(const char *str, int *level, int *qual);
const char * g_raid_volume_state2str(int state);
const char * g_raid_subdisk_state2str(int state);
const char * g_raid_disk_state2str(int state);

struct g_raid_softc * g_raid_create_node(struct g_class *mp,
    const char *name, struct g_raid_md_object *md);
int g_raid_create_node_format(const char *format, struct g_geom **gp);
struct g_raid_volume * g_raid_create_volume(struct g_raid_softc *sc,
    const char *name, int id);
struct g_raid_disk * g_raid_create_disk(struct g_raid_softc *sc);
const char * g_raid_get_diskname(struct g_raid_disk *disk);

int g_raid_start_volume(struct g_raid_volume *vol);

int g_raid_destroy_node(struct g_raid_softc *sc, int worker);
int g_raid_destroy_volume(struct g_raid_volume *vol);
int g_raid_destroy_disk(struct g_raid_disk *disk);

void g_raid_iodone(struct bio *bp, int error);
void g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp);
int g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd,
    void *virtual, vm_offset_t physical, off_t offset, size_t length);

struct g_consumer *g_raid_open_consumer(struct g_raid_softc *sc,
    const char *name);
void g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp);

void g_raid_report_disk_state(struct g_raid_disk *disk);
void g_raid_change_disk_state(struct g_raid_disk *disk, int state);
void g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state);
void g_raid_change_volume_state(struct g_raid_volume *vol, int state);

void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol,
    struct g_raid_subdisk *sd, struct g_raid_disk *disk);
void g_raid_fail_disk(struct g_raid_softc *sc,
    struct g_raid_subdisk *sd, struct g_raid_disk *disk);

void g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp);
int g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
    void *virtual, vm_offset_t physical, off_t offset, size_t length);

u_int g_raid_ndisks(struct g_raid_softc *sc, int state);
u_int g_raid_nsubdisks(struct g_raid_volume *vol, int state);
u_int g_raid_nopens(struct g_raid_softc *sc);
struct g_raid_subdisk * g_raid_get_subdisk(struct g_raid_volume *vol,
    int state);
#define	G_RAID_DESTROY_SOFT		0
#define	G_RAID_DESTROY_DELAYED	1
#define	G_RAID_DESTROY_HARD		2
int g_raid_destroy(struct g_raid_softc *sc, int how);
int g_raid_event_send(void *arg, int event, int flags);
int g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
    struct bio *ignore, void *argp);
int g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len);

g_ctl_req_t g_raid_ctl;
#endif	/* _KERNEL */

#endif	/* !_G_RAID_H_ */
OpenPOWER on IntegriCloud