Print this page
expandable RAID-Z
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ new/usr/src/uts/common/fs/zfs/vdev_raidz.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
25 25 */
26 26
27 27 #pragma ident "%Z%%M% %I% %E% SMI"
28 28
29 29 #include <sys/zfs_context.h>
30 30 #include <sys/spa.h>
31 31 #include <sys/vdev_impl.h>
32 32 #include <sys/zio.h>
33 33 #include <sys/zio_checksum.h>
34 34 #include <sys/fs/zfs.h>
35 35 #include <sys/fm/fs/zfs.h>
36 36
37 37 /*
38 38 * Virtual device vector for RAID-Z.
39 39 *
40 40 * This vdev supports both single and double parity. For single parity, we
41 41 * use a simple XOR of all the data columns. For double parity, we use both
42 42 * the simple XOR as well as a technique described in "The mathematics of
43 43 * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
44 44 * over the integers expressable in a single byte. Briefly, the operations on
45 45 * the field are defined as follows:
46 46 *
47 47 * o addition (+) is represented by a bitwise XOR
48 48 * o subtraction (-) is therefore identical to addition: A + B = A - B
49 49 * o multiplication of A by 2 is defined by the following bitwise expression:
50 50 * (A * 2)_7 = A_6
51 51 * (A * 2)_6 = A_5
52 52 * (A * 2)_5 = A_4
53 53 * (A * 2)_4 = A_3 + A_7
54 54 * (A * 2)_3 = A_2 + A_7
55 55 * (A * 2)_2 = A_1 + A_7
56 56 * (A * 2)_1 = A_0
57 57 * (A * 2)_0 = A_7
58 58 *
59 59 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
60 60 *
61 61 * Observe that any number in the field (except for 0) can be expressed as a
62 62 * power of 2 -- a generator for the field. We store a table of the powers of
63 63 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
64 64 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
65 65 * than field addition). The inverse of a field element A (A^-1) is A^254.
66 66 *
67 67 * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
68 68 * can be expressed by field operations:
69 69 *
70 70 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
71 71 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
72 72 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
73 73 *
74 74 * See the reconstruction code below for how P and Q can used individually or
75 75 * in concert to recover missing data columns.
76 76 */
77 77
78 78 typedef struct raidz_col {
79 79 uint64_t rc_devidx; /* child device index for I/O */
80 80 uint64_t rc_offset; /* device offset */
81 81 uint64_t rc_size; /* I/O size */
82 82 void *rc_data; /* I/O data */
83 83 int rc_error; /* I/O error for this device */
84 84 uint8_t rc_tried; /* Did we attempt this I/O column? */
85 85 uint8_t rc_skipped; /* Did we skip this I/O column? */
86 86 } raidz_col_t;
87 87
88 88 typedef struct raidz_map {
89 89 uint64_t rm_cols; /* Column count */
90 90 uint64_t rm_bigcols; /* Number of oversized columns */
91 91 uint64_t rm_asize; /* Actual total I/O size */
92 92 uint64_t rm_missingdata; /* Count of missing data devices */
93 93 uint64_t rm_missingparity; /* Count of missing parity devices */
94 94 uint64_t rm_firstdatacol; /* First data column/parity count */
95 95 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
96 96 } raidz_map_t;
97 97
98 98 #define VDEV_RAIDZ_P 0
99 99 #define VDEV_RAIDZ_Q 1
100 100
101 101 #define VDEV_RAIDZ_MAXPARITY 2
102 102
103 103 #define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
104 104
105 105 /*
106 106 * These two tables represent powers and logs of 2 in the Galois field defined
107 107 * above. These values were computed by repeatedly multiplying by 2 as above.
108 108 */
109 109 static const uint8_t vdev_raidz_pow2[256] = {
110 110 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
111 111 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
112 112 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
113 113 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
114 114 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
115 115 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
116 116 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
117 117 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
118 118 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
119 119 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
120 120 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
121 121 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
122 122 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
123 123 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
124 124 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
125 125 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
126 126 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
127 127 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
128 128 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
129 129 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
130 130 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
131 131 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
132 132 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
133 133 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
134 134 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
135 135 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
136 136 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
137 137 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
138 138 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
139 139 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
140 140 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
141 141 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
142 142 };
143 143 static const uint8_t vdev_raidz_log2[256] = {
144 144 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
145 145 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
146 146 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
147 147 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
148 148 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
149 149 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
150 150 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
151 151 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
152 152 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
153 153 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
154 154 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
155 155 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
156 156 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
157 157 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
158 158 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
159 159 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
160 160 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
161 161 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
162 162 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
163 163 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
164 164 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
165 165 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
166 166 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
167 167 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
168 168 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
169 169 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
170 170 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
171 171 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
172 172 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
173 173 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
174 174 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
175 175 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
176 176 };
177 177
178 178 /*
179 179 * Multiply a given number by 2 raised to the given power.
180 180 */
181 181 static uint8_t
182 182 vdev_raidz_exp2(uint_t a, int exp)
183 183 {
184 184 if (a == 0)
185 185 return (0);
186 186
187 187 ASSERT(exp >= 0);
188 188 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
189 189
190 190 exp += vdev_raidz_log2[a];
191 191 if (exp > 255)
192 192 exp -= 255;
193 193
194 194 return (vdev_raidz_pow2[exp]);
195 195 }
196 196
197 197 static raidz_map_t *
198 198 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
199 199 uint64_t nparity)
200 200 {
201 201 raidz_map_t *rm;
202 202 uint64_t b = zio->io_offset >> unit_shift;
203 203 uint64_t s = zio->io_size >> unit_shift;
204 204 uint64_t f = b % dcols;
205 205 uint64_t o = (b / dcols) << unit_shift;
206 206 uint64_t q, r, c, bc, col, acols, coff, devidx;
207 207
208 208 q = s / (dcols - nparity);
209 209 r = s - q * (dcols - nparity);
210 210 bc = (r == 0 ? 0 : r + nparity);
211 211
212 212 acols = (q == 0 ? bc : dcols);
213 213
214 214 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
215 215
216 216 rm->rm_cols = acols;
217 217 rm->rm_bigcols = bc;
218 218 rm->rm_asize = 0;
219 219 rm->rm_missingdata = 0;
220 220 rm->rm_missingparity = 0;
221 221 rm->rm_firstdatacol = nparity;
222 222
223 223 for (c = 0; c < acols; c++) {
224 224 col = f + c;
225 225 coff = o;
226 226 if (col >= dcols) {
227 227 col -= dcols;
228 228 coff += 1ULL << unit_shift;
229 229 }
230 230 rm->rm_col[c].rc_devidx = col;
231 231 rm->rm_col[c].rc_offset = coff;
232 232 rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
233 233 rm->rm_col[c].rc_data = NULL;
234 234 rm->rm_col[c].rc_error = 0;
235 235 rm->rm_col[c].rc_tried = 0;
236 236 rm->rm_col[c].rc_skipped = 0;
237 237 rm->rm_asize += rm->rm_col[c].rc_size;
238 238 }
239 239
240 240 rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
241 241
242 242 for (c = 0; c < rm->rm_firstdatacol; c++)
243 243 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
244 244
245 245 rm->rm_col[c].rc_data = zio->io_data;
246 246
247 247 for (c = c + 1; c < acols; c++)
248 248 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
249 249 rm->rm_col[c - 1].rc_size;
250 250
251 251 /*
252 252 * If all data stored spans all columns, there's a danger that parity
253 253 * will always be on the same device and, since parity isn't read
254 254 * during normal operation, that that device's I/O bandwidth won't be
255 255 * used effectively. We therefore switch the parity every 1MB.
256 256 *
257 257 * ... at least that was, ostensibly, the theory. As a practical
258 258 * matter unless we juggle the parity between all devices evenly, we
259 259 * won't see any benefit. Further, occasional writes that aren't a
260 260 * multiple of the LCM of the number of children and the minimum
261 261 * stripe width are sufficient to avoid pessimal behavior.
262 262 * Unfortunately, this decision created an implicit on-disk format
263 263 * requirement that we need to support for all eternity, but only
264 264 * for single-parity RAID-Z.
265 265 */
266 266 ASSERT(rm->rm_cols >= 2);
267 267 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
268 268
269 269 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
270 270 devidx = rm->rm_col[0].rc_devidx;
271 271 o = rm->rm_col[0].rc_offset;
272 272 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
273 273 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
274 274 rm->rm_col[1].rc_devidx = devidx;
275 275 rm->rm_col[1].rc_offset = o;
276 276 }
277 277
278 278 zio->io_vsd = rm;
279 279 return (rm);
280 280 }
281 281
282 282 static void
283 283 vdev_raidz_map_free(zio_t *zio)
284 284 {
285 285 raidz_map_t *rm = zio->io_vsd;
286 286 int c;
287 287
288 288 for (c = 0; c < rm->rm_firstdatacol; c++)
289 289 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
290 290
291 291 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
292 292 zio->io_vsd = NULL;
293 293 }
294 294
295 295 static void
296 296 vdev_raidz_generate_parity_p(raidz_map_t *rm)
297 297 {
298 298 uint64_t *p, *src, pcount, ccount, i;
299 299 int c;
300 300
301 301 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
302 302
303 303 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
304 304 src = rm->rm_col[c].rc_data;
305 305 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
306 306 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
307 307
308 308 if (c == rm->rm_firstdatacol) {
309 309 ASSERT(ccount == pcount);
310 310 for (i = 0; i < ccount; i++, p++, src++) {
311 311 *p = *src;
312 312 }
313 313 } else {
314 314 ASSERT(ccount <= pcount);
315 315 for (i = 0; i < ccount; i++, p++, src++) {
316 316 *p ^= *src;
317 317 }
318 318 }
319 319 }
320 320 }
321 321
322 322 static void
323 323 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
324 324 {
325 325 uint64_t *q, *p, *src, pcount, ccount, mask, i;
326 326 int c;
327 327
328 328 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
329 329 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
330 330 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
331 331
332 332 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
333 333 src = rm->rm_col[c].rc_data;
334 334 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
335 335 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
336 336 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
337 337
338 338 if (c == rm->rm_firstdatacol) {
339 339 ASSERT(ccount == pcount || ccount == 0);
340 340 for (i = 0; i < ccount; i++, p++, q++, src++) {
341 341 *q = *src;
342 342 *p = *src;
343 343 }
344 344 for (; i < pcount; i++, p++, q++, src++) {
345 345 *q = 0;
346 346 *p = 0;
347 347 }
348 348 } else {
349 349 ASSERT(ccount <= pcount);
350 350
351 351 /*
352 352 * Rather than multiplying each byte individually (as
353 353 * described above), we are able to handle 8 at once
354 354 * by generating a mask based on the high bit in each
355 355 * byte and using that to conditionally XOR in 0x1d.
356 356 */
357 357 for (i = 0; i < ccount; i++, p++, q++, src++) {
358 358 mask = *q & 0x8080808080808080ULL;
359 359 mask = (mask << 1) - (mask >> 7);
360 360 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
361 361 (mask & 0x1d1d1d1d1d1d1d1dULL);
362 362 *q ^= *src;
363 363 *p ^= *src;
364 364 }
365 365
366 366 /*
367 367 * Treat short columns as though they are full of 0s.
368 368 */
369 369 for (; i < pcount; i++, q++) {
370 370 mask = *q & 0x8080808080808080ULL;
371 371 mask = (mask << 1) - (mask >> 7);
372 372 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
373 373 (mask & 0x1d1d1d1d1d1d1d1dULL);
374 374 }
375 375 }
376 376 }
377 377 }
378 378
379 379 static void
380 380 vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
381 381 {
382 382 uint64_t *dst, *src, xcount, ccount, count, i;
383 383 int c;
384 384
385 385 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
386 386 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
387 387 ASSERT(xcount > 0);
388 388
389 389 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
390 390 dst = rm->rm_col[x].rc_data;
391 391 for (i = 0; i < xcount; i++, dst++, src++) {
392 392 *dst = *src;
393 393 }
394 394
395 395 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
396 396 src = rm->rm_col[c].rc_data;
397 397 dst = rm->rm_col[x].rc_data;
398 398
399 399 if (c == x)
400 400 continue;
401 401
402 402 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
403 403 count = MIN(ccount, xcount);
404 404
405 405 for (i = 0; i < count; i++, dst++, src++) {
406 406 *dst ^= *src;
407 407 }
408 408 }
409 409 }
410 410
411 411 static void
412 412 vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
413 413 {
414 414 uint64_t *dst, *src, xcount, ccount, count, mask, i;
415 415 uint8_t *b;
416 416 int c, j, exp;
417 417
418 418 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
419 419 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
420 420
421 421 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
422 422 src = rm->rm_col[c].rc_data;
423 423 dst = rm->rm_col[x].rc_data;
424 424
425 425 if (c == x)
426 426 ccount = 0;
427 427 else
428 428 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
429 429
430 430 count = MIN(ccount, xcount);
431 431
432 432 if (c == rm->rm_firstdatacol) {
433 433 for (i = 0; i < count; i++, dst++, src++) {
434 434 *dst = *src;
435 435 }
436 436 for (; i < xcount; i++, dst++) {
437 437 *dst = 0;
438 438 }
439 439
440 440 } else {
441 441 /*
442 442 * For an explanation of this, see the comment in
443 443 * vdev_raidz_generate_parity_pq() above.
444 444 */
445 445 for (i = 0; i < count; i++, dst++, src++) {
446 446 mask = *dst & 0x8080808080808080ULL;
447 447 mask = (mask << 1) - (mask >> 7);
448 448 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
449 449 (mask & 0x1d1d1d1d1d1d1d1dULL);
450 450 *dst ^= *src;
451 451 }
452 452
453 453 for (; i < xcount; i++, dst++) {
454 454 mask = *dst & 0x8080808080808080ULL;
455 455 mask = (mask << 1) - (mask >> 7);
456 456 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
457 457 (mask & 0x1d1d1d1d1d1d1d1dULL);
458 458 }
459 459 }
460 460 }
461 461
462 462 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
463 463 dst = rm->rm_col[x].rc_data;
464 464 exp = 255 - (rm->rm_cols - 1 - x);
465 465
466 466 for (i = 0; i < xcount; i++, dst++, src++) {
467 467 *dst ^= *src;
468 468 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
469 469 *b = vdev_raidz_exp2(*b, exp);
470 470 }
471 471 }
472 472 }
473 473
474 474 static void
475 475 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
476 476 {
477 477 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
478 478 void *pdata, *qdata;
479 479 uint64_t xsize, ysize, i;
480 480
481 481 ASSERT(x < y);
482 482 ASSERT(x >= rm->rm_firstdatacol);
483 483 ASSERT(y < rm->rm_cols);
484 484
485 485 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
486 486
487 487 /*
488 488 * Move the parity data aside -- we're going to compute parity as
489 489 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
490 490 * reuse the parity generation mechanism without trashing the actual
491 491 * parity so we make those columns appear to be full of zeros by
492 492 * setting their lengths to zero.
493 493 */
494 494 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
495 495 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
496 496 xsize = rm->rm_col[x].rc_size;
497 497 ysize = rm->rm_col[y].rc_size;
498 498
499 499 rm->rm_col[VDEV_RAIDZ_P].rc_data =
500 500 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
501 501 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
502 502 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
503 503 rm->rm_col[x].rc_size = 0;
504 504 rm->rm_col[y].rc_size = 0;
505 505
506 506 vdev_raidz_generate_parity_pq(rm);
507 507
508 508 rm->rm_col[x].rc_size = xsize;
509 509 rm->rm_col[y].rc_size = ysize;
510 510
511 511 p = pdata;
512 512 q = qdata;
513 513 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
514 514 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
515 515 xd = rm->rm_col[x].rc_data;
516 516 yd = rm->rm_col[y].rc_data;
517 517
518 518 /*
519 519 * We now have:
520 520 * Pxy = P + D_x + D_y
521 521 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
522 522 *
523 523 * We can then solve for D_x:
524 524 * D_x = A * (P + Pxy) + B * (Q + Qxy)
525 525 * where
526 526 * A = 2^(x - y) * (2^(x - y) + 1)^-1
527 527 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
528 528 *
529 529 * With D_x in hand, we can easily solve for D_y:
530 530 * D_y = P + Pxy + D_x
531 531 */
532 532
533 533 a = vdev_raidz_pow2[255 + x - y];
534 534 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
535 535 tmp = 255 - vdev_raidz_log2[a ^ 1];
536 536
537 537 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
538 538 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
539 539
540 540 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
541 541 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
542 542 vdev_raidz_exp2(*q ^ *qxy, bexp);
543 543
544 544 if (i < ysize)
545 545 *yd = *p ^ *pxy ^ *xd;
546 546 }
547 547
548 548 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
549 549 rm->rm_col[VDEV_RAIDZ_P].rc_size);
550 550 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
551 551 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
552 552
553 553 /*
554 554 * Restore the saved parity data.
555 555 */
556 556 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
557 557 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
558 558 }
559 559
560 560
561 561 static int
562 562 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
563 563 {
564 564 vdev_t *cvd;
565 565 uint64_t nparity = vd->vdev_nparity;
566 566 int c, error;
567 567 int lasterror = 0;
568 568 int numerrors = 0;
569 569
570 570 ASSERT(nparity > 0);
571 571
572 572 if (nparity > VDEV_RAIDZ_MAXPARITY ||
573 573 vd->vdev_children < nparity + 1) {
574 574 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
575 575 return (EINVAL);
576 576 }
577 577
578 578 for (c = 0; c < vd->vdev_children; c++) {
579 579 cvd = vd->vdev_child[c];
580 580
581 581 if ((error = vdev_open(cvd)) != 0) {
582 582 lasterror = error;
583 583 numerrors++;
584 584 continue;
585 585 }
586 586
587 587 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
588 588 *ashift = MAX(*ashift, cvd->vdev_ashift);
589 589 }
590 590
591 591 *asize *= vd->vdev_children;
592 592
593 593 if (numerrors > nparity) {
594 594 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
595 595 return (lasterror);
596 596 }
597 597
598 598 return (0);
599 599 }
600 600
601 601 static void
602 602 vdev_raidz_close(vdev_t *vd)
603 603 {
604 604 int c;
605 605
606 606 for (c = 0; c < vd->vdev_children; c++)
607 607 vdev_close(vd->vdev_child[c]);
608 608 }
609 609
610 610 static uint64_t
611 611 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
612 612 {
613 613 uint64_t asize;
614 614 uint64_t ashift = vd->vdev_top->vdev_ashift;
615 615 uint64_t cols = vd->vdev_children;
616 616 uint64_t nparity = vd->vdev_nparity;
617 617
618 618 asize = ((psize - 1) >> ashift) + 1;
619 619 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
620 620 asize = roundup(asize, nparity + 1) << ashift;
621 621
622 622 return (asize);
623 623 }
624 624
625 625 static void
626 626 vdev_raidz_child_done(zio_t *zio)
627 627 {
628 628 raidz_col_t *rc = zio->io_private;
629 629
630 630 rc->rc_error = zio->io_error;
631 631 rc->rc_tried = 1;
632 632 rc->rc_skipped = 0;
633 633 }
634 634
635 635 static void
636 636 vdev_raidz_repair_done(zio_t *zio)
637 637 {
638 638 ASSERT(zio->io_private == zio->io_parent);
639 639 vdev_raidz_map_free(zio->io_private);
640 640 }
641 641
642 642 static int
643 643 vdev_raidz_io_start(zio_t *zio)
644 644 {
645 645 vdev_t *vd = zio->io_vd;
646 646 vdev_t *tvd = vd->vdev_top;
647 647 vdev_t *cvd;
648 648 blkptr_t *bp = zio->io_bp;
649 649 raidz_map_t *rm;
650 650 raidz_col_t *rc;
651 651 int c;
652 652
653 653 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
654 654 vd->vdev_nparity);
655 655
656 656 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
657 657
658 658 if (zio->io_type == ZIO_TYPE_WRITE) {
659 659 /*
660 660 * Generate RAID parity in the first virtual columns.
661 661 */
662 662 if (rm->rm_firstdatacol == 1)
663 663 vdev_raidz_generate_parity_p(rm);
664 664 else
665 665 vdev_raidz_generate_parity_pq(rm);
666 666
667 667 for (c = 0; c < rm->rm_cols; c++) {
668 668 rc = &rm->rm_col[c];
669 669 cvd = vd->vdev_child[rc->rc_devidx];
670 670 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
671 671 rc->rc_offset, rc->rc_data, rc->rc_size,
672 672 zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
673 673 vdev_raidz_child_done, rc));
674 674 }
675 675
676 676 return (zio_wait_for_children_done(zio));
677 677 }
678 678
679 679 ASSERT(zio->io_type == ZIO_TYPE_READ);
680 680
681 681 /*
682 682 * Iterate over the columns in reverse order so that we hit the parity
683 683 * last -- any errors along the way will force us to read the parity
684 684 * data.
685 685 */
686 686 for (c = rm->rm_cols - 1; c >= 0; c--) {
687 687 rc = &rm->rm_col[c];
688 688 cvd = vd->vdev_child[rc->rc_devidx];
689 689 if (!vdev_readable(cvd)) {
690 690 if (c >= rm->rm_firstdatacol)
691 691 rm->rm_missingdata++;
692 692 else
693 693 rm->rm_missingparity++;
694 694 rc->rc_error = ENXIO;
695 695 rc->rc_tried = 1; /* don't even try */
696 696 rc->rc_skipped = 1;
697 697 continue;
698 698 }
699 699 if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
700 700 if (c >= rm->rm_firstdatacol)
701 701 rm->rm_missingdata++;
702 702 else
703 703 rm->rm_missingparity++;
704 704 rc->rc_error = ESTALE;
705 705 rc->rc_skipped = 1;
706 706 continue;
707 707 }
708 708 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
709 709 (zio->io_flags & ZIO_FLAG_SCRUB)) {
710 710 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
711 711 rc->rc_offset, rc->rc_data, rc->rc_size,
712 712 zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
713 713 vdev_raidz_child_done, rc));
714 714 }
715 715 }
716 716
717 717 return (zio_wait_for_children_done(zio));
718 718 }
719 719
720 720 /*
721 721 * Report a checksum error for a child of a RAID-Z device.
722 722 */
723 723 static void
724 724 raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
725 725 {
726 726 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
727 727 dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
728 728 vdev_description(vd));
729 729
730 730 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
731 731 mutex_enter(&vd->vdev_stat_lock);
732 732 vd->vdev_stat.vs_checksum_errors++;
733 733 mutex_exit(&vd->vdev_stat_lock);
734 734 }
735 735
736 736 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
737 737 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
738 738 zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
739 739 }
740 740
741 741 /*
742 742 * Generate the parity from the data columns. If we tried and were able to
743 743 * read the parity without error, verify that the generated parity matches the
744 744 * data we read. If it doesn't, we fire off a checksum error. Return the
745 745 * number such failures.
746 746 */
747 747 static int
748 748 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
749 749 {
750 750 void *orig[VDEV_RAIDZ_MAXPARITY];
751 751 int c, ret = 0;
752 752 raidz_col_t *rc;
753 753
754 754 for (c = 0; c < rm->rm_firstdatacol; c++) {
755 755 rc = &rm->rm_col[c];
756 756 if (!rc->rc_tried || rc->rc_error != 0)
757 757 continue;
758 758 orig[c] = zio_buf_alloc(rc->rc_size);
759 759 bcopy(rc->rc_data, orig[c], rc->rc_size);
760 760 }
761 761
762 762 if (rm->rm_firstdatacol == 1)
763 763 vdev_raidz_generate_parity_p(rm);
764 764 else
765 765 vdev_raidz_generate_parity_pq(rm);
766 766
767 767 for (c = 0; c < rm->rm_firstdatacol; c++) {
768 768 rc = &rm->rm_col[c];
769 769 if (!rc->rc_tried || rc->rc_error != 0)
770 770 continue;
771 771 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
772 772 raidz_checksum_error(zio, rc);
773 773 rc->rc_error = ECKSUM;
774 774 ret++;
775 775 }
776 776 zio_buf_free(orig[c], rc->rc_size);
777 777 }
778 778
779 779 return (ret);
780 780 }
781 781
782 782 static uint64_t raidz_corrected_p;
783 783 static uint64_t raidz_corrected_q;
784 784 static uint64_t raidz_corrected_pq;
785 785
786 786 static int
787 787 vdev_raidz_io_done(zio_t *zio)
788 788 {
789 789 vdev_t *vd = zio->io_vd;
790 790 vdev_t *cvd;
791 791 raidz_map_t *rm = zio->io_vsd;
792 792 raidz_col_t *rc, *rc1;
793 793 int unexpected_errors = 0;
794 794 int parity_errors = 0;
795 795 int parity_untried = 0;
796 796 int data_errors = 0;
797 797 int n, c, c1;
798 798
799 799 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
800 800
801 801 zio->io_error = 0;
802 802 zio->io_numerrors = 0;
803 803
804 804 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
805 805 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
806 806
807 807 for (c = 0; c < rm->rm_cols; c++) {
808 808 rc = &rm->rm_col[c];
809 809
810 810 /*
811 811 * We preserve any EIOs because those may be worth retrying;
812 812 * whereas ECKSUM and ENXIO are more likely to be persistent.
813 813 */
814 814 if (rc->rc_error) {
815 815 if (zio->io_error != EIO)
816 816 zio->io_error = rc->rc_error;
817 817
818 818 if (c < rm->rm_firstdatacol)
819 819 parity_errors++;
820 820 else
821 821 data_errors++;
822 822
823 823 if (!rc->rc_skipped)
824 824 unexpected_errors++;
825 825
826 826 zio->io_numerrors++;
827 827 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
828 828 parity_untried++;
829 829 }
830 830 }
831 831
832 832 if (zio->io_type == ZIO_TYPE_WRITE) {
833 833 /*
834 834 * If this is not a failfast write, and we were able to
835 835 * write enough columns to reconstruct the data, good enough.
836 836 */
837 837 /* XXPOLICY */
838 838 if (zio->io_numerrors <= rm->rm_firstdatacol &&
839 839 !(zio->io_flags & ZIO_FLAG_FAILFAST))
840 840 zio->io_error = 0;
841 841
842 842 vdev_raidz_map_free(zio);
843 843
844 844 return (ZIO_PIPELINE_CONTINUE);
845 845 }
846 846
847 847 ASSERT(zio->io_type == ZIO_TYPE_READ);
848 848 /*
849 849 * There are three potential phases for a read:
850 850 * 1. produce valid data from the columns read
851 851 * 2. read all disks and try again
852 852 * 3. perform combinatorial reconstruction
853 853 *
854 854 * Each phase is progressively both more expensive and less likely to
855 855 * occur. If we encounter more errors than we can repair or all phases
856 856 * fail, we have no choice but to return an error.
857 857 */
858 858
859 859 /*
860 860 * If the number of errors we saw was correctable -- less than or equal
861 861 * to the number of parity disks read -- attempt to produce data that
862 862 * has a valid checksum. Naturally, this case applies in the absence of
863 863 * any errors.
864 864 */
865 865 if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) {
866 866 switch (data_errors) {
867 867 case 0:
868 868 if (zio_checksum_error(zio) == 0) {
869 869 zio->io_error = 0;
870 870
871 871 /*
872 872 * If we read parity information (unnecessarily
873 873 * as it happens since no reconstruction was
874 874 * needed) regenerate and verify the parity.
875 875 * We also regenerate parity when resilvering
876 876 * so we can write it out to the failed device
877 877 * later.
878 878 */
879 879 if (parity_errors + parity_untried <
880 880 rm->rm_firstdatacol ||
881 881 (zio->io_flags & ZIO_FLAG_RESILVER)) {
882 882 n = raidz_parity_verify(zio, rm);
883 883 unexpected_errors += n;
884 884 ASSERT(parity_errors + n <=
885 885 rm->rm_firstdatacol);
886 886 }
887 887 goto done;
888 888 }
889 889 break;
890 890
891 891 case 1:
892 892 /*
893 893 * We either attempt to read all the parity columns or
894 894 * none of them. If we didn't try to read parity, we
895 895 * wouldn't be here in the correctable case. There must
896 896 * also have been fewer parity errors than parity
897 897 * columns or, again, we wouldn't be in this code path.
898 898 */
899 899 ASSERT(parity_untried == 0);
900 900 ASSERT(parity_errors < rm->rm_firstdatacol);
901 901
902 902 /*
903 903 * Find the column that reported the error.
904 904 */
905 905 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
906 906 rc = &rm->rm_col[c];
907 907 if (rc->rc_error != 0)
908 908 break;
909 909 }
910 910 ASSERT(c != rm->rm_cols);
911 911 ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
912 912 rc->rc_error == ESTALE);
913 913
914 914 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
915 915 vdev_raidz_reconstruct_p(rm, c);
916 916 } else {
917 917 ASSERT(rm->rm_firstdatacol > 1);
918 918 vdev_raidz_reconstruct_q(rm, c);
919 919 }
920 920
921 921 if (zio_checksum_error(zio) == 0) {
922 922 zio->io_error = 0;
923 923 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
924 924 atomic_inc_64(&raidz_corrected_p);
925 925 else
926 926 atomic_inc_64(&raidz_corrected_q);
927 927
928 928 /*
929 929 * If there's more than one parity disk that
930 930 * was successfully read, confirm that the
931 931 * other parity disk produced the correct data.
932 932 * This routine is suboptimal in that it
933 933 * regenerates both the parity we wish to test
934 934 * as well as the parity we just used to
935 935 * perform the reconstruction, but this should
936 936 * be a relatively uncommon case, and can be
937 937 * optimized if it becomes a problem.
938 938 * We also regenerate parity when resilvering
939 939 * so we can write it out to the failed device
940 940 * later.
941 941 */
942 942 if (parity_errors < rm->rm_firstdatacol - 1 ||
943 943 (zio->io_flags & ZIO_FLAG_RESILVER)) {
944 944 n = raidz_parity_verify(zio, rm);
945 945 unexpected_errors += n;
946 946 ASSERT(parity_errors + n <=
947 947 rm->rm_firstdatacol);
948 948 }
949 949
950 950 goto done;
951 951 }
952 952 break;
953 953
954 954 case 2:
955 955 /*
956 956 * Two data column errors require double parity.
957 957 */
958 958 ASSERT(rm->rm_firstdatacol == 2);
959 959
960 960 /*
961 961 * Find the two columns that reported errors.
962 962 */
963 963 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
964 964 rc = &rm->rm_col[c];
965 965 if (rc->rc_error != 0)
966 966 break;
967 967 }
968 968 ASSERT(c != rm->rm_cols);
969 969 ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
970 970 rc->rc_error == ESTALE);
971 971
972 972 for (c1 = c++; c < rm->rm_cols; c++) {
973 973 rc = &rm->rm_col[c];
974 974 if (rc->rc_error != 0)
975 975 break;
976 976 }
977 977 ASSERT(c != rm->rm_cols);
978 978 ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
979 979 rc->rc_error == ESTALE);
980 980
981 981 vdev_raidz_reconstruct_pq(rm, c1, c);
982 982
983 983 if (zio_checksum_error(zio) == 0) {
984 984 zio->io_error = 0;
985 985 atomic_inc_64(&raidz_corrected_pq);
986 986
987 987 goto done;
988 988 }
989 989 break;
990 990
991 991 default:
992 992 ASSERT(rm->rm_firstdatacol <= 2);
993 993 ASSERT(0);
994 994 }
995 995 }
996 996
997 997 /*
998 998 * This isn't a typical situation -- either we got a read error or
999 999 * a child silently returned bad data. Read every block so we can
1000 1000 * try again with as much data and parity as we can track down. If
1001 1001 * we've already been through once before, all children will be marked
1002 1002 * as tried so we'll proceed to combinatorial reconstruction.
1003 1003 */
1004 1004 unexpected_errors = 1;
1005 1005 rm->rm_missingdata = 0;
1006 1006 rm->rm_missingparity = 0;
1007 1007
1008 1008 for (c = 0; c < rm->rm_cols; c++) {
1009 1009 if (rm->rm_col[c].rc_tried)
1010 1010 continue;
1011 1011
1012 1012 zio->io_error = 0;
1013 1013 zio_vdev_io_redone(zio);
1014 1014 do {
1015 1015 rc = &rm->rm_col[c];
1016 1016 if (rc->rc_tried)
1017 1017 continue;
1018 1018 zio_nowait(zio_vdev_child_io(zio, NULL,
1019 1019 vd->vdev_child[rc->rc_devidx],
1020 1020 rc->rc_offset, rc->rc_data, rc->rc_size,
1021 1021 zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
1022 1022 vdev_raidz_child_done, rc));
1023 1023 } while (++c < rm->rm_cols);
1024 1024 dprintf("rereading\n");
1025 1025
1026 1026 return (zio_wait_for_children_done(zio));
1027 1027 }
1028 1028
1029 1029 /*
1030 1030 * At this point we've attempted to reconstruct the data given the
1031 1031 * errors we detected, and we've attempted to read all columns. There
1032 1032 * must, therefore, be one or more additional problems -- silent errors
1033 1033 * resulting in invalid data rather than explicit I/O errors resulting
1034 1034 * in absent data. Before we attempt combinatorial reconstruction make
1035 1035 * sure we have a chance of coming up with the right answer.
1036 1036 */
1037 1037 if (zio->io_numerrors >= rm->rm_firstdatacol) {
1038 1038 ASSERT(zio->io_error != 0);
1039 1039 goto done;
1040 1040 }
1041 1041
1042 1042 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
1043 1043 /*
1044 1044 * Attempt to reconstruct the data from parity P.
1045 1045 */
1046 1046 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1047 1047 void *orig;
1048 1048 rc = &rm->rm_col[c];
1049 1049
1050 1050 orig = zio_buf_alloc(rc->rc_size);
1051 1051 bcopy(rc->rc_data, orig, rc->rc_size);
1052 1052 vdev_raidz_reconstruct_p(rm, c);
1053 1053
1054 1054 if (zio_checksum_error(zio) == 0) {
1055 1055 zio_buf_free(orig, rc->rc_size);
1056 1056 zio->io_error = 0;
1057 1057 atomic_inc_64(&raidz_corrected_p);
1058 1058
1059 1059 /*
1060 1060 * If this child didn't know that it returned
1061 1061 * bad data, inform it.
1062 1062 */
1063 1063 if (rc->rc_tried && rc->rc_error == 0)
1064 1064 raidz_checksum_error(zio, rc);
1065 1065 rc->rc_error = ECKSUM;
1066 1066 goto done;
1067 1067 }
1068 1068
1069 1069 bcopy(orig, rc->rc_data, rc->rc_size);
1070 1070 zio_buf_free(orig, rc->rc_size);
1071 1071 }
1072 1072 }
1073 1073
1074 1074 if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1075 1075 /*
1076 1076 * Attempt to reconstruct the data from parity Q.
1077 1077 */
1078 1078 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1079 1079 void *orig;
1080 1080 rc = &rm->rm_col[c];
1081 1081
1082 1082 orig = zio_buf_alloc(rc->rc_size);
1083 1083 bcopy(rc->rc_data, orig, rc->rc_size);
1084 1084 vdev_raidz_reconstruct_q(rm, c);
1085 1085
1086 1086 if (zio_checksum_error(zio) == 0) {
1087 1087 zio_buf_free(orig, rc->rc_size);
1088 1088 zio->io_error = 0;
1089 1089 atomic_inc_64(&raidz_corrected_q);
1090 1090
1091 1091 /*
1092 1092 * If this child didn't know that it returned
1093 1093 * bad data, inform it.
1094 1094 */
1095 1095 if (rc->rc_tried && rc->rc_error == 0)
1096 1096 raidz_checksum_error(zio, rc);
1097 1097 rc->rc_error = ECKSUM;
1098 1098 goto done;
1099 1099 }
1100 1100
1101 1101 bcopy(orig, rc->rc_data, rc->rc_size);
1102 1102 zio_buf_free(orig, rc->rc_size);
1103 1103 }
1104 1104 }
1105 1105
1106 1106 if (rm->rm_firstdatacol > 1 &&
1107 1107 rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
1108 1108 rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1109 1109 /*
1110 1110 * Attempt to reconstruct the data from both P and Q.
1111 1111 */
1112 1112 for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
1113 1113 void *orig, *orig1;
1114 1114 rc = &rm->rm_col[c];
1115 1115
1116 1116 orig = zio_buf_alloc(rc->rc_size);
1117 1117 bcopy(rc->rc_data, orig, rc->rc_size);
1118 1118
1119 1119 for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
1120 1120 rc1 = &rm->rm_col[c1];
1121 1121
1122 1122 orig1 = zio_buf_alloc(rc1->rc_size);
1123 1123 bcopy(rc1->rc_data, orig1, rc1->rc_size);
1124 1124
1125 1125 vdev_raidz_reconstruct_pq(rm, c, c1);
1126 1126
1127 1127 if (zio_checksum_error(zio) == 0) {
1128 1128 zio_buf_free(orig, rc->rc_size);
1129 1129 zio_buf_free(orig1, rc1->rc_size);
1130 1130 zio->io_error = 0;
1131 1131 atomic_inc_64(&raidz_corrected_pq);
1132 1132
1133 1133 /*
1134 1134 * If these children didn't know they
1135 1135 * returned bad data, inform them.
1136 1136 */
1137 1137 if (rc->rc_tried && rc->rc_error == 0)
1138 1138 raidz_checksum_error(zio, rc);
1139 1139 if (rc1->rc_tried && rc1->rc_error == 0)
1140 1140 raidz_checksum_error(zio, rc1);
1141 1141
1142 1142 rc->rc_error = ECKSUM;
1143 1143 rc1->rc_error = ECKSUM;
1144 1144
1145 1145 goto done;
1146 1146 }
1147 1147
1148 1148 bcopy(orig1, rc1->rc_data, rc1->rc_size);
1149 1149 zio_buf_free(orig1, rc1->rc_size);
1150 1150 }
1151 1151
1152 1152 bcopy(orig, rc->rc_data, rc->rc_size);
1153 1153 zio_buf_free(orig, rc->rc_size);
1154 1154 }
1155 1155 }
1156 1156
1157 1157 /*
1158 1158 * All combinations failed to checksum. Generate checksum ereports for
1159 1159 * all children.
1160 1160 */
1161 1161 zio->io_error = ECKSUM;
1162 1162 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1163 1163 for (c = 0; c < rm->rm_cols; c++) {
1164 1164 rc = &rm->rm_col[c];
1165 1165 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1166 1166 zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
1167 1167 rc->rc_offset, rc->rc_size);
1168 1168 }
1169 1169 }
1170 1170
1171 1171 done:
1172 1172 zio_checksum_verified(zio);
1173 1173
1174 1174 if (zio->io_error == 0 && (spa_mode & FWRITE) &&
1175 1175 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1176 1176 zio_t *rio;
1177 1177
1178 1178 /*
1179 1179 * Use the good data we have in hand to repair damaged children.
1180 1180 *
1181 1181 * We issue all repair I/Os as children of 'rio' to arrange
1182 1182 * that vdev_raidz_map_free(zio) will be invoked after all
1183 1183 * repairs complete, but before we advance to the next stage.
1184 1184 */
1185 1185 rio = zio_null(zio, zio->io_spa,
1186 1186 vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
1187 1187
1188 1188 for (c = 0; c < rm->rm_cols; c++) {
1189 1189 rc = &rm->rm_col[c];
1190 1190 cvd = vd->vdev_child[rc->rc_devidx];
1191 1191
1192 1192 if (rc->rc_error == 0)
1193 1193 continue;
1194 1194
1195 1195 dprintf("%s resilvered %s @ 0x%llx error %d\n",
1196 1196 vdev_description(vd),
1197 1197 vdev_description(cvd),
1198 1198 zio->io_offset, rc->rc_error);
1199 1199
1200 1200 zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
1201 1201 rc->rc_offset, rc->rc_data, rc->rc_size,
1202 1202 ZIO_TYPE_WRITE, zio->io_priority,
1203 1203 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE |
1204 1204 ZIO_FLAG_CANFAIL, NULL, NULL));
1205 1205 }
1206 1206
1207 1207 zio_nowait(rio);
1208 1208
1209 1209 return (zio_wait_for_children_done(zio));
1210 1210 }
1211 1211
1212 1212 vdev_raidz_map_free(zio);
1213 1213
1214 1214 return (ZIO_PIPELINE_CONTINUE);
1215 1215 }
1216 1216
1217 1217 static void
1218 1218 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
|
↓ open down ↓ |
1218 lines elided |
↑ open up ↑ |
1219 1219 {
1220 1220 if (faulted > vd->vdev_nparity)
1221 1221 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1222 1222 VDEV_AUX_NO_REPLICAS);
1223 1223 else if (degraded + faulted != 0)
1224 1224 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1225 1225 else
1226 1226 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1227 1227 }
1228 1228
1229 +static uint8_t
1230 +vdev_raidz_grid(vdev_t *vd)
1231 +{
1232 + ASSERT(vd->vdev_nparity - 1 <= 1);
1233 + return (((vd->vdev_nparity - 1) << 6) | vd->vdev_children);
1234 +}
1235 +
1229 1236 vdev_ops_t vdev_raidz_ops = {
1230 1237 vdev_raidz_open,
1231 1238 vdev_raidz_close,
1232 1239 NULL,
1233 1240 vdev_raidz_asize,
1234 1241 vdev_raidz_io_start,
1235 1242 vdev_raidz_io_done,
1236 1243 vdev_raidz_state_change,
1244 + vdev_raidz_grid,
1237 1245 VDEV_TYPE_RAIDZ, /* name of this vdev type */
1238 1246 B_FALSE /* not a leaf vdev */
1239 1247 };
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX