1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #pragma ident   "%Z%%M% %I%     %E% SMI"
  28 
  29 #include <sys/zfs_context.h>
  30 #include <sys/spa.h>
  31 #include <sys/vdev_impl.h>
  32 #include <sys/zio.h>
  33 #include <sys/zio_checksum.h>
  34 #include <sys/fs/zfs.h>
  35 #include <sys/fm/fs/zfs.h>
  36 
  37 /*
  38  * Virtual device vector for RAID-Z.
  39  *
  40  * This vdev supports both single and double parity. For single parity, we
  41  * use a simple XOR of all the data columns. For double parity, we use both
  42  * the simple XOR as well as a technique described in "The mathematics of
  43  * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
  44  * over the integers expressable in a single byte. Briefly, the operations on
  45  * the field are defined as follows:
  46  *
  47  *   o addition (+) is represented by a bitwise XOR
  48  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  49  *   o multiplication of A by 2 is defined by the following bitwise expression:
  50  *      (A * 2)_7 = A_6
  51  *      (A * 2)_6 = A_5
  52  *      (A * 2)_5 = A_4
  53  *      (A * 2)_4 = A_3 + A_7
  54  *      (A * 2)_3 = A_2 + A_7
  55  *      (A * 2)_2 = A_1 + A_7
  56  *      (A * 2)_1 = A_0
  57  *      (A * 2)_0 = A_7
  58  *
  59  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  60  *
  61  * Observe that any number in the field (except for 0) can be expressed as a
  62  * power of 2 -- a generator for the field. We store a table of the powers of
  63  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  64  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  65  * than field addition). The inverse of a field element A (A^-1) is A^254.
  66  *
  67  * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
  68  * can be expressed by field operations:
  69  *
  70  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
  71  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  72  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  73  *
  74  * See the reconstruction code below for how P and Q can used individually or
  75  * in concert to recover missing data columns.
  76  */
  77 
  78 typedef struct raidz_col {
  79         uint64_t rc_devidx;             /* child device index for I/O */
  80         uint64_t rc_offset;             /* device offset */
  81         uint64_t rc_size;               /* I/O size */
  82         void *rc_data;                  /* I/O data */
  83         int rc_error;                   /* I/O error for this device */
  84         uint8_t rc_tried;               /* Did we attempt this I/O column? */
  85         uint8_t rc_skipped;             /* Did we skip this I/O column? */
  86 } raidz_col_t;
  87 
  88 typedef struct raidz_map {
  89         uint64_t rm_cols;               /* Column count */
  90         uint64_t rm_bigcols;            /* Number of oversized columns */
  91         uint64_t rm_asize;              /* Actual total I/O size */
  92         uint64_t rm_missingdata;        /* Count of missing data devices */
  93         uint64_t rm_missingparity;      /* Count of missing parity devices */
  94         uint64_t rm_firstdatacol;       /* First data column/parity count */
  95         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
  96 } raidz_map_t;
  97 
  98 #define VDEV_RAIDZ_P            0
  99 #define VDEV_RAIDZ_Q            1
 100 
 101 #define VDEV_RAIDZ_MAXPARITY    2
 102 
 103 #define VDEV_RAIDZ_MUL_2(a)     (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
 104 
 105 /*
 106  * These two tables represent powers and logs of 2 in the Galois field defined
 107  * above. These values were computed by repeatedly multiplying by 2 as above.
 108  */
 109 static const uint8_t vdev_raidz_pow2[256] = {
 110         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 111         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 112         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 113         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 114         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 115         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 116         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 117         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 118         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 119         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 120         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 121         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 122         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 123         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 124         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 125         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 126         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 127         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 128         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 129         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 130         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 131         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 132         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 133         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 134         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 135         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 136         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 137         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 138         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 139         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 140         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 141         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 142 };
 143 static const uint8_t vdev_raidz_log2[256] = {
 144         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 145         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 146         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 147         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 148         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 149         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 150         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 151         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 152         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 153         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 154         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 155         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 156         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 157         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 158         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 159         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 160         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 161         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 162         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 163         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 164         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 165         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 166         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 167         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 168         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 169         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 170         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 171         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 172         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 173         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 174         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 175         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 176 };
 177 
 178 /*
 179  * Multiply a given number by 2 raised to the given power.
 180  */
 181 static uint8_t
 182 vdev_raidz_exp2(uint_t a, int exp)
 183 {
 184         if (a == 0)
 185                 return (0);
 186 
 187         ASSERT(exp >= 0);
 188         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 189 
 190         exp += vdev_raidz_log2[a];
 191         if (exp > 255)
 192                 exp -= 255;
 193 
 194         return (vdev_raidz_pow2[exp]);
 195 }
 196 
 197 static raidz_map_t *
 198 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 199     uint64_t nparity)
 200 {
 201         raidz_map_t *rm;
 202         uint64_t b = zio->io_offset >> unit_shift;
 203         uint64_t s = zio->io_size >> unit_shift;
 204         uint64_t f = b % dcols;
 205         uint64_t o = (b / dcols) << unit_shift;
 206         uint64_t q, r, c, bc, col, acols, coff, devidx;
 207 
 208         q = s / (dcols - nparity);
 209         r = s - q * (dcols - nparity);
 210         bc = (r == 0 ? 0 : r + nparity);
 211 
 212         acols = (q == 0 ? bc : dcols);
 213 
 214         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
 215 
 216         rm->rm_cols = acols;
 217         rm->rm_bigcols = bc;
 218         rm->rm_asize = 0;
 219         rm->rm_missingdata = 0;
 220         rm->rm_missingparity = 0;
 221         rm->rm_firstdatacol = nparity;
 222 
 223         for (c = 0; c < acols; c++) {
 224                 col = f + c;
 225                 coff = o;
 226                 if (col >= dcols) {
 227                         col -= dcols;
 228                         coff += 1ULL << unit_shift;
 229                 }
 230                 rm->rm_col[c].rc_devidx = col;
 231                 rm->rm_col[c].rc_offset = coff;
 232                 rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
 233                 rm->rm_col[c].rc_data = NULL;
 234                 rm->rm_col[c].rc_error = 0;
 235                 rm->rm_col[c].rc_tried = 0;
 236                 rm->rm_col[c].rc_skipped = 0;
 237                 rm->rm_asize += rm->rm_col[c].rc_size;
 238         }
 239 
 240         rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
 241 
 242         for (c = 0; c < rm->rm_firstdatacol; c++)
 243                 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
 244 
 245         rm->rm_col[c].rc_data = zio->io_data;
 246 
 247         for (c = c + 1; c < acols; c++)
 248                 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
 249                     rm->rm_col[c - 1].rc_size;
 250 
 251         /*
 252          * If all data stored spans all columns, there's a danger that parity
 253          * will always be on the same device and, since parity isn't read
 254          * during normal operation, that that device's I/O bandwidth won't be
 255          * used effectively. We therefore switch the parity every 1MB.
 256          *
 257          * ... at least that was, ostensibly, the theory. As a practical
 258          * matter unless we juggle the parity between all devices evenly, we
 259          * won't see any benefit. Further, occasional writes that aren't a
 260          * multiple of the LCM of the number of children and the minimum
 261          * stripe width are sufficient to avoid pessimal behavior.
 262          * Unfortunately, this decision created an implicit on-disk format
 263          * requirement that we need to support for all eternity, but only
 264          * for single-parity RAID-Z.
 265          */
 266         ASSERT(rm->rm_cols >= 2);
 267         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 268 
 269         if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
 270                 devidx = rm->rm_col[0].rc_devidx;
 271                 o = rm->rm_col[0].rc_offset;
 272                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
 273                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
 274                 rm->rm_col[1].rc_devidx = devidx;
 275                 rm->rm_col[1].rc_offset = o;
 276         }
 277 
 278         zio->io_vsd = rm;
 279         return (rm);
 280 }
 281 
 282 static void
 283 vdev_raidz_map_free(zio_t *zio)
 284 {
 285         raidz_map_t *rm = zio->io_vsd;
 286         int c;
 287 
 288         for (c = 0; c < rm->rm_firstdatacol; c++)
 289                 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
 290 
 291         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
 292         zio->io_vsd = NULL;
 293 }
 294 
 295 static void
 296 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 297 {
 298         uint64_t *p, *src, pcount, ccount, i;
 299         int c;
 300 
 301         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 302 
 303         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 304                 src = rm->rm_col[c].rc_data;
 305                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 306                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 307 
 308                 if (c == rm->rm_firstdatacol) {
 309                         ASSERT(ccount == pcount);
 310                         for (i = 0; i < ccount; i++, p++, src++) {
 311                                 *p = *src;
 312                         }
 313                 } else {
 314                         ASSERT(ccount <= pcount);
 315                         for (i = 0; i < ccount; i++, p++, src++) {
 316                                 *p ^= *src;
 317                         }
 318                 }
 319         }
 320 }
 321 
 322 static void
 323 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 324 {
 325         uint64_t *q, *p, *src, pcount, ccount, mask, i;
 326         int c;
 327 
 328         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 329         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 330             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 331 
 332         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 333                 src = rm->rm_col[c].rc_data;
 334                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 335                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 336                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 337 
 338                 if (c == rm->rm_firstdatacol) {
 339                         ASSERT(ccount == pcount || ccount == 0);
 340                         for (i = 0; i < ccount; i++, p++, q++, src++) {
 341                                 *q = *src;
 342                                 *p = *src;
 343                         }
 344                         for (; i < pcount; i++, p++, q++, src++) {
 345                                 *q = 0;
 346                                 *p = 0;
 347                         }
 348                 } else {
 349                         ASSERT(ccount <= pcount);
 350 
 351                         /*
 352                          * Rather than multiplying each byte individually (as
 353                          * described above), we are able to handle 8 at once
 354                          * by generating a mask based on the high bit in each
 355                          * byte and using that to conditionally XOR in 0x1d.
 356                          */
 357                         for (i = 0; i < ccount; i++, p++, q++, src++) {
 358                                 mask = *q & 0x8080808080808080ULL;
 359                                 mask = (mask << 1) - (mask >> 7);
 360                                 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
 361                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
 362                                 *q ^= *src;
 363                                 *p ^= *src;
 364                         }
 365 
 366                         /*
 367                          * Treat short columns as though they are full of 0s.
 368                          */
 369                         for (; i < pcount; i++, q++) {
 370                                 mask = *q & 0x8080808080808080ULL;
 371                                 mask = (mask << 1) - (mask >> 7);
 372                                 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
 373                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
 374                         }
 375                 }
 376         }
 377 }
 378 
 379 static void
 380 vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
 381 {
 382         uint64_t *dst, *src, xcount, ccount, count, i;
 383         int c;
 384 
 385         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 386         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
 387         ASSERT(xcount > 0);
 388 
 389         src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 390         dst = rm->rm_col[x].rc_data;
 391         for (i = 0; i < xcount; i++, dst++, src++) {
 392                 *dst = *src;
 393         }
 394 
 395         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 396                 src = rm->rm_col[c].rc_data;
 397                 dst = rm->rm_col[x].rc_data;
 398 
 399                 if (c == x)
 400                         continue;
 401 
 402                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 403                 count = MIN(ccount, xcount);
 404 
 405                 for (i = 0; i < count; i++, dst++, src++) {
 406                         *dst ^= *src;
 407                 }
 408         }
 409 }
 410 
 411 static void
 412 vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
 413 {
 414         uint64_t *dst, *src, xcount, ccount, count, mask, i;
 415         uint8_t *b;
 416         int c, j, exp;
 417 
 418         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 419         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
 420 
 421         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 422                 src = rm->rm_col[c].rc_data;
 423                 dst = rm->rm_col[x].rc_data;
 424 
 425                 if (c == x)
 426                         ccount = 0;
 427                 else
 428                         ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 429 
 430                 count = MIN(ccount, xcount);
 431 
 432                 if (c == rm->rm_firstdatacol) {
 433                         for (i = 0; i < count; i++, dst++, src++) {
 434                                 *dst = *src;
 435                         }
 436                         for (; i < xcount; i++, dst++) {
 437                                 *dst = 0;
 438                         }
 439 
 440                 } else {
 441                         /*
 442                          * For an explanation of this, see the comment in
 443                          * vdev_raidz_generate_parity_pq() above.
 444                          */
 445                         for (i = 0; i < count; i++, dst++, src++) {
 446                                 mask = *dst & 0x8080808080808080ULL;
 447                                 mask = (mask << 1) - (mask >> 7);
 448                                 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
 449                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
 450                                 *dst ^= *src;
 451                         }
 452 
 453                         for (; i < xcount; i++, dst++) {
 454                                 mask = *dst & 0x8080808080808080ULL;
 455                                 mask = (mask << 1) - (mask >> 7);
 456                                 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
 457                                     (mask & 0x1d1d1d1d1d1d1d1dULL);
 458                         }
 459                 }
 460         }
 461 
 462         src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 463         dst = rm->rm_col[x].rc_data;
 464         exp = 255 - (rm->rm_cols - 1 - x);
 465 
 466         for (i = 0; i < xcount; i++, dst++, src++) {
 467                 *dst ^= *src;
 468                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 469                         *b = vdev_raidz_exp2(*b, exp);
 470                 }
 471         }
 472 }
 473 
 474 static void
 475 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
 476 {
 477         uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
 478         void *pdata, *qdata;
 479         uint64_t xsize, ysize, i;
 480 
 481         ASSERT(x < y);
 482         ASSERT(x >= rm->rm_firstdatacol);
 483         ASSERT(y < rm->rm_cols);
 484 
 485         ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
 486 
 487         /*
 488          * Move the parity data aside -- we're going to compute parity as
 489          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
 490          * reuse the parity generation mechanism without trashing the actual
 491          * parity so we make those columns appear to be full of zeros by
 492          * setting their lengths to zero.
 493          */
 494         pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 495         qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 496         xsize = rm->rm_col[x].rc_size;
 497         ysize = rm->rm_col[y].rc_size;
 498 
 499         rm->rm_col[VDEV_RAIDZ_P].rc_data =
 500             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
 501         rm->rm_col[VDEV_RAIDZ_Q].rc_data =
 502             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 503         rm->rm_col[x].rc_size = 0;
 504         rm->rm_col[y].rc_size = 0;
 505 
 506         vdev_raidz_generate_parity_pq(rm);
 507 
 508         rm->rm_col[x].rc_size = xsize;
 509         rm->rm_col[y].rc_size = ysize;
 510 
 511         p = pdata;
 512         q = qdata;
 513         pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 514         qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 515         xd = rm->rm_col[x].rc_data;
 516         yd = rm->rm_col[y].rc_data;
 517 
 518         /*
 519          * We now have:
 520          *      Pxy = P + D_x + D_y
 521          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
 522          *
 523          * We can then solve for D_x:
 524          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
 525          * where
 526          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
 527          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
 528          *
 529          * With D_x in hand, we can easily solve for D_y:
 530          *      D_y = P + Pxy + D_x
 531          */
 532 
 533         a = vdev_raidz_pow2[255 + x - y];
 534         b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
 535         tmp = 255 - vdev_raidz_log2[a ^ 1];
 536 
 537         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
 538         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
 539 
 540         for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
 541                 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
 542                     vdev_raidz_exp2(*q ^ *qxy, bexp);
 543 
 544                 if (i < ysize)
 545                         *yd = *p ^ *pxy ^ *xd;
 546         }
 547 
 548         zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
 549             rm->rm_col[VDEV_RAIDZ_P].rc_size);
 550         zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
 551             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 552 
 553         /*
 554          * Restore the saved parity data.
 555          */
 556         rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
 557         rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
 558 }
 559 
 560 
 561 static int
 562 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 563 {
 564         vdev_t *cvd;
 565         uint64_t nparity = vd->vdev_nparity;
 566         int c, error;
 567         int lasterror = 0;
 568         int numerrors = 0;
 569 
 570         ASSERT(nparity > 0);
 571 
 572         if (nparity > VDEV_RAIDZ_MAXPARITY ||
 573             vd->vdev_children < nparity + 1) {
 574                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 575                 return (EINVAL);
 576         }
 577 
 578         for (c = 0; c < vd->vdev_children; c++) {
 579                 cvd = vd->vdev_child[c];
 580 
 581                 if ((error = vdev_open(cvd)) != 0) {
 582                         lasterror = error;
 583                         numerrors++;
 584                         continue;
 585                 }
 586 
 587                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
 588                 *ashift = MAX(*ashift, cvd->vdev_ashift);
 589         }
 590 
 591         *asize *= vd->vdev_children;
 592 
 593         if (numerrors > nparity) {
 594                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
 595                 return (lasterror);
 596         }
 597 
 598         return (0);
 599 }
 600 
 601 static void
 602 vdev_raidz_close(vdev_t *vd)
 603 {
 604         int c;
 605 
 606         for (c = 0; c < vd->vdev_children; c++)
 607                 vdev_close(vd->vdev_child[c]);
 608 }
 609 
 610 static uint64_t
 611 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
 612 {
 613         uint64_t asize;
 614         uint64_t ashift = vd->vdev_top->vdev_ashift;
 615         uint64_t cols = vd->vdev_children;
 616         uint64_t nparity = vd->vdev_nparity;
 617 
 618         asize = ((psize - 1) >> ashift) + 1;
 619         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
 620         asize = roundup(asize, nparity + 1) << ashift;
 621 
 622         return (asize);
 623 }
 624 
 625 static void
 626 vdev_raidz_child_done(zio_t *zio)
 627 {
 628         raidz_col_t *rc = zio->io_private;
 629 
 630         rc->rc_error = zio->io_error;
 631         rc->rc_tried = 1;
 632         rc->rc_skipped = 0;
 633 }
 634 
 635 static void
 636 vdev_raidz_repair_done(zio_t *zio)
 637 {
 638         ASSERT(zio->io_private == zio->io_parent);
 639         vdev_raidz_map_free(zio->io_private);
 640 }
 641 
 642 static int
 643 vdev_raidz_io_start(zio_t *zio)
 644 {
 645         vdev_t *vd = zio->io_vd;
 646         vdev_t *tvd = vd->vdev_top;
 647         vdev_t *cvd;
 648         blkptr_t *bp = zio->io_bp;
 649         raidz_map_t *rm;
 650         raidz_col_t *rc;
 651         int c;
 652 
 653         rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
 654             vd->vdev_nparity);
 655 
 656         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
 657 
 658         if (zio->io_type == ZIO_TYPE_WRITE) {
 659                 /*
 660                  * Generate RAID parity in the first virtual columns.
 661                  */
 662                 if (rm->rm_firstdatacol == 1)
 663                         vdev_raidz_generate_parity_p(rm);
 664                 else
 665                         vdev_raidz_generate_parity_pq(rm);
 666 
 667                 for (c = 0; c < rm->rm_cols; c++) {
 668                         rc = &rm->rm_col[c];
 669                         cvd = vd->vdev_child[rc->rc_devidx];
 670                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 671                             rc->rc_offset, rc->rc_data, rc->rc_size,
 672                             zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
 673                             vdev_raidz_child_done, rc));
 674                 }
 675 
 676                 return (zio_wait_for_children_done(zio));
 677         }
 678 
 679         ASSERT(zio->io_type == ZIO_TYPE_READ);
 680 
 681         /*
 682          * Iterate over the columns in reverse order so that we hit the parity
 683          * last -- any errors along the way will force us to read the parity
 684          * data.
 685          */
 686         for (c = rm->rm_cols - 1; c >= 0; c--) {
 687                 rc = &rm->rm_col[c];
 688                 cvd = vd->vdev_child[rc->rc_devidx];
 689                 if (!vdev_readable(cvd)) {
 690                         if (c >= rm->rm_firstdatacol)
 691                                 rm->rm_missingdata++;
 692                         else
 693                                 rm->rm_missingparity++;
 694                         rc->rc_error = ENXIO;
 695                         rc->rc_tried = 1;    /* don't even try */
 696                         rc->rc_skipped = 1;
 697                         continue;
 698                 }
 699                 if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
 700                         if (c >= rm->rm_firstdatacol)
 701                                 rm->rm_missingdata++;
 702                         else
 703                                 rm->rm_missingparity++;
 704                         rc->rc_error = ESTALE;
 705                         rc->rc_skipped = 1;
 706                         continue;
 707                 }
 708                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
 709                     (zio->io_flags & ZIO_FLAG_SCRUB)) {
 710                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 711                             rc->rc_offset, rc->rc_data, rc->rc_size,
 712                             zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
 713                             vdev_raidz_child_done, rc));
 714                 }
 715         }
 716 
 717         return (zio_wait_for_children_done(zio));
 718 }
 719 
 720 /*
 721  * Report a checksum error for a child of a RAID-Z device.
 722  */
 723 static void
 724 raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
 725 {
 726         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
 727         dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
 728             vdev_description(vd));
 729 
 730         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 731                 mutex_enter(&vd->vdev_stat_lock);
 732                 vd->vdev_stat.vs_checksum_errors++;
 733                 mutex_exit(&vd->vdev_stat_lock);
 734         }
 735 
 736         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
 737                 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
 738                     zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
 739 }
 740 
 741 /*
 742  * Generate the parity from the data columns. If we tried and were able to
 743  * read the parity without error, verify that the generated parity matches the
 744  * data we read. If it doesn't, we fire off a checksum error. Return the
 745  * number such failures.
 746  */
 747 static int
 748 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
 749 {
 750         void *orig[VDEV_RAIDZ_MAXPARITY];
 751         int c, ret = 0;
 752         raidz_col_t *rc;
 753 
 754         for (c = 0; c < rm->rm_firstdatacol; c++) {
 755                 rc = &rm->rm_col[c];
 756                 if (!rc->rc_tried || rc->rc_error != 0)
 757                         continue;
 758                 orig[c] = zio_buf_alloc(rc->rc_size);
 759                 bcopy(rc->rc_data, orig[c], rc->rc_size);
 760         }
 761 
 762         if (rm->rm_firstdatacol == 1)
 763                 vdev_raidz_generate_parity_p(rm);
 764         else
 765                 vdev_raidz_generate_parity_pq(rm);
 766 
 767         for (c = 0; c < rm->rm_firstdatacol; c++) {
 768                 rc = &rm->rm_col[c];
 769                 if (!rc->rc_tried || rc->rc_error != 0)
 770                         continue;
 771                 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
 772                         raidz_checksum_error(zio, rc);
 773                         rc->rc_error = ECKSUM;
 774                         ret++;
 775                 }
 776                 zio_buf_free(orig[c], rc->rc_size);
 777         }
 778 
 779         return (ret);
 780 }
 781 
 782 static uint64_t raidz_corrected_p;
 783 static uint64_t raidz_corrected_q;
 784 static uint64_t raidz_corrected_pq;
 785 
 786 static int
 787 vdev_raidz_io_done(zio_t *zio)
 788 {
 789         vdev_t *vd = zio->io_vd;
 790         vdev_t *cvd;
 791         raidz_map_t *rm = zio->io_vsd;
 792         raidz_col_t *rc, *rc1;
 793         int unexpected_errors = 0;
 794         int parity_errors = 0;
 795         int parity_untried = 0;
 796         int data_errors = 0;
 797         int n, c, c1;
 798 
 799         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
 800 
 801         zio->io_error = 0;
 802         zio->io_numerrors = 0;
 803 
 804         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
 805         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
 806 
 807         for (c = 0; c < rm->rm_cols; c++) {
 808                 rc = &rm->rm_col[c];
 809 
 810                 /*
 811                  * We preserve any EIOs because those may be worth retrying;
 812                  * whereas ECKSUM and ENXIO are more likely to be persistent.
 813                  */
 814                 if (rc->rc_error) {
 815                         if (zio->io_error != EIO)
 816                                 zio->io_error = rc->rc_error;
 817 
 818                         if (c < rm->rm_firstdatacol)
 819                                 parity_errors++;
 820                         else
 821                                 data_errors++;
 822 
 823                         if (!rc->rc_skipped)
 824                                 unexpected_errors++;
 825 
 826                         zio->io_numerrors++;
 827                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
 828                         parity_untried++;
 829                 }
 830         }
 831 
 832         if (zio->io_type == ZIO_TYPE_WRITE) {
 833                 /*
 834                  * If this is not a failfast write, and we were able to
 835                  * write enough columns to reconstruct the data, good enough.
 836                  */
 837                 /* XXPOLICY */
 838                 if (zio->io_numerrors <= rm->rm_firstdatacol &&
 839                     !(zio->io_flags & ZIO_FLAG_FAILFAST))
 840                         zio->io_error = 0;
 841 
 842                 vdev_raidz_map_free(zio);
 843 
 844                 return (ZIO_PIPELINE_CONTINUE);
 845         }
 846 
 847         ASSERT(zio->io_type == ZIO_TYPE_READ);
 848         /*
 849          * There are three potential phases for a read:
 850          *      1. produce valid data from the columns read
 851          *      2. read all disks and try again
 852          *      3. perform combinatorial reconstruction
 853          *
 854          * Each phase is progressively both more expensive and less likely to
 855          * occur. If we encounter more errors than we can repair or all phases
 856          * fail, we have no choice but to return an error.
 857          */
 858 
 859         /*
 860          * If the number of errors we saw was correctable -- less than or equal
 861          * to the number of parity disks read -- attempt to produce data that
 862          * has a valid checksum. Naturally, this case applies in the absence of
 863          * any errors.
 864          */
 865         if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) {
 866                 switch (data_errors) {
 867                 case 0:
 868                         if (zio_checksum_error(zio) == 0) {
 869                                 zio->io_error = 0;
 870 
 871                                 /*
 872                                  * If we read parity information (unnecessarily
 873                                  * as it happens since no reconstruction was
 874                                  * needed) regenerate and verify the parity.
 875                                  * We also regenerate parity when resilvering
 876                                  * so we can write it out to the failed device
 877                                  * later.
 878                                  */
 879                                 if (parity_errors + parity_untried <
 880                                     rm->rm_firstdatacol ||
 881                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
 882                                         n = raidz_parity_verify(zio, rm);
 883                                         unexpected_errors += n;
 884                                         ASSERT(parity_errors + n <=
 885                                             rm->rm_firstdatacol);
 886                                 }
 887                                 goto done;
 888                         }
 889                         break;
 890 
 891                 case 1:
 892                         /*
 893                          * We either attempt to read all the parity columns or
 894                          * none of them. If we didn't try to read parity, we
 895                          * wouldn't be here in the correctable case. There must
 896                          * also have been fewer parity errors than parity
 897                          * columns or, again, we wouldn't be in this code path.
 898                          */
 899                         ASSERT(parity_untried == 0);
 900                         ASSERT(parity_errors < rm->rm_firstdatacol);
 901 
 902                         /*
 903                          * Find the column that reported the error.
 904                          */
 905                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 906                                 rc = &rm->rm_col[c];
 907                                 if (rc->rc_error != 0)
 908                                         break;
 909                         }
 910                         ASSERT(c != rm->rm_cols);
 911                         ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
 912                             rc->rc_error == ESTALE);
 913 
 914                         if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
 915                                 vdev_raidz_reconstruct_p(rm, c);
 916                         } else {
 917                                 ASSERT(rm->rm_firstdatacol > 1);
 918                                 vdev_raidz_reconstruct_q(rm, c);
 919                         }
 920 
 921                         if (zio_checksum_error(zio) == 0) {
 922                                 zio->io_error = 0;
 923                                 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
 924                                         atomic_inc_64(&raidz_corrected_p);
 925                                 else
 926                                         atomic_inc_64(&raidz_corrected_q);
 927 
 928                                 /*
 929                                  * If there's more than one parity disk that
 930                                  * was successfully read, confirm that the
 931                                  * other parity disk produced the correct data.
 932                                  * This routine is suboptimal in that it
 933                                  * regenerates both the parity we wish to test
 934                                  * as well as the parity we just used to
 935                                  * perform the reconstruction, but this should
 936                                  * be a relatively uncommon case, and can be
 937                                  * optimized if it becomes a problem.
 938                                  * We also regenerate parity when resilvering
 939                                  * so we can write it out to the failed device
 940                                  * later.
 941                                  */
 942                                 if (parity_errors < rm->rm_firstdatacol - 1 ||
 943                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
 944                                         n = raidz_parity_verify(zio, rm);
 945                                         unexpected_errors += n;
 946                                         ASSERT(parity_errors + n <=
 947                                             rm->rm_firstdatacol);
 948                                 }
 949 
 950                                 goto done;
 951                         }
 952                         break;
 953 
 954                 case 2:
 955                         /*
 956                          * Two data column errors require double parity.
 957                          */
 958                         ASSERT(rm->rm_firstdatacol == 2);
 959 
 960                         /*
 961                          * Find the two columns that reported errors.
 962                          */
 963                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 964                                 rc = &rm->rm_col[c];
 965                                 if (rc->rc_error != 0)
 966                                         break;
 967                         }
 968                         ASSERT(c != rm->rm_cols);
 969                         ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
 970                             rc->rc_error == ESTALE);
 971 
 972                         for (c1 = c++; c < rm->rm_cols; c++) {
 973                                 rc = &rm->rm_col[c];
 974                                 if (rc->rc_error != 0)
 975                                         break;
 976                         }
 977                         ASSERT(c != rm->rm_cols);
 978                         ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
 979                             rc->rc_error == ESTALE);
 980 
 981                         vdev_raidz_reconstruct_pq(rm, c1, c);
 982 
 983                         if (zio_checksum_error(zio) == 0) {
 984                                 zio->io_error = 0;
 985                                 atomic_inc_64(&raidz_corrected_pq);
 986 
 987                                 goto done;
 988                         }
 989                         break;
 990 
 991                 default:
 992                         ASSERT(rm->rm_firstdatacol <= 2);
 993                         ASSERT(0);
 994                 }
 995         }
 996 
 997         /*
 998          * This isn't a typical situation -- either we got a read error or
 999          * a child silently returned bad data. Read every block so we can
1000          * try again with as much data and parity as we can track down. If
1001          * we've already been through once before, all children will be marked
1002          * as tried so we'll proceed to combinatorial reconstruction.
1003          */
1004         unexpected_errors = 1;
1005         rm->rm_missingdata = 0;
1006         rm->rm_missingparity = 0;
1007 
1008         for (c = 0; c < rm->rm_cols; c++) {
1009                 if (rm->rm_col[c].rc_tried)
1010                         continue;
1011 
1012                 zio->io_error = 0;
1013                 zio_vdev_io_redone(zio);
1014                 do {
1015                         rc = &rm->rm_col[c];
1016                         if (rc->rc_tried)
1017                                 continue;
1018                         zio_nowait(zio_vdev_child_io(zio, NULL,
1019                             vd->vdev_child[rc->rc_devidx],
1020                             rc->rc_offset, rc->rc_data, rc->rc_size,
1021                             zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
1022                             vdev_raidz_child_done, rc));
1023                 } while (++c < rm->rm_cols);
1024                 dprintf("rereading\n");
1025 
1026                 return (zio_wait_for_children_done(zio));
1027         }
1028 
1029         /*
1030          * At this point we've attempted to reconstruct the data given the
1031          * errors we detected, and we've attempted to read all columns. There
1032          * must, therefore, be one or more additional problems -- silent errors
1033          * resulting in invalid data rather than explicit I/O errors resulting
1034          * in absent data. Before we attempt combinatorial reconstruction make
1035          * sure we have a chance of coming up with the right answer.
1036          */
1037         if (zio->io_numerrors >= rm->rm_firstdatacol) {
1038                 ASSERT(zio->io_error != 0);
1039                 goto done;
1040         }
1041 
1042         if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
1043                 /*
1044                  * Attempt to reconstruct the data from parity P.
1045                  */
1046                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1047                         void *orig;
1048                         rc = &rm->rm_col[c];
1049 
1050                         orig = zio_buf_alloc(rc->rc_size);
1051                         bcopy(rc->rc_data, orig, rc->rc_size);
1052                         vdev_raidz_reconstruct_p(rm, c);
1053 
1054                         if (zio_checksum_error(zio) == 0) {
1055                                 zio_buf_free(orig, rc->rc_size);
1056                                 zio->io_error = 0;
1057                                 atomic_inc_64(&raidz_corrected_p);
1058 
1059                                 /*
1060                                  * If this child didn't know that it returned
1061                                  * bad data, inform it.
1062                                  */
1063                                 if (rc->rc_tried && rc->rc_error == 0)
1064                                         raidz_checksum_error(zio, rc);
1065                                 rc->rc_error = ECKSUM;
1066                                 goto done;
1067                         }
1068 
1069                         bcopy(orig, rc->rc_data, rc->rc_size);
1070                         zio_buf_free(orig, rc->rc_size);
1071                 }
1072         }
1073 
1074         if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1075                 /*
1076                  * Attempt to reconstruct the data from parity Q.
1077                  */
1078                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1079                         void *orig;
1080                         rc = &rm->rm_col[c];
1081 
1082                         orig = zio_buf_alloc(rc->rc_size);
1083                         bcopy(rc->rc_data, orig, rc->rc_size);
1084                         vdev_raidz_reconstruct_q(rm, c);
1085 
1086                         if (zio_checksum_error(zio) == 0) {
1087                                 zio_buf_free(orig, rc->rc_size);
1088                                 zio->io_error = 0;
1089                                 atomic_inc_64(&raidz_corrected_q);
1090 
1091                                 /*
1092                                  * If this child didn't know that it returned
1093                                  * bad data, inform it.
1094                                  */
1095                                 if (rc->rc_tried && rc->rc_error == 0)
1096                                         raidz_checksum_error(zio, rc);
1097                                 rc->rc_error = ECKSUM;
1098                                 goto done;
1099                         }
1100 
1101                         bcopy(orig, rc->rc_data, rc->rc_size);
1102                         zio_buf_free(orig, rc->rc_size);
1103                 }
1104         }
1105 
1106         if (rm->rm_firstdatacol > 1 &&
1107             rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
1108             rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1109                 /*
1110                  * Attempt to reconstruct the data from both P and Q.
1111                  */
1112                 for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
1113                         void *orig, *orig1;
1114                         rc = &rm->rm_col[c];
1115 
1116                         orig = zio_buf_alloc(rc->rc_size);
1117                         bcopy(rc->rc_data, orig, rc->rc_size);
1118 
1119                         for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
1120                                 rc1 = &rm->rm_col[c1];
1121 
1122                                 orig1 = zio_buf_alloc(rc1->rc_size);
1123                                 bcopy(rc1->rc_data, orig1, rc1->rc_size);
1124 
1125                                 vdev_raidz_reconstruct_pq(rm, c, c1);
1126 
1127                                 if (zio_checksum_error(zio) == 0) {
1128                                         zio_buf_free(orig, rc->rc_size);
1129                                         zio_buf_free(orig1, rc1->rc_size);
1130                                         zio->io_error = 0;
1131                                         atomic_inc_64(&raidz_corrected_pq);
1132 
1133                                         /*
1134                                          * If these children didn't know they
1135                                          * returned bad data, inform them.
1136                                          */
1137                                         if (rc->rc_tried && rc->rc_error == 0)
1138                                                 raidz_checksum_error(zio, rc);
1139                                         if (rc1->rc_tried && rc1->rc_error == 0)
1140                                                 raidz_checksum_error(zio, rc1);
1141 
1142                                         rc->rc_error = ECKSUM;
1143                                         rc1->rc_error = ECKSUM;
1144 
1145                                         goto done;
1146                                 }
1147 
1148                                 bcopy(orig1, rc1->rc_data, rc1->rc_size);
1149                                 zio_buf_free(orig1, rc1->rc_size);
1150                         }
1151 
1152                         bcopy(orig, rc->rc_data, rc->rc_size);
1153                         zio_buf_free(orig, rc->rc_size);
1154                 }
1155         }
1156 
1157         /*
1158          * All combinations failed to checksum. Generate checksum ereports for
1159          * all children.
1160          */
1161         zio->io_error = ECKSUM;
1162         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1163                 for (c = 0; c < rm->rm_cols; c++) {
1164                         rc = &rm->rm_col[c];
1165                         zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1166                             zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
1167                             rc->rc_offset, rc->rc_size);
1168                 }
1169         }
1170 
1171 done:
1172         zio_checksum_verified(zio);
1173 
1174         if (zio->io_error == 0 && (spa_mode & FWRITE) &&
1175             (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1176                 zio_t *rio;
1177 
1178                 /*
1179                  * Use the good data we have in hand to repair damaged children.
1180                  *
1181                  * We issue all repair I/Os as children of 'rio' to arrange
1182                  * that vdev_raidz_map_free(zio) will be invoked after all
1183                  * repairs complete, but before we advance to the next stage.
1184                  */
1185                 rio = zio_null(zio, zio->io_spa,
1186                     vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
1187 
1188                 for (c = 0; c < rm->rm_cols; c++) {
1189                         rc = &rm->rm_col[c];
1190                         cvd = vd->vdev_child[rc->rc_devidx];
1191 
1192                         if (rc->rc_error == 0)
1193                                 continue;
1194 
1195                         dprintf("%s resilvered %s @ 0x%llx error %d\n",
1196                             vdev_description(vd),
1197                             vdev_description(cvd),
1198                             zio->io_offset, rc->rc_error);
1199 
1200                         zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
1201                             rc->rc_offset, rc->rc_data, rc->rc_size,
1202                             ZIO_TYPE_WRITE, zio->io_priority,
1203                             ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE |
1204                             ZIO_FLAG_CANFAIL, NULL, NULL));
1205                 }
1206 
1207                 zio_nowait(rio);
1208 
1209                 return (zio_wait_for_children_done(zio));
1210         }
1211 
1212         vdev_raidz_map_free(zio);
1213 
1214         return (ZIO_PIPELINE_CONTINUE);
1215 }
1216 
1217 static void
1218 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1219 {
1220         if (faulted > vd->vdev_nparity)
1221                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1222                     VDEV_AUX_NO_REPLICAS);
1223         else if (degraded + faulted != 0)
1224                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1225         else
1226                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1227 }
1228 
1229 static uint8_t
1230 vdev_raidz_grid(vdev_t *vd)
1231 {
1232         ASSERT(vd->vdev_nparity - 1 <= 1);
1233         return (((vd->vdev_nparity - 1) << 6) | vd->vdev_children);
1234 }
1235 
1236 vdev_ops_t vdev_raidz_ops = {
1237         vdev_raidz_open,
1238         vdev_raidz_close,
1239         NULL,
1240         vdev_raidz_asize,
1241         vdev_raidz_io_start,
1242         vdev_raidz_io_done,
1243         vdev_raidz_state_change,
1244         vdev_raidz_grid,
1245         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
1246         B_FALSE                 /* not a leaf vdev */
1247 };