1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27 
  28 #include <sys/zfs_context.h>
  29 #include <sys/spa.h>
  30 #include <sys/dmu.h>
  31 #include <sys/zio.h>
  32 #include <sys/space_map.h>
  33 
  34 /*
  35  * Space map routines.
  36  * NOTE: caller is responsible for all locking.
  37  */
  38 static int
  39 space_map_seg_compare(const void *x1, const void *x2)
  40 {
  41         const space_seg_t *s1 = x1;
  42         const space_seg_t *s2 = x2;
  43 
  44         if (s1->ss_start < s2->ss_start) {
  45                 if (s1->ss_end > s2->ss_start)
  46                         return (0);
  47                 return (-1);
  48         }
  49         if (s1->ss_start > s2->ss_start) {
  50                 if (s1->ss_start < s2->ss_end)
  51                         return (0);
  52                 return (1);
  53         }
  54         return (0);
  55 }
  56 
  57 void
  58 space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift,
  59         kmutex_t *lp)
  60 {
  61         bzero(sm, sizeof (*sm));
  62 
  63         avl_create(&sm->sm_root, space_map_seg_compare,
  64             sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
  65 
  66         sm->sm_start = start;
  67         sm->sm_size = size;
  68         sm->sm_shift = shift;
  69         sm->sm_lock = lp;
  70 }
  71 
  72 void
  73 space_map_destroy(space_map_t *sm)
  74 {
  75         ASSERT(!sm->sm_loaded && !sm->sm_loading);
  76         VERIFY3U(sm->sm_space, ==, 0);
  77         avl_destroy(&sm->sm_root);
  78 }
  79 
  80 void
  81 space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
  82 {
  83         avl_index_t where;
  84         space_seg_t ssearch, *ss_before, *ss_after, *ss;
  85         uint64_t end = start + size;
  86         int merge_before, merge_after;
  87 
  88         ASSERT(MUTEX_HELD(sm->sm_lock));
  89         VERIFY(size != 0);
  90         VERIFY3U(start, >=, sm->sm_start);
  91         VERIFY3U(end, <=, sm->sm_start + sm->sm_size);
  92         VERIFY(sm->sm_space + size <= sm->sm_size);
  93         VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
  94         VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
  95 
  96         ssearch.ss_start = start;
  97         ssearch.ss_end = end;
  98         ss = avl_find(&sm->sm_root, &ssearch, &where);
  99 
 100         if (ss != NULL && ss->ss_start <= start && ss->ss_end >= end) {
 101                 zfs_panic_recover("zfs: allocating allocated segment"
 102                     "(offset=%llu size=%llu)\n",
 103                     (longlong_t)start, (longlong_t)size);
 104                 return;
 105         }
 106 
 107         /* Make sure we don't overlap with either of our neighbors */
 108         VERIFY(ss == NULL);
 109 
 110         ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE);
 111         ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER);
 112 
 113         merge_before = (ss_before != NULL && ss_before->ss_end == start);
 114         merge_after = (ss_after != NULL && ss_after->ss_start == end);
 115 
 116         if (merge_before && merge_after) {
 117                 avl_remove(&sm->sm_root, ss_before);
 118                 ss_after->ss_start = ss_before->ss_start;
 119                 kmem_free(ss_before, sizeof (*ss_before));
 120         } else if (merge_before) {
 121                 ss_before->ss_end = end;
 122         } else if (merge_after) {
 123                 ss_after->ss_start = start;
 124         } else {
 125                 ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
 126                 ss->ss_start = start;
 127                 ss->ss_end = end;
 128                 avl_insert(&sm->sm_root, ss, where);
 129         }
 130 
 131         sm->sm_space += size;
 132 }
 133 
 134 void
 135 space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
 136 {
 137         avl_index_t where;
 138         space_seg_t ssearch, *ss, *newseg;
 139         uint64_t end = start + size;
 140         int left_over, right_over;
 141 
 142         ASSERT(MUTEX_HELD(sm->sm_lock));
 143         VERIFY(size != 0);
 144         VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
 145         VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
 146 
 147         ssearch.ss_start = start;
 148         ssearch.ss_end = end;
 149         ss = avl_find(&sm->sm_root, &ssearch, &where);
 150 
 151         /* Make sure we completely overlap with someone */
 152         if (ss == NULL) {
 153                 zfs_panic_recover("zfs: freeing free segment "
 154                     "(offset=%llu size=%llu)",
 155                     (longlong_t)start, (longlong_t)size);
 156                 return;
 157         }
 158         VERIFY3U(ss->ss_start, <=, start);
 159         VERIFY3U(ss->ss_end, >=, end);
 160         VERIFY(sm->sm_space - size <= sm->sm_size);
 161 
 162         left_over = (ss->ss_start != start);
 163         right_over = (ss->ss_end != end);
 164 
 165         if (left_over && right_over) {
 166                 newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
 167                 newseg->ss_start = end;
 168                 newseg->ss_end = ss->ss_end;
 169                 ss->ss_end = start;
 170                 avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
 171         } else if (left_over) {
 172                 ss->ss_end = start;
 173         } else if (right_over) {
 174                 ss->ss_start = end;
 175         } else {
 176                 avl_remove(&sm->sm_root, ss);
 177                 kmem_free(ss, sizeof (*ss));
 178         }
 179 
 180         sm->sm_space -= size;
 181 }
 182 
 183 
 184 
 185 int
 186 space_map_fold(space_map_t *sm, uint8_t oldwidth, uint8_t newwidth)
 187 {
 188         space_seg_t *ss;
 189         uint64_t d, r;
 190 
 191         ASSERT(MUTEX_HELD(sm->sm_lock));
 192 
 193         for (ss = avl_first(&sm->sm_root); ss != NULL;
 194             ss = AVL_NEXT(&sm->sm_root, ss)) {
 195                 d = ss->ss_start / oldwidth;
 196                 r = ss->ss_start % oldwidth;
 197                 ss->ss_start = d * newwidth + r;
 198 
 199                 d = ss->ss_end / oldwidth;
 200                 r = ss->ss_end % oldwidth;
 201                 ss->ss_end = d * newwidth + r;
 202         }
 203 }
 204 
 205 int
 206 space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
 207 {
 208         avl_index_t where;
 209         space_seg_t ssearch, *ss;
 210         uint64_t end = start + size;
 211 
 212         ASSERT(MUTEX_HELD(sm->sm_lock));
 213         VERIFY(size != 0);
 214         VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
 215         VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
 216 
 217         ssearch.ss_start = start;
 218         ssearch.ss_end = end;
 219         ss = avl_find(&sm->sm_root, &ssearch, &where);
 220 
 221         return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end);
 222 }
 223 
 224 void
 225 space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
 226 {
 227         space_seg_t *ss;
 228         void *cookie = NULL;
 229 
 230         ASSERT(MUTEX_HELD(sm->sm_lock));
 231 
 232         while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
 233                 if (func != NULL)
 234                         func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
 235                 kmem_free(ss, sizeof (*ss));
 236         }
 237         sm->sm_space = 0;
 238 }
 239 
 240 void
 241 space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
 242 {
 243         space_seg_t *ss;
 244 
 245         for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
 246                 func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
 247 }
 248 
 249 void
 250 space_map_excise(space_map_t *sm, uint64_t start, uint64_t size)
 251 {
 252         avl_tree_t *t = &sm->sm_root;
 253         avl_index_t where;
 254         space_seg_t *ss, search;
 255         uint64_t end = start + size;
 256         uint64_t rm_start, rm_end;
 257 
 258         ASSERT(MUTEX_HELD(sm->sm_lock));
 259 
 260         search.ss_start = start;
 261         search.ss_end = start;
 262 
 263         for (;;) {
 264                 ss = avl_find(t, &search, &where);
 265 
 266                 if (ss == NULL)
 267                         ss = avl_nearest(t, where, AVL_AFTER);
 268 
 269                 if (ss == NULL || ss->ss_start >= end)
 270                         break;
 271 
 272                 rm_start = MAX(ss->ss_start, start);
 273                 rm_end = MIN(ss->ss_end, end);
 274 
 275                 space_map_remove(sm, rm_start, rm_end - rm_start);
 276         }
 277 }
 278 
 279 /*
 280  * Replace smd with the union of smd and sms.
 281  */
 282 void
 283 space_map_union(space_map_t *smd, space_map_t *sms)
 284 {
 285         avl_tree_t *t = &sms->sm_root;
 286         space_seg_t *ss;
 287 
 288         ASSERT(MUTEX_HELD(smd->sm_lock));
 289 
 290         /*
 291          * For each source segment, remove any intersections with the
 292          * destination, then add the source segment to the destination.
 293          */
 294         for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
 295                 space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start);
 296                 space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start);
 297         }
 298 }
 299 
 300 /*
 301  * Wait for any in-progress space_map_load() to complete.
 302  */
 303 void
 304 space_map_load_wait(space_map_t *sm)
 305 {
 306         ASSERT(MUTEX_HELD(sm->sm_lock));
 307 
 308         while (sm->sm_loading)
 309                 cv_wait(&sm->sm_load_cv, sm->sm_lock);
 310 }
 311 
 312 /*
 313  * Note: space_map_load() will drop sm_lock across dmu_read() calls.
 314  * The caller must be OK with this.
 315  */
 316 int
 317 space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
 318         space_map_obj_t *smo, objset_t *os)
 319 {
 320         uint64_t *entry, *entry_map, *entry_map_end;
 321         uint64_t bufsize, size, offset, end, space;
 322         uint64_t mapstart = sm->sm_start;
 323         int error = 0;
 324 
 325         ASSERT(MUTEX_HELD(sm->sm_lock));
 326 
 327         space_map_load_wait(sm);
 328 
 329         if (sm->sm_loaded)
 330                 return (0);
 331 
 332         sm->sm_loading = B_TRUE;
 333         end = smo->smo_objsize;
 334         space = smo->smo_alloc;
 335 
 336         ASSERT(sm->sm_ops == NULL);
 337         VERIFY3U(sm->sm_space, ==, 0);
 338 
 339         if (maptype == SM_FREE) {
 340                 space_map_add(sm, sm->sm_start, sm->sm_size);
 341                 space = sm->sm_size - space;
 342         }
 343 
 344         bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT;
 345         entry_map = zio_buf_alloc(bufsize);
 346 
 347         mutex_exit(sm->sm_lock);
 348         if (end > bufsize)
 349                 dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize);
 350         mutex_enter(sm->sm_lock);
 351 
 352         for (offset = 0; offset < end; offset += bufsize) {
 353                 size = MIN(end - offset, bufsize);
 354                 VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
 355                 VERIFY(size != 0);
 356 
 357                 dprintf("object=%llu  offset=%llx  size=%llx\n",
 358                     smo->smo_object, offset, size);
 359 
 360                 mutex_exit(sm->sm_lock);
 361                 error = dmu_read(os, smo->smo_object, offset, size, entry_map);
 362                 mutex_enter(sm->sm_lock);
 363                 if (error != 0)
 364                         break;
 365 
 366                 entry_map_end = entry_map + (size / sizeof (uint64_t));
 367                 for (entry = entry_map; entry < entry_map_end; entry++) {
 368                         uint64_t e = *entry;
 369 
 370                         if (SM_IS_DEBUG(e))             /* Skip debug entries */
 371                                 continue;
 372 
 373                         if (SM_IS_SPECIAL(e)) {
 374                                 uint8_t oldwidth, newwidth;
 375                                 ASSERT(SM_SPECIAL_ACTION_DECODE(e) == SM_FOLD);
 376                                 oldwidth = BF64_ENCODE(e, 0, 8);
 377                                 newwidth = BF64_ENCODE(e, 8, 8);
 378                                 space_map_fold(sm, oldwidth, newwidth);
 379                                 continue;
 380                         }
 381 
 382                         (SM_TYPE_DECODE(e) == maptype ?
 383                             space_map_add : space_map_remove)(sm,
 384                             (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart,
 385                             SM_RUN_DECODE(e) << sm->sm_shift);
 386                 }
 387         }
 388 
 389         if (error == 0) {
 390                 VERIFY3U(sm->sm_space, ==, space);
 391 
 392                 sm->sm_loaded = B_TRUE;
 393                 sm->sm_ops = ops;
 394                 if (ops != NULL)
 395                         ops->smop_load(sm);
 396         } else {
 397                 space_map_vacate(sm, NULL, NULL);
 398         }
 399 
 400         zio_buf_free(entry_map, bufsize);
 401 
 402         sm->sm_loading = B_FALSE;
 403 
 404         cv_broadcast(&sm->sm_load_cv);
 405 
 406         return (error);
 407 }
 408 
 409 void
 410 space_map_unload(space_map_t *sm)
 411 {
 412         ASSERT(MUTEX_HELD(sm->sm_lock));
 413 
 414         if (sm->sm_loaded && sm->sm_ops != NULL)
 415                 sm->sm_ops->smop_unload(sm);
 416 
 417         sm->sm_loaded = B_FALSE;
 418         sm->sm_ops = NULL;
 419 
 420         space_map_vacate(sm, NULL, NULL);
 421 }
 422 
 423 uint64_t
 424 space_map_alloc(space_map_t *sm, uint64_t size)
 425 {
 426         uint64_t start;
 427 
 428         start = sm->sm_ops->smop_alloc(sm, size);
 429         if (start != -1ULL)
 430                 space_map_remove(sm, start, size);
 431         return (start);
 432 }
 433 
 434 void
 435 space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
 436 {
 437         sm->sm_ops->smop_claim(sm, start, size);
 438         space_map_remove(sm, start, size);
 439 }
 440 
 441 void
 442 space_map_free(space_map_t *sm, uint64_t start, uint64_t size)
 443 {
 444         space_map_add(sm, start, size);
 445         sm->sm_ops->smop_free(sm, start, size);
 446 }
 447 
 448 /*
 449  * Note: space_map_sync() will drop sm_lock across dmu_write() calls.
 450  */
 451 void
 452 space_map_sync(space_map_t *sm, uint8_t maptype,
 453         space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
 454 {
 455         spa_t *spa = dmu_objset_spa(os);
 456         void *cookie = NULL;
 457         space_seg_t *ss;
 458         uint64_t bufsize, start, size, run_len;
 459         uint64_t *entry, *entry_map, *entry_map_end;
 460 
 461         ASSERT(MUTEX_HELD(sm->sm_lock));
 462 
 463         if (sm->sm_space == 0)
 464                 return;
 465 
 466         dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n",
 467             smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa),
 468             maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root),
 469             sm->sm_space);
 470 
 471         if (maptype == SM_ALLOC)
 472                 smo->smo_alloc += sm->sm_space;
 473         else
 474                 smo->smo_alloc -= sm->sm_space;
 475 
 476         bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t);
 477         bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT);
 478         entry_map = zio_buf_alloc(bufsize);
 479         entry_map_end = entry_map + (bufsize / sizeof (uint64_t));
 480         entry = entry_map;
 481 
 482         *entry++ = SM_DEBUG_ENCODE(1) |
 483             SM_DEBUG_ACTION_ENCODE(maptype) |
 484             SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
 485             SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
 486 
 487         while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
 488                 size = ss->ss_end - ss->ss_start;
 489                 start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
 490 
 491                 sm->sm_space -= size;
 492                 size >>= sm->sm_shift;
 493 
 494                 while (size) {
 495                         run_len = MIN(size, SM_RUN_MAX);
 496 
 497                         if (entry == entry_map_end) {
 498                                 mutex_exit(sm->sm_lock);
 499                                 dmu_write(os, smo->smo_object, smo->smo_objsize,
 500                                     bufsize, entry_map, tx);
 501                                 mutex_enter(sm->sm_lock);
 502                                 smo->smo_objsize += bufsize;
 503                                 entry = entry_map;
 504                         }
 505 
 506                         *entry++ = SM_OFFSET_ENCODE(start) |
 507                             SM_TYPE_ENCODE(maptype) |
 508                             SM_RUN_ENCODE(run_len);
 509 
 510                         start += run_len;
 511                         size -= run_len;
 512                 }
 513                 kmem_free(ss, sizeof (*ss));
 514         }
 515 
 516         if (entry != entry_map) {
 517                 size = (entry - entry_map) * sizeof (uint64_t);
 518                 mutex_exit(sm->sm_lock);
 519                 dmu_write(os, smo->smo_object, smo->smo_objsize,
 520                     size, entry_map, tx);
 521                 mutex_enter(sm->sm_lock);
 522                 smo->smo_objsize += size;
 523         }
 524 
 525         zio_buf_free(entry_map, bufsize);
 526 
 527         VERIFY3U(sm->sm_space, ==, 0);
 528 }
 529 
 530 void
 531 space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
 532 {
 533         VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0);
 534 
 535         smo->smo_objsize = 0;
 536         smo->smo_alloc = 0;
 537 }