1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #pragma ident   "%Z%%M% %I%     %E% SMI"
  27 
  28 #include <sys/zfs_context.h>
  29 #include <sys/spa.h>
  30 #include <sys/dmu.h>
  31 #include <sys/zio.h>
  32 #include <sys/space_map.h>
  33 
  34 /*
  35  * Space map routines.
  36  * NOTE: caller is responsible for all locking.
  37  */
  38 static int
  39 space_map_seg_compare(const void *x1, const void *x2)
  40 {
  41         const space_seg_t *s1 = x1;
  42         const space_seg_t *s2 = x2;
  43 
  44         if (s1->ss_start < s2->ss_start) {
  45                 if (s1->ss_end > s2->ss_start)
  46                         return (0);
  47                 return (-1);
  48         }
  49         if (s1->ss_start > s2->ss_start) {
  50                 if (s1->ss_start < s2->ss_end)
  51                         return (0);
  52                 return (1);
  53         }
  54         return (0);
  55 }
  56 
  57 void
  58 space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift,
  59         kmutex_t *lp)
  60 {
  61         bzero(sm, sizeof (*sm));
  62 
  63         avl_create(&sm->sm_root, space_map_seg_compare,
  64             sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
  65 
  66         sm->sm_start = start;
  67         sm->sm_size = size;
  68         sm->sm_shift = shift;
  69         sm->sm_lock = lp;
  70 }
  71 
  72 void
  73 space_map_destroy(space_map_t *sm)
  74 {
  75         ASSERT(!sm->sm_loaded && !sm->sm_loading);
  76         VERIFY3U(sm->sm_space, ==, 0);
  77         avl_destroy(&sm->sm_root);
  78 }
  79 
  80 void
  81 space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
  82 {
  83         avl_index_t where;
  84         space_seg_t ssearch, *ss_before, *ss_after, *ss;
  85         uint64_t end = start + size;
  86         int merge_before, merge_after;
  87 
  88         ASSERT(MUTEX_HELD(sm->sm_lock));
  89         VERIFY(size != 0);
  90         VERIFY3U(start, >=, sm->sm_start);
  91         VERIFY3U(end, <=, sm->sm_start + sm->sm_size);
  92         VERIFY(sm->sm_space + size <= sm->sm_size);
  93         VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
  94         VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
  95 
  96         ssearch.ss_start = start;
  97         ssearch.ss_end = end;
  98         ss = avl_find(&sm->sm_root, &ssearch, &where);
  99 
 100         if (ss != NULL && ss->ss_start <= start && ss->ss_end >= end) {
 101                 zfs_panic_recover("zfs: allocating allocated segment"
 102                     "(offset=%llu size=%llu)\n",
 103                     (longlong_t)start, (longlong_t)size);
 104                 return;
 105         }
 106 
 107         /* Make sure we don't overlap with either of our neighbors */
 108         VERIFY(ss == NULL);
 109 
 110         ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE);
 111         ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER);
 112 
 113         merge_before = (ss_before != NULL && ss_before->ss_end == start);
 114         merge_after = (ss_after != NULL && ss_after->ss_start == end);
 115 
 116         if (merge_before && merge_after) {
 117                 avl_remove(&sm->sm_root, ss_before);
 118                 ss_after->ss_start = ss_before->ss_start;
 119                 kmem_free(ss_before, sizeof (*ss_before));
 120         } else if (merge_before) {
 121                 ss_before->ss_end = end;
 122         } else if (merge_after) {
 123                 ss_after->ss_start = start;
 124         } else {
 125                 ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
 126                 ss->ss_start = start;
 127                 ss->ss_end = end;
 128                 avl_insert(&sm->sm_root, ss, where);
 129         }
 130 
 131         sm->sm_space += size;
 132 }
 133 
 134 void
 135 space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
 136 {
 137         avl_index_t where;
 138         space_seg_t ssearch, *ss, *newseg;
 139         uint64_t end = start + size;
 140         int left_over, right_over;
 141 
 142         ASSERT(MUTEX_HELD(sm->sm_lock));
 143         VERIFY(size != 0);
 144         VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
 145         VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
 146 
 147         ssearch.ss_start = start;
 148         ssearch.ss_end = end;
 149         ss = avl_find(&sm->sm_root, &ssearch, &where);
 150 
 151         /* Make sure we completely overlap with someone */
 152         if (ss == NULL) {
 153                 zfs_panic_recover("zfs: freeing free segment "
 154                     "(offset=%llu size=%llu)",
 155                     (longlong_t)start, (longlong_t)size);
 156                 return;
 157         }
 158         VERIFY3U(ss->ss_start, <=, start);
 159         VERIFY3U(ss->ss_end, >=, end);
 160         VERIFY(sm->sm_space - size <= sm->sm_size);
 161 
 162         left_over = (ss->ss_start != start);
 163         right_over = (ss->ss_end != end);
 164 
 165         if (left_over && right_over) {
 166                 newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
 167                 newseg->ss_start = end;
 168                 newseg->ss_end = ss->ss_end;
 169                 ss->ss_end = start;
 170                 avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
 171         } else if (left_over) {
 172                 ss->ss_end = start;
 173         } else if (right_over) {
 174                 ss->ss_start = end;
 175         } else {
 176                 avl_remove(&sm->sm_root, ss);
 177                 kmem_free(ss, sizeof (*ss));
 178         }
 179 
 180         sm->sm_space -= size;
 181 }
 182 
 183 int
 184 space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
 185 {
 186         avl_index_t where;
 187         space_seg_t ssearch, *ss;
 188         uint64_t end = start + size;
 189 
 190         ASSERT(MUTEX_HELD(sm->sm_lock));
 191         VERIFY(size != 0);
 192         VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
 193         VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
 194 
 195         ssearch.ss_start = start;
 196         ssearch.ss_end = end;
 197         ss = avl_find(&sm->sm_root, &ssearch, &where);
 198 
 199         return (ss != NULL && ss->ss_start <= start && ss->ss_end >= end);
 200 }
 201 
 202 void
 203 space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
 204 {
 205         space_seg_t *ss;
 206         void *cookie = NULL;
 207 
 208         ASSERT(MUTEX_HELD(sm->sm_lock));
 209 
 210         while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
 211                 if (func != NULL)
 212                         func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
 213                 kmem_free(ss, sizeof (*ss));
 214         }
 215         sm->sm_space = 0;
 216 }
 217 
 218 void
 219 space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
 220 {
 221         space_seg_t *ss;
 222 
 223         for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
 224                 func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
 225 }
 226 
 227 void
 228 space_map_excise(space_map_t *sm, uint64_t start, uint64_t size)
 229 {
 230         avl_tree_t *t = &sm->sm_root;
 231         avl_index_t where;
 232         space_seg_t *ss, search;
 233         uint64_t end = start + size;
 234         uint64_t rm_start, rm_end;
 235 
 236         ASSERT(MUTEX_HELD(sm->sm_lock));
 237 
 238         search.ss_start = start;
 239         search.ss_end = start;
 240 
 241         for (;;) {
 242                 ss = avl_find(t, &search, &where);
 243 
 244                 if (ss == NULL)
 245                         ss = avl_nearest(t, where, AVL_AFTER);
 246 
 247                 if (ss == NULL || ss->ss_start >= end)
 248                         break;
 249 
 250                 rm_start = MAX(ss->ss_start, start);
 251                 rm_end = MIN(ss->ss_end, end);
 252 
 253                 space_map_remove(sm, rm_start, rm_end - rm_start);
 254         }
 255 }
 256 
 257 /*
 258  * Replace smd with the union of smd and sms.
 259  */
 260 void
 261 space_map_union(space_map_t *smd, space_map_t *sms)
 262 {
 263         avl_tree_t *t = &sms->sm_root;
 264         space_seg_t *ss;
 265 
 266         ASSERT(MUTEX_HELD(smd->sm_lock));
 267 
 268         /*
 269          * For each source segment, remove any intersections with the
 270          * destination, then add the source segment to the destination.
 271          */
 272         for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
 273                 space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start);
 274                 space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start);
 275         }
 276 }
 277 
 278 /*
 279  * Wait for any in-progress space_map_load() to complete.
 280  */
 281 void
 282 space_map_load_wait(space_map_t *sm)
 283 {
 284         ASSERT(MUTEX_HELD(sm->sm_lock));
 285 
 286         while (sm->sm_loading)
 287                 cv_wait(&sm->sm_load_cv, sm->sm_lock);
 288 }
 289 
 290 /*
 291  * Note: space_map_load() will drop sm_lock across dmu_read() calls.
 292  * The caller must be OK with this.
 293  */
 294 int
 295 space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
 296         space_map_obj_t *smo, objset_t *os)
 297 {
 298         uint64_t *entry, *entry_map, *entry_map_end;
 299         uint64_t bufsize, size, offset, end, space;
 300         uint64_t mapstart = sm->sm_start;
 301         int error = 0;
 302 
 303         ASSERT(MUTEX_HELD(sm->sm_lock));
 304 
 305         space_map_load_wait(sm);
 306 
 307         if (sm->sm_loaded)
 308                 return (0);
 309 
 310         sm->sm_loading = B_TRUE;
 311         end = smo->smo_objsize;
 312         space = smo->smo_alloc;
 313 
 314         ASSERT(sm->sm_ops == NULL);
 315         VERIFY3U(sm->sm_space, ==, 0);
 316 
 317         if (maptype == SM_FREE) {
 318                 space_map_add(sm, sm->sm_start, sm->sm_size);
 319                 space = sm->sm_size - space;
 320         }
 321 
 322         bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT;
 323         entry_map = zio_buf_alloc(bufsize);
 324 
 325         mutex_exit(sm->sm_lock);
 326         if (end > bufsize)
 327                 dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize);
 328         mutex_enter(sm->sm_lock);
 329 
 330         for (offset = 0; offset < end; offset += bufsize) {
 331                 size = MIN(end - offset, bufsize);
 332                 VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
 333                 VERIFY(size != 0);
 334 
 335                 dprintf("object=%llu  offset=%llx  size=%llx\n",
 336                     smo->smo_object, offset, size);
 337 
 338                 mutex_exit(sm->sm_lock);
 339                 error = dmu_read(os, smo->smo_object, offset, size, entry_map);
 340                 mutex_enter(sm->sm_lock);
 341                 if (error != 0)
 342                         break;
 343 
 344                 entry_map_end = entry_map + (size / sizeof (uint64_t));
 345                 for (entry = entry_map; entry < entry_map_end; entry++) {
 346                         uint64_t e = *entry;
 347 
 348                         if (SM_DEBUG_DECODE(e))         /* Skip debug entries */
 349                                 continue;
 350 
 351                         (SM_TYPE_DECODE(e) == maptype ?
 352                             space_map_add : space_map_remove)(sm,
 353                             (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart,
 354                             SM_RUN_DECODE(e) << sm->sm_shift);
 355                 }
 356         }
 357 
 358         if (error == 0) {
 359                 VERIFY3U(sm->sm_space, ==, space);
 360 
 361                 sm->sm_loaded = B_TRUE;
 362                 sm->sm_ops = ops;
 363                 if (ops != NULL)
 364                         ops->smop_load(sm);
 365         } else {
 366                 space_map_vacate(sm, NULL, NULL);
 367         }
 368 
 369         zio_buf_free(entry_map, bufsize);
 370 
 371         sm->sm_loading = B_FALSE;
 372 
 373         cv_broadcast(&sm->sm_load_cv);
 374 
 375         return (error);
 376 }
 377 
 378 void
 379 space_map_unload(space_map_t *sm)
 380 {
 381         ASSERT(MUTEX_HELD(sm->sm_lock));
 382 
 383         if (sm->sm_loaded && sm->sm_ops != NULL)
 384                 sm->sm_ops->smop_unload(sm);
 385 
 386         sm->sm_loaded = B_FALSE;
 387         sm->sm_ops = NULL;
 388 
 389         space_map_vacate(sm, NULL, NULL);
 390 }
 391 
 392 uint64_t
 393 space_map_alloc(space_map_t *sm, uint64_t size)
 394 {
 395         uint64_t start;
 396 
 397         start = sm->sm_ops->smop_alloc(sm, size);
 398         if (start != -1ULL)
 399                 space_map_remove(sm, start, size);
 400         return (start);
 401 }
 402 
 403 void
 404 space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
 405 {
 406         sm->sm_ops->smop_claim(sm, start, size);
 407         space_map_remove(sm, start, size);
 408 }
 409 
 410 void
 411 space_map_free(space_map_t *sm, uint64_t start, uint64_t size)
 412 {
 413         space_map_add(sm, start, size);
 414         sm->sm_ops->smop_free(sm, start, size);
 415 }
 416 
 417 /*
 418  * Note: space_map_sync() will drop sm_lock across dmu_write() calls.
 419  */
 420 void
 421 space_map_sync(space_map_t *sm, uint8_t maptype,
 422         space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
 423 {
 424         spa_t *spa = dmu_objset_spa(os);
 425         void *cookie = NULL;
 426         space_seg_t *ss;
 427         uint64_t bufsize, start, size, run_len;
 428         uint64_t *entry, *entry_map, *entry_map_end;
 429 
 430         ASSERT(MUTEX_HELD(sm->sm_lock));
 431 
 432         if (sm->sm_space == 0)
 433                 return;
 434 
 435         dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n",
 436             smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa),
 437             maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root),
 438             sm->sm_space);
 439 
 440         if (maptype == SM_ALLOC)
 441                 smo->smo_alloc += sm->sm_space;
 442         else
 443                 smo->smo_alloc -= sm->sm_space;
 444 
 445         bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t);
 446         bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT);
 447         entry_map = zio_buf_alloc(bufsize);
 448         entry_map_end = entry_map + (bufsize / sizeof (uint64_t));
 449         entry = entry_map;
 450 
 451         *entry++ = SM_DEBUG_ENCODE(1) |
 452             SM_DEBUG_ACTION_ENCODE(maptype) |
 453             SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
 454             SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
 455 
 456         while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
 457                 size = ss->ss_end - ss->ss_start;
 458                 start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
 459 
 460                 sm->sm_space -= size;
 461                 size >>= sm->sm_shift;
 462 
 463                 while (size) {
 464                         run_len = MIN(size, SM_RUN_MAX);
 465 
 466                         if (entry == entry_map_end) {
 467                                 mutex_exit(sm->sm_lock);
 468                                 dmu_write(os, smo->smo_object, smo->smo_objsize,
 469                                     bufsize, entry_map, tx);
 470                                 mutex_enter(sm->sm_lock);
 471                                 smo->smo_objsize += bufsize;
 472                                 entry = entry_map;
 473                         }
 474 
 475                         *entry++ = SM_OFFSET_ENCODE(start) |
 476                             SM_TYPE_ENCODE(maptype) |
 477                             SM_RUN_ENCODE(run_len);
 478 
 479                         start += run_len;
 480                         size -= run_len;
 481                 }
 482                 kmem_free(ss, sizeof (*ss));
 483         }
 484 
 485         if (entry != entry_map) {
 486                 size = (entry - entry_map) * sizeof (uint64_t);
 487                 mutex_exit(sm->sm_lock);
 488                 dmu_write(os, smo->smo_object, smo->smo_objsize,
 489                     size, entry_map, tx);
 490                 mutex_enter(sm->sm_lock);
 491                 smo->smo_objsize += size;
 492         }
 493 
 494         zio_buf_free(entry_map, bufsize);
 495 
 496         VERIFY3U(sm->sm_space, ==, 0);
 497 }
 498 
 499 void
 500 space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
 501 {
 502         VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0);
 503 
 504         smo->smo_objsize = 0;
 505         smo->smo_alloc = 0;
 506 }