Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 18 Dec 2012 17:42:05 +0000 (09:42 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 18 Dec 2012 17:42:05 +0000 (09:42 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 18 Dec 2012 17:42:05 +0000 (09:42 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 18 Dec 2012 17:42:05 +0000 (09:42 -0800)
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile

index d7fcdba141a2f2508999003344008458c2cf48ea..7df3e0f0ee512b7d6e7082c2c288e739ed1f6505 100644 (file)
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
            extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
            export.o tree-log.o free-space-cache.o zlib.o lzo.o \
            compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-          reada.o backref.o ulist.o qgroup.o send.o
+          reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
  
  btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
  btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c

index 0c16e3dbfd5681ea67d4cf2f349cb33083a46d2d..e15d2b0d8d3b20f3085c18348e9d711682fedc24 100644 (file)
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -121,6 +121,8 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
                         ret = posix_acl_equiv_mode(acl, &inode->i_mode);
                         if (ret < 0)
                                 return ret;
+                       if (ret == 0)
+                               acl = NULL;
                 }
                 ret = 0;
                 break;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c

index 208d8aa5b07e488f1f39cb0877ff46bb5d08d5a6..04edf69be87561318375716abeb1d854370c63c3 100644 (file)
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -461,6 +461,7 @@ static int __merge_refs(struct list_head *head, int mode)
                      pos2 = n2, n2 = pos2->next) {
                         struct __prelim_ref *ref2;
                         struct __prelim_ref *xchg;
+                       struct extent_inode_elem *eie;
  
                         ref2 = list_entry(pos2, struct __prelim_ref, list);
  
@@ -472,12 +473,20 @@ static int __merge_refs(struct list_head *head, int mode)
                                         ref1 = ref2;
                                         ref2 = xchg;
                                 }
-                               ref1->count += ref2->count;
                         } else {
                                 if (ref1->parent != ref2->parent)
                                         continue;
-                               ref1->count += ref2->count;
                         }
+
+                       eie = ref1->inode_list;
+                       while (eie && eie->next)
+                               eie = eie->next;
+                       if (eie)
+                               eie->next = ref2->inode_list;
+                       else
+                               ref1->inode_list = ref2->inode_list;
+                       ref1->count += ref2->count;
+
                         list_del(&ref2->list);
                         kfree(ref2);
                 }
@@ -890,8 +899,7 @@ again:
         while (!list_empty(&prefs)) {
                 ref = list_first_entry(&prefs, struct __prelim_ref, list);
                 list_del(&ref->list);
-               if (ref->count < 0)
-                       WARN_ON(1);
+               WARN_ON(ref->count < 0);
                 if (ref->count && ref->root_id && ref->parent == 0) {
                         /* no parent == root of tree */
                         ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h

index ed8ca7ca5eff2d2ce0529f42d32fb394b0331e89..2a8c242bc4f5486f8173bbc0c1f41eeef92b81e7 100644 (file)
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -39,6 +39,7 @@
  #define BTRFS_INODE_HAS_ORPHAN_ITEM            5
  #define BTRFS_INODE_HAS_ASYNC_EXTENT           6
  #define BTRFS_INODE_NEEDS_FULL_SYNC            7
+#define BTRFS_INODE_COPY_EVERYTHING            8
  
  /* in memory btrfs inode */
  struct btrfs_inode {
@@ -90,6 +91,9 @@ struct btrfs_inode {
  
         unsigned long runtime_flags;
  
+       /* Keep track of who's O_SYNC/fsycing currently */
+       atomic_t sync_writers;
+
         /* full 64 bit generation number, struct vfs_inode doesn't have a big
          * enough field for this.
          */
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c

index 5a3e45db642a6b1c64998ba523be143d382d396e..11d47bfb62b418f6f4d5459d8c02a18b05c3731b 100644 (file)
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -137,7 +137,7 @@ struct btrfsic_block {
         unsigned int never_written:1;   /* block was added because it was
                                          * referenced, not because it was
                                          * written */
-       unsigned int mirror_num:2;      /* large enough to hold
+       unsigned int mirror_num;        /* large enough to hold
                                          * BTRFS_SUPER_MIRROR_MAX */
         struct btrfsic_dev_state *dev_state;
         u64 dev_bytenr;         /* key, physical byte num on disk */
@@ -723,7 +723,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
                 }
  
                 num_copies =
-                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                   btrfs_num_copies(state->root->fs_info,
                                      next_bytenr, state->metablock_size);
                 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                         printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -903,7 +903,7 @@ static int btrfsic_process_superblock_dev_mirror(
                 }
  
                 num_copies =
-                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                   btrfs_num_copies(state->root->fs_info,
                                      next_bytenr, state->metablock_size);
                 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                         printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1287,7 +1287,7 @@ static int btrfsic_create_link_to_next_block(
         *next_blockp = NULL;
         if (0 == *num_copiesp) {
                 *num_copiesp =
-                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                   btrfs_num_copies(state->root->fs_info,
                                      next_bytenr, state->metablock_size);
                 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                         printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1489,7 +1489,7 @@ static int btrfsic_handle_extent_data(
                         chunk_len = num_bytes;
  
                 num_copies =
-                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                   btrfs_num_copies(state->root->fs_info,
                                      next_bytenr, state->datablock_size);
                 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                         printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -1582,9 +1582,21 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
         struct btrfs_device *device;
  
         length = len;
-       ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
+       ret = btrfs_map_block(state->root->fs_info, READ,
                               bytenr, &length, &multi, mirror_num);
  
+       if (ret) {
+               block_ctx_out->start = 0;
+               block_ctx_out->dev_bytenr = 0;
+               block_ctx_out->len = 0;
+               block_ctx_out->dev = NULL;
+               block_ctx_out->datav = NULL;
+               block_ctx_out->pagev = NULL;
+               block_ctx_out->mem_to_free = NULL;
+
+               return ret;
+       }
+
         device = multi->stripes[0].dev;
         block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
         block_ctx_out->dev_bytenr = multi->stripes[0].physical;
@@ -1594,8 +1606,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
         block_ctx_out->pagev = NULL;
         block_ctx_out->mem_to_free = NULL;
  
-       if (0 == ret)
-               kfree(multi);
+       kfree(multi);
         if (NULL == block_ctx_out->dev) {
                 ret = -ENXIO;
                 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
@@ -2463,7 +2474,7 @@ static int btrfsic_process_written_superblock(
                 }
  
                 num_copies =
-                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                   btrfs_num_copies(state->root->fs_info,
                                      next_bytenr, BTRFS_SUPER_INFO_SIZE);
                 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                         printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
@@ -2960,7 +2971,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
         struct btrfsic_block_data_ctx block_ctx;
         int match = 0;
  
-       num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
+       num_copies = btrfs_num_copies(state->root->fs_info,
                                       bytenr, state->metablock_size);
  
         for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c

index c6467aa88bee24fb3f4fe401306aed57442a0a59..94ab2f80e7e3154c517bfae0f873db552f0d052b 100644 (file)
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -687,7 +687,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
  
                         ret = btrfs_map_bio(root, READ, comp_bio,
                                             mirror_num, 0);
-                       BUG_ON(ret); /* -ENOMEM */
+                       if (ret)
+                               bio_endio(comp_bio, ret);
  
                         bio_put(comp_bio);
  
@@ -712,7 +713,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
         }
  
         ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
-       BUG_ON(ret); /* -ENOMEM */
+       if (ret)
+               bio_endio(comp_bio, ret);
  
         bio_put(comp_bio);
         return 0;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c

index cdfb4c49a806ad4ba0ebe83b5569a4121a10a220..c7b67cf24bba54f85a6eb488ea899190961bac09 100644 (file)
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,8 +38,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
                               struct extent_buffer *dst_buf,
                               struct extent_buffer *src_buf);
  static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                   struct btrfs_path *path, int level, int slot,
-                   int tree_mod_log);
+                   struct btrfs_path *path, int level, int slot);
  static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
                                  struct extent_buffer *eb);
  struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -776,8 +775,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
  
  static noinline void
  tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
-                         struct extent_buffer *eb,
-                         struct btrfs_disk_key *disk_key, int slot, int atomic)
+                         struct extent_buffer *eb, int slot, int atomic)
  {
         int ret;
  
@@ -1140,13 +1138,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
                 switch (tm->op) {
                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
                         BUG_ON(tm->slot < n);
-               case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
                 case MOD_LOG_KEY_REMOVE:
+                       n++;
+               case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
                         btrfs_set_node_key(eb, &tm->key, tm->slot);
                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
                         btrfs_set_node_ptr_generation(eb, tm->slot,
                                                       tm->generation);
-                       n++;
                         break;
                 case MOD_LOG_KEY_REPLACE:
                         BUG_ON(tm->slot >= n);
@@ -1361,19 +1359,16 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
         u64 search_start;
         int ret;
  
-       if (trans->transaction != root->fs_info->running_transaction) {
-               printk(KERN_CRIT "trans %llu running %llu\n",
+       if (trans->transaction != root->fs_info->running_transaction)
+               WARN(1, KERN_CRIT "trans %llu running %llu\n",
                        (unsigned long long)trans->transid,
                        (unsigned long long)
                        root->fs_info->running_transaction->transid);
-               WARN_ON(1);
-       }
-       if (trans->transid != root->fs_info->generation) {
-               printk(KERN_CRIT "trans %llu running %llu\n",
+
+       if (trans->transid != root->fs_info->generation)
+               WARN(1, KERN_CRIT "trans %llu running %llu\n",
                        (unsigned long long)trans->transid,
                        (unsigned long long)root->fs_info->generation);
-               WARN_ON(1);
-       }
  
         if (!should_cow_block(trans, root, buf)) {
                 *cow_ret = buf;
@@ -1469,10 +1464,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
         if (cache_only && parent_level != 1)
                 return 0;
  
-       if (trans->transaction != root->fs_info->running_transaction)
-               WARN_ON(1);
-       if (trans->transid != root->fs_info->generation)
-               WARN_ON(1);
+       WARN_ON(trans->transaction != root->fs_info->running_transaction);
+       WARN_ON(trans->transid != root->fs_info->generation);
  
         parent_nritems = btrfs_header_nritems(parent);
         blocksize = btrfs_level_size(root, parent_level - 1);
@@ -1827,7 +1820,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                 if (btrfs_header_nritems(right) == 0) {
                         clean_tree_block(trans, root, right);
                         btrfs_tree_unlock(right);
-                       del_ptr(trans, root, path, level + 1, pslot + 1, 1);
+                       del_ptr(trans, root, path, level + 1, pslot + 1);
                         root_sub_used(root, right->len);
                         btrfs_free_tree_block(trans, root, right, 0, 1);
                         free_extent_buffer_stale(right);
@@ -1836,7 +1829,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                         struct btrfs_disk_key right_key;
                         btrfs_node_key(right, &right_key, 0);
                         tree_mod_log_set_node_key(root->fs_info, parent,
-                                                 &right_key, pslot + 1, 0);
+                                                 pslot + 1, 0);
                         btrfs_set_node_key(parent, &right_key, pslot + 1);
                         btrfs_mark_buffer_dirty(parent);
                 }
@@ -1871,7 +1864,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
         if (btrfs_header_nritems(mid) == 0) {
                 clean_tree_block(trans, root, mid);
                 btrfs_tree_unlock(mid);
-               del_ptr(trans, root, path, level + 1, pslot, 1);
+               del_ptr(trans, root, path, level + 1, pslot);
                 root_sub_used(root, mid->len);
                 btrfs_free_tree_block(trans, root, mid, 0, 1);
                 free_extent_buffer_stale(mid);
@@ -1880,7 +1873,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                 /* update the parent key to reflect our changes */
                 struct btrfs_disk_key mid_key;
                 btrfs_node_key(mid, &mid_key, 0);
-               tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
+               tree_mod_log_set_node_key(root->fs_info, parent,
                                           pslot, 0);
                 btrfs_set_node_key(parent, &mid_key, pslot);
                 btrfs_mark_buffer_dirty(parent);
@@ -1980,7 +1973,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                         orig_slot += left_nr;
                         btrfs_node_key(mid, &disk_key, 0);
                         tree_mod_log_set_node_key(root->fs_info, parent,
-                                                 &disk_key, pslot, 0);
+                                                 pslot, 0);
                         btrfs_set_node_key(parent, &disk_key, pslot);
                         btrfs_mark_buffer_dirty(parent);
                         if (btrfs_header_nritems(left) > orig_slot) {
@@ -2033,7 +2026,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
  
                         btrfs_node_key(right, &disk_key, 0);
                         tree_mod_log_set_node_key(root->fs_info, parent,
-                                                 &disk_key, pslot + 1, 0);
+                                                 pslot + 1, 0);
                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
                         btrfs_mark_buffer_dirty(parent);
  
@@ -2219,6 +2212,9 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
         int no_skips = 0;
         struct extent_buffer *t;
  
+       if (path->really_keep_locks)
+               return;
+
         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
                 if (!path->nodes[i])
                         break;
@@ -2266,7 +2262,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
  {
         int i;
  
-       if (path->keep_locks)
+       if (path->keep_locks || path->really_keep_locks)
                 return;
  
         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -2499,7 +2495,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
         if (!cow)
                 write_lock_level = -1;
  
-       if (cow && (p->keep_locks || p->lowest_level))
+       if (cow && (p->really_keep_locks || p->keep_locks || p->lowest_level))
                 write_lock_level = BTRFS_MAX_LEVEL;
  
         min_write_lock_level = write_lock_level;
@@ -2568,7 +2564,10 @@ again:
                          * must have write locks on this node and the
                          * parent
                          */
-                       if (level + 1 > write_lock_level) {
+                       if (level > write_lock_level ||
+                           (level + 1 > write_lock_level &&
+                           level + 1 < BTRFS_MAX_LEVEL &&
+                           p->nodes[level + 1])) {
                                 write_lock_level = level + 1;
                                 btrfs_release_path(p);
                                 goto again;
@@ -2917,7 +2916,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
                 if (!path->nodes[i])
                         break;
                 t = path->nodes[i];
-               tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
+               tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
                 btrfs_set_node_key(t, key, tslot);
                 btrfs_mark_buffer_dirty(path->nodes[i]);
                 if (tslot != 0)
@@ -3302,14 +3301,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
   */
  static int leaf_space_used(struct extent_buffer *l, int start, int nr)
  {
+       struct btrfs_item *start_item;
+       struct btrfs_item *end_item;
+       struct btrfs_map_token token;
         int data_len;
         int nritems = btrfs_header_nritems(l);
         int end = min(nritems, start + nr) - 1;
  
         if (!nr)
                 return 0;
-       data_len = btrfs_item_end_nr(l, start);
-       data_len = data_len - btrfs_item_offset_nr(l, end);
+       btrfs_init_map_token(&token);
+       start_item = btrfs_item_nr(l, start);
+       end_item = btrfs_item_nr(l, end);
+       data_len = btrfs_token_item_offset(l, start_item, &token) +
+               btrfs_token_item_size(l, start_item, &token);
+       data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
         data_len += sizeof(struct btrfs_item) * nr;
         WARN_ON(data_len < 0);
         return data_len;
@@ -3403,8 +3409,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
         if (push_items == 0)
                 goto out_unlock;
  
-       if (!empty && push_items == left_nritems)
-               WARN_ON(1);
+       WARN_ON(!empty && push_items == left_nritems);
  
         /* push left to right */
         right_nritems = btrfs_header_nritems(right);
@@ -3642,11 +3647,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
         btrfs_set_header_nritems(left, old_left_nritems + push_items);
  
         /* fixup right node */
-       if (push_items > right_nritems) {
-               printk(KERN_CRIT "push items %d nr %u\n", push_items,
+       if (push_items > right_nritems)
+               WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
                        right_nritems);
-               WARN_ON(1);
-       }
  
         if (push_items < right_nritems) {
                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
@@ -4602,16 +4605,21 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
   * empty a node.
   */
  static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                   struct btrfs_path *path, int level, int slot,
-                   int tree_mod_log)
+                   struct btrfs_path *path, int level, int slot)
  {
         struct extent_buffer *parent = path->nodes[level];
         u32 nritems;
         int ret;
  
+       if (level) {
+               ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
+                                             MOD_LOG_KEY_REMOVE);
+               BUG_ON(ret < 0);
+       }
+
         nritems = btrfs_header_nritems(parent);
         if (slot != nritems - 1) {
-               if (tree_mod_log && level)
+               if (level)
                         tree_mod_log_eb_move(root->fs_info, parent, slot,
                                              slot + 1, nritems - slot - 1);
                 memmove_extent_buffer(parent,
@@ -4619,10 +4627,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                               btrfs_node_key_ptr_offset(slot + 1),
                               sizeof(struct btrfs_key_ptr) *
                               (nritems - slot - 1));
-       } else if (tree_mod_log && level) {
-               ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
-                                             MOD_LOG_KEY_REMOVE);
-               BUG_ON(ret < 0);
         }
  
         nritems--;
@@ -4656,7 +4660,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
                                     struct extent_buffer *leaf)
  {
         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
-       del_ptr(trans, root, path, 1, path->slots[1], 1);
+       del_ptr(trans, root, path, 1, path->slots[1]);
  
         /*
          * btrfs_free_extent is expensive, we want to make sure we
@@ -5123,13 +5127,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
         right_path->search_commit_root = 1;
         right_path->skip_locking = 1;
  
-       spin_lock(&left_root->root_times_lock);
+       spin_lock(&left_root->root_item_lock);
         left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
-       spin_unlock(&left_root->root_times_lock);
+       spin_unlock(&left_root->root_item_lock);
  
-       spin_lock(&right_root->root_times_lock);
+       spin_lock(&right_root->root_item_lock);
         right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
-       spin_unlock(&right_root->root_times_lock);
+       spin_unlock(&right_root->root_item_lock);
  
         trans = btrfs_join_transaction(left_root);
         if (IS_ERR(trans)) {
@@ -5224,15 +5228,15 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
                                 goto out;
                         }
  
-                       spin_lock(&left_root->root_times_lock);
+                       spin_lock(&left_root->root_item_lock);
                         ctransid = btrfs_root_ctransid(&left_root->root_item);
-                       spin_unlock(&left_root->root_times_lock);
+                       spin_unlock(&left_root->root_item_lock);
                         if (ctransid != left_start_ctransid)
                                 left_start_ctransid = 0;
  
-                       spin_lock(&right_root->root_times_lock);
+                       spin_lock(&right_root->root_item_lock);
                         ctransid = btrfs_root_ctransid(&right_root->root_item);
-                       spin_unlock(&right_root->root_times_lock);
+                       spin_unlock(&right_root->root_item_lock);
                         if (ctransid != right_start_ctransid)
                                 right_start_ctransid = 0;
  
@@ -5496,6 +5500,139 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
         return btrfs_next_old_leaf(root, path, 0);
  }
  
+/* Release the path up to but not including the given level */
+static void btrfs_release_level(struct btrfs_path *path, int level)
+{
+       int i;
+
+       for (i = 0; i < level; i++) {
+               path->slots[i] = 0;
+               if (!path->nodes[i])
+                       continue;
+               if (path->locks[i]) {
+                       btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
+                       path->locks[i] = 0;
+               }
+               free_extent_buffer(path->nodes[i]);
+               path->nodes[i] = NULL;
+       }
+}
+
+/*
+ * This function assumes 2 things
+ *
+ * 1) You are using path->keep_locks
+ * 2) You are not inserting items.
+ *
+ * If either of these are not true do not use this function. If you need a next
+ * leaf with either of these not being true then this function can be easily
+ * adapted to do that, but at the moment these are the limitations.
+ */
+int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root, struct btrfs_path *path,
+                         int del)
+{
+       struct extent_buffer *b;
+       struct btrfs_key key;
+       u32 nritems;
+       int level = 1;
+       int slot;
+       int ret = 1;
+       int write_lock_level = BTRFS_MAX_LEVEL;
+       int ins_len = del ? -1 : 0;
+
+       WARN_ON(!(path->keep_locks || path->really_keep_locks));
+
+       nritems = btrfs_header_nritems(path->nodes[0]);
+       btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+
+       while (path->nodes[level]) {
+               nritems = btrfs_header_nritems(path->nodes[level]);
+               if (!(path->locks[level] & BTRFS_WRITE_LOCK)) {
+search:
+                       btrfs_release_path(path);
+                       ret = btrfs_search_slot(trans, root, &key, path,
+                                               ins_len, 1);
+                       if (ret < 0)
+                               goto out;
+                       level = 1;
+                       continue;
+               }
+
+               if (path->slots[level] >= nritems - 1) {
+                       level++;
+                       continue;
+               }
+
+               btrfs_release_level(path, level);
+               break;
+       }
+
+       if (!path->nodes[level]) {
+               ret = 1;
+               goto out;
+       }
+
+       path->slots[level]++;
+       b = path->nodes[level];
+
+       while (b) {
+               level = btrfs_header_level(b);
+
+               if (!should_cow_block(trans, root, b))
+                       goto cow_done;
+
+               btrfs_set_path_blocking(path);
+               ret = btrfs_cow_block(trans, root, b,
+                                     path->nodes[level + 1],
+                                     path->slots[level + 1], &b);
+               if (ret)
+                       goto out;
+cow_done:
+               path->nodes[level] = b;
+               btrfs_clear_path_blocking(path, NULL, 0);
+               if (level != 0) {
+                       ret = setup_nodes_for_search(trans, root, path, b,
+                                                    level, ins_len,
+                                                    &write_lock_level);
+                       if (ret == -EAGAIN)
+                               goto search;
+                       if (ret)
+                               goto out;
+
+                       b = path->nodes[level];
+                       slot = path->slots[level];
+
+                       ret = read_block_for_search(trans, root, path,
+                                                   &b, level, slot, &key, 0);
+                       if (ret == -EAGAIN)
+                               goto search;
+                       if (ret)
+                               goto out;
+                       level = btrfs_header_level(b);
+                       if (!btrfs_try_tree_write_lock(b)) {
+                               btrfs_set_path_blocking(path);
+                               btrfs_tree_lock(b);
+                               btrfs_clear_path_blocking(path, b,
+                                                         BTRFS_WRITE_LOCK);
+                       }
+                       path->locks[level] = BTRFS_WRITE_LOCK;
+                       path->nodes[level] = b;
+                       path->slots[level] = 0;
+               } else {
+                       path->slots[level] = 0;
+                       ret = 0;
+                       break;
+               }
+       }
+
+out:
+       if (ret)
+               btrfs_release_path(path);
+
+       return ret;
+}
+
  int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
                         u64 time_seq)
  {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index 596617ecd3296e22eebb4bb08ec57cf4170a100e..547b7b05727f917dfc2bad6516f5c5b30ea12c68 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -48,7 +48,7 @@ struct btrfs_ordered_sum;
  
  #define BTRFS_MAGIC "_BHRfS_M"
  
-#define BTRFS_MAX_MIRRORS 2
+#define BTRFS_MAX_MIRRORS 3
  
  #define BTRFS_MAX_LEVEL 8
  
@@ -142,6 +142,8 @@ struct btrfs_ordered_sum;
  
  #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
  
+#define BTRFS_DEV_REPLACE_DEVID 0
+
  /*
   * the max metadata block size.  This limit is somewhat artificial,
   * but the memmove costs go through the roof for larger blocks.
@@ -172,6 +174,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
  /* four bytes for CRC32 */
  #define BTRFS_EMPTY_DIR_SIZE 0
  
+/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
+#define REQ_GET_READ_MIRRORS   (1 << 30)
+
  #define BTRFS_FT_UNKNOWN       0
  #define BTRFS_FT_REG_FILE      1
  #define BTRFS_FT_DIR           2
@@ -571,6 +576,7 @@ struct btrfs_path {
         unsigned int skip_locking:1;
         unsigned int leave_spinning:1;
         unsigned int search_commit_root:1;
+       unsigned int really_keep_locks:1;
  };
  
  /*
@@ -885,6 +891,59 @@ struct btrfs_dev_stats_item {
         __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
  } __attribute__ ((__packed__));
  
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS    0
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID     1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED     0
+#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED           1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED         2
+#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED          3
+#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED          4
+
+struct btrfs_dev_replace {
+       u64 replace_state;      /* see #define above */
+       u64 time_started;       /* seconds since 1-Jan-1970 */
+       u64 time_stopped;       /* seconds since 1-Jan-1970 */
+       atomic64_t num_write_errors;
+       atomic64_t num_uncorrectable_read_errors;
+
+       u64 cursor_left;
+       u64 committed_cursor_left;
+       u64 cursor_left_last_write_of_item;
+       u64 cursor_right;
+
+       u64 cont_reading_from_srcdev_mode;      /* see #define above */
+
+       int is_valid;
+       int item_needs_writeback;
+       struct btrfs_device *srcdev;
+       struct btrfs_device *tgtdev;
+
+       pid_t lock_owner;
+       atomic_t nesting_level;
+       struct mutex lock_finishing_cancel_unmount;
+       struct mutex lock_management_lock;
+       struct mutex lock;
+
+       struct btrfs_scrub_progress scrub_progress;
+};
+
+struct btrfs_dev_replace_item {
+       /*
+        * grow this item struct at the end for future enhancements and keep
+        * the existing values unchanged
+        */
+       __le64 src_devid;
+       __le64 cursor_left;
+       __le64 cursor_right;
+       __le64 cont_reading_from_srcdev_mode;
+
+       __le64 replace_state;
+       __le64 time_started;
+       __le64 time_stopped;
+       __le64 num_write_errors;
+       __le64 num_uncorrectable_read_errors;
+} __attribute__ ((__packed__));
+
  /* different types of block groups (and chunks) */
  #define BTRFS_BLOCK_GROUP_DATA         (1ULL << 0)
  #define BTRFS_BLOCK_GROUP_SYSTEM       (1ULL << 1)
@@ -1333,6 +1392,7 @@ struct btrfs_fs_info {
         struct btrfs_workers generic_worker;
         struct btrfs_workers workers;
         struct btrfs_workers delalloc_workers;
+       struct btrfs_workers flush_workers;
         struct btrfs_workers endio_workers;
         struct btrfs_workers endio_meta_workers;
         struct btrfs_workers endio_meta_write_workers;
@@ -1429,6 +1489,8 @@ struct btrfs_fs_info {
         struct rw_semaphore scrub_super_lock;
         int scrub_workers_refcnt;
         struct btrfs_workers scrub_workers;
+       struct btrfs_workers scrub_wr_completion_workers;
+       struct btrfs_workers scrub_nocow_workers;
  
  #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
         u32 check_integrity_print_mask;
@@ -1470,6 +1532,11 @@ struct btrfs_fs_info {
         int backup_root_index;
  
         int num_tolerated_disk_barrier_failures;
+
+       /* device replace state */
+       struct btrfs_dev_replace dev_replace;
+
+       atomic_t mutually_exclusive_operation_running;
  };
  
  /*
@@ -1579,7 +1646,7 @@ struct btrfs_root {
  
         int force_cow;
  
-       spinlock_t root_times_lock;
+       spinlock_t root_item_lock;
  };
  
  struct btrfs_ioctl_defrag_range_args {
@@ -1722,6 +1789,12 @@ struct btrfs_ioctl_defrag_range_args {
   */
  #define BTRFS_DEV_STATS_KEY    249
  
+/*
+ * Persistantly stores the device replace state in the device tree.
+ * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
+ */
+#define BTRFS_DEV_REPLACE_KEY  250
+
  /*
   * string items are for debugging.  They just store a short string of
   * data in the FS
@@ -1787,7 +1860,7 @@ struct btrfs_map_token {
  
  static inline void btrfs_init_map_token (struct btrfs_map_token *token)
  {
-       memset(token, 0, sizeof(*token));
+       token->kaddr = NULL;
  }
  
  /* some macros to generate set/get funcs for the struct fields.  This
@@ -2755,6 +2828,49 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
  BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
                    rsv_excl, 64);
  
+/* btrfs_dev_replace_item */
+BTRFS_SETGET_FUNCS(dev_replace_src_devid,
+                  struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
+                  struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
+                  64);
+BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
+                  replace_state, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
+                  time_started, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
+                  time_stopped, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
+                  num_write_errors, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
+                  struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
+                  64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
+                  cursor_left, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
+                  cursor_right, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
+                        struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
+                        struct btrfs_dev_replace_item,
+                        cont_reading_from_srcdev_mode, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
+                        struct btrfs_dev_replace_item, replace_state, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
+                        struct btrfs_dev_replace_item, time_started, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
+                        struct btrfs_dev_replace_item, time_stopped, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
+                        struct btrfs_dev_replace_item, num_write_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
+                        struct btrfs_dev_replace_item,
+                        num_uncorrectable_read_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
+                        struct btrfs_dev_replace_item, cursor_left, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
+                        struct btrfs_dev_replace_item, cursor_right, 64);
+
  static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
  {
         return sb->s_fs_info;
@@ -2900,6 +3016,18 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
  u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
  u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
  void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+
+enum btrfs_reserve_flush_enum {
+       /* If we are in the transaction, we can't flush anything.*/
+       BTRFS_RESERVE_NO_FLUSH,
+       /*
+        * Flushing delalloc may cause deadlock somewhere, in this
+        * case, use FLUSH LIMIT
+        */
+       BTRFS_RESERVE_FLUSH_LIMIT,
+       BTRFS_RESERVE_FLUSH_ALL,
+};
+
  int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
  void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
  void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2919,19 +3047,13 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
  void btrfs_free_block_rsv(struct btrfs_root *root,
                           struct btrfs_block_rsv *rsv);
  int btrfs_block_rsv_add(struct btrfs_root *root,
-                       struct btrfs_block_rsv *block_rsv,
-                       u64 num_bytes);
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
-                               struct btrfs_block_rsv *block_rsv,
-                               u64 num_bytes);
+                       struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+                       enum btrfs_reserve_flush_enum flush);
  int btrfs_block_rsv_check(struct btrfs_root *root,
                           struct btrfs_block_rsv *block_rsv, int min_factor);
  int btrfs_block_rsv_refill(struct btrfs_root *root,
-                         struct btrfs_block_rsv *block_rsv,
-                         u64 min_reserved);
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
-                                  struct btrfs_block_rsv *block_rsv,
-                                  u64 min_reserved);
+                          struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+                          enum btrfs_reserve_flush_enum flush);
  int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                             struct btrfs_block_rsv *dst_rsv,
                             u64 num_bytes);
@@ -2955,6 +3077,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
  int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
  int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
                                          struct btrfs_fs_info *fs_info);
+int __get_raid_index(u64 flags);
  /* ctree.c */
  int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                      int level, int *slot);
@@ -3065,6 +3188,9 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
  }
  
  int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root, struct btrfs_path *path,
+                         int del);
  int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
                         u64 time_seq);
  static inline int btrfs_next_old_item(struct btrfs_root *root,
@@ -3157,6 +3283,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root);
  
  /* dir-item.c */
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+                         const char *name, int name_len);
  int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, const char *name,
                           int name_len, struct inode *dir,
@@ -3256,6 +3384,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              struct btrfs_path *path, u64 objectid,
                              u64 bytenr, int mod);
+u64 btrfs_file_extent_length(struct btrfs_path *path);
  int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root,
                            struct btrfs_ordered_sum *sums);
@@ -3271,6 +3400,19 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
  int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
                              struct list_head *list, int search_commit);
  /* inode.c */
+struct btrfs_delalloc_work {
+       struct inode *inode;
+       int wait;
+       int delay_iput;
+       struct completion completion;
+       struct list_head list;
+       struct btrfs_work work;
+};
+
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+                                                   int wait, int delay_iput);
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
+
  struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
                                            size_t pg_offset, u64 start, u64 len,
                                            int create);
@@ -3370,9 +3512,12 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
                                 struct btrfs_ioctl_space_info *space);
  
  /* file.c */
+int btrfs_auto_defrag_init(void);
+void btrfs_auto_defrag_exit(void);
  int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
                            struct inode *inode);
  int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
  int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
  void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                              int skip_pinned);
@@ -3519,15 +3664,16 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
                               struct btrfs_pending_snapshot *pending);
  
  /* scrub.c */
-int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
-                   struct btrfs_scrub_progress *progress, int readonly);
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+                   u64 end, struct btrfs_scrub_progress *progress,
+                   int readonly, int is_dev_replace);
  void btrfs_scrub_pause(struct btrfs_root *root);
  void btrfs_scrub_pause_super(struct btrfs_root *root);
  void btrfs_scrub_continue(struct btrfs_root *root);
  void btrfs_scrub_continue_super(struct btrfs_root *root);
-int __btrfs_scrub_cancel(struct btrfs_fs_info *info);
-int btrfs_scrub_cancel(struct btrfs_root *root);
-int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
+int btrfs_scrub_cancel(struct btrfs_fs_info *info);
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
+                          struct btrfs_device *dev);
  int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
  int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
                          struct btrfs_scrub_progress *progress);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c

index 478f66bdc57b958445365baf739122e4eec263af..34836036f01bc5bec6fff8768e1365cf69d500c0 100644 (file)
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -651,7 +651,8 @@ static int btrfs_delayed_inode_reserve_metadata(
          */
         if (!src_rsv || (!trans->bytes_reserved &&
                          src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
-               ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+               ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+                                         BTRFS_RESERVE_NO_FLUSH);
                 /*
                  * Since we're under a transaction reserve_metadata_bytes could
                  * try to commit the transaction which will make it return
@@ -686,7 +687,8 @@ static int btrfs_delayed_inode_reserve_metadata(
                  * reserve something strictly for us.  If not be a pain and try
                  * to steal from the delalloc block rsv.
                  */
-               ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+               ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+                                         BTRFS_RESERVE_NO_FLUSH);
                 if (!ret)
                         goto out;
  
@@ -1255,7 +1257,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
         struct btrfs_delayed_node *delayed_node = NULL;
         struct btrfs_root *root;
         struct btrfs_block_rsv *block_rsv;
-       unsigned long nr = 0;
         int need_requeue = 0;
         int ret;
  
@@ -1316,11 +1317,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
                                            delayed_node);
         mutex_unlock(&delayed_node->mutex);
  
-       nr = trans->blocks_used;
-
         trans->block_rsv = block_rsv;
         btrfs_end_transaction_dmeta(trans, root);
-       __btrfs_btree_balance_dirty(root, nr);
+       btrfs_btree_balance_dirty_nodelay(root);
  free_path:
         btrfs_free_path(path);
  out:
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c

new file mode 100644 (file)

index 0000000..66dbc8d
--- /dev/null
+++ b/fs/btrfs/dev-replace.c
@@ -0,0 +1,856 @@
+/*
+ * Copyright (C) STRATO AG 2012.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <asm/div64.h>
+#include "compat.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+
+static u64 btrfs_get_seconds_since_1970(void);
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+                                      int scrub_ret);
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+                                               struct btrfs_fs_info *fs_info,
+                                               struct btrfs_device *srcdev,
+                                               struct btrfs_device *tgtdev);
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+                                        char *srcdev_name,
+                                        struct btrfs_device **device);
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
+static int btrfs_dev_replace_kthread(void *data);
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
+
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_key key;
+       struct btrfs_root *dev_root = fs_info->dev_root;
+       struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+       struct extent_buffer *eb;
+       int slot;
+       int ret = 0;
+       struct btrfs_path *path = NULL;
+       int item_size;
+       struct btrfs_dev_replace_item *ptr;
+       u64 src_devid;
+
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       key.objectid = 0;
+       key.type = BTRFS_DEV_REPLACE_KEY;
+       key.offset = 0;
+       ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+       if (ret) {
+no_valid_dev_replace_entry_found:
+               ret = 0;
+               dev_replace->replace_state =
+                       BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
+               dev_replace->cont_reading_from_srcdev_mode =
+                   BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
+               dev_replace->replace_state = 0;
+               dev_replace->time_started = 0;
+               dev_replace->time_stopped = 0;
+               atomic64_set(&dev_replace->num_write_errors, 0);
+               atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
+               dev_replace->cursor_left = 0;
+               dev_replace->committed_cursor_left = 0;
+               dev_replace->cursor_left_last_write_of_item = 0;
+               dev_replace->cursor_right = 0;
+               dev_replace->srcdev = NULL;
+               dev_replace->tgtdev = NULL;
+               dev_replace->is_valid = 0;
+               dev_replace->item_needs_writeback = 0;
+               goto out;
+       }
+       slot = path->slots[0];
+       eb = path->nodes[0];
+       item_size = btrfs_item_size_nr(eb, slot);
+       ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
+
+       if (item_size != sizeof(struct btrfs_dev_replace_item)) {
+               pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
+               goto no_valid_dev_replace_entry_found;
+       }
+
+       src_devid = btrfs_dev_replace_src_devid(eb, ptr);
+       dev_replace->cont_reading_from_srcdev_mode =
+               btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
+       dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
+       dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
+       dev_replace->time_stopped =
+               btrfs_dev_replace_time_stopped(eb, ptr);
+       atomic64_set(&dev_replace->num_write_errors,
+                    btrfs_dev_replace_num_write_errors(eb, ptr));
+       atomic64_set(&dev_replace->num_uncorrectable_read_errors,
+                    btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
+       dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
+       dev_replace->committed_cursor_left = dev_replace->cursor_left;
+       dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
+       dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
+       dev_replace->is_valid = 1;
+
+       dev_replace->item_needs_writeback = 0;
+       switch (dev_replace->replace_state) {
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+               dev_replace->srcdev = NULL;
+               dev_replace->tgtdev = NULL;
+               break;
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+               dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
+                                                       NULL, NULL);
+               dev_replace->tgtdev = btrfs_find_device(fs_info,
+                                                       BTRFS_DEV_REPLACE_DEVID,
+                                                       NULL, NULL);
+               /*
+                * allow 'btrfs dev replace_cancel' if src/tgt device is
+                * missing
+                */
+               if (!dev_replace->srcdev &&
+                   !btrfs_test_opt(dev_root, DEGRADED)) {
+                       ret = -EIO;
+                       pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
+                               (unsigned long long)src_devid);
+               }
+               if (!dev_replace->tgtdev &&
+                   !btrfs_test_opt(dev_root, DEGRADED)) {
+                       ret = -EIO;
+                       pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
+                               (unsigned long long)BTRFS_DEV_REPLACE_DEVID);
+               }
+               if (dev_replace->tgtdev) {
+                       if (dev_replace->srcdev) {
+                               dev_replace->tgtdev->total_bytes =
+                                       dev_replace->srcdev->total_bytes;
+                               dev_replace->tgtdev->disk_total_bytes =
+                                       dev_replace->srcdev->disk_total_bytes;
+                               dev_replace->tgtdev->bytes_used =
+                                       dev_replace->srcdev->bytes_used;
+                       }
+                       dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
+                       btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
+                               dev_replace->tgtdev);
+               }
+               break;
+       }
+
+out:
+       if (path)
+               btrfs_free_path(path);
+       return ret;
+}
+
+/*
+ * called from commit_transaction. Writes changed device replace state to
+ * disk.
+ */
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+                         struct btrfs_fs_info *fs_info)
+{
+       int ret;
+       struct btrfs_root *dev_root = fs_info->dev_root;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       struct extent_buffer *eb;
+       struct btrfs_dev_replace_item *ptr;
+       struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+       btrfs_dev_replace_lock(dev_replace);
+       if (!dev_replace->is_valid ||
+           !dev_replace->item_needs_writeback) {
+               btrfs_dev_replace_unlock(dev_replace);
+               return 0;
+       }
+       btrfs_dev_replace_unlock(dev_replace);
+
+       key.objectid = 0;
+       key.type = BTRFS_DEV_REPLACE_KEY;
+       key.offset = 0;
+
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+       if (ret < 0) {
+               pr_warn("btrfs: error %d while searching for dev_replace item!\n",
+                       ret);
+               goto out;
+       }
+
+       if (ret == 0 &&
+           btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+               /*
+                * need to delete old one and insert a new one.
+                * Since no attempt is made to recover any old state, if the
+                * dev_replace state is 'running', the data on the target
+                * drive is lost.
+                * It would be possible to recover the state: just make sure
+                * that the beginning of the item is never changed and always
+                * contains all the essential information. Then read this
+                * minimal set of information and use it as a base for the
+                * new state.
+                */
+               ret = btrfs_del_item(trans, dev_root, path);
+               if (ret != 0) {
+                       pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
+                               ret);
+                       goto out;
+               }
+               ret = 1;
+       }
+
+       if (ret == 1) {
+               /* need to insert a new item */
+               btrfs_release_path(path);
+               ret = btrfs_insert_empty_item(trans, dev_root, path,
+                                             &key, sizeof(*ptr));
+               if (ret < 0) {
+                       pr_warn("btrfs: insert dev_replace item failed %d!\n",
+                               ret);
+                       goto out;
+               }
+       }
+
+       eb = path->nodes[0];
+       ptr = btrfs_item_ptr(eb, path->slots[0],
+                            struct btrfs_dev_replace_item);
+
+       btrfs_dev_replace_lock(dev_replace);
+       if (dev_replace->srcdev)
+               btrfs_set_dev_replace_src_devid(eb, ptr,
+                       dev_replace->srcdev->devid);
+       else
+               btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
+       btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
+               dev_replace->cont_reading_from_srcdev_mode);
+       btrfs_set_dev_replace_replace_state(eb, ptr,
+               dev_replace->replace_state);
+       btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
+       btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
+       btrfs_set_dev_replace_num_write_errors(eb, ptr,
+               atomic64_read(&dev_replace->num_write_errors));
+       btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
+               atomic64_read(&dev_replace->num_uncorrectable_read_errors));
+       dev_replace->cursor_left_last_write_of_item =
+               dev_replace->cursor_left;
+       btrfs_set_dev_replace_cursor_left(eb, ptr,
+               dev_replace->cursor_left_last_write_of_item);
+       btrfs_set_dev_replace_cursor_right(eb, ptr,
+               dev_replace->cursor_right);
+       dev_replace->item_needs_writeback = 0;
+       btrfs_dev_replace_unlock(dev_replace);
+
+       btrfs_mark_buffer_dirty(eb);
+
+out:
+       btrfs_free_path(path);
+
+       return ret;
+}
+
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+       dev_replace->committed_cursor_left =
+               dev_replace->cursor_left_last_write_of_item;
+}
+
+static u64 btrfs_get_seconds_since_1970(void)
+{
+       struct timespec t = CURRENT_TIME_SEC;
+
+       return t.tv_sec;
+}
+
+int btrfs_dev_replace_start(struct btrfs_root *root,
+                           struct btrfs_ioctl_dev_replace_args *args)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+       int ret;
+       struct btrfs_device *tgt_device = NULL;
+       struct btrfs_device *src_device = NULL;
+
+       switch (args->start.cont_reading_from_srcdev_mode) {
+       case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
+       case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
+           args->start.tgtdev_name[0] == '\0')
+               return -EINVAL;
+
+       mutex_lock(&fs_info->volume_mutex);
+       ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
+                                           &tgt_device);
+       if (ret) {
+               pr_err("btrfs: target device %s is invalid!\n",
+                      args->start.tgtdev_name);
+               mutex_unlock(&fs_info->volume_mutex);
+               return -EINVAL;
+       }
+
+       ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
+                                           args->start.srcdev_name,
+                                           &src_device);
+       mutex_unlock(&fs_info->volume_mutex);
+       if (ret) {
+               ret = -EINVAL;
+               goto leave_no_lock;
+       }
+
+       if (tgt_device->total_bytes < src_device->total_bytes) {
+               pr_err("btrfs: target device is smaller than source device!\n");
+               ret = -EINVAL;
+               goto leave_no_lock;
+       }
+
+       btrfs_dev_replace_lock(dev_replace);
+       switch (dev_replace->replace_state) {
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+               break;
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+               args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
+               goto leave;
+       }
+
+       dev_replace->cont_reading_from_srcdev_mode =
+               args->start.cont_reading_from_srcdev_mode;
+       WARN_ON(!src_device);
+       dev_replace->srcdev = src_device;
+       WARN_ON(!tgt_device);
+       dev_replace->tgtdev = tgt_device;
+
+       printk_in_rcu(KERN_INFO
+                     "btrfs: dev_replace from %s (devid %llu) to %s) started\n",
+                     src_device->missing ? "<missing disk>" :
+                       rcu_str_deref(src_device->name),
+                     src_device->devid,
+                     rcu_str_deref(tgt_device->name));
+
+       tgt_device->total_bytes = src_device->total_bytes;
+       tgt_device->disk_total_bytes = src_device->disk_total_bytes;
+       tgt_device->bytes_used = src_device->bytes_used;
+
+       /*
+        * from now on, the writes to the srcdev are all duplicated to
+        * go to the tgtdev as well (refer to btrfs_map_block()).
+        */
+       dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+       dev_replace->time_started = btrfs_get_seconds_since_1970();
+       dev_replace->cursor_left = 0;
+       dev_replace->committed_cursor_left = 0;
+       dev_replace->cursor_left_last_write_of_item = 0;
+       dev_replace->cursor_right = 0;
+       dev_replace->is_valid = 1;
+       dev_replace->item_needs_writeback = 1;
+       args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+       btrfs_dev_replace_unlock(dev_replace);
+
+       btrfs_wait_ordered_extents(root, 0);
+
+       /* force writing the updated state information to disk */
+       trans = btrfs_start_transaction(root, 0);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               btrfs_dev_replace_lock(dev_replace);
+               goto leave;
+       }
+
+       ret = btrfs_commit_transaction(trans, root);
+       WARN_ON(ret);
+
+       /* the disk copy procedure reuses the scrub code */
+       ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
+                             src_device->total_bytes,
+                             &dev_replace->scrub_progress, 0, 1);
+
+       ret = btrfs_dev_replace_finishing(root->fs_info, ret);
+       WARN_ON(ret);
+
+       return 0;
+
+leave:
+       dev_replace->srcdev = NULL;
+       dev_replace->tgtdev = NULL;
+       btrfs_dev_replace_unlock(dev_replace);
+leave_no_lock:
+       if (tgt_device)
+               btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+       return ret;
+}
+
+static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
+                                      int scrub_ret)
+{
+       struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+       struct btrfs_device *tgt_device;
+       struct btrfs_device *src_device;
+       struct btrfs_root *root = fs_info->tree_root;
+       u8 uuid_tmp[BTRFS_UUID_SIZE];
+       struct btrfs_trans_handle *trans;
+       int ret = 0;
+
+       /* don't allow cancel or unmount to disturb the finishing procedure */
+       mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+
+       btrfs_dev_replace_lock(dev_replace);
+       /* was the operation canceled, or is it finished? */
+       if (dev_replace->replace_state !=
+           BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
+               btrfs_dev_replace_unlock(dev_replace);
+               mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+               return 0;
+       }
+
+       tgt_device = dev_replace->tgtdev;
+       src_device = dev_replace->srcdev;
+       btrfs_dev_replace_unlock(dev_replace);
+
+       /* replace old device with new one in mapping tree */
+       if (!scrub_ret)
+               btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
+                                                               src_device,
+                                                               tgt_device);
+
+       /*
+        * flush all outstanding I/O and inode extent mappings before the
+        * copy operation is declared as being finished
+        */
+       btrfs_start_delalloc_inodes(root, 0);
+       btrfs_wait_ordered_extents(root, 0);
+
+       trans = btrfs_start_transaction(root, 0);
+       if (IS_ERR(trans)) {
+               mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+               return PTR_ERR(trans);
+       }
+       ret = btrfs_commit_transaction(trans, root);
+       WARN_ON(ret);
+
+       /* keep away write_all_supers() during the finishing procedure */
+       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+       btrfs_dev_replace_lock(dev_replace);
+       dev_replace->replace_state =
+               scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
+                         : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
+       dev_replace->tgtdev = NULL;
+       dev_replace->srcdev = NULL;
+       dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+       dev_replace->item_needs_writeback = 1;
+
+       if (scrub_ret) {
+               printk_in_rcu(KERN_ERR
+                             "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+                             src_device->missing ? "<missing disk>" :
+                               rcu_str_deref(src_device->name),
+                             src_device->devid,
+                             rcu_str_deref(tgt_device->name), scrub_ret);
+               btrfs_dev_replace_unlock(dev_replace);
+               mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+               if (tgt_device)
+                       btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+               mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+               return 0;
+       }
+
+       printk_in_rcu(KERN_INFO
+                     "btrfs: dev_replace from %s (devid %llu) to %s) finished\n",
+                     src_device->missing ? "<missing disk>" :
+                       rcu_str_deref(src_device->name),
+                     src_device->devid,
+                     rcu_str_deref(tgt_device->name));
+       tgt_device->is_tgtdev_for_dev_replace = 0;
+       tgt_device->devid = src_device->devid;
+       src_device->devid = BTRFS_DEV_REPLACE_DEVID;
+       tgt_device->bytes_used = src_device->bytes_used;
+       memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
+       memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
+       memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
+       tgt_device->total_bytes = src_device->total_bytes;
+       tgt_device->disk_total_bytes = src_device->disk_total_bytes;
+       tgt_device->bytes_used = src_device->bytes_used;
+       if (fs_info->sb->s_bdev == src_device->bdev)
+               fs_info->sb->s_bdev = tgt_device->bdev;
+       if (fs_info->fs_devices->latest_bdev == src_device->bdev)
+               fs_info->fs_devices->latest_bdev = tgt_device->bdev;
+       list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+
+       btrfs_rm_dev_replace_srcdev(fs_info, src_device);
+       if (src_device->bdev) {
+               /* zero out the old super */
+               btrfs_scratch_superblock(src_device);
+       }
+       /*
+        * this is again a consistent state where no dev_replace procedure
+        * is running, the target device is part of the filesystem, the
+        * source device is not part of the filesystem anymore and its 1st
+        * superblock is scratched out so that it is no longer marked to
+        * belong to this filesystem.
+        */
+       btrfs_dev_replace_unlock(dev_replace);
+       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+       /* write back the superblocks */
+       trans = btrfs_start_transaction(root, 0);
+       if (!IS_ERR(trans))
+               btrfs_commit_transaction(trans, root);
+
+       mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+
+       return 0;
+}
+
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+                                               struct btrfs_fs_info *fs_info,
+                                               struct btrfs_device *srcdev,
+                                               struct btrfs_device *tgtdev)
+{
+       struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+       struct extent_map *em;
+       struct map_lookup *map;
+       u64 start = 0;
+       int i;
+
+       write_lock(&em_tree->lock);
+       do {
+               em = lookup_extent_mapping(em_tree, start, (u64)-1);
+               if (!em)
+                       break;
+               map = (struct map_lookup *)em->bdev;
+               for (i = 0; i < map->num_stripes; i++)
+                       if (srcdev == map->stripes[i].dev)
+                               map->stripes[i].dev = tgtdev;
+               start = em->start + em->len;
+               free_extent_map(em);
+       } while (start);
+       write_unlock(&em_tree->lock);
+}
+
+static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
+                                        char *srcdev_name,
+                                        struct btrfs_device **device)
+{
+       int ret;
+
+       if (srcdevid) {
+               ret = 0;
+               *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
+                                           NULL);
+               if (!*device)
+                       ret = -ENOENT;
+       } else {
+               ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
+                                                          device);
+       }
+       return ret;
+}
+
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+                             struct btrfs_ioctl_dev_replace_args *args)
+{
+       struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+       btrfs_dev_replace_lock(dev_replace);
+       /* even if !dev_replace_is_valid, the values are good enough for
+        * the replace_status ioctl */
+       args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+       args->status.replace_state = dev_replace->replace_state;
+       args->status.time_started = dev_replace->time_started;
+       args->status.time_stopped = dev_replace->time_stopped;
+       args->status.num_write_errors =
+               atomic64_read(&dev_replace->num_write_errors);
+       args->status.num_uncorrectable_read_errors =
+               atomic64_read(&dev_replace->num_uncorrectable_read_errors);
+       switch (dev_replace->replace_state) {
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+               args->status.progress_1000 = 0;
+               break;
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+               args->status.progress_1000 = 1000;
+               break;
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+               args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
+                       div64_u64(dev_replace->srcdev->total_bytes, 1000));
+               break;
+       }
+       btrfs_dev_replace_unlock(dev_replace);
+}
+
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+                            struct btrfs_ioctl_dev_replace_args *args)
+{
+       args->result = __btrfs_dev_replace_cancel(fs_info);
+       return 0;
+}
+
+static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+       struct btrfs_device *tgt_device = NULL;
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = fs_info->tree_root;
+       u64 result;
+       int ret;
+
+       mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+       btrfs_dev_replace_lock(dev_replace);
+       switch (dev_replace->replace_state) {
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+               result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
+               btrfs_dev_replace_unlock(dev_replace);
+               goto leave;
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+               result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
+               tgt_device = dev_replace->tgtdev;
+               dev_replace->tgtdev = NULL;
+               dev_replace->srcdev = NULL;
+               break;
+       }
+       dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
+       dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+       dev_replace->item_needs_writeback = 1;
+       btrfs_dev_replace_unlock(dev_replace);
+       btrfs_scrub_cancel(fs_info);
+
+       trans = btrfs_start_transaction(root, 0);
+       if (IS_ERR(trans)) {
+               mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+               return PTR_ERR(trans);
+       }
+       ret = btrfs_commit_transaction(trans, root);
+       WARN_ON(ret);
+       if (tgt_device)
+               btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
+
+leave:
+       mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+       return result;
+}
+
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+       mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
+       btrfs_dev_replace_lock(dev_replace);
+       switch (dev_replace->replace_state) {
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+               break;
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+               dev_replace->replace_state =
+                       BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+               dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+               dev_replace->item_needs_writeback = 1;
+               pr_info("btrfs: suspending dev_replace for unmount\n");
+               break;
+       }
+
+       btrfs_dev_replace_unlock(dev_replace);
+       mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+}
+
+/* resume dev_replace procedure that was interrupted by unmount */
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
+{
+       struct task_struct *task;
+       struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+
+       btrfs_dev_replace_lock(dev_replace);
+       switch (dev_replace->replace_state) {
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+               btrfs_dev_replace_unlock(dev_replace);
+               return 0;
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+               break;
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+               dev_replace->replace_state =
+                       BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
+               break;
+       }
+       if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
+               pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
+                       "btrfs: you may cancel the operation after 'mount -o degraded'\n");
+               btrfs_dev_replace_unlock(dev_replace);
+               return 0;
+       }
+       btrfs_dev_replace_unlock(dev_replace);
+
+       WARN_ON(atomic_xchg(
+               &fs_info->mutually_exclusive_operation_running, 1));
+       task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
+       return PTR_RET(task);
+}
+
+static int btrfs_dev_replace_kthread(void *data)
+{
+       struct btrfs_fs_info *fs_info = data;
+       struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+       struct btrfs_ioctl_dev_replace_args *status_args;
+       u64 progress;
+
+       status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
+       if (status_args) {
+               btrfs_dev_replace_status(fs_info, status_args);
+               progress = status_args->status.progress_1000;
+               kfree(status_args);
+               do_div(progress, 10);
+               printk_in_rcu(KERN_INFO
+                             "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+                             dev_replace->srcdev->missing ? "<missing disk>" :
+                               rcu_str_deref(dev_replace->srcdev->name),
+                             dev_replace->srcdev->devid,
+                             dev_replace->tgtdev ?
+                               rcu_str_deref(dev_replace->tgtdev->name) :
+                               "<missing target disk>",
+                             (unsigned int)progress);
+       }
+       btrfs_dev_replace_continue_on_mount(fs_info);
+       atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+
+       return 0;
+}
+
+static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+       int ret;
+
+       ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
+                             dev_replace->committed_cursor_left,
+                             dev_replace->srcdev->total_bytes,
+                             &dev_replace->scrub_progress, 0, 1);
+       ret = btrfs_dev_replace_finishing(fs_info, ret);
+       WARN_ON(ret);
+       return 0;
+}
+
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
+{
+       if (!dev_replace->is_valid)
+               return 0;
+
+       switch (dev_replace->replace_state) {
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
+               return 0;
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
+       case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+               /*
+                * return true even if tgtdev is missing (this is
+                * something that can happen if the dev_replace
+                * procedure is suspended by an umount and then
+                * the tgtdev is missing (or "btrfs dev scan") was
+                * not called and the the filesystem is remounted
+                * in degraded state. This does not stop the
+                * dev_replace procedure. It needs to be canceled
+                * manually if the cancelation is wanted.
+                */
+               break;
+       }
+       return 1;
+}
+
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
+{
+       /* the beginning is just an optimization for the typical case */
+       if (atomic_read(&dev_replace->nesting_level) == 0) {
+acquire_lock:
+               /* this is not a nested case where the same thread
+                * is trying to acqurire the same lock twice */
+               mutex_lock(&dev_replace->lock);
+               mutex_lock(&dev_replace->lock_management_lock);
+               dev_replace->lock_owner = current->pid;
+               atomic_inc(&dev_replace->nesting_level);
+               mutex_unlock(&dev_replace->lock_management_lock);
+               return;
+       }
+
+       mutex_lock(&dev_replace->lock_management_lock);
+       if (atomic_read(&dev_replace->nesting_level) > 0 &&
+           dev_replace->lock_owner == current->pid) {
+               WARN_ON(!mutex_is_locked(&dev_replace->lock));
+               atomic_inc(&dev_replace->nesting_level);
+               mutex_unlock(&dev_replace->lock_management_lock);
+               return;
+       }
+
+       mutex_unlock(&dev_replace->lock_management_lock);
+       goto acquire_lock;
+}
+
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
+{
+       WARN_ON(!mutex_is_locked(&dev_replace->lock));
+       mutex_lock(&dev_replace->lock_management_lock);
+       WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
+       WARN_ON(dev_replace->lock_owner != current->pid);
+       atomic_dec(&dev_replace->nesting_level);
+       if (atomic_read(&dev_replace->nesting_level) == 0) {
+               dev_replace->lock_owner = 0;
+               mutex_unlock(&dev_replace->lock_management_lock);
+               mutex_unlock(&dev_replace->lock);
+       } else {
+               mutex_unlock(&dev_replace->lock_management_lock);
+       }
+}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h

new file mode 100644 (file)

index 0000000..20035cb
--- /dev/null
+++ b/fs/btrfs/dev-replace.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) STRATO AG 2012.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_DEV_REPLACE__)
+#define __BTRFS_DEV_REPLACE__
+
+struct btrfs_ioctl_dev_replace_args;
+
+int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
+                         struct btrfs_fs_info *fs_info);
+void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_start(struct btrfs_root *root,
+                           struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
+                             struct btrfs_ioctl_dev_replace_args *args);
+int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
+                            struct btrfs_ioctl_dev_replace_args *args);
+void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
+int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
+int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace);
+void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace);
+
+static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
+{
+       atomic64_inc(stat_value);
+}
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c

index c1a074d0696ff897258c576c127af7fd8d513846..502c2158167c8fb95578f8166a9802a34e762def 100644 (file)
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -213,6 +213,65 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
         return btrfs_match_dir_item_name(root, path, name, name_len);
  }
  
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+                                  const char *name, int name_len)
+{
+       int ret;
+       struct btrfs_key key;
+       struct btrfs_dir_item *di;
+       int data_size;
+       struct extent_buffer *leaf;
+       int slot;
+       struct btrfs_path *path;
+
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       key.objectid = dir;
+       btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+       key.offset = btrfs_name_hash(name, name_len);
+
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+
+       /* return back any errors */
+       if (ret < 0)
+               goto out;
+
+       /* nothing found, we're safe */
+       if (ret > 0) {
+               ret = 0;
+               goto out;
+       }
+
+       /* we found an item, look for our name in the item */
+       di = btrfs_match_dir_item_name(root, path, name, name_len);
+       if (di) {
+               /* our exact name was found */
+               ret = -EEXIST;
+               goto out;
+       }
+
+       /*
+        * see if there is room in the item to insert this
+        * name
+        */
+       data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item);
+       leaf = path->nodes[0];
+       slot = path->slots[0];
+       if (data_size + btrfs_item_size_nr(leaf, slot) +
+           sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) {
+               ret = -EOVERFLOW;
+       } else {
+               /* plenty of insertion room */
+               ret = 0;
+       }
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
  /*
   * lookup a directory item based on index.  'dir' is the objectid
   * we're searching in, and 'mod' tells us if you plan on deleting the
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index 22a0439e5a86316196f07c26896e823f5cf756a1..a8f652dc940bd85148dad48c11d2d893aefe32ba 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -45,6 +45,7 @@
  #include "inode-map.h"
  #include "check-integrity.h"
  #include "rcu-string.h"
+#include "dev-replace.h"
  
  #ifdef CONFIG_X86
  #include <asm/cpufeature.h>
@@ -387,7 +388,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
                 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
                         break;
  
-               num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+               num_copies = btrfs_num_copies(root->fs_info,
                                               eb->start, eb->len);
                 if (num_copies == 1)
                         break;
@@ -852,11 +853,16 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
                                  int mirror_num, unsigned long bio_flags,
                                  u64 bio_offset)
  {
+       int ret;
+
         /*
          * when we're called for a write, we're already in the async
          * submission context.  Just jump into btrfs_map_bio
          */
-       return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+       ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+       if (ret)
+               bio_endio(bio, ret);
+       return ret;
  }
  
  static int check_async_write(struct inode *inode, unsigned long bio_flags)
@@ -878,7 +884,6 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
         int ret;
  
         if (!(rw & REQ_WRITE)) {
-
                 /*
                  * called for a read, do the setup so that checksum validation
                  * can happen in the async kernel threads
@@ -886,26 +891,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
                                           bio, 1);
                 if (ret)
-                       return ret;
-               return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
-                                    mirror_num, 0);
+                       goto out_w_error;
+               ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+                                   mirror_num, 0);
         } else if (!async) {
                 ret = btree_csum_one_bio(bio);
                 if (ret)
-                       return ret;
-               return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
-                                    mirror_num, 0);
+                       goto out_w_error;
+               ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+                                   mirror_num, 0);
+       } else {
+               /*
+                * kthread helpers are used to submit writes so that
+                * checksumming can happen in parallel across all CPUs
+                */
+               ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                                         inode, rw, bio, mirror_num, 0,
+                                         bio_offset,
+                                         __btree_submit_bio_start,
+                                         __btree_submit_bio_done);
         }
  
-       /*
-        * kthread helpers are used to submit writes so that checksumming
-        * can happen in parallel across all CPUs
-        */
-       return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-                                  inode, rw, bio, mirror_num, 0,
-                                  bio_offset,
-                                  __btree_submit_bio_start,
-                                  __btree_submit_bio_done);
+       if (ret) {
+out_w_error:
+               bio_endio(bio, ret);
+       }
+       return ret;
  }
  
  #ifdef CONFIG_MIGRATION
@@ -990,6 +1001,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
  
  static int btree_set_page_dirty(struct page *page)
  {
+#ifdef DEBUG
         struct extent_buffer *eb;
  
         BUG_ON(!PagePrivate(page));
@@ -998,6 +1010,7 @@ static int btree_set_page_dirty(struct page *page)
         BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
         BUG_ON(!atomic_read(&eb->refs));
         btrfs_assert_tree_locked(eb);
+#endif
         return __set_page_dirty_nobuffers(page);
  }
  
@@ -1129,11 +1142,11 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                                           root->fs_info->dirty_metadata_bytes);
                         }
                         spin_unlock(&root->fs_info->delalloc_lock);
-               }
  
-               /* ugh, clear_extent_buffer_dirty needs to lock the page */
-               btrfs_set_lock_blocking(buf);
-               clear_extent_buffer_dirty(buf);
+                       /* ugh, clear_extent_buffer_dirty needs to lock the page */
+                       btrfs_set_lock_blocking(buf);
+                       clear_extent_buffer_dirty(buf);
+               }
         }
  }
  
@@ -1193,7 +1206,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
         root->root_key.objectid = objectid;
         root->anon_dev = 0;
  
-       spin_lock_init(&root->root_times_lock);
+       spin_lock_init(&root->root_item_lock);
  }
  
  static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
@@ -2131,6 +2144,11 @@ int open_ctree(struct super_block *sb,
         init_rwsem(&fs_info->extent_commit_sem);
         init_rwsem(&fs_info->cleanup_work_sem);
         init_rwsem(&fs_info->subvol_sem);
+       fs_info->dev_replace.lock_owner = 0;
+       atomic_set(&fs_info->dev_replace.nesting_level, 0);
+       mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
+       mutex_init(&fs_info->dev_replace.lock_management_lock);
+       mutex_init(&fs_info->dev_replace.lock);
  
         spin_lock_init(&fs_info->qgroup_lock);
         fs_info->qgroup_tree = RB_ROOT;
@@ -2279,6 +2297,10 @@ int open_ctree(struct super_block *sb,
                            fs_info->thread_pool_size,
                            &fs_info->generic_worker);
  
+       btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
+                          fs_info->thread_pool_size,
+                          &fs_info->generic_worker);
+
         btrfs_init_workers(&fs_info->submit_workers, "submit",
                            min_t(u64, fs_devices->num_devices,
                            fs_info->thread_pool_size),
@@ -2350,6 +2372,7 @@ int open_ctree(struct super_block *sb,
         ret |= btrfs_start_workers(&fs_info->delayed_workers);
         ret |= btrfs_start_workers(&fs_info->caching_workers);
         ret |= btrfs_start_workers(&fs_info->readahead_workers);
+       ret |= btrfs_start_workers(&fs_info->flush_workers);
         if (ret) {
                 err = -ENOMEM;
                 goto fail_sb_buffer;
@@ -2418,7 +2441,11 @@ int open_ctree(struct super_block *sb,
                 goto fail_tree_roots;
         }
  
-       btrfs_close_extra_devices(fs_devices);
+       /*
+        * keep the device that is marked to be the target device for the
+        * dev_replace procedure
+        */
+       btrfs_close_extra_devices(fs_info, fs_devices, 0);
  
         if (!fs_devices->latest_bdev) {
                 printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
@@ -2490,6 +2517,14 @@ retry_root_backup:
                 goto fail_block_groups;
         }
  
+       ret = btrfs_init_dev_replace(fs_info);
+       if (ret) {
+               pr_err("btrfs: failed to init dev_replace: %d\n", ret);
+               goto fail_block_groups;
+       }
+
+       btrfs_close_extra_devices(fs_info, fs_devices, 1);
+
         ret = btrfs_init_space_info(fs_info);
         if (ret) {
                 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2503,6 +2538,13 @@ retry_root_backup:
         }
         fs_info->num_tolerated_disk_barrier_failures =
                 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+       if (fs_info->fs_devices->missing_devices >
+            fs_info->num_tolerated_disk_barrier_failures &&
+           !(sb->s_flags & MS_RDONLY)) {
+               printk(KERN_WARNING
+                      "Btrfs: too many missing devices, writeable mount is not allowed\n");
+               goto fail_block_groups;
+       }
  
         fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
                                                "btrfs-cleaner");
@@ -2631,6 +2673,13 @@ retry_root_backup:
                 return ret;
         }
  
+       ret = btrfs_resume_dev_replace_async(fs_info);
+       if (ret) {
+               pr_warn("btrfs: failed to resume dev_replace\n");
+               close_ctree(tree_root);
+               return ret;
+       }
+
         return 0;
  
  fail_qgroup:
@@ -2667,6 +2716,7 @@ fail_sb_buffer:
         btrfs_stop_workers(&fs_info->submit_workers);
         btrfs_stop_workers(&fs_info->delayed_workers);
         btrfs_stop_workers(&fs_info->caching_workers);
+       btrfs_stop_workers(&fs_info->flush_workers);
  fail_alloc:
  fail_iput:
         btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -3270,16 +3320,18 @@ int close_ctree(struct btrfs_root *root)
         smp_mb();
  
         /* pause restriper - we want to resume on mount */
-       btrfs_pause_balance(root->fs_info);
+       btrfs_pause_balance(fs_info);
  
-       btrfs_scrub_cancel(root);
+       btrfs_dev_replace_suspend_for_unmount(fs_info);
+
+       btrfs_scrub_cancel(fs_info);
  
         /* wait for any defraggers to finish */
         wait_event(fs_info->transaction_wait,
                    (atomic_read(&fs_info->defrag_running) == 0));
  
         /* clear out the rbtree of defraggable inodes */
-       btrfs_run_defrag_inodes(fs_info);
+       btrfs_cleanup_defrag_inodes(fs_info);
  
         if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                 ret = btrfs_commit_super(root);
@@ -3339,6 +3391,7 @@ int close_ctree(struct btrfs_root *root)
         btrfs_stop_workers(&fs_info->delayed_workers);
         btrfs_stop_workers(&fs_info->caching_workers);
         btrfs_stop_workers(&fs_info->readahead_workers);
+       btrfs_stop_workers(&fs_info->flush_workers);
  
  #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
         if (btrfs_test_opt(root, CHECK_INTEGRITY))
@@ -3383,14 +3436,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
         int was_dirty;
  
         btrfs_assert_tree_locked(buf);
-       if (transid != root->fs_info->generation) {
-               printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+       if (transid != root->fs_info->generation)
+               WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
                        "found %llu running %llu\n",
                         (unsigned long long)buf->start,
                         (unsigned long long)transid,
                         (unsigned long long)root->fs_info->generation);
-               WARN_ON(1);
-       }
         was_dirty = set_extent_buffer_dirty(buf);
         if (!was_dirty) {
                 spin_lock(&root->fs_info->delalloc_lock);
@@ -3399,7 +3450,8 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
         }
  }
  
-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
+                                       int flush_delayed)
  {
         /*
          * looks as though older kernels can get into trouble with
@@ -3411,7 +3463,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
         if (current->flags & PF_MEMALLOC)
                 return;
  
-       btrfs_balance_delayed_items(root);
+       if (flush_delayed)
+               btrfs_balance_delayed_items(root);
  
         num_dirty = root->fs_info->dirty_metadata_bytes;
  
@@ -3422,25 +3475,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
         return;
  }
  
-void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+void btrfs_btree_balance_dirty(struct btrfs_root *root)
  {
-       /*
-        * looks as though older kernels can get into trouble with
-        * this code, they end up stuck in balance_dirty_pages forever
-        */
-       u64 num_dirty;
-       unsigned long thresh = 32 * 1024 * 1024;
-
-       if (current->flags & PF_MEMALLOC)
-               return;
-
-       num_dirty = root->fs_info->dirty_metadata_bytes;
+       __btrfs_btree_balance_dirty(root, 1);
+}
  
-       if (num_dirty > thresh) {
-               balance_dirty_pages_ratelimited(
-                                  root->fs_info->btree_inode->i_mapping);
-       }
-       return;
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
+{
+       __btrfs_btree_balance_dirty(root, 0);
  }
  
  int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h

index 2025a9132c16119c5795b2614b7d114ada047643..305c33efb0e322224e0358a249103529ae78994b 100644 (file)
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -62,8 +62,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
  struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
                                               struct btrfs_key *location);
  int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
-void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
+void btrfs_btree_balance_dirty(struct btrfs_root *root);
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
  void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
  void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
  int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 06b2635073f37dbd15969803f4387020344f26ec..521e9d4424f64f81d69d98709031a342d5ee808f 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
  #include "volumes.h"
  #include "locking.h"
  #include "free-space-cache.h"
+#include "math.h"
  
  #undef SCRAMBLE_DELAYED_REFS
  
@@ -649,24 +650,6 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
         rcu_read_unlock();
  }
  
-static u64 div_factor(u64 num, int factor)
-{
-       if (factor == 10)
-               return num;
-       num *= factor;
-       do_div(num, 10);
-       return num;
-}
-
-static u64 div_factor_fine(u64 num, int factor)
-{
-       if (factor == 100)
-               return num;
-       num *= factor;
-       do_div(num, 100);
-       return num;
-}
-
  u64 btrfs_find_block_group(struct btrfs_root *root,
                            u64 search_start, u64 search_hint, int owner)
  {
@@ -1835,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
  
  
         /* Tell the block device(s) that the sectors can be discarded */
-       ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
+       ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
                               bytenr, &num_bytes, &bbio, 0);
         /* Error condition is -ENOMEM */
         if (!ret) {
@@ -2314,6 +2297,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                                 kfree(extent_op);
  
                                 if (ret) {
+                                       list_del_init(&locked_ref->cluster);
+                                       mutex_unlock(&locked_ref->mutex);
+
                                         printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
                                         spin_lock(&delayed_refs->lock);
                                         return ret;
@@ -2356,6 +2342,10 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                 count++;
  
                 if (ret) {
+                       if (locked_ref) {
+                               list_del_init(&locked_ref->cluster);
+                               mutex_unlock(&locked_ref->mutex);
+                       }
                         printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
                         spin_lock(&delayed_refs->lock);
                         return ret;
@@ -3661,7 +3651,7 @@ out:
  
  static int can_overcommit(struct btrfs_root *root,
                           struct btrfs_space_info *space_info, u64 bytes,
-                         int flush)
+                         enum btrfs_reserve_flush_enum flush)
  {
         u64 profile = btrfs_get_alloc_profile(root, 0);
         u64 avail;
@@ -3685,11 +3675,11 @@ static int can_overcommit(struct btrfs_root *root,
                 avail >>= 1;
  
         /*
-        * If we aren't flushing don't let us overcommit too much, say
-        * 1/8th of the space.  If we can flush, let it overcommit up to
-        * 1/2 of the space.
+        * If we aren't flushing all things, let us overcommit up to
+        * 1/2th of the space. If we can flush, don't let us overcommit
+        * too much, let it overcommit up to 1/8 of the space.
          */
-       if (flush)
+       if (flush == BTRFS_RESERVE_FLUSH_ALL)
                 avail >>= 3;
         else
                 avail >>= 1;
@@ -3699,6 +3689,20 @@ static int can_overcommit(struct btrfs_root *root,
         return 0;
  }
  
+static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
+                                              unsigned long nr_pages,
+                                              enum wb_reason reason)
+{
+       if (!writeback_in_progress(sb->s_bdi) &&
+           down_read_trylock(&sb->s_umount)) {
+               writeback_inodes_sb_nr(sb, nr_pages, reason);
+               up_read(&sb->s_umount);
+               return 1;
+       }
+
+       return 0;
+}
+
  /*
   * shrink metadata reservation for delalloc
   */
@@ -3713,6 +3717,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
         long time_left;
         unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
         int loops = 0;
+       enum btrfs_reserve_flush_enum flush;
  
         trans = (struct btrfs_trans_handle *)current->journal_info;
         block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -3730,8 +3735,9 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
         while (delalloc_bytes && loops < 3) {
                 max_reclaim = min(delalloc_bytes, to_reclaim);
                 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
-               writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
-                                              WB_REASON_FS_FREE_SPACE);
+               writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb,
+                                                   nr_pages,
+                                                   WB_REASON_FS_FREE_SPACE);
  
                 /*
                  * We need to wait for the async pages to actually start before
@@ -3740,8 +3746,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
                 wait_event(root->fs_info->async_submit_wait,
                            !atomic_read(&root->fs_info->async_delalloc_pages));
  
+               if (!trans)
+                       flush = BTRFS_RESERVE_FLUSH_ALL;
+               else
+                       flush = BTRFS_RESERVE_NO_FLUSH;
                 spin_lock(&space_info->lock);
-               if (can_overcommit(root, space_info, orig, !trans)) {
+               if (can_overcommit(root, space_info, orig, flush)) {
                         spin_unlock(&space_info->lock);
                         break;
                 }
@@ -3899,7 +3909,8 @@ static int flush_space(struct btrfs_root *root,
   */
  static int reserve_metadata_bytes(struct btrfs_root *root,
                                   struct btrfs_block_rsv *block_rsv,
-                                 u64 orig_bytes, int flush)
+                                 u64 orig_bytes,
+                                 enum btrfs_reserve_flush_enum flush)
  {
         struct btrfs_space_info *space_info = block_rsv->space_info;
         u64 used;
@@ -3912,10 +3923,11 @@ again:
         ret = 0;
         spin_lock(&space_info->lock);
         /*
-        * We only want to wait if somebody other than us is flushing and we are
-        * actually alloed to flush.
+        * We only want to wait if somebody other than us is flushing and we
+        * are actually allowed to flush all things.
          */
-       while (flush && !flushing && space_info->flush) {
+       while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
+              space_info->flush) {
                 spin_unlock(&space_info->lock);
                 /*
                  * If we have a trans handle we can't wait because the flusher
@@ -3981,23 +3993,40 @@ again:
          * Couldn't make our reservation, save our place so while we're trying
          * to reclaim space we can actually use it instead of somebody else
          * stealing it from us.
+        *
+        * We make the other tasks wait for the flush only when we can flush
+        * all things.
          */
-       if (ret && flush) {
+       if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
                 flushing = true;
                 space_info->flush = 1;
         }
  
         spin_unlock(&space_info->lock);
  
-       if (!ret || !flush)
+       if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
                 goto out;
  
         ret = flush_space(root, space_info, num_bytes, orig_bytes,
                           flush_state);
         flush_state++;
+
+       /*
+        * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
+        * would happen. So skip delalloc flush.
+        */
+       if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+           (flush_state == FLUSH_DELALLOC ||
+            flush_state == FLUSH_DELALLOC_WAIT))
+               flush_state = ALLOC_CHUNK;
+
         if (!ret)
                 goto again;
-       else if (flush_state <= COMMIT_TRANS)
+       else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+                flush_state < COMMIT_TRANS)
+               goto again;
+       else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+                flush_state <= COMMIT_TRANS)
                 goto again;
  
  out:
@@ -4148,9 +4177,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
         kfree(rsv);
  }
  
-static inline int __block_rsv_add(struct btrfs_root *root,
-                                 struct btrfs_block_rsv *block_rsv,
-                                 u64 num_bytes, int flush)
+int btrfs_block_rsv_add(struct btrfs_root *root,
+                       struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+                       enum btrfs_reserve_flush_enum flush)
  {
         int ret;
  
@@ -4166,20 +4195,6 @@ static inline int __block_rsv_add(struct btrfs_root *root,
         return ret;
  }
  
-int btrfs_block_rsv_add(struct btrfs_root *root,
-                       struct btrfs_block_rsv *block_rsv,
-                       u64 num_bytes)
-{
-       return __block_rsv_add(root, block_rsv, num_bytes, 1);
-}
-
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
-                               struct btrfs_block_rsv *block_rsv,
-                               u64 num_bytes)
-{
-       return __block_rsv_add(root, block_rsv, num_bytes, 0);
-}
-
  int btrfs_block_rsv_check(struct btrfs_root *root,
                           struct btrfs_block_rsv *block_rsv, int min_factor)
  {
@@ -4198,9 +4213,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
         return ret;
  }
  
-static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
-                                          struct btrfs_block_rsv *block_rsv,
-                                          u64 min_reserved, int flush)
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+                          enum btrfs_reserve_flush_enum flush)
  {
         u64 num_bytes = 0;
         int ret = -ENOSPC;
@@ -4228,20 +4243,6 @@ static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
         return ret;
  }
  
-int btrfs_block_rsv_refill(struct btrfs_root *root,
-                          struct btrfs_block_rsv *block_rsv,
-                          u64 min_reserved)
-{
-       return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
-}
-
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
-                                  struct btrfs_block_rsv *block_rsv,
-                                  u64 min_reserved)
-{
-       return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
-}
-
  int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                             struct btrfs_block_rsv *dst_rsv,
                             u64 num_bytes)
@@ -4532,17 +4533,27 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
         u64 csum_bytes;
         unsigned nr_extents = 0;
         int extra_reserve = 0;
-       int flush = 1;
+       enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
         int ret;
+       bool delalloc_lock = true;
  
-       /* Need to be holding the i_mutex here if we aren't free space cache */
-       if (btrfs_is_free_space_inode(inode))
-               flush = 0;
+       /* If we are a free space inode we need to not flush since we will be in
+        * the middle of a transaction commit.  We also don't need the delalloc
+        * mutex since we won't race with anybody.  We need this mostly to make
+        * lockdep shut its filthy mouth.
+        */
+       if (btrfs_is_free_space_inode(inode)) {
+               flush = BTRFS_RESERVE_NO_FLUSH;
+               delalloc_lock = false;
+       }
  
-       if (flush && btrfs_transaction_in_commit(root->fs_info))
+       if (flush != BTRFS_RESERVE_NO_FLUSH &&
+           btrfs_transaction_in_commit(root->fs_info))
                 schedule_timeout(1);
  
-       mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+       if (delalloc_lock)
+               mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+
         num_bytes = ALIGN(num_bytes, root->sectorsize);
  
         spin_lock(&BTRFS_I(inode)->lock);
@@ -4572,7 +4583,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                 ret = btrfs_qgroup_reserve(root, num_bytes +
                                            nr_extents * root->leafsize);
                 if (ret) {
-                       mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+                       spin_lock(&BTRFS_I(inode)->lock);
+                       calc_csum_metadata_size(inode, num_bytes, 0);
+                       spin_unlock(&BTRFS_I(inode)->lock);
+                       if (delalloc_lock)
+                               mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
                         return ret;
                 }
         }
@@ -4607,7 +4622,12 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                                                       btrfs_ino(inode),
                                                       to_free, 0);
                 }
-               mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+               if (root->fs_info->quota_enabled) {
+                       btrfs_qgroup_free(root, num_bytes +
+                                               nr_extents * root->leafsize);
+               }
+               if (delalloc_lock)
+                       mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
                 return ret;
         }
  
@@ -4619,7 +4639,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
         }
         BTRFS_I(inode)->reserved_extents += nr_extents;
         spin_unlock(&BTRFS_I(inode)->lock);
-       mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+
+       if (delalloc_lock)
+               mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
  
         if (to_reserve)
                 trace_btrfs_space_reservation(root->fs_info,"delalloc",
@@ -4969,9 +4991,13 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
  {
         struct btrfs_fs_info *fs_info = root->fs_info;
         struct btrfs_block_group_cache *cache = NULL;
+       struct btrfs_space_info *space_info;
+       struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
         u64 len;
+       bool readonly;
  
         while (start <= end) {
+               readonly = false;
                 if (!cache ||
                     start >= cache->key.objectid + cache->key.offset) {
                         if (cache)
@@ -4989,15 +5015,30 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
                 }
  
                 start += len;
+               space_info = cache->space_info;
  
-               spin_lock(&cache->space_info->lock);
+               spin_lock(&space_info->lock);
                 spin_lock(&cache->lock);
                 cache->pinned -= len;
-               cache->space_info->bytes_pinned -= len;
-               if (cache->ro)
-                       cache->space_info->bytes_readonly += len;
+               space_info->bytes_pinned -= len;
+               if (cache->ro) {
+                       space_info->bytes_readonly += len;
+                       readonly = true;
+               }
                 spin_unlock(&cache->lock);
-               spin_unlock(&cache->space_info->lock);
+               if (!readonly && global_rsv->space_info == space_info) {
+                       spin_lock(&global_rsv->lock);
+                       if (!global_rsv->full) {
+                               len = min(len, global_rsv->size -
+                                         global_rsv->reserved);
+                               global_rsv->reserved += len;
+                               space_info->bytes_may_use += len;
+                               if (global_rsv->reserved >= global_rsv->size)
+                                       global_rsv->full = 1;
+                       }
+                       spin_unlock(&global_rsv->lock);
+               }
+               spin_unlock(&space_info->lock);
         }
  
         if (cache)
@@ -5466,7 +5507,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
         return 0;
  }
  
-static int __get_block_group_index(u64 flags)
+int __get_raid_index(u64 flags)
  {
         int index;
  
@@ -5486,7 +5527,7 @@ static int __get_block_group_index(u64 flags)
  
  static int get_block_group_index(struct btrfs_block_group_cache *cache)
  {
-       return __get_block_group_index(cache->flags);
+       return __get_raid_index(cache->flags);
  }
  
  enum btrfs_loop_type {
@@ -6269,7 +6310,8 @@ use_block_rsv(struct btrfs_trans_handle *trans,
         block_rsv = get_block_rsv(trans, root);
  
         if (block_rsv->size == 0) {
-               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+               ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+                                            BTRFS_RESERVE_NO_FLUSH);
                 /*
                  * If we couldn't reserve metadata bytes try and use some from
                  * the global reserve.
@@ -6292,11 +6334,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
                 static DEFINE_RATELIMIT_STATE(_rs,
                                 DEFAULT_RATELIMIT_INTERVAL,
                                 /*DEFAULT_RATELIMIT_BURST*/ 2);
-               if (__ratelimit(&_rs)) {
-                       printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
-                       WARN_ON(1);
-               }
-               ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+               if (__ratelimit(&_rs))
+                       WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
+                            ret);
+               ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+                                            BTRFS_RESERVE_NO_FLUSH);
                 if (!ret) {
                         return block_rsv;
                 } else if (ret && block_rsv != global_rsv) {
@@ -7427,7 +7469,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
          */
         target = get_restripe_target(root->fs_info, block_group->flags);
         if (target) {
-               index = __get_block_group_index(extended_to_chunk(target));
+               index = __get_raid_index(extended_to_chunk(target));
         } else {
                 /*
                  * this is just a balance, so if we were marked as full
@@ -7461,7 +7503,8 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                  * check to make sure we can actually find a chunk with enough
                  * space to fit our block group in.
                  */
-               if (device->total_bytes > device->bytes_used + min_free) {
+               if (device->total_bytes > device->bytes_used + min_free &&
+                   !device->is_tgtdev_for_dev_replace) {
                         ret = find_free_dev_extent(device, min_free,
                                                    &dev_offset, NULL);
                         if (!ret)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index 472873a94d969a86967e832eac2d452f274f11f9..1b319df29eeee30904bdaa165c1a2fc2b5a49103 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -341,12 +341,10 @@ static int insert_state(struct extent_io_tree *tree,
  {
         struct rb_node *node;
  
-       if (end < start) {
-               printk(KERN_ERR "btrfs end < start %llu %llu\n",
+       if (end < start)
+               WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
                        (unsigned long long)end,
                        (unsigned long long)start);
-               WARN_ON(1);
-       }
         state->start = start;
         state->end = end;
  
@@ -1919,12 +1917,12 @@ static void repair_io_failure_callback(struct bio *bio, int err)
   * the standard behavior is to write all copies in a raid setup. here we only
   * want to write the one bad copy. so we do the mapping for ourselves and issue
   * submit_bio directly.
- * to avoid any synchonization issues, wait for the data after writing, which
+ * to avoid any synchronization issues, wait for the data after writing, which
   * actually prevents the read that triggered the error from finishing.
   * currently, there can be no more than two copies of every data bit. thus,
   * exactly one rewrite is required.
   */
-int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
                         u64 length, u64 logical, struct page *page,
                         int mirror_num)
  {
@@ -1946,7 +1944,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
         bio->bi_size = 0;
         map_length = length;
  
-       ret = btrfs_map_block(map_tree, WRITE, logical,
+       ret = btrfs_map_block(fs_info, WRITE, logical,
                               &map_length, &bbio, mirror_num);
         if (ret) {
                 bio_put(bio);
@@ -1984,14 +1982,13 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
  int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
                          int mirror_num)
  {
-       struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
         u64 start = eb->start;
         unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
         int ret = 0;
  
         for (i = 0; i < num_pages; i++) {
                 struct page *p = extent_buffer_page(eb, i);
-               ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
+               ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
                                         start, p, mirror_num);
                 if (ret)
                         break;
@@ -2010,7 +2007,7 @@ static int clean_io_failure(u64 start, struct page *page)
         u64 private;
         u64 private_failure;
         struct io_failure_record *failrec;
-       struct btrfs_mapping_tree *map_tree;
+       struct btrfs_fs_info *fs_info;
         struct extent_state *state;
         int num_copies;
         int did_repair = 0;
@@ -2046,11 +2043,11 @@ static int clean_io_failure(u64 start, struct page *page)
         spin_unlock(&BTRFS_I(inode)->io_tree.lock);
  
         if (state && state->start == failrec->start) {
-               map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
-               num_copies = btrfs_num_copies(map_tree, failrec->logical,
-                                               failrec->len);
+               fs_info = BTRFS_I(inode)->root->fs_info;
+               num_copies = btrfs_num_copies(fs_info, failrec->logical,
+                                             failrec->len);
                 if (num_copies > 1)  {
-                       ret = repair_io_failure(map_tree, start, failrec->len,
+                       ret = repair_io_failure(fs_info, start, failrec->len,
                                                 failrec->logical, page,
                                                 failrec->failed_mirror);
                         did_repair = !ret;
@@ -2159,9 +2156,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
                  * clean_io_failure() clean all those errors at once.
                  */
         }
-       num_copies = btrfs_num_copies(
-                             &BTRFS_I(inode)->root->fs_info->mapping_tree,
-                             failrec->logical, failrec->len);
+       num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+                                     failrec->logical, failrec->len);
         if (num_copies == 1) {
                 /*
                  * we only have a single copy of the data, so don't bother with
@@ -2466,10 +2462,6 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
         return bio;
  }
  
-/*
- * Since writes are async, they will only return -ENOMEM.
- * Reads can return the full range of I/O error conditions.
- */
  static int __must_check submit_one_bio(int rw, struct bio *bio,
                                        int mirror_num, unsigned long bio_flags)
  {
@@ -4721,10 +4713,9 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
         }
  
         if (start + min_len > eb->len) {
-               printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+               WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
                        "wanted %lu %lu\n", (unsigned long long)eb->start,
                        eb->len, start, min_len);
-               WARN_ON(1);
                 return -EINVAL;
         }
  
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h

index 711d12b80028b701033d7a326b28ed18b43e37ff..2eacfabd32632e90056e76cc5a678ecce6c6504a 100644 (file)
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -337,9 +337,9 @@ struct bio *
  btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
                 gfp_t gfp_flags);
  
-struct btrfs_mapping_tree;
+struct btrfs_fs_info;
  
-int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
                         u64 length, u64 logical, struct page *page,
                         int mirror_num);
  int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c

index ce9f79216723fbfcfc563d17e5e8974cbd2e9b00..f169d6b11d7f6a093158428352a44f35d4bae952 100644 (file)
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -49,7 +49,7 @@ void extent_map_tree_init(struct extent_map_tree *tree)
  struct extent_map *alloc_extent_map(void)
  {
         struct extent_map *em;
-       em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
+       em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
         if (!em)
                 return NULL;
         em->in_tree = 0;
@@ -198,16 +198,15 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
                         merge = rb_entry(rb, struct extent_map, rb_node);
                 if (rb && mergable_maps(merge, em)) {
                         em->start = merge->start;
+                       em->orig_start = merge->orig_start;
                         em->len += merge->len;
                         em->block_len += merge->block_len;
                         em->block_start = merge->block_start;
                         merge->in_tree = 0;
-                       if (merge->generation > em->generation) {
-                               em->mod_start = em->start;
-                               em->mod_len = em->len;
-                               em->generation = merge->generation;
-                               list_move(&em->list, &tree->modified_extents);
-                       }
+                       em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
+                       em->mod_start = merge->mod_start;
+                       em->generation = max(em->generation, merge->generation);
+                       list_move(&em->list, &tree->modified_extents);
  
                         list_del_init(&merge->list);
                         rb_erase(&merge->rb_node, &tree->map);
@@ -223,11 +222,8 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
                 em->block_len += merge->len;
                 rb_erase(&merge->rb_node, &tree->map);
                 merge->in_tree = 0;
-               if (merge->generation > em->generation) {
-                       em->mod_len = em->len;
-                       em->generation = merge->generation;
-                       list_move(&em->list, &tree->modified_extents);
-               }
+               em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
+               em->generation = max(em->generation, merge->generation);
                 list_del_init(&merge->list);
                 free_extent_map(merge);
         }
@@ -265,9 +261,9 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
         em->mod_start = em->start;
         em->mod_len = em->len;
  
-       if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+       if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
                 prealloc = true;
-               clear_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+               clear_bit(EXTENT_FLAG_FILLING, &em->flags);
         }
  
         try_merge_map(tree, em);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h

index 679225555f7b597b91012e5a47434a72e1774c91..922943ce29e8caebe0a69feda2a7e481bed0d556 100644 (file)
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -14,6 +14,7 @@
  #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
  #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
  #define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
+#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
  
  struct extent_map {
         struct rb_node rb_node;
@@ -24,6 +25,7 @@ struct extent_map {
         u64 mod_start;
         u64 mod_len;
         u64 orig_start;
+       u64 orig_block_len;
         u64 block_start;
         u64 block_len;
         u64 generation;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c

index 1ad08e4e4a15fa25ea7f65397cbc1e154fa1f4b9..bd38cef4235882425c08eb6a5d10306736a14237 100644 (file)
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -133,7 +133,6 @@ fail:
         return ERR_PTR(ret);
  }
  
-
  int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              struct btrfs_path *path, u64 objectid,
@@ -151,6 +150,26 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
         return ret;
  }
  
+u64 btrfs_file_extent_length(struct btrfs_path *path)
+{
+       int extent_type;
+       struct btrfs_file_extent_item *fi;
+       u64 len;
+
+       fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                           struct btrfs_file_extent_item);
+       extent_type = btrfs_file_extent_type(path->nodes[0], fi);
+
+       if (extent_type == BTRFS_FILE_EXTENT_REG ||
+           extent_type == BTRFS_FILE_EXTENT_PREALLOC)
+               len = btrfs_file_extent_num_bytes(path->nodes[0], fi);
+       else if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+               len = btrfs_file_extent_inline_len(path->nodes[0], fi);
+       else
+               BUG();
+
+       return len;
+}
  
  static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
                                    struct inode *inode, struct bio *bio,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index 9c6673a9231fad34c6387f08822aaada78fe5665..77061bf43edbae995bf55ce73dcc89bf73012400 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,7 @@
  #include "compat.h"
  #include "volumes.h"
  
+static struct kmem_cache *btrfs_inode_defrag_cachep;
  /*
   * when auto defrag is enabled we
   * queue up these defrag structs to remember which
@@ -90,7 +91,7 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1,
   * If an existing record is found the defrag item you
   * pass in is freed
   */
-static void __btrfs_add_inode_defrag(struct inode *inode,
+static int __btrfs_add_inode_defrag(struct inode *inode,
                                     struct inode_defrag *defrag)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -118,18 +119,24 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
                                 entry->transid = defrag->transid;
                         if (defrag->last_offset > entry->last_offset)
                                 entry->last_offset = defrag->last_offset;
-                       goto exists;
+                       return -EEXIST;
                 }
         }
         set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
         rb_link_node(&defrag->rb_node, parent, p);
         rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
-       return;
+       return 0;
+}
  
-exists:
-       kfree(defrag);
-       return;
+static inline int __need_auto_defrag(struct btrfs_root *root)
+{
+       if (!btrfs_test_opt(root, AUTO_DEFRAG))
+               return 0;
+
+       if (btrfs_fs_closing(root->fs_info))
+               return 0;
  
+       return 1;
  }
  
  /*
@@ -142,11 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct inode_defrag *defrag;
         u64 transid;
+       int ret;
  
-       if (!btrfs_test_opt(root, AUTO_DEFRAG))
-               return 0;
-
-       if (btrfs_fs_closing(root->fs_info))
+       if (!__need_auto_defrag(root))
                 return 0;
  
         if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
@@ -157,7 +162,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
         else
                 transid = BTRFS_I(inode)->root->last_trans;
  
-       defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
+       defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
         if (!defrag)
                 return -ENOMEM;
  
@@ -166,20 +171,56 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
         defrag->root = root->root_key.objectid;
  
         spin_lock(&root->fs_info->defrag_inodes_lock);
-       if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
-               __btrfs_add_inode_defrag(inode, defrag);
-       else
-               kfree(defrag);
+       if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
+               /*
+                * If we set IN_DEFRAG flag and evict the inode from memory,
+                * and then re-read this inode, this new inode doesn't have
+                * IN_DEFRAG flag. At the case, we may find the existed defrag.
+                */
+               ret = __btrfs_add_inode_defrag(inode, defrag);
+               if (ret)
+                       kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+       } else {
+               kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+       }
         spin_unlock(&root->fs_info->defrag_inodes_lock);
         return 0;
  }
  
  /*
- * must be called with the defrag_inodes lock held
+ * Requeue the defrag object. If there is a defrag object that points to
+ * the same inode in the tree, we will merge them together (by
+ * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
   */
-struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
-                                            u64 root, u64 ino,
-                                            struct rb_node **next)
+void btrfs_requeue_inode_defrag(struct inode *inode,
+                               struct inode_defrag *defrag)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int ret;
+
+       if (!__need_auto_defrag(root))
+               goto out;
+
+       /*
+        * Here we don't check the IN_DEFRAG flag, because we need merge
+        * them together.
+        */
+       spin_lock(&root->fs_info->defrag_inodes_lock);
+       ret = __btrfs_add_inode_defrag(inode, defrag);
+       spin_unlock(&root->fs_info->defrag_inodes_lock);
+       if (ret)
+               goto out;
+       return;
+out:
+       kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+}
+
+/*
+ * pick the defragable inode that we want, if it doesn't exist, we will get
+ * the next one.
+ */
+static struct inode_defrag *
+btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
  {
         struct inode_defrag *entry = NULL;
         struct inode_defrag tmp;
@@ -190,7 +231,8 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
         tmp.ino = ino;
         tmp.root = root;
  
-       p = info->defrag_inodes.rb_node;
+       spin_lock(&fs_info->defrag_inodes_lock);
+       p = fs_info->defrag_inodes.rb_node;
         while (p) {
                 parent = p;
                 entry = rb_entry(parent, struct inode_defrag, rb_node);
@@ -201,52 +243,131 @@ struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
                 else if (ret > 0)
                         p = parent->rb_right;
                 else
-                       return entry;
+                       goto out;
         }
  
-       if (next) {
-               while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
-                       parent = rb_next(parent);
+       if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
+               parent = rb_next(parent);
+               if (parent)
                         entry = rb_entry(parent, struct inode_defrag, rb_node);
-               }
-               *next = parent;
+               else
+                       entry = NULL;
         }
-       return NULL;
+out:
+       if (entry)
+               rb_erase(parent, &fs_info->defrag_inodes);
+       spin_unlock(&fs_info->defrag_inodes_lock);
+       return entry;
  }
  
-/*
- * run through the list of inodes in the FS that need
- * defragging
- */
-int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
  {
         struct inode_defrag *defrag;
+       struct rb_node *node;
+
+       spin_lock(&fs_info->defrag_inodes_lock);
+       node = rb_first(&fs_info->defrag_inodes);
+       while (node) {
+               rb_erase(node, &fs_info->defrag_inodes);
+               defrag = rb_entry(node, struct inode_defrag, rb_node);
+               kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+
+               if (need_resched()) {
+                       spin_unlock(&fs_info->defrag_inodes_lock);
+                       cond_resched();
+                       spin_lock(&fs_info->defrag_inodes_lock);
+               }
+
+               node = rb_first(&fs_info->defrag_inodes);
+       }
+       spin_unlock(&fs_info->defrag_inodes_lock);
+}
+
+#define BTRFS_DEFRAG_BATCH     1024
+
+static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+                                   struct inode_defrag *defrag)
+{
         struct btrfs_root *inode_root;
         struct inode *inode;
-       struct rb_node *n;
         struct btrfs_key key;
         struct btrfs_ioctl_defrag_range_args range;
-       u64 first_ino = 0;
-       u64 root_objectid = 0;
         int num_defrag;
-       int defrag_batch = 1024;
  
+       /* get the inode */
+       key.objectid = defrag->root;
+       btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+       key.offset = (u64)-1;
+       inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+       if (IS_ERR(inode_root)) {
+               kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+               return PTR_ERR(inode_root);
+       }
+
+       key.objectid = defrag->ino;
+       btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+       key.offset = 0;
+       inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+       if (IS_ERR(inode)) {
+               kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+               return PTR_ERR(inode);
+       }
+
+       /* do a chunk of defrag */
+       clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
         memset(&range, 0, sizeof(range));
         range.len = (u64)-1;
+       range.start = defrag->last_offset;
+
+       sb_start_write(fs_info->sb);
+       num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+                                      BTRFS_DEFRAG_BATCH);
+       sb_end_write(fs_info->sb);
+       /*
+        * if we filled the whole defrag batch, there
+        * must be more work to do.  Queue this defrag
+        * again
+        */
+       if (num_defrag == BTRFS_DEFRAG_BATCH) {
+               defrag->last_offset = range.start;
+               btrfs_requeue_inode_defrag(inode, defrag);
+       } else if (defrag->last_offset && !defrag->cycled) {
+               /*
+                * we didn't fill our defrag batch, but
+                * we didn't start at zero.  Make sure we loop
+                * around to the start of the file.
+                */
+               defrag->last_offset = 0;
+               defrag->cycled = 1;
+               btrfs_requeue_inode_defrag(inode, defrag);
+       } else {
+               kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+       }
+
+       iput(inode);
+       return 0;
+}
+
+/*
+ * run through the list of inodes in the FS that need
+ * defragging
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+       struct inode_defrag *defrag;
+       u64 first_ino = 0;
+       u64 root_objectid = 0;
  
         atomic_inc(&fs_info->defrag_running);
-       spin_lock(&fs_info->defrag_inodes_lock);
         while(1) {
-               n = NULL;
+               if (!__need_auto_defrag(fs_info->tree_root))
+                       break;
  
                 /* find an inode to defrag */
-               defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
-                                                first_ino, &n);
+               defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
+                                                first_ino);
                 if (!defrag) {
-                       if (n) {
-                               defrag = rb_entry(n, struct inode_defrag,
-                                                 rb_node);
-                       } else if (root_objectid || first_ino) {
+                       if (root_objectid || first_ino) {
                                 root_objectid = 0;
                                 first_ino = 0;
                                 continue;
@@ -255,70 +376,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
                         }
                 }
  
-               /* remove it from the rbtree */
                 first_ino = defrag->ino + 1;
                 root_objectid = defrag->root;
-               rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
-
-               if (btrfs_fs_closing(fs_info))
-                       goto next_free;
-
-               spin_unlock(&fs_info->defrag_inodes_lock);
-
-               /* get the inode */
-               key.objectid = defrag->root;
-               btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-               key.offset = (u64)-1;
-               inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
-               if (IS_ERR(inode_root))
-                       goto next;
-
-               key.objectid = defrag->ino;
-               btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-               key.offset = 0;
-
-               inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
-               if (IS_ERR(inode))
-                       goto next;
  
-               /* do a chunk of defrag */
-               clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
-               range.start = defrag->last_offset;
-               num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
-                                              defrag_batch);
-               /*
-                * if we filled the whole defrag batch, there
-                * must be more work to do.  Queue this defrag
-                * again
-                */
-               if (num_defrag == defrag_batch) {
-                       defrag->last_offset = range.start;
-                       __btrfs_add_inode_defrag(inode, defrag);
-                       /*
-                        * we don't want to kfree defrag, we added it back to
-                        * the rbtree
-                        */
-                       defrag = NULL;
-               } else if (defrag->last_offset && !defrag->cycled) {
-                       /*
-                        * we didn't fill our defrag batch, but
-                        * we didn't start at zero.  Make sure we loop
-                        * around to the start of the file.
-                        */
-                       defrag->last_offset = 0;
-                       defrag->cycled = 1;
-                       __btrfs_add_inode_defrag(inode, defrag);
-                       defrag = NULL;
-               }
-
-               iput(inode);
-next:
-               spin_lock(&fs_info->defrag_inodes_lock);
-next_free:
-               kfree(defrag);
+               __btrfs_run_defrag_inode(fs_info, defrag);
         }
-       spin_unlock(&fs_info->defrag_inodes_lock);
-
         atomic_dec(&fs_info->defrag_running);
  
         /*
@@ -526,6 +588,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                                 split->block_len = em->block_len;
                         else
                                 split->block_len = split->len;
+                       split->orig_block_len = max(split->block_len,
+                                                   em->orig_block_len);
                         split->generation = gen;
                         split->bdev = em->bdev;
                         split->flags = flags;
@@ -547,6 +611,8 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                         split->flags = flags;
                         split->compress_type = em->compress_type;
                         split->generation = gen;
+                       split->orig_block_len = max(em->block_len,
+                                                   em->orig_block_len);
  
                         if (compressed) {
                                 split->block_len = em->block_len;
@@ -555,7 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                         } else {
                                 split->block_len = split->len;
                                 split->block_start = em->block_start + diff;
-                               split->orig_start = split->start;
+                               split->orig_start = em->orig_start;
                         }
  
                         ret = add_extent_mapping(em_tree, split);
@@ -1348,7 +1414,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
  
                 balance_dirty_pages_ratelimited(inode->i_mapping);
                 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
-                       btrfs_btree_balance_dirty(root, 1);
+                       btrfs_btree_balance_dirty(root);
  
                 pos += copied;
                 num_written += copied;
@@ -1397,6 +1463,24 @@ out:
         return written ? written : err;
  }
  
+static void update_time_for_write(struct inode *inode)
+{
+       struct timespec now;
+
+       if (IS_NOCMTIME(inode))
+               return;
+
+       now = current_fs_time(inode->i_sb);
+       if (!timespec_equal(&inode->i_mtime, &now))
+               inode->i_mtime = now;
+
+       if (!timespec_equal(&inode->i_ctime, &now))
+               inode->i_ctime = now;
+
+       if (IS_I_VERSION(inode))
+               inode_inc_iversion(inode);
+}
+
  static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                                     const struct iovec *iov,
                                     unsigned long nr_segs, loff_t pos)
@@ -1409,6 +1493,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
         ssize_t num_written = 0;
         ssize_t err = 0;
         size_t count, ocount;
+       bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
  
         sb_start_write(inode->i_sb);
  
@@ -1451,11 +1536,13 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                 goto out;
         }
  
-       err = file_update_time(file);
-       if (err) {
-               mutex_unlock(&inode->i_mutex);
-               goto out;
-       }
+       /*
+        * We reserve space for updating the inode when we reserve space for the
+        * extent we are going to write, so we will enospc out there.  We don't
+        * need to start yet another transaction to update the inode as we will
+        * update the inode when we finish writing whatever data we write.
+        */
+       update_time_for_write(inode);
  
         start_pos = round_down(pos, root->sectorsize);
         if (start_pos > i_size_read(inode)) {
@@ -1466,6 +1553,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                 }
         }
  
+       if (sync)
+               atomic_inc(&BTRFS_I(inode)->sync_writers);
+
         if (unlikely(file->f_flags & O_DIRECT)) {
                 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
                                                    pos, ppos, count, ocount);
@@ -1492,14 +1582,21 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
          * this will either be one more than the running transaction
          * or the generation used for the next transaction if there isn't
          * one running right now.
+        *
+        * We also have to set last_sub_trans to the current log transid,
+        * otherwise subsequent syncs to a file that's been synced in this
+        * transaction will appear to have already occured.
          */
         BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+       BTRFS_I(inode)->last_sub_trans = root->log_transid;
         if (num_written > 0 || num_written == -EIOCBQUEUED) {
                 err = generic_write_sync(file, pos, num_written);
                 if (err < 0 && num_written > 0)
                         num_written = err;
         }
  out:
+       if (sync)
+               atomic_dec(&BTRFS_I(inode)->sync_writers);
         sb_end_write(inode->i_sb);
         current->backing_dev_info = NULL;
         return num_written ? num_written : err;
@@ -1550,7 +1647,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
          * out of the ->i_mutex. If so, we can flush the dirty pages by
          * multi-task, and make the performance up.
          */
+       atomic_inc(&BTRFS_I(inode)->sync_writers);
         ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+       atomic_dec(&BTRFS_I(inode)->sync_writers);
         if (ret)
                 return ret;
  
@@ -1561,7 +1660,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
          * range being left.
          */
         atomic_inc(&root->log_batch);
-       btrfs_wait_ordered_range(inode, start, end);
+       btrfs_wait_ordered_range(inode, start, end - start + 1);
         atomic_inc(&root->log_batch);
  
         /*
@@ -1767,6 +1866,7 @@ out:
  
                 hole_em->block_start = EXTENT_MAP_HOLE;
                 hole_em->block_len = 0;
+               hole_em->orig_block_len = 0;
                 hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
                 hole_em->compress_type = BTRFS_COMPRESS_NONE;
                 hole_em->generation = trans->transid;
@@ -1796,48 +1896,51 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
         struct btrfs_path *path;
         struct btrfs_block_rsv *rsv;
         struct btrfs_trans_handle *trans;
-       u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
-       u64 lockstart = (offset + mask) & ~mask;
-       u64 lockend = ((offset + len) & ~mask) - 1;
+       u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
+       u64 lockend = round_down(offset + len,
+                                BTRFS_I(inode)->root->sectorsize) - 1;
         u64 cur_offset = lockstart;
         u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
         u64 drop_end;
-       unsigned long nr;
         int ret = 0;
         int err = 0;
-       bool same_page = (offset >> PAGE_CACHE_SHIFT) ==
-               ((offset + len) >> PAGE_CACHE_SHIFT);
+       bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
+                         ((offset + len - 1) >> PAGE_CACHE_SHIFT));
  
         btrfs_wait_ordered_range(inode, offset, len);
  
         mutex_lock(&inode->i_mutex);
-       if (offset >= inode->i_size) {
-               mutex_unlock(&inode->i_mutex);
-               return 0;
-       }
-
+       /*
+        * We needn't truncate any page which is beyond the end of the file
+        * because we are sure there is no data there.
+        */
         /*
          * Only do this if we are in the same page and we aren't doing the
          * entire page.
          */
         if (same_page && len < PAGE_CACHE_SIZE) {
-               ret = btrfs_truncate_page(inode, offset, len, 0);
+               if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
+                       ret = btrfs_truncate_page(inode, offset, len, 0);
                 mutex_unlock(&inode->i_mutex);
                 return ret;
         }
  
         /* zero back part of the first page */
-       ret = btrfs_truncate_page(inode, offset, 0, 0);
-       if (ret) {
-               mutex_unlock(&inode->i_mutex);
-               return ret;
+       if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+               ret = btrfs_truncate_page(inode, offset, 0, 0);
+               if (ret) {
+                       mutex_unlock(&inode->i_mutex);
+                       return ret;
+               }
         }
  
         /* zero the front end of the last page */
-       ret = btrfs_truncate_page(inode, offset + len, 0, 1);
-       if (ret) {
-               mutex_unlock(&inode->i_mutex);
-               return ret;
+       if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+               ret = btrfs_truncate_page(inode, offset + len, 0, 1);
+               if (ret) {
+                       mutex_unlock(&inode->i_mutex);
+                       return ret;
+               }
         }
  
         if (lockend < lockstart) {
@@ -1930,9 +2033,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                         break;
                 }
  
-               nr = trans->blocks_used;
                 btrfs_end_transaction(trans, root);
-               btrfs_btree_balance_dirty(root, nr);
+               btrfs_btree_balance_dirty(root);
  
                 trans = btrfs_start_transaction(root, 3);
                 if (IS_ERR(trans)) {
@@ -1963,11 +2065,13 @@ out_trans:
         if (!trans)
                 goto out_free;
  
+       inode_inc_iversion(inode);
+       inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
         trans->block_rsv = &root->fs_info->trans_block_rsv;
         ret = btrfs_update_inode(trans, root, inode);
-       nr = trans->blocks_used;
         btrfs_end_transaction(trans, root);
-       btrfs_btree_balance_dirty(root, nr);
+       btrfs_btree_balance_dirty(root);
  out_free:
         btrfs_free_path(path);
         btrfs_free_block_rsv(root, rsv);
@@ -1991,12 +2095,12 @@ static long btrfs_fallocate(struct file *file, int mode,
         u64 alloc_end;
         u64 alloc_hint = 0;
         u64 locked_end;
-       u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
         struct extent_map *em;
+       int blocksize = BTRFS_I(inode)->root->sectorsize;
         int ret;
  
-       alloc_start = offset & ~mask;
-       alloc_end =  (offset + len + mask) & ~mask;
+       alloc_start = round_down(offset, blocksize);
+       alloc_end = round_up(offset + len, blocksize);
  
         /* Make sure we aren't being give some crap mode */
         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2009,7 +2113,7 @@ static long btrfs_fallocate(struct file *file, int mode,
          * Make sure we have enough space before we do the
          * allocation.
          */
-       ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start + 1);
+       ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
         if (ret)
                 return ret;
  
@@ -2077,7 +2181,7 @@ static long btrfs_fallocate(struct file *file, int mode,
                 }
                 last_byte = min(extent_map_end(em), alloc_end);
                 actual_end = min_t(u64, extent_map_end(em), offset + len);
-               last_byte = (last_byte + mask) & ~mask;
+               last_byte = ALIGN(last_byte, blocksize);
  
                 if (em->block_start == EXTENT_MAP_HOLE ||
                     (cur_offset >= inode->i_size &&
@@ -2116,7 +2220,7 @@ static long btrfs_fallocate(struct file *file, int mode,
  out:
         mutex_unlock(&inode->i_mutex);
         /* Let go of our reservation. */
-       btrfs_free_reserved_data_space(inode, alloc_end - alloc_start + 1);
+       btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
         return ret;
  }
  
@@ -2292,3 +2396,21 @@ const struct file_operations btrfs_file_operations = {
         .compat_ioctl   = btrfs_ioctl,
  #endif
  };
+
+void btrfs_auto_defrag_exit(void)
+{
+       if (btrfs_inode_defrag_cachep)
+               kmem_cache_destroy(btrfs_inode_defrag_cachep);
+}
+
+int btrfs_auto_defrag_init(void)
+{
+       btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
+                                       sizeof(struct inode_defrag), 0,
+                                       SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                       NULL);
+       if (!btrfs_inode_defrag_cachep)
+               return -ENOMEM;
+
+       return 0;
+}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c

index 1027b854b90cec02b9d2328804bc12f23f9bc00b..59ea2e4349c9cdbecb105a3a384376f52b0a5da6 100644 (file)
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -307,7 +307,6 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl)
  
  static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
  {
-       WARN_ON(io_ctl->cur);
         BUG_ON(io_ctl->index >= io_ctl->num_pages);
         io_ctl->page = io_ctl->pages[io_ctl->index++];
         io_ctl->cur = kmap(io_ctl->page);
@@ -1250,18 +1249,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
                          * if previous extent entry covers the offset,
                          * we should return it instead of the bitmap entry
                          */
-                       n = &entry->offset_index;
-                       while (1) {
-                               n = rb_prev(n);
-                               if (!n)
-                                       break;
+                       n = rb_prev(&entry->offset_index);
+                       if (n) {
                                 prev = rb_entry(n, struct btrfs_free_space,
                                                 offset_index);
-                               if (!prev->bitmap) {
-                                       if (prev->offset + prev->bytes > offset)
-                                               entry = prev;
-                                       break;
-                               }
+                               if (!prev->bitmap &&
+                                   prev->offset + prev->bytes > offset)
+                                       entry = prev;
                         }
                 }
                 return entry;
@@ -1287,18 +1281,13 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
         }
  
         if (entry->bitmap) {
-               n = &entry->offset_index;
-               while (1) {
-                       n = rb_prev(n);
-                       if (!n)
-                               break;
+               n = rb_prev(&entry->offset_index);
+               if (n) {
                         prev = rb_entry(n, struct btrfs_free_space,
                                         offset_index);
-                       if (!prev->bitmap) {
-                               if (prev->offset + prev->bytes > offset)
-                                       return prev;
-                               break;
-                       }
+                       if (!prev->bitmap &&
+                           prev->offset + prev->bytes > offset)
+                               return prev;
                 }
                 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
                         return entry;
@@ -1364,7 +1353,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
         u64 bitmap_bytes;
         u64 extent_bytes;
         u64 size = block_group->key.offset;
-       u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+       u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
         int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
  
         BUG_ON(ctl->total_bitmaps > max_bitmaps);
@@ -1650,8 +1639,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
          * some block groups are so tiny they can't be enveloped by a bitmap, so
          * don't even bother to create a bitmap for this
          */
-       if (BITS_PER_BITMAP * block_group->sectorsize >
-           block_group->key.offset)
+       if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset)
                 return false;
  
         return true;
@@ -2298,10 +2286,10 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
         unsigned long total_found = 0;
         int ret;
  
-       i = offset_to_bit(entry->offset, block_group->sectorsize,
+       i = offset_to_bit(entry->offset, ctl->unit,
                           max_t(u64, offset, entry->offset));
-       want_bits = bytes_to_bits(bytes, block_group->sectorsize);
-       min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+       want_bits = bytes_to_bits(bytes, ctl->unit);
+       min_bits = bytes_to_bits(min_bytes, ctl->unit);
  
  again:
         found_bits = 0;
@@ -2325,23 +2313,22 @@ again:
  
         total_found += found_bits;
  
-       if (cluster->max_size < found_bits * block_group->sectorsize)
-               cluster->max_size = found_bits * block_group->sectorsize;
+       if (cluster->max_size < found_bits * ctl->unit)
+               cluster->max_size = found_bits * ctl->unit;
  
         if (total_found < want_bits || cluster->max_size < cont1_bytes) {
                 i = next_zero + 1;
                 goto again;
         }
  
-       cluster->window_start = start * block_group->sectorsize +
-               entry->offset;
+       cluster->window_start = start * ctl->unit + entry->offset;
         rb_erase(&entry->offset_index, &ctl->free_space_offset);
         ret = tree_insert_offset(&cluster->root, entry->offset,
                                  &entry->offset_index, 1);
         BUG_ON(ret); /* -EEXIST; Logic error */
  
         trace_btrfs_setup_cluster(block_group, cluster,
-                                 total_found * block_group->sectorsize, 1);
+                                 total_found * ctl->unit, 1);
         return 0;
  }
  
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c

index b1a1c929ba8047553aa36773cafaf692b75eb2f7..d26f67a59e36f9caf3e91832fa1d4c943c8c9040 100644 (file)
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -434,8 +434,9 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
          * 3 items for pre-allocation
          */
         trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
-       ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
-                                         trans->bytes_reserved);
+       ret = btrfs_block_rsv_add(root, trans->block_rsv,
+                                 trans->bytes_reserved,
+                                 BTRFS_RESERVE_NO_FLUSH);
         if (ret)
                 goto out;
         trace_btrfs_space_reservation(root->fs_info, "ino_cache",
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 95542a1b3dfc99632219310f0108788789247fc9..67ed24ae86bbc1c475517d2593eeca72e56880ac 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -71,6 +71,7 @@ static const struct file_operations btrfs_dir_file_operations;
  static struct extent_io_ops btrfs_extent_io_ops;
  
  static struct kmem_cache *btrfs_inode_cachep;
+static struct kmem_cache *btrfs_delalloc_work_cachep;
  struct kmem_cache *btrfs_trans_handle_cachep;
  struct kmem_cache *btrfs_transaction_cachep;
  struct kmem_cache *btrfs_path_cachep;
@@ -94,6 +95,10 @@ static noinline int cow_file_range(struct inode *inode,
                                    struct page *locked_page,
                                    u64 start, u64 end, int *page_started,
                                    unsigned long *nr_written, int unlock);
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+                                          u64 len, u64 orig_start,
+                                          u64 block_start, u64 block_len,
+                                          u64 orig_block_len, int type);
  
  static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
                                      struct inode *inode,  struct inode *dir,
@@ -698,14 +703,19 @@ retry:
  
                 em->block_start = ins.objectid;
                 em->block_len = ins.offset;
+               em->orig_block_len = ins.offset;
                 em->bdev = root->fs_info->fs_devices->latest_bdev;
                 em->compress_type = async_extent->compress_type;
                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
                 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+               em->generation = -1;
  
                 while (1) {
                         write_lock(&em_tree->lock);
                         ret = add_extent_mapping(em_tree, em);
+                       if (!ret)
+                               list_move(&em->list,
+                                         &em_tree->modified_extents);
                         write_unlock(&em_tree->lock);
                         if (ret != -EEXIST) {
                                 free_extent_map(em);
@@ -803,14 +813,14 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
   * required to start IO on it.  It may be clean and already done with
   * IO when we return.
   */
-static noinline int cow_file_range(struct inode *inode,
-                                  struct page *locked_page,
-                                  u64 start, u64 end, int *page_started,
-                                  unsigned long *nr_written,
-                                  int unlock)
+static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
+                                    struct inode *inode,
+                                    struct btrfs_root *root,
+                                    struct page *locked_page,
+                                    u64 start, u64 end, int *page_started,
+                                    unsigned long *nr_written,
+                                    int unlock)
  {
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct btrfs_trans_handle *trans;
         u64 alloc_hint = 0;
         u64 num_bytes;
         unsigned long ram_size;
@@ -823,25 +833,10 @@ static noinline int cow_file_range(struct inode *inode,
         int ret = 0;
  
         BUG_ON(btrfs_is_free_space_inode(inode));
-       trans = btrfs_join_transaction(root);
-       if (IS_ERR(trans)) {
-               extent_clear_unlock_delalloc(inode,
-                            &BTRFS_I(inode)->io_tree,
-                            start, end, locked_page,
-                            EXTENT_CLEAR_UNLOCK_PAGE |
-                            EXTENT_CLEAR_UNLOCK |
-                            EXTENT_CLEAR_DELALLOC |
-                            EXTENT_CLEAR_DIRTY |
-                            EXTENT_SET_WRITEBACK |
-                            EXTENT_END_WRITEBACK);
-               return PTR_ERR(trans);
-       }
-       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
  
         num_bytes = (end - start + blocksize) & ~(blocksize - 1);
         num_bytes = max(blocksize,  num_bytes);
         disk_num_bytes = num_bytes;
-       ret = 0;
  
         /* if this is a small write inside eof, kick off defrag */
         if (num_bytes < 64 * 1024 &&
@@ -900,12 +895,17 @@ static noinline int cow_file_range(struct inode *inode,
  
                 em->block_start = ins.objectid;
                 em->block_len = ins.offset;
+               em->orig_block_len = ins.offset;
                 em->bdev = root->fs_info->fs_devices->latest_bdev;
                 set_bit(EXTENT_FLAG_PINNED, &em->flags);
+               em->generation = -1;
  
                 while (1) {
                         write_lock(&em_tree->lock);
                         ret = add_extent_mapping(em_tree, em);
+                       if (!ret)
+                               list_move(&em->list,
+                                         &em_tree->modified_extents);
                         write_unlock(&em_tree->lock);
                         if (ret != -EEXIST) {
                                 free_extent_map(em);
@@ -952,11 +952,9 @@ static noinline int cow_file_range(struct inode *inode,
                 alloc_hint = ins.objectid + ins.offset;
                 start += cur_alloc_size;
         }
-       ret = 0;
  out:
-       btrfs_end_transaction(trans, root);
-
         return ret;
+
  out_unlock:
         extent_clear_unlock_delalloc(inode,
                      &BTRFS_I(inode)->io_tree,
@@ -971,6 +969,39 @@ out_unlock:
         goto out;
  }
  
+static noinline int cow_file_range(struct inode *inode,
+                                  struct page *locked_page,
+                                  u64 start, u64 end, int *page_started,
+                                  unsigned long *nr_written,
+                                  int unlock)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int ret;
+
+       trans = btrfs_join_transaction(root);
+       if (IS_ERR(trans)) {
+               extent_clear_unlock_delalloc(inode,
+                            &BTRFS_I(inode)->io_tree,
+                            start, end, locked_page,
+                            EXTENT_CLEAR_UNLOCK_PAGE |
+                            EXTENT_CLEAR_UNLOCK |
+                            EXTENT_CLEAR_DELALLOC |
+                            EXTENT_CLEAR_DIRTY |
+                            EXTENT_SET_WRITEBACK |
+                            EXTENT_END_WRITEBACK);
+               return PTR_ERR(trans);
+       }
+       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+       ret = __cow_file_range(trans, inode, root, locked_page, start, end,
+                              page_started, nr_written, unlock);
+
+       btrfs_end_transaction(trans, root);
+
+       return ret;
+}
+
  /*
   * work queue call back to started compression on a file and pages
   */
@@ -1126,6 +1157,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
         u64 extent_offset;
         u64 disk_bytenr;
         u64 num_bytes;
+       u64 disk_num_bytes;
         int extent_type;
         int ret, err;
         int type;
@@ -1228,6 +1260,8 @@ next_slot:
                         extent_offset = btrfs_file_extent_offset(leaf, fi);
                         extent_end = found_key.offset +
                                 btrfs_file_extent_num_bytes(leaf, fi);
+                       disk_num_bytes =
+                               btrfs_file_extent_disk_num_bytes(leaf, fi);
                         if (extent_end <= start) {
                                 path->slots[0]++;
                                 goto next_slot;
@@ -1281,9 +1315,9 @@ out_check:
  
                 btrfs_release_path(path);
                 if (cow_start != (u64)-1) {
-                       ret = cow_file_range(inode, locked_page, cow_start,
-                                       found_key.offset - 1, page_started,
-                                       nr_written, 1);
+                       ret = __cow_file_range(trans, inode, root, locked_page,
+                                              cow_start, found_key.offset - 1,
+                                              page_started, nr_written, 1);
                         if (ret) {
                                 btrfs_abort_transaction(trans, root, ret);
                                 goto error;
@@ -1298,16 +1332,21 @@ out_check:
                         em = alloc_extent_map();
                         BUG_ON(!em); /* -ENOMEM */
                         em->start = cur_offset;
-                       em->orig_start = em->start;
+                       em->orig_start = found_key.offset - extent_offset;
                         em->len = num_bytes;
                         em->block_len = num_bytes;
                         em->block_start = disk_bytenr;
+                       em->orig_block_len = disk_num_bytes;
                         em->bdev = root->fs_info->fs_devices->latest_bdev;
                         set_bit(EXTENT_FLAG_PINNED, &em->flags);
-                       set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+                       set_bit(EXTENT_FLAG_FILLING, &em->flags);
+                       em->generation = -1;
                         while (1) {
                                 write_lock(&em_tree->lock);
                                 ret = add_extent_mapping(em_tree, em);
+                               if (!ret)
+                                       list_move(&em->list,
+                                                 &em_tree->modified_extents);
                                 write_unlock(&em_tree->lock);
                                 if (ret != -EEXIST) {
                                         free_extent_map(em);
@@ -1352,8 +1391,9 @@ out_check:
         }
  
         if (cow_start != (u64)-1) {
-               ret = cow_file_range(inode, locked_page, cow_start, end,
-                                    page_started, nr_written, 1);
+               ret = __cow_file_range(trans, inode, root, locked_page,
+                                      cow_start, end,
+                                      page_started, nr_written, 1);
                 if (ret) {
                         btrfs_abort_transaction(trans, root, ret);
                         goto error;
@@ -1531,7 +1571,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
                          unsigned long bio_flags)
  {
         struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-       struct btrfs_mapping_tree *map_tree;
         u64 logical = (u64)bio->bi_sector << 9;
         u64 length = 0;
         u64 map_length;
@@ -1541,11 +1580,10 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
                 return 0;
  
         length = bio->bi_size;
-       map_tree = &root->fs_info->mapping_tree;
         map_length = length;
-       ret = btrfs_map_block(map_tree, READ, logical,
+       ret = btrfs_map_block(root->fs_info, READ, logical,
                               &map_length, NULL, 0);
-       /* Will always return 0 or 1 with map_multi == NULL */
+       /* Will always return 0 with map_multi == NULL */
         BUG_ON(ret < 0);
         if (map_length < length + size)
                 return 1;
@@ -1586,7 +1624,12 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
                           u64 bio_offset)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
-       return btrfs_map_bio(root, rw, bio, mirror_num, 1);
+       int ret;
+
+       ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
+       if (ret)
+               bio_endio(bio, ret);
+       return ret;
  }
  
  /*
@@ -1601,6 +1644,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
         int ret = 0;
         int skip_sum;
         int metadata = 0;
+       int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
  
         skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
  
@@ -1610,31 +1654,43 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
         if (!(rw & REQ_WRITE)) {
                 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
                 if (ret)
-                       return ret;
+                       goto out;
  
                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
-                       return btrfs_submit_compressed_read(inode, bio,
-                                                   mirror_num, bio_flags);
+                       ret = btrfs_submit_compressed_read(inode, bio,
+                                                          mirror_num,
+                                                          bio_flags);
+                       goto out;
                 } else if (!skip_sum) {
                         ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
                         if (ret)
-                               return ret;
+                               goto out;
                 }
                 goto mapit;
-       } else if (!skip_sum) {
+       } else if (async && !skip_sum) {
                 /* csum items have already been cloned */
                 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
                         goto mapit;
                 /* we're doing a write, do the async checksumming */
-               return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+               ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
                                    inode, rw, bio, mirror_num,
                                    bio_flags, bio_offset,
                                    __btrfs_submit_bio_start,
                                    __btrfs_submit_bio_done);
+               goto out;
+       } else if (!skip_sum) {
+               ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+               if (ret)
+                       goto out;
         }
  
  mapit:
-       return btrfs_map_bio(root, rw, bio, mirror_num, 0);
+       ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+
+out:
+       if (ret < 0)
+               bio_endio(bio, ret);
+       return ret;
  }
  
  /*
@@ -1657,8 +1713,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                               struct extent_state **cached_state)
  {
-       if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
-               WARN_ON(1);
+       WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
         return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
                                    cached_state, GFP_NOFS);
  }
@@ -1867,22 +1922,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
  
         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
-               ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-               if (!ret) {
-                       if (nolock)
-                               trans = btrfs_join_transaction_nolock(root);
-                       else
-                               trans = btrfs_join_transaction(root);
-                       if (IS_ERR(trans)) {
-                               ret = PTR_ERR(trans);
-                               trans = NULL;
-                               goto out;
-                       }
-                       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-                       ret = btrfs_update_inode_fallback(trans, root, inode);
-                       if (ret) /* -ENOMEM or corruption */
-                               btrfs_abort_transaction(trans, root, ret);
+               btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+               if (nolock)
+                       trans = btrfs_join_transaction_nolock(root);
+               else
+                       trans = btrfs_join_transaction(root);
+               if (IS_ERR(trans)) {
+                       ret = PTR_ERR(trans);
+                       trans = NULL;
+                       goto out;
                 }
+               trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+               ret = btrfs_update_inode_fallback(trans, root, inode);
+               if (ret) /* -ENOMEM or corruption */
+                       btrfs_abort_transaction(trans, root, ret);
                 goto out;
         }
  
@@ -1931,15 +1984,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
         add_pending_csums(trans, inode, ordered_extent->file_offset,
                           &ordered_extent->list);
  
-       ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-       if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
-               ret = btrfs_update_inode_fallback(trans, root, inode);
-               if (ret) { /* -ENOMEM or corruption */
-                       btrfs_abort_transaction(trans, root, ret);
-                       goto out_unlock;
-               }
-       } else {
-               btrfs_set_inode_last_trans(trans, inode);
+       btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+       ret = btrfs_update_inode_fallback(trans, root, inode);
+       if (ret) { /* -ENOMEM or corruption */
+               btrfs_abort_transaction(trans, root, ret);
+               goto out_unlock;
         }
         ret = 0;
  out_unlock:
@@ -3074,7 +3123,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
         struct btrfs_trans_handle *trans;
         struct inode *inode = dentry->d_inode;
         int ret;
-       unsigned long nr = 0;
  
         trans = __unlink_start_trans(dir, dentry);
         if (IS_ERR(trans))
@@ -3094,9 +3142,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
         }
  
  out:
-       nr = trans->blocks_used;
         __unlink_end_trans(trans, root);
-       btrfs_btree_balance_dirty(root, nr);
+       btrfs_btree_balance_dirty(root);
         return ret;
  }
  
@@ -3186,7 +3233,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
         int err = 0;
         struct btrfs_root *root = BTRFS_I(dir)->root;
         struct btrfs_trans_handle *trans;
-       unsigned long nr = 0;
  
         if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                 return -ENOTEMPTY;
@@ -3215,9 +3261,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
         if (!err)
                 btrfs_i_size_write(inode, 0);
  out:
-       nr = trans->blocks_used;
         __unlink_end_trans(trans, root);
-       btrfs_btree_balance_dirty(root, nr);
+       btrfs_btree_balance_dirty(root);
  
         return err;
  }
@@ -3497,11 +3542,11 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
         if (ret)
                 goto out;
  
-       ret = -ENOMEM;
  again:
         page = find_or_create_page(mapping, index, mask);
         if (!page) {
                 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+               ret = -ENOMEM;
                 goto out;
         }
  
@@ -3550,7 +3595,6 @@ again:
                 goto out_unlock;
         }
  
-       ret = 0;
         if (offset != PAGE_CACHE_SIZE) {
                 if (!len)
                         len = PAGE_CACHE_SIZE - offset;
@@ -3668,6 +3712,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
  
                         hole_em->block_start = EXTENT_MAP_HOLE;
                         hole_em->block_len = 0;
+                       hole_em->orig_block_len = 0;
                         hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
                         hole_em->compress_type = BTRFS_COMPRESS_NONE;
                         hole_em->generation = trans->transid;
@@ -3783,7 +3828,6 @@ void btrfs_evict_inode(struct inode *inode)
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_block_rsv *rsv, *global_rsv;
         u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
-       unsigned long nr;
         int ret;
  
         trace_btrfs_inode_evict(inode);
@@ -3829,7 +3873,8 @@ void btrfs_evict_inode(struct inode *inode)
          * inode item when doing the truncate.
          */
         while (1) {
-               ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
+               ret = btrfs_block_rsv_refill(root, rsv, min_size,
+                                            BTRFS_RESERVE_FLUSH_LIMIT);
  
                 /*
                  * Try and steal from the global reserve since we will
@@ -3847,7 +3892,7 @@ void btrfs_evict_inode(struct inode *inode)
                         goto no_delete;
                 }
  
-               trans = btrfs_start_transaction_noflush(root, 1);
+               trans = btrfs_start_transaction_lflush(root, 1);
                 if (IS_ERR(trans)) {
                         btrfs_orphan_del(NULL, inode);
                         btrfs_free_block_rsv(root, rsv);
@@ -3864,10 +3909,9 @@ void btrfs_evict_inode(struct inode *inode)
                 ret = btrfs_update_inode(trans, root, inode);
                 BUG_ON(ret);
  
-               nr = trans->blocks_used;
                 btrfs_end_transaction(trans, root);
                 trans = NULL;
-               btrfs_btree_balance_dirty(root, nr);
+               btrfs_btree_balance_dirty(root);
         }
  
         btrfs_free_block_rsv(root, rsv);
@@ -3883,9 +3927,8 @@ void btrfs_evict_inode(struct inode *inode)
               root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
                 btrfs_return_ino(root, btrfs_ino(inode));
  
-       nr = trans->blocks_used;
         btrfs_end_transaction(trans, root);
-       btrfs_btree_balance_dirty(root, nr);
+       btrfs_btree_balance_dirty(root);
  no_delete:
         clear_inode(inode);
         return;
@@ -4775,8 +4818,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
         if (S_ISREG(mode)) {
                 if (btrfs_test_opt(root, NODATASUM))
                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
-               if (btrfs_test_opt(root, NODATACOW) ||
-                   (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
+               if (btrfs_test_opt(root, NODATACOW))
                         BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
         }
  
@@ -4842,7 +4884,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
         ret = btrfs_insert_dir_item(trans, root, name, name_len,
                                     parent_inode, &key,
                                     btrfs_inode_type(inode), index);
-       if (ret == -EEXIST)
+       if (ret == -EEXIST || ret == -EOVERFLOW)
                 goto fail_dir_item;
         else if (ret) {
                 btrfs_abort_transaction(trans, root, ret);
@@ -4897,7 +4939,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
         int err;
         int drop_inode = 0;
         u64 objectid;
-       unsigned long nr = 0;
         u64 index = 0;
  
         if (!new_valid_dev(rdev))
@@ -4930,6 +4971,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                 goto out_unlock;
         }
  
+       err = btrfs_update_inode(trans, root, inode);
+       if (err) {
+               drop_inode = 1;
+               goto out_unlock;
+       }
+
         /*
         * If the active LSM wants to access the inode during
         * d_instantiate it needs these. Smack checks to see
@@ -4947,9 +4994,8 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                 d_instantiate(dentry, inode);
         }
  out_unlock:
-       nr = trans->blocks_used;
         btrfs_end_transaction(trans, root);
-       btrfs_btree_balance_dirty(root, nr);
+       btrfs_btree_balance_dirty(root);
         if (drop_inode) {
                 inode_dec_link_count(inode);
                 iput(inode);
@@ -4963,9 +5009,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
         struct btrfs_trans_handle *trans;
         struct btrfs_root *root = BTRFS_I(dir)->root;
         struct inode *inode = NULL;
-       int drop_inode = 0;
+       int drop_inode_on_err = 0;
         int err;
-       unsigned long nr = 0;
         u64 objectid;
         u64 index = 0;
  
@@ -4989,12 +5034,15 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
                 err = PTR_ERR(inode);
                 goto out_unlock;
         }
+       drop_inode_on_err = 1;
  
         err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-       if (err) {
-               drop_inode = 1;
+       if (err)
+               goto out_unlock;
+
+       err = btrfs_update_inode(trans, root, inode);
+       if (err)
                 goto out_unlock;
-       }
  
         /*
         * If the active LSM wants to access the inode during
@@ -5007,21 +5055,20 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
  
         err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
         if (err)
-               drop_inode = 1;
-       else {
-               inode->i_mapping->a_ops = &btrfs_aops;
-               inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-               BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-               d_instantiate(dentry, inode);
-       }
+               goto out_unlock;
+
+       inode->i_mapping->a_ops = &btrfs_aops;
+       inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+       BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+       d_instantiate(dentry, inode);
+
  out_unlock:
-       nr = trans->blocks_used;
         btrfs_end_transaction(trans, root);
-       if (drop_inode) {
+       if (err && drop_inode_on_err) {
                 inode_dec_link_count(inode);
                 iput(inode);
         }
-       btrfs_btree_balance_dirty(root, nr);
+       btrfs_btree_balance_dirty(root);
         return err;
  }
  
@@ -5032,7 +5079,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
         struct btrfs_root *root = BTRFS_I(dir)->root;
         struct inode *inode = old_dentry->d_inode;
         u64 index;
-       unsigned long nr = 0;
         int err;
         int drop_inode = 0;
  
@@ -5062,6 +5108,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
         inode_inc_iversion(inode);
         inode->i_ctime = CURRENT_TIME;
         ihold(inode);
+       set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
  
         err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
  
@@ -5076,14 +5123,13 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                 btrfs_log_new_name(trans, inode, NULL, parent);
         }
  
-       nr = trans->blocks_used;
         btrfs_end_transaction(trans, root);
  fail:
         if (drop_inode) {
                 inode_dec_link_count(inode);
                 iput(inode);
         }
-       btrfs_btree_balance_dirty(root, nr);
+       btrfs_btree_balance_dirty(root);
         return err;
  }
  
@@ -5096,7 +5142,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
         int drop_on_err = 0;
         u64 objectid = 0;
         u64 index = 0;
-       unsigned long nr = 1;
  
         /*
          * 2 items for inode and ref
@@ -5142,11 +5187,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
         drop_on_err = 0;
  
  out_fail:
-       nr = trans->blocks_used;
         btrfs_end_transaction(trans, root);
         if (drop_on_err)
                 iput(inode);
-       btrfs_btree_balance_dirty(root, nr);
+       btrfs_btree_balance_dirty(root);
         return err;
  }
  
@@ -5340,6 +5384,7 @@ again:
                 if (start + len <= found_key.offset)
                         goto not_found;
                 em->start = start;
+               em->orig_start = start;
                 em->len = found_key.offset - start;
                 goto not_found_em;
         }
@@ -5350,6 +5395,8 @@ again:
                 em->len = extent_end - extent_start;
                 em->orig_start = extent_start -
                                  btrfs_file_extent_offset(leaf, item);
+               em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
+                                                                     item);
                 bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
                 if (bytenr == 0) {
                         em->block_start = EXTENT_MAP_HOLE;
@@ -5359,8 +5406,7 @@ again:
                         set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
                         em->compress_type = compress_type;
                         em->block_start = bytenr;
-                       em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
-                                                                        item);
+                       em->block_len = em->orig_block_len;
                 } else {
                         bytenr += btrfs_file_extent_offset(leaf, item);
                         em->block_start = bytenr;
@@ -5390,7 +5436,8 @@ again:
                 em->start = extent_start + extent_offset;
                 em->len = (copy_size + root->sectorsize - 1) &
                         ~((u64)root->sectorsize - 1);
-               em->orig_start = EXTENT_MAP_INLINE;
+               em->orig_block_len = em->len;
+               em->orig_start = em->start;
                 if (compress_type) {
                         set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
                         em->compress_type = compress_type;
@@ -5439,11 +5486,11 @@ again:
                                     extent_map_end(em) - 1, NULL, GFP_NOFS);
                 goto insert;
         } else {
-               printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
-               WARN_ON(1);
+               WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
         }
  not_found:
         em->start = start;
+       em->orig_start = start;
         em->len = len;
  not_found_em:
         em->block_start = EXTENT_MAP_HOLE;
@@ -5645,38 +5692,19 @@ out:
  }
  
  static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
-                                                 struct extent_map *em,
                                                   u64 start, u64 len)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_trans_handle *trans;
-       struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+       struct extent_map *em;
         struct btrfs_key ins;
         u64 alloc_hint;
         int ret;
-       bool insert = false;
-
-       /*
-        * Ok if the extent map we looked up is a hole and is for the exact
-        * range we want, there is no reason to allocate a new one, however if
-        * it is not right then we need to free this one and drop the cache for
-        * our range.
-        */
-       if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
-           em->len != len) {
-               free_extent_map(em);
-               em = NULL;
-               insert = true;
-               btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
-       }
  
         trans = btrfs_join_transaction(root);
         if (IS_ERR(trans))
                 return ERR_CAST(trans);
  
-       if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
-               btrfs_add_inode_defrag(trans, inode);
-
         trans->block_rsv = &root->fs_info->delalloc_block_rsv;
  
         alloc_hint = get_extent_allocation_hint(inode, start, len);
@@ -5687,37 +5715,10 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                 goto out;
         }
  
-       if (!em) {
-               em = alloc_extent_map();
-               if (!em) {
-                       em = ERR_PTR(-ENOMEM);
-                       goto out;
-               }
-       }
-
-       em->start = start;
-       em->orig_start = em->start;
-       em->len = ins.offset;
-
-       em->block_start = ins.objectid;
-       em->block_len = ins.offset;
-       em->bdev = root->fs_info->fs_devices->latest_bdev;
-
-       /*
-        * We need to do this because if we're using the original em we searched
-        * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
-        */
-       em->flags = 0;
-       set_bit(EXTENT_FLAG_PINNED, &em->flags);
-
-       while (insert) {
-               write_lock(&em_tree->lock);
-               ret = add_extent_mapping(em_tree, em);
-               write_unlock(&em_tree->lock);
-               if (ret != -EEXIST)
-                       break;
-               btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
-       }
+       em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
+                             ins.offset, ins.offset, 0);
+       if (IS_ERR(em))
+               goto out;
  
         ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
                                            ins.offset, ins.offset, 0);
@@ -5894,7 +5895,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
  static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
                                            u64 len, u64 orig_start,
                                            u64 block_start, u64 block_len,
-                                          int type)
+                                          u64 orig_block_len, int type)
  {
         struct extent_map_tree *em_tree;
         struct extent_map *em;
@@ -5912,15 +5913,20 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
         em->block_len = block_len;
         em->block_start = block_start;
         em->bdev = root->fs_info->fs_devices->latest_bdev;
+       em->orig_block_len = orig_block_len;
+       em->generation = -1;
         set_bit(EXTENT_FLAG_PINNED, &em->flags);
         if (type == BTRFS_ORDERED_PREALLOC)
-               set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+               set_bit(EXTENT_FLAG_FILLING, &em->flags);
  
         do {
                 btrfs_drop_extent_cache(inode, em->start,
                                 em->start + em->len - 1, 0);
                 write_lock(&em_tree->lock);
                 ret = add_extent_mapping(em_tree, em);
+               if (!ret)
+                       list_move(&em->list,
+                                 &em_tree->modified_extents);
                 write_unlock(&em_tree->lock);
         } while (ret == -EEXIST);
  
@@ -6047,13 +6053,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                         goto must_cow;
  
                 if (can_nocow_odirect(trans, inode, start, len) == 1) {
-                       u64 orig_start = em->start;
+                       u64 orig_start = em->orig_start;
+                       u64 orig_block_len = em->orig_block_len;
  
                         if (type == BTRFS_ORDERED_PREALLOC) {
                                 free_extent_map(em);
                                 em = create_pinned_em(inode, start, len,
                                                        orig_start,
-                                                      block_start, len, type);
+                                                      block_start, len,
+                                                      orig_block_len, type);
                                 if (IS_ERR(em)) {
                                         btrfs_end_transaction(trans, root);
                                         goto unlock_err;
@@ -6077,7 +6085,8 @@ must_cow:
          * it above
          */
         len = bh_result->b_size;
-       em = btrfs_new_extent_direct(inode, em, start, len);
+       free_extent_map(em);
+       em = btrfs_new_extent_direct(inode, start, len);
         if (IS_ERR(em)) {
                 ret = PTR_ERR(em);
                 goto unlock_err;
@@ -6318,6 +6327,9 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
         struct btrfs_root *root = BTRFS_I(inode)->root;
         int ret;
  
+       if (async_submit)
+               async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
+
         bio_get(bio);
  
         if (!write) {
@@ -6362,7 +6374,6 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
  {
         struct inode *inode = dip->inode;
         struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
         struct bio *bio;
         struct bio *orig_bio = dip->orig_bio;
         struct bio_vec *bvec = orig_bio->bi_io_vec;
@@ -6375,7 +6386,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
         int async_submit = 0;
  
         map_length = orig_bio->bi_size;
-       ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+       ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
                               &map_length, NULL, 0);
         if (ret) {
                 bio_put(orig_bio);
@@ -6429,7 +6440,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                         bio->bi_end_io = btrfs_end_dio_bio;
  
                         map_length = orig_bio->bi_size;
-                       ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+                       ret = btrfs_map_block(root->fs_info, READ,
+                                             start_sector << 9,
                                               &map_length, NULL, 0);
                         if (ret) {
                                 bio_put(bio);
@@ -6582,9 +6594,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                    btrfs_submit_direct, 0);
  }
  
+#define BTRFS_FIEMAP_FLAGS     (FIEMAP_FLAG_SYNC)
+
  static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                 __u64 start, __u64 len)
  {
+       int     ret;
+
+       ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
+       if (ret)
+               return ret;
+
         return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
  }
  
@@ -6855,7 +6875,6 @@ static int btrfs_truncate(struct inode *inode)
         int ret;
         int err = 0;
         struct btrfs_trans_handle *trans;
-       unsigned long nr;
         u64 mask = root->sectorsize - 1;
         u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
  
@@ -6978,9 +6997,8 @@ static int btrfs_truncate(struct inode *inode)
                         break;
                 }
  
-               nr = trans->blocks_used;
                 btrfs_end_transaction(trans, root);
-               btrfs_btree_balance_dirty(root, nr);
+               btrfs_btree_balance_dirty(root);
  
                 trans = btrfs_start_transaction(root, 2);
                 if (IS_ERR(trans)) {
@@ -7014,9 +7032,8 @@ static int btrfs_truncate(struct inode *inode)
                 if (ret && !err)
                         err = ret;
  
-               nr = trans->blocks_used;
                 ret = btrfs_end_transaction(trans, root);
-               btrfs_btree_balance_dirty(root, nr);
+               btrfs_btree_balance_dirty(root);
         }
  
  out:
@@ -7093,6 +7110,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
         extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
         ei->io_tree.track_uptodate = 1;
         ei->io_failure_tree.track_uptodate = 1;
+       atomic_set(&ei->sync_writers, 0);
         mutex_init(&ei->log_mutex);
         mutex_init(&ei->delalloc_mutex);
         btrfs_ordered_inode_tree_init(&ei->ordered_tree);
@@ -7203,6 +7221,8 @@ void btrfs_destroy_cachep(void)
                 kmem_cache_destroy(btrfs_path_cachep);
         if (btrfs_free_space_cachep)
                 kmem_cache_destroy(btrfs_free_space_cachep);
+       if (btrfs_delalloc_work_cachep)
+               kmem_cache_destroy(btrfs_delalloc_work_cachep);
  }
  
  int btrfs_init_cachep(void)
@@ -7237,6 +7257,13 @@ int btrfs_init_cachep(void)
         if (!btrfs_free_space_cachep)
                 goto fail;
  
+       btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
+                       sizeof(struct btrfs_delalloc_work), 0,
+                       SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                       NULL);
+       if (!btrfs_delalloc_work_cachep)
+               goto fail;
+
         return 0;
  fail:
         btrfs_destroy_cachep();
@@ -7308,6 +7335,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         if (S_ISDIR(old_inode->i_mode) && new_inode &&
             new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                 return -ENOTEMPTY;
+
+
+       /* check for collisions, even if the  name isn't there */
+       ret = btrfs_check_dir_item_collision(root, new_dir->i_ino,
+                            new_dentry->d_name.name,
+                            new_dentry->d_name.len);
+
+       if (ret) {
+               if (ret == -EEXIST) {
+                       /* we shouldn't get
+                        * eexist without a new_inode */
+                       if (!new_inode) {
+                               WARN_ON(1);
+                               return ret;
+                       }
+               } else {
+                       /* maybe -EOVERFLOW */
+                       return ret;
+               }
+       }
+       ret = 0;
+
         /*
          * we're using rename to replace one file with another.
          * and the replacement file is large.  Start IO on it now so
@@ -7447,6 +7496,49 @@ out_notrans:
         return ret;
  }
  
+static void btrfs_run_delalloc_work(struct btrfs_work *work)
+{
+       struct btrfs_delalloc_work *delalloc_work;
+
+       delalloc_work = container_of(work, struct btrfs_delalloc_work,
+                                    work);
+       if (delalloc_work->wait)
+               btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
+       else
+               filemap_flush(delalloc_work->inode->i_mapping);
+
+       if (delalloc_work->delay_iput)
+               btrfs_add_delayed_iput(delalloc_work->inode);
+       else
+               iput(delalloc_work->inode);
+       complete(&delalloc_work->completion);
+}
+
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+                                                   int wait, int delay_iput)
+{
+       struct btrfs_delalloc_work *work;
+
+       work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
+       if (!work)
+               return NULL;
+
+       init_completion(&work->completion);
+       INIT_LIST_HEAD(&work->list);
+       work->inode = inode;
+       work->wait = wait;
+       work->delay_iput = delay_iput;
+       work->work.func = btrfs_run_delalloc_work;
+
+       return work;
+}
+
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
+{
+       wait_for_completion(&work->completion);
+       kmem_cache_free(btrfs_delalloc_work_cachep, work);
+}
+
  /*
   * some fairly slow code that needs optimization. This walks the list
   * of all the inodes with pending delalloc and forces them to disk.
@@ -7456,10 +7548,15 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
         struct list_head *head = &root->fs_info->delalloc_inodes;
         struct btrfs_inode *binode;
         struct inode *inode;
+       struct btrfs_delalloc_work *work, *next;
+       struct list_head works;
+       int ret = 0;
  
         if (root->fs_info->sb->s_flags & MS_RDONLY)
                 return -EROFS;
  
+       INIT_LIST_HEAD(&works);
+
         spin_lock(&root->fs_info->delalloc_lock);
         while (!list_empty(head)) {
                 binode = list_entry(head->next, struct btrfs_inode,
@@ -7469,11 +7566,14 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
                         list_del_init(&binode->delalloc_inodes);
                 spin_unlock(&root->fs_info->delalloc_lock);
                 if (inode) {
-                       filemap_flush(inode->i_mapping);
-                       if (delay_iput)
-                               btrfs_add_delayed_iput(inode);
-                       else
-                               iput(inode);
+                       work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+                       if (!work) {
+                               ret = -ENOMEM;
+                               goto out;
+                       }
+                       list_add_tail(&work->list, &works);
+                       btrfs_queue_worker(&root->fs_info->flush_workers,
+                                          &work->work);
                 }
                 cond_resched();
                 spin_lock(&root->fs_info->delalloc_lock);
@@ -7492,7 +7592,12 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
                     atomic_read(&root->fs_info->async_delalloc_pages) == 0));
         }
         atomic_dec(&root->fs_info->async_submit_draining);
-       return 0;
+out:
+       list_for_each_entry_safe(work, next, &works, list) {
+               list_del_init(&work->list);
+               btrfs_wait_and_free_delalloc_work(work);
+       }
+       return ret;
  }
  
  static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
@@ -7512,7 +7617,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
         unsigned long ptr;
         struct btrfs_file_extent_item *ei;
         struct extent_buffer *leaf;
-       unsigned long nr = 0;
  
         name_len = strlen(symname) + 1;
         if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
@@ -7610,13 +7714,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
  out_unlock:
         if (!err)
                 d_instantiate(dentry, inode);
-       nr = trans->blocks_used;
         btrfs_end_transaction(trans, root);
         if (drop_inode) {
                 inode_dec_link_count(inode);
                 iput(inode);
         }
-       btrfs_btree_balance_dirty(root, nr);
+       btrfs_btree_balance_dirty(root);
         return err;
  }
  
@@ -7679,6 +7782,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                 em->len = ins.offset;
                 em->block_start = ins.objectid;
                 em->block_len = ins.offset;
+               em->orig_block_len = ins.offset;
                 em->bdev = root->fs_info->fs_devices->latest_bdev;
                 set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
                 em->generation = trans->transid;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c

index 5b3429ab8ec1d79f1157b51fa33910b3ebaa57d7..4b4516770f055432964da1e82fe591e1e2128ef8 100644 (file)
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -55,6 +55,7 @@
  #include "backref.h"
  #include "rcu-string.h"
  #include "send.h"
+#include "dev-replace.h"
  
  /* Mask out flags that are inappropriate for the given type of inode. */
  static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -140,8 +141,11 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
                 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
         }
  
-       if (flags & BTRFS_INODE_NODATACOW)
+       if (flags & BTRFS_INODE_NODATACOW) {
                 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+               if (S_ISREG(inode->i_mode))
+                       BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+       }
  
         btrfs_update_iflags(inode);
  }
@@ -571,8 +575,12 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
                 ret = btrfs_commit_transaction(trans,
                                                root->fs_info->extent_root);
         }
-       if (ret)
+       if (ret) {
+               /* cleanup_transaction has freed this for us */
+               if (trans->aborted)
+                       pending_snapshot = NULL;
                 goto fail;
+       }
  
         ret = pending_snapshot->error;
         if (ret)
@@ -705,6 +713,16 @@ static noinline int btrfs_mksubvol(struct path *parent,
         if (error)
                 goto out_dput;
  
+       /*
+        * even if this name doesn't exist, we may get hash collisions.
+        * check for them now when we can safely fail
+        */
+       error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
+                                              dir->i_ino, name,
+                                              namelen);
+       if (error)
+               goto out_dput;
+
         down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
  
         if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
@@ -1293,12 +1311,13 @@ out_ra:
         return ret;
  }
  
-static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
+static noinline int btrfs_ioctl_resize(struct file *file,
                                         void __user *arg)
  {
         u64 new_size;
         u64 old_size;
         u64 devid = 1;
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
         struct btrfs_ioctl_vol_args *vol_args;
         struct btrfs_trans_handle *trans;
         struct btrfs_device *device = NULL;
@@ -1313,13 +1332,17 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
  
-       mutex_lock(&root->fs_info->volume_mutex);
-       if (root->fs_info->balance_ctl) {
-               printk(KERN_INFO "btrfs: balance in progress\n");
-               ret = -EINVAL;
-               goto out;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
+
+       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+               pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+               return -EINPROGRESS;
         }
  
+       mutex_lock(&root->fs_info->volume_mutex);
         vol_args = memdup_user(arg, sizeof(*vol_args));
         if (IS_ERR(vol_args)) {
                 ret = PTR_ERR(vol_args);
@@ -1339,7 +1362,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                 printk(KERN_INFO "btrfs: resizing devid %llu\n",
                        (unsigned long long)devid);
         }
-       device = btrfs_find_device(root, devid, NULL, NULL);
+       device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
         if (!device) {
                 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
                        (unsigned long long)devid);
@@ -1371,6 +1394,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                 }
         }
  
+       if (device->is_tgtdev_for_dev_replace) {
+               ret = -EINVAL;
+               goto out_free;
+       }
+
         old_size = device->total_bytes;
  
         if (mod < 0) {
@@ -1409,12 +1437,14 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                 btrfs_commit_transaction(trans, root);
         } else if (new_size < old_size) {
                 ret = btrfs_shrink_device(device, new_size);
-       }
+       } /* equal, nothing need to do */
  
  out_free:
         kfree(vol_args);
  out:
         mutex_unlock(&root->fs_info->volume_mutex);
+       mnt_drop_write_file(file);
+       atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
         return ret;
  }
  
@@ -2156,9 +2186,17 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
         if (btrfs_root_readonly(root))
                 return -EROFS;
  
+       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+               pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+               return -EINPROGRESS;
+       }
         ret = mnt_want_write_file(file);
-       if (ret)
+       if (ret) {
+               atomic_set(&root->fs_info->mutually_exclusive_operation_running,
+                          0);
                 return ret;
+       }
  
         switch (inode->i_mode & S_IFMT) {
         case S_IFDIR:
@@ -2210,6 +2248,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
         }
  out:
         mnt_drop_write_file(file);
+       atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
         return ret;
  }
  
@@ -2221,13 +2260,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
  
-       mutex_lock(&root->fs_info->volume_mutex);
-       if (root->fs_info->balance_ctl) {
-               printk(KERN_INFO "btrfs: balance in progress\n");
-               ret = -EINVAL;
-               goto out;
+       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+               pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+               return -EINPROGRESS;
         }
  
+       mutex_lock(&root->fs_info->volume_mutex);
         vol_args = memdup_user(arg, sizeof(*vol_args));
         if (IS_ERR(vol_args)) {
                 ret = PTR_ERR(vol_args);
@@ -2240,27 +2279,31 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
         kfree(vol_args);
  out:
         mutex_unlock(&root->fs_info->volume_mutex);
+       atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
         return ret;
  }
  
-static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
  {
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
         struct btrfs_ioctl_vol_args *vol_args;
         int ret;
  
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
  
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
  
-       mutex_lock(&root->fs_info->volume_mutex);
-       if (root->fs_info->balance_ctl) {
-               printk(KERN_INFO "btrfs: balance in progress\n");
-               ret = -EINVAL;
-               goto out;
+       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+               pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+               mnt_drop_write_file(file);
+               return -EINPROGRESS;
         }
  
+       mutex_lock(&root->fs_info->volume_mutex);
         vol_args = memdup_user(arg, sizeof(*vol_args));
         if (IS_ERR(vol_args)) {
                 ret = PTR_ERR(vol_args);
@@ -2273,6 +2316,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
         kfree(vol_args);
  out:
         mutex_unlock(&root->fs_info->volume_mutex);
+       mnt_drop_write_file(file);
+       atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
         return ret;
  }
  
@@ -2328,7 +2373,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
                 s_uuid = di_args->uuid;
  
         mutex_lock(&fs_devices->device_list_mutex);
-       dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL);
+       dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
         mutex_unlock(&fs_devices->device_list_mutex);
  
         if (!dev) {
@@ -2821,12 +2866,19 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
         struct btrfs_disk_key disk_key;
         u64 objectid = 0;
         u64 dir_id;
+       int ret;
  
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
  
-       if (copy_from_user(&objectid, argp, sizeof(objectid)))
-               return -EFAULT;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
+
+       if (copy_from_user(&objectid, argp, sizeof(objectid))) {
+               ret = -EFAULT;
+               goto out;
+       }
  
         if (!objectid)
                 objectid = root->root_key.objectid;
@@ -2836,21 +2888,28 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
         location.offset = (u64)-1;
  
         new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
-       if (IS_ERR(new_root))
-               return PTR_ERR(new_root);
+       if (IS_ERR(new_root)) {
+               ret = PTR_ERR(new_root);
+               goto out;
+       }
  
-       if (btrfs_root_refs(&new_root->root_item) == 0)
-               return -ENOENT;
+       if (btrfs_root_refs(&new_root->root_item) == 0) {
+               ret = -ENOENT;
+               goto out;
+       }
  
         path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
+       if (!path) {
+               ret = -ENOMEM;
+               goto out;
+       }
         path->leave_spinning = 1;
  
         trans = btrfs_start_transaction(root, 1);
         if (IS_ERR(trans)) {
                 btrfs_free_path(path);
-               return PTR_ERR(trans);
+               ret = PTR_ERR(trans);
+               goto out;
         }
  
         dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
@@ -2861,7 +2920,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
                 btrfs_end_transaction(trans, root);
                 printk(KERN_ERR "Umm, you don't have the default dir item, "
                        "this isn't going to work\n");
-               return -ENOENT;
+               ret = -ENOENT;
+               goto out;
         }
  
         btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
@@ -2871,8 +2931,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
  
         btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
         btrfs_end_transaction(trans, root);
-
-       return 0;
+out:
+       mnt_drop_write_file(file);
+       return ret;
  }
  
  void btrfs_get_block_group_info(struct list_head *groups_list,
@@ -3036,32 +3097,38 @@ long btrfs_ioctl_trans_end(struct file *file)
         return 0;
  }
  
-static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
+static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
+                                           void __user *argp)
  {
-       struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
         struct btrfs_trans_handle *trans;
         u64 transid;
         int ret;
  
-       trans = btrfs_start_transaction(root, 0);
-       if (IS_ERR(trans))
-               return PTR_ERR(trans);
+       trans = btrfs_attach_transaction(root);
+       if (IS_ERR(trans)) {
+               if (PTR_ERR(trans) != -ENOENT)
+                       return PTR_ERR(trans);
+
+               /* No running transaction, don't bother */
+               transid = root->fs_info->last_trans_committed;
+               goto out;
+       }
         transid = trans->transid;
         ret = btrfs_commit_transaction_async(trans, root, 0);
         if (ret) {
                 btrfs_end_transaction(trans, root);
                 return ret;
         }
-
+out:
         if (argp)
                 if (copy_to_user(argp, &transid, sizeof(transid)))
                         return -EFAULT;
         return 0;
  }
  
-static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
+static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
+                                          void __user *argp)
  {
-       struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
         u64 transid;
  
         if (argp) {
@@ -3073,10 +3140,11 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
         return btrfs_wait_for_commit(root, transid);
  }
  
-static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
  {
-       int ret;
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
         struct btrfs_ioctl_scrub_args *sa;
+       int ret;
  
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
@@ -3085,12 +3153,22 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
         if (IS_ERR(sa))
                 return PTR_ERR(sa);
  
-       ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end,
-                             &sa->progress, sa->flags & BTRFS_SCRUB_READONLY);
+       if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
+               ret = mnt_want_write_file(file);
+               if (ret)
+                       goto out;
+       }
+
+       ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
+                             &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
+                             0);
  
         if (copy_to_user(arg, sa, sizeof(*sa)))
                 ret = -EFAULT;
  
+       if (!(sa->flags & BTRFS_SCRUB_READONLY))
+               mnt_drop_write_file(file);
+out:
         kfree(sa);
         return ret;
  }
@@ -3100,7 +3178,7 @@ static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
  
-       return btrfs_scrub_cancel(root);
+       return btrfs_scrub_cancel(root->fs_info);
  }
  
  static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
@@ -3149,6 +3227,51 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
         return ret;
  }
  
+static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
+{
+       struct btrfs_ioctl_dev_replace_args *p;
+       int ret;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       p = memdup_user(arg, sizeof(*p));
+       if (IS_ERR(p))
+               return PTR_ERR(p);
+
+       switch (p->cmd) {
+       case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
+               if (atomic_xchg(
+                       &root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+                       pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+                       ret = -EINPROGRESS;
+               } else {
+                       ret = btrfs_dev_replace_start(root, p);
+                       atomic_set(
+                        &root->fs_info->mutually_exclusive_operation_running,
+                        0);
+               }
+               break;
+       case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
+               btrfs_dev_replace_status(root->fs_info, p);
+               ret = 0;
+               break;
+       case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
+               ret = btrfs_dev_replace_cancel(root->fs_info, p);
+               break;
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       if (copy_to_user(arg, p, sizeof(*p)))
+               ret = -EFAULT;
+
+       kfree(p);
+       return ret;
+}
+
  static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
  {
         int ret = 0;
@@ -3315,6 +3438,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
         struct btrfs_ioctl_balance_args *bargs;
         struct btrfs_balance_control *bctl;
         int ret;
+       int need_to_clear_lock = 0;
  
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
@@ -3350,10 +3474,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
                 bargs = NULL;
         }
  
-       if (fs_info->balance_ctl) {
+       if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+                       1)) {
+               pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
                 ret = -EINPROGRESS;
                 goto out_bargs;
         }
+       need_to_clear_lock = 1;
  
         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
         if (!bctl) {
@@ -3387,6 +3514,9 @@ do_balance:
  out_bargs:
         kfree(bargs);
  out:
+       if (need_to_clear_lock)
+               atomic_set(&root->fs_info->mutually_exclusive_operation_running,
+                          0);
         mutex_unlock(&fs_info->balance_mutex);
         mutex_unlock(&fs_info->volume_mutex);
         mnt_drop_write_file(file);
@@ -3441,8 +3571,9 @@ out:
         return ret;
  }
  
-static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
  {
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
         struct btrfs_ioctl_quota_ctl_args *sa;
         struct btrfs_trans_handle *trans = NULL;
         int ret;
@@ -3451,12 +3582,15 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
  
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
  
         sa = memdup_user(arg, sizeof(*sa));
-       if (IS_ERR(sa))
-               return PTR_ERR(sa);
+       if (IS_ERR(sa)) {
+               ret = PTR_ERR(sa);
+               goto drop_write;
+       }
  
         if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) {
                 trans = btrfs_start_transaction(root, 2);
@@ -3489,14 +3623,16 @@ static long btrfs_ioctl_quota_ctl(struct btrfs_root *root, void __user *arg)
                 if (err && !ret)
                         ret = err;
         }
-
  out:
         kfree(sa);
+drop_write:
+       mnt_drop_write_file(file);
         return ret;
  }
  
-static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
  {
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
         struct btrfs_ioctl_qgroup_assign_args *sa;
         struct btrfs_trans_handle *trans;
         int ret;
@@ -3505,12 +3641,15 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
  
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
  
         sa = memdup_user(arg, sizeof(*sa));
-       if (IS_ERR(sa))
-               return PTR_ERR(sa);
+       if (IS_ERR(sa)) {
+               ret = PTR_ERR(sa);
+               goto drop_write;
+       }
  
         trans = btrfs_join_transaction(root);
         if (IS_ERR(trans)) {
@@ -3533,11 +3672,14 @@ static long btrfs_ioctl_qgroup_assign(struct btrfs_root *root, void __user *arg)
  
  out:
         kfree(sa);
+drop_write:
+       mnt_drop_write_file(file);
         return ret;
  }
  
-static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
  {
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
         struct btrfs_ioctl_qgroup_create_args *sa;
         struct btrfs_trans_handle *trans;
         int ret;
@@ -3546,12 +3688,15 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
  
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
  
         sa = memdup_user(arg, sizeof(*sa));
-       if (IS_ERR(sa))
-               return PTR_ERR(sa);
+       if (IS_ERR(sa)) {
+               ret = PTR_ERR(sa);
+               goto drop_write;
+       }
  
         trans = btrfs_join_transaction(root);
         if (IS_ERR(trans)) {
@@ -3573,11 +3718,14 @@ static long btrfs_ioctl_qgroup_create(struct btrfs_root *root, void __user *arg)
  
  out:
         kfree(sa);
+drop_write:
+       mnt_drop_write_file(file);
         return ret;
  }
  
-static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
  {
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
         struct btrfs_ioctl_qgroup_limit_args *sa;
         struct btrfs_trans_handle *trans;
         int ret;
@@ -3587,12 +3735,15 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
  
-       if (root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
+       ret = mnt_want_write_file(file);
+       if (ret)
+               return ret;
  
         sa = memdup_user(arg, sizeof(*sa));
-       if (IS_ERR(sa))
-               return PTR_ERR(sa);
+       if (IS_ERR(sa)) {
+               ret = PTR_ERR(sa);
+               goto drop_write;
+       }
  
         trans = btrfs_join_transaction(root);
         if (IS_ERR(trans)) {
@@ -3615,6 +3766,8 @@ static long btrfs_ioctl_qgroup_limit(struct btrfs_root *root, void __user *arg)
  
  out:
         kfree(sa);
+drop_write:
+       mnt_drop_write_file(file);
         return ret;
  }
  
@@ -3735,11 +3888,11 @@ long btrfs_ioctl(struct file *file, unsigned int
         case BTRFS_IOC_DEFRAG_RANGE:
                 return btrfs_ioctl_defrag(file, argp);
         case BTRFS_IOC_RESIZE:
-               return btrfs_ioctl_resize(root, argp);
+               return btrfs_ioctl_resize(file, argp);
         case BTRFS_IOC_ADD_DEV:
                 return btrfs_ioctl_add_dev(root, argp);
         case BTRFS_IOC_RM_DEV:
-               return btrfs_ioctl_rm_dev(root, argp);
+               return btrfs_ioctl_rm_dev(file, argp);
         case BTRFS_IOC_FS_INFO:
                 return btrfs_ioctl_fs_info(root, argp);
         case BTRFS_IOC_DEV_INFO:
@@ -3768,11 +3921,11 @@ long btrfs_ioctl(struct file *file, unsigned int
                 btrfs_sync_fs(file->f_dentry->d_sb, 1);
                 return 0;
         case BTRFS_IOC_START_SYNC:
-               return btrfs_ioctl_start_sync(file, argp);
+               return btrfs_ioctl_start_sync(root, argp);
         case BTRFS_IOC_WAIT_SYNC:
-               return btrfs_ioctl_wait_sync(file, argp);
+               return btrfs_ioctl_wait_sync(root, argp);
         case BTRFS_IOC_SCRUB:
-               return btrfs_ioctl_scrub(root, argp);
+               return btrfs_ioctl_scrub(file, argp);
         case BTRFS_IOC_SCRUB_CANCEL:
                 return btrfs_ioctl_scrub_cancel(root, argp);
         case BTRFS_IOC_SCRUB_PROGRESS:
@@ -3790,13 +3943,15 @@ long btrfs_ioctl(struct file *file, unsigned int
         case BTRFS_IOC_GET_DEV_STATS:
                 return btrfs_ioctl_get_dev_stats(root, argp);
         case BTRFS_IOC_QUOTA_CTL:
-               return btrfs_ioctl_quota_ctl(root, argp);
+               return btrfs_ioctl_quota_ctl(file, argp);
         case BTRFS_IOC_QGROUP_ASSIGN:
-               return btrfs_ioctl_qgroup_assign(root, argp);
+               return btrfs_ioctl_qgroup_assign(file, argp);
         case BTRFS_IOC_QGROUP_CREATE:
-               return btrfs_ioctl_qgroup_create(root, argp);
+               return btrfs_ioctl_qgroup_create(file, argp);
         case BTRFS_IOC_QGROUP_LIMIT:
-               return btrfs_ioctl_qgroup_limit(root, argp);
+               return btrfs_ioctl_qgroup_limit(file, argp);
+       case BTRFS_IOC_DEV_REPLACE:
+               return btrfs_ioctl_dev_replace(root, argp);
         }
  
         return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h

index 731e2875ab93900b0f623797642f371b2f042e2f..dabca9cc8c2ebe72a7c440d2147e985bcc6b3c2c 100644 (file)
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,6 +30,8 @@ struct btrfs_ioctl_vol_args {
         char name[BTRFS_PATH_NAME_MAX + 1];
  };
  
+#define BTRFS_DEVICE_PATH_NAME_MAX 1024
+
  #define BTRFS_SUBVOL_CREATE_ASYNC      (1ULL << 0)
  #define BTRFS_SUBVOL_RDONLY            (1ULL << 1)
  #define BTRFS_SUBVOL_QGROUP_INHERIT    (1ULL << 2)
@@ -123,7 +125,48 @@ struct btrfs_ioctl_scrub_args {
         __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
  };
  
-#define BTRFS_DEVICE_PATH_NAME_MAX 1024
+#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS   0
+#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID    1
+struct btrfs_ioctl_dev_replace_start_params {
+       __u64 srcdevid; /* in, if 0, use srcdev_name instead */
+       __u64 cont_reading_from_srcdev_mode;    /* in, see #define
+                                                * above */
+       __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1];       /* in */
+       __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1];       /* in */
+};
+
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED    0
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED          1
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED         2
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED         3
+#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED                4
+struct btrfs_ioctl_dev_replace_status_params {
+       __u64 replace_state;    /* out, see #define above */
+       __u64 progress_1000;    /* out, 0 <= x <= 1000 */
+       __u64 time_started;     /* out, seconds since 1-Jan-1970 */
+       __u64 time_stopped;     /* out, seconds since 1-Jan-1970 */
+       __u64 num_write_errors; /* out */
+       __u64 num_uncorrectable_read_errors;    /* out */
+};
+
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_START                      0
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS                     1
+#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL                     2
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR                        0
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED             1
+#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED         2
+struct btrfs_ioctl_dev_replace_args {
+       __u64 cmd;      /* in */
+       __u64 result;   /* out */
+
+       union {
+               struct btrfs_ioctl_dev_replace_start_params start;
+               struct btrfs_ioctl_dev_replace_status_params status;
+       };      /* in/out */
+
+       __u64 spare[64];
+};
+
  struct btrfs_ioctl_dev_info_args {
         __u64 devid;                            /* in/out */
         __u8 uuid[BTRFS_UUID_SIZE];             /* in/out */
@@ -453,4 +496,7 @@ struct btrfs_ioctl_send_args {
                                struct btrfs_ioctl_qgroup_limit_args)
  #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
                                       struct btrfs_ioctl_get_dev_stats)
+#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
+                                   struct btrfs_ioctl_dev_replace_args)
+
  #endif
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h

new file mode 100644 (file)

index 0000000..b7816ce
--- /dev/null
+++ b/fs/btrfs/math.h
@@ -0,0 +1,44 @@
+
+/*
+ * Copyright (C) 2012 Fujitsu.  All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_MATH_H
+#define __BTRFS_MATH_H
+
+#include <asm/div64.h>
+
+static inline u64 div_factor(u64 num, int factor)
+{
+       if (factor == 10)
+               return num;
+       num *= factor;
+       do_div(num, 10);
+       return num;
+}
+
+static inline u64 div_factor_fine(u64 num, int factor)
+{
+       if (factor == 100)
+               return num;
+       num *= factor;
+       do_div(num, 100);
+       return num;
+}
+
+#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c

index 7772f02ba28e6966826c0d897475961ad628c2f5..f107312970405da1e3218118a8d0555894955c59 100644 (file)
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -211,6 +211,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
         init_waitqueue_head(&entry->wait);
         INIT_LIST_HEAD(&entry->list);
         INIT_LIST_HEAD(&entry->root_extent_list);
+       INIT_LIST_HEAD(&entry->work_list);
+       init_completion(&entry->completion);
  
         trace_btrfs_ordered_extent_add(inode, entry);
  
@@ -464,18 +466,28 @@ void btrfs_remove_ordered_extent(struct inode *inode,
         wake_up(&entry->wait);
  }
  
+static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
+{
+       struct btrfs_ordered_extent *ordered;
+
+       ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
+       btrfs_start_ordered_extent(ordered->inode, ordered, 1);
+       complete(&ordered->completion);
+}
+
  /*
   * wait for all the ordered extents in a root.  This is done when balancing
   * space between drives.
   */
  void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
  {
-       struct list_head splice;
+       struct list_head splice, works;
         struct list_head *cur;
-       struct btrfs_ordered_extent *ordered;
+       struct btrfs_ordered_extent *ordered, *next;
         struct inode *inode;
  
         INIT_LIST_HEAD(&splice);
+       INIT_LIST_HEAD(&works);
  
         spin_lock(&root->fs_info->ordered_extent_lock);
         list_splice_init(&root->fs_info->ordered_extents, &splice);
@@ -494,19 +506,32 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
                 spin_unlock(&root->fs_info->ordered_extent_lock);
  
                 if (inode) {
-                       btrfs_start_ordered_extent(inode, ordered, 1);
-                       btrfs_put_ordered_extent(ordered);
-                       if (delay_iput)
-                               btrfs_add_delayed_iput(inode);
-                       else
-                               iput(inode);
+                       ordered->flush_work.func = btrfs_run_ordered_extent_work;
+                       list_add_tail(&ordered->work_list, &works);
+                       btrfs_queue_worker(&root->fs_info->flush_workers,
+                                          &ordered->flush_work);
                 } else {
                         btrfs_put_ordered_extent(ordered);
                 }
  
+               cond_resched();
                 spin_lock(&root->fs_info->ordered_extent_lock);
         }
         spin_unlock(&root->fs_info->ordered_extent_lock);
+
+       list_for_each_entry_safe(ordered, next, &works, work_list) {
+               list_del_init(&ordered->work_list);
+               wait_for_completion(&ordered->completion);
+
+               inode = ordered->inode;
+               btrfs_put_ordered_extent(ordered);
+               if (delay_iput)
+                       btrfs_add_delayed_iput(inode);
+               else
+                       iput(inode);
+
+               cond_resched();
+       }
  }
  
  /*
@@ -519,13 +544,17 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
   * extra check to make sure the ordered operation list really is empty
   * before we return
   */
-void btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
  {
         struct btrfs_inode *btrfs_inode;
         struct inode *inode;
         struct list_head splice;
+       struct list_head works;
+       struct btrfs_delalloc_work *work, *next;
+       int ret = 0;
  
         INIT_LIST_HEAD(&splice);
+       INIT_LIST_HEAD(&works);
  
         mutex_lock(&root->fs_info->ordered_operations_mutex);
         spin_lock(&root->fs_info->ordered_extent_lock);
@@ -533,6 +562,7 @@ again:
         list_splice_init(&root->fs_info->ordered_operations, &splice);
  
         while (!list_empty(&splice)) {
+
                 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
                                    ordered_operations);
  
@@ -549,15 +579,26 @@ again:
                         list_add_tail(&BTRFS_I(inode)->ordered_operations,
                               &root->fs_info->ordered_operations);
                 }
+
+               if (!inode)
+                       continue;
                 spin_unlock(&root->fs_info->ordered_extent_lock);
  
-               if (inode) {
-                       if (wait)
-                               btrfs_wait_ordered_range(inode, 0, (u64)-1);
-                       else
-                               filemap_flush(inode->i_mapping);
-                       btrfs_add_delayed_iput(inode);
+               work = btrfs_alloc_delalloc_work(inode, wait, 1);
+               if (!work) {
+                       if (list_empty(&BTRFS_I(inode)->ordered_operations))
+                               list_add_tail(&btrfs_inode->ordered_operations,
+                                             &splice);
+                       spin_lock(&root->fs_info->ordered_extent_lock);
+                       list_splice_tail(&splice,
+                                        &root->fs_info->ordered_operations);
+                       spin_unlock(&root->fs_info->ordered_extent_lock);
+                       ret = -ENOMEM;
+                       goto out;
                 }
+               list_add_tail(&work->list, &works);
+               btrfs_queue_worker(&root->fs_info->flush_workers,
+                                  &work->work);
  
                 cond_resched();
                 spin_lock(&root->fs_info->ordered_extent_lock);
@@ -566,7 +607,13 @@ again:
                 goto again;
  
         spin_unlock(&root->fs_info->ordered_extent_lock);
+out:
+       list_for_each_entry_safe(work, next, &works, list) {
+               list_del_init(&work->list);
+               btrfs_wait_and_free_delalloc_work(work);
+       }
         mutex_unlock(&root->fs_info->ordered_operations_mutex);
+       return ret;
  }
  
  /*
@@ -606,7 +653,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
         u64 end;
         u64 orig_end;
         struct btrfs_ordered_extent *ordered;
-       int found;
  
         if (start + len < start) {
                 orig_end = INT_LIMIT(loff_t);
@@ -642,7 +688,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
         filemap_fdatawait_range(inode->i_mapping, start, orig_end);
  
         end = orig_end;
-       found = 0;
         while (1) {
                 ordered = btrfs_lookup_first_ordered_extent(inode, end);
                 if (!ordered)
@@ -655,7 +700,6 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
                         btrfs_put_ordered_extent(ordered);
                         break;
                 }
-               found++;
                 btrfs_start_ordered_extent(inode, ordered, 1);
                 end = ordered->file_offset;
                 btrfs_put_ordered_extent(ordered);
@@ -934,15 +978,6 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
         if (last_mod < root->fs_info->last_trans_committed)
                 return;
  
-       /*
-        * the transaction is already committing.  Just start the IO and
-        * don't bother with all of this list nonsense
-        */
-       if (trans && root->fs_info->running_transaction->blocked) {
-               btrfs_wait_ordered_range(inode, 0, (u64)-1);
-               return;
-       }
-
         spin_lock(&root->fs_info->ordered_extent_lock);
         if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
                 list_add_tail(&BTRFS_I(inode)->ordered_operations,
@@ -959,6 +994,7 @@ int __init ordered_data_init(void)
                                      NULL);
         if (!btrfs_ordered_extent_cache)
                 return -ENOMEM;
+
         return 0;
  }
  
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h

index 853fc7beedfaae7fd7ed03750e786693839b3f47..f29d4bf5fbe70dee3990874be8d9975c937f3477 100644 (file)
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -128,8 +128,11 @@ struct btrfs_ordered_extent {
         struct list_head root_extent_list;
  
         struct btrfs_work work;
-};
  
+       struct completion completion;
+       struct btrfs_work flush_work;
+       struct list_head work_list;
+};
  
  /*
   * calculates the total size you need to allocate for an ordered sum
@@ -186,7 +189,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
  int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                                 struct btrfs_ordered_extent *ordered);
  int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-void btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
  void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root,
                                  struct inode *inode);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c

index 5e23684887eb8eb401594af69b1be7372f7188aa..50d95fd190a5932e165e9331313ca3e302635b5d 100644 (file)
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -297,6 +297,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
                 case BTRFS_DEV_STATS_KEY:
                         printk(KERN_INFO "\t\tdevice stats\n");
                         break;
+               case BTRFS_DEV_REPLACE_KEY:
+                       printk(KERN_INFO "\t\tdev replace\n");
+                       break;
                 };
         }
  }
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c

index a955669519a265bbfb5f13de4723c87932f8047e..96b93daa0bbb484440e5d51b605ea15884cb9148 100644 (file)
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -27,6 +27,7 @@
  #include "volumes.h"
  #include "disk-io.h"
  #include "transaction.h"
+#include "dev-replace.h"
  
  #undef DEBUG
  
@@ -323,7 +324,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
         struct reada_extent *re = NULL;
         struct reada_extent *re_exist = NULL;
         struct btrfs_fs_info *fs_info = root->fs_info;
-       struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
         struct btrfs_bio *bbio = NULL;
         struct btrfs_device *dev;
         struct btrfs_device *prev_dev;
@@ -332,6 +332,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
         int nzones = 0;
         int i;
         unsigned long index = logical >> PAGE_CACHE_SHIFT;
+       int dev_replace_is_ongoing;
  
         spin_lock(&fs_info->reada_lock);
         re = radix_tree_lookup(&fs_info->reada_tree, index);
@@ -358,7 +359,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
          * map block
          */
         length = blocksize;
-       ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
+       ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
+                             &bbio, 0);
         if (ret || !bbio || length < blocksize)
                 goto error;
  
@@ -393,6 +395,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
         }
  
         /* insert extent in reada_tree + all per-device trees, all or nothing */
+       btrfs_dev_replace_lock(&fs_info->dev_replace);
         spin_lock(&fs_info->reada_lock);
         ret = radix_tree_insert(&fs_info->reada_tree, index, re);
         if (ret == -EEXIST) {
@@ -400,13 +403,17 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
                 BUG_ON(!re_exist);
                 re_exist->refcnt++;
                 spin_unlock(&fs_info->reada_lock);
+               btrfs_dev_replace_unlock(&fs_info->dev_replace);
                 goto error;
         }
         if (ret) {
                 spin_unlock(&fs_info->reada_lock);
+               btrfs_dev_replace_unlock(&fs_info->dev_replace);
                 goto error;
         }
         prev_dev = NULL;
+       dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
+                       &fs_info->dev_replace);
         for (i = 0; i < nzones; ++i) {
                 dev = bbio->stripes[i].dev;
                 if (dev == prev_dev) {
@@ -419,21 +426,36 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
                          */
                         continue;
                 }
+               if (!dev->bdev) {
+                       /* cannot read ahead on missing device */
+                       continue;
+               }
+               if (dev_replace_is_ongoing &&
+                   dev == fs_info->dev_replace.tgtdev) {
+                       /*
+                        * as this device is selected for reading only as
+                        * a last resort, skip it for read ahead.
+                        */
+                       continue;
+               }
                 prev_dev = dev;
                 ret = radix_tree_insert(&dev->reada_extents, index, re);
                 if (ret) {
                         while (--i >= 0) {
                                 dev = bbio->stripes[i].dev;
                                 BUG_ON(dev == NULL);
+                               /* ignore whether the entry was inserted */
                                 radix_tree_delete(&dev->reada_extents, index);
                         }
                         BUG_ON(fs_info == NULL);
                         radix_tree_delete(&fs_info->reada_tree, index);
                         spin_unlock(&fs_info->reada_lock);
+                       btrfs_dev_replace_unlock(&fs_info->dev_replace);
                         goto error;
                 }
         }
         spin_unlock(&fs_info->reada_lock);
+       btrfs_dev_replace_unlock(&fs_info->dev_replace);
  
         kfree(bbio);
         return re;
@@ -915,7 +937,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
         generation = btrfs_header_generation(node);
         free_extent_buffer(node);
  
-       reada_add_block(rc, start, &max_key, level, generation);
+       if (reada_add_block(rc, start, &max_key, level, generation)) {
+               kfree(rc);
+               return ERR_PTR(-ENOMEM);
+       }
  
         reada_start_machine(root->fs_info);
  
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c

index 776f0aa128fc56294dbed997d6a60768f05a0ee9..300e09ac36599ae8b412284b43e677792005fe9e 100644 (file)
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2025,7 +2025,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
         struct btrfs_root_item *root_item;
         struct btrfs_path *path;
         struct extent_buffer *leaf;
-       unsigned long nr;
         int level;
         int max_level;
         int replaced = 0;
@@ -2074,7 +2073,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                 BUG_ON(IS_ERR(trans));
                 trans->block_rsv = rc->block_rsv;
  
-               ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
+               ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
+                                            BTRFS_RESERVE_FLUSH_ALL);
                 if (ret) {
                         BUG_ON(ret != -EAGAIN);
                         ret = btrfs_commit_transaction(trans, root);
@@ -2125,10 +2125,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                                path->slots[level]);
                 root_item->drop_level = level;
  
-               nr = trans->blocks_used;
                 btrfs_end_transaction_throttle(trans, root);
  
-               btrfs_btree_balance_dirty(root, nr);
+               btrfs_btree_balance_dirty(root);
  
                 if (replaced && rc->stage == UPDATE_DATA_PTRS)
                         invalidate_extent_cache(root, &key, &next_key);
@@ -2155,10 +2154,9 @@ out:
                 btrfs_update_reloc_root(trans, root);
         }
  
-       nr = trans->blocks_used;
         btrfs_end_transaction_throttle(trans, root);
  
-       btrfs_btree_balance_dirty(root, nr);
+       btrfs_btree_balance_dirty(root);
  
         if (replaced && rc->stage == UPDATE_DATA_PTRS)
                 invalidate_extent_cache(root, &key, &next_key);
@@ -2184,7 +2182,8 @@ int prepare_to_merge(struct reloc_control *rc, int err)
  again:
         if (!err) {
                 num_bytes = rc->merging_rsv_size;
-               ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
+               ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+                                         BTRFS_RESERVE_FLUSH_ALL);
                 if (ret)
                         err = ret;
         }
@@ -2459,7 +2458,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
         num_bytes = calcu_metadata_size(rc, node, 1) * 2;
  
         trans->block_rsv = rc->block_rsv;
-       ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
+       ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+                                 BTRFS_RESERVE_FLUSH_ALL);
         if (ret) {
                 if (ret == -EAGAIN)
                         rc->commit_transaction = 1;
@@ -3259,7 +3259,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
         struct btrfs_path *path;
         struct btrfs_root *root = fs_info->tree_root;
         struct btrfs_trans_handle *trans;
-       unsigned long nr;
         int ret = 0;
  
         if (inode)
@@ -3293,9 +3292,8 @@ truncate:
         ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
  
         btrfs_free_path(path);
-       nr = trans->blocks_used;
         btrfs_end_transaction(trans, root);
-       btrfs_btree_balance_dirty(root, nr);
+       btrfs_btree_balance_dirty(root);
  out:
         iput(inode);
         return ret;
@@ -3685,7 +3683,8 @@ int prepare_to_relocate(struct reloc_control *rc)
          * is no reservation in transaction handle.
          */
         ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
-                                 rc->extent_root->nodesize * 256);
+                                 rc->extent_root->nodesize * 256,
+                                 BTRFS_RESERVE_FLUSH_ALL);
         if (ret)
                 return ret;
  
@@ -3711,7 +3710,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
         struct btrfs_trans_handle *trans = NULL;
         struct btrfs_path *path;
         struct btrfs_extent_item *ei;
-       unsigned long nr;
         u64 flags;
         u32 item_size;
         int ret;
@@ -3828,9 +3826,8 @@ restart:
                         ret = btrfs_commit_transaction(trans, rc->extent_root);
                         BUG_ON(ret);
                 } else {
-                       nr = trans->blocks_used;
                         btrfs_end_transaction_throttle(trans, rc->extent_root);
-                       btrfs_btree_balance_dirty(rc->extent_root, nr);
+                       btrfs_btree_balance_dirty(rc->extent_root);
                 }
                 trans = NULL;
  
@@ -3860,9 +3857,8 @@ restart:
                           GFP_NOFS);
  
         if (trans) {
-               nr = trans->blocks_used;
                 btrfs_end_transaction_throttle(trans, rc->extent_root);
-               btrfs_btree_balance_dirty(rc->extent_root, nr);
+               btrfs_btree_balance_dirty(rc->extent_root);
         }
  
         if (!err) {
@@ -3941,7 +3937,6 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
         struct btrfs_trans_handle *trans;
         struct btrfs_root *root;
         struct btrfs_key key;
-       unsigned long nr;
         u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
         int err = 0;
  
@@ -3969,9 +3964,8 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
  
         err = btrfs_orphan_add(trans, inode);
  out:
-       nr = trans->blocks_used;
         btrfs_end_transaction(trans, root);
-       btrfs_btree_balance_dirty(root, nr);
+       btrfs_btree_balance_dirty(root);
         if (err) {
                 if (inode)
                         iput(inode);
@@ -4057,7 +4051,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
                (unsigned long long)rc->block_group->key.objectid,
                (unsigned long long)rc->block_group->flags);
  
-       btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+       ret = btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
+       if (ret < 0) {
+               err = ret;
+               goto out;
+       }
         btrfs_wait_ordered_extents(fs_info->tree_root, 0);
  
         while (1) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c

index eb923d087da7848d445820213d53ff3b39d7d279..668af537a3ea2f38abeb7935bfe1d2f4b17132f1 100644 (file)
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -548,9 +548,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
         struct btrfs_root_item *item = &root->root_item;
         struct timespec ct = CURRENT_TIME;
  
-       spin_lock(&root->root_times_lock);
+       spin_lock(&root->root_item_lock);
         item->ctransid = cpu_to_le64(trans->transid);
         item->ctime.sec = cpu_to_le64(ct.tv_sec);
         item->ctime.nsec = cpu_to_le32(ct.tv_nsec);
-       spin_unlock(&root->root_times_lock);
+       spin_unlock(&root->root_item_lock);
  }
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c

index 27892f67e69b216694299720caf31b6e19919555..bdbb94f245c9070802c65acb6eba392ba1a4c932 100644 (file)
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1,5 +1,5 @@
  /*
- * Copyright (C) 2011 STRATO.  All rights reserved.
+ * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   *
   * This program is free software; you can redistribute it and/or
   * modify it under the terms of the GNU General Public
@@ -25,6 +25,7 @@
  #include "transaction.h"
  #include "backref.h"
  #include "extent_io.h"
+#include "dev-replace.h"
  #include "check-integrity.h"
  #include "rcu-string.h"
  
@@ -42,10 +43,23 @@
   */
  
  struct scrub_block;
-struct scrub_dev;
+struct scrub_ctx;
  
-#define SCRUB_PAGES_PER_BIO    16      /* 64k per bio */
-#define SCRUB_BIOS_PER_DEV     16      /* 1 MB per device in flight */
+/*
+ * the following three values only influence the performance.
+ * The last one configures the number of parallel and outstanding I/O
+ * operations. The first two values configure an upper limit for the number
+ * of (dynamically allocated) pages that are added to a bio.
+ */
+#define SCRUB_PAGES_PER_RD_BIO 32      /* 128k per bio */
+#define SCRUB_PAGES_PER_WR_BIO 32      /* 128k per bio */
+#define SCRUB_BIOS_PER_SCTX    64      /* 8MB per device in flight */
+
+/*
+ * the following value times PAGE_SIZE needs to be large enough to match the
+ * largest node/leaf/sector size that shall be supported.
+ * Values larger than BTRFS_STRIPE_LEN are not supported.
+ */
  #define SCRUB_MAX_PAGES_PER_BLOCK      16      /* 64k per node/leaf/sector */
  
  struct scrub_page {
@@ -56,6 +70,8 @@ struct scrub_page {
         u64                     generation;
         u64                     logical;
         u64                     physical;
+       u64                     physical_for_dev_replace;
+       atomic_t                ref_count;
         struct {
                 unsigned int    mirror_num:8;
                 unsigned int    have_csum:1;
@@ -66,23 +82,28 @@ struct scrub_page {
  
  struct scrub_bio {
         int                     index;
-       struct scrub_dev        *sdev;
+       struct scrub_ctx        *sctx;
+       struct btrfs_device     *dev;
         struct bio              *bio;
         int                     err;
         u64                     logical;
         u64                     physical;
-       struct scrub_page       *pagev[SCRUB_PAGES_PER_BIO];
+#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
+       struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
+#else
+       struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
+#endif
         int                     page_count;
         int                     next_free;
         struct btrfs_work       work;
  };
  
  struct scrub_block {
-       struct scrub_page       pagev[SCRUB_MAX_PAGES_PER_BLOCK];
+       struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
         int                     page_count;
         atomic_t                outstanding_pages;
         atomic_t                ref_count; /* free mem on transition to zero */
-       struct scrub_dev        *sdev;
+       struct scrub_ctx        *sctx;
         struct {
                 unsigned int    header_error:1;
                 unsigned int    checksum_error:1;
@@ -91,23 +112,35 @@ struct scrub_block {
         };
  };
  
-struct scrub_dev {
-       struct scrub_bio        *bios[SCRUB_BIOS_PER_DEV];
-       struct btrfs_device     *dev;
+struct scrub_wr_ctx {
+       struct scrub_bio *wr_curr_bio;
+       struct btrfs_device *tgtdev;
+       int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
+       atomic_t flush_all_writes;
+       struct mutex wr_lock;
+};
+
+struct scrub_ctx {
+       struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
+       struct btrfs_root       *dev_root;
         int                     first_free;
         int                     curr;
-       atomic_t                in_flight;
-       atomic_t                fixup_cnt;
+       atomic_t                bios_in_flight;
+       atomic_t                workers_pending;
         spinlock_t              list_lock;
         wait_queue_head_t       list_wait;
         u16                     csum_size;
         struct list_head        csum_list;
         atomic_t                cancel_req;
         int                     readonly;
-       int                     pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */
+       int                     pages_per_rd_bio;
         u32                     sectorsize;
         u32                     nodesize;
         u32                     leafsize;
+
+       int                     is_dev_replace;
+       struct scrub_wr_ctx     wr_ctx;
+
         /*
          * statistics
          */
@@ -116,13 +149,23 @@ struct scrub_dev {
  };
  
  struct scrub_fixup_nodatasum {
-       struct scrub_dev        *sdev;
+       struct scrub_ctx        *sctx;
+       struct btrfs_device     *dev;
         u64                     logical;
         struct btrfs_root       *root;
         struct btrfs_work       work;
         int                     mirror_num;
  };
  
+struct scrub_copy_nocow_ctx {
+       struct scrub_ctx        *sctx;
+       u64                     logical;
+       u64                     len;
+       int                     mirror_num;
+       u64                     physical_for_dev_replace;
+       struct btrfs_work       work;
+};
+
  struct scrub_warning {
         struct btrfs_path       *path;
         u64                     extent_item_size;
@@ -137,15 +180,20 @@ struct scrub_warning {
  };
  
  
+static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
+static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
  static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
-static int scrub_setup_recheck_block(struct scrub_dev *sdev,
-                                    struct btrfs_mapping_tree *map_tree,
+static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
+                                    struct btrfs_fs_info *fs_info,
+                                    struct scrub_block *original_sblock,
                                      u64 length, u64 logical,
-                                    struct scrub_block *sblock);
-static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
-                              struct scrub_block *sblock, int is_metadata,
-                              int have_csum, u8 *csum, u64 generation,
-                              u16 csum_size);
+                                    struct scrub_block *sblocks_for_recheck);
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
+                               struct scrub_block *sblock, int is_metadata,
+                               int have_csum, u8 *csum, u64 generation,
+                               u16 csum_size);
  static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
                                          struct scrub_block *sblock,
                                          int is_metadata, int have_csum,
@@ -158,118 +206,221 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
  static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
                                             struct scrub_block *sblock_good,
                                             int page_num, int force_write);
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
+                                          int page_num);
  static int scrub_checksum_data(struct scrub_block *sblock);
  static int scrub_checksum_tree_block(struct scrub_block *sblock);
  static int scrub_checksum_super(struct scrub_block *sblock);
  static void scrub_block_get(struct scrub_block *sblock);
  static void scrub_block_put(struct scrub_block *sblock);
-static int scrub_add_page_to_bio(struct scrub_dev *sdev,
-                                struct scrub_page *spage);
-static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
-                      u64 physical, u64 flags, u64 gen, int mirror_num,
-                      u8 *csum, int force);
+static void scrub_page_get(struct scrub_page *spage);
+static void scrub_page_put(struct scrub_page *spage);
+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
+                                   struct scrub_page *spage);
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+                      u64 physical, struct btrfs_device *dev, u64 flags,
+                      u64 gen, int mirror_num, u8 *csum, int force,
+                      u64 physical_for_dev_replace);
  static void scrub_bio_end_io(struct bio *bio, int err);
  static void scrub_bio_end_io_worker(struct btrfs_work *work);
  static void scrub_block_complete(struct scrub_block *sblock);
+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
+                              u64 extent_logical, u64 extent_len,
+                              u64 *extent_physical,
+                              struct btrfs_device **extent_dev,
+                              int *extent_mirror_num);
+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
+                             struct scrub_wr_ctx *wr_ctx,
+                             struct btrfs_fs_info *fs_info,
+                             struct btrfs_device *dev,
+                             int is_dev_replace);
+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
+                                   struct scrub_page *spage);
+static void scrub_wr_submit(struct scrub_ctx *sctx);
+static void scrub_wr_bio_end_io(struct bio *bio, int err);
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
+static int write_page_nocow(struct scrub_ctx *sctx,
+                           u64 physical_for_dev_replace, struct page *page);
+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
+                                     void *ctx);
+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+                           int mirror_num, u64 physical_for_dev_replace);
+static void copy_nocow_pages_worker(struct btrfs_work *work);
+
+
+static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
+{
+       atomic_inc(&sctx->bios_in_flight);
+}
+
+static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
+{
+       atomic_dec(&sctx->bios_in_flight);
+       wake_up(&sctx->list_wait);
+}
+
+/*
+ * used for workers that require transaction commits (i.e., for the
+ * NOCOW case)
+ */
+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
+{
+       struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+
+       /*
+        * increment scrubs_running to prevent cancel requests from
+        * completing as long as a worker is running. we must also
+        * increment scrubs_paused to prevent deadlocking on pause
+        * requests used for transactions commits (as the worker uses a
+        * transaction context). it is safe to regard the worker
+        * as paused for all matters practical. effectively, we only
+        * avoid cancellation requests from completing.
+        */
+       mutex_lock(&fs_info->scrub_lock);
+       atomic_inc(&fs_info->scrubs_running);
+       atomic_inc(&fs_info->scrubs_paused);
+       mutex_unlock(&fs_info->scrub_lock);
+       atomic_inc(&sctx->workers_pending);
+}
  
+/* used for workers that require transaction commits */
+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
+{
+       struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
  
-static void scrub_free_csums(struct scrub_dev *sdev)
+       /*
+        * see scrub_pending_trans_workers_inc() why we're pretending
+        * to be paused in the scrub counters
+        */
+       mutex_lock(&fs_info->scrub_lock);
+       atomic_dec(&fs_info->scrubs_running);
+       atomic_dec(&fs_info->scrubs_paused);
+       mutex_unlock(&fs_info->scrub_lock);
+       atomic_dec(&sctx->workers_pending);
+       wake_up(&fs_info->scrub_pause_wait);
+       wake_up(&sctx->list_wait);
+}
+
+static void scrub_free_csums(struct scrub_ctx *sctx)
  {
-       while (!list_empty(&sdev->csum_list)) {
+       while (!list_empty(&sctx->csum_list)) {
                 struct btrfs_ordered_sum *sum;
-               sum = list_first_entry(&sdev->csum_list,
+               sum = list_first_entry(&sctx->csum_list,
                                        struct btrfs_ordered_sum, list);
                 list_del(&sum->list);
                 kfree(sum);
         }
  }
  
-static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
+static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
  {
         int i;
  
-       if (!sdev)
+       if (!sctx)
                 return;
  
+       scrub_free_wr_ctx(&sctx->wr_ctx);
+
         /* this can happen when scrub is cancelled */
-       if (sdev->curr != -1) {
-               struct scrub_bio *sbio = sdev->bios[sdev->curr];
+       if (sctx->curr != -1) {
+               struct scrub_bio *sbio = sctx->bios[sctx->curr];
  
                 for (i = 0; i < sbio->page_count; i++) {
-                       BUG_ON(!sbio->pagev[i]);
-                       BUG_ON(!sbio->pagev[i]->page);
+                       WARN_ON(!sbio->pagev[i]->page);
                         scrub_block_put(sbio->pagev[i]->sblock);
                 }
                 bio_put(sbio->bio);
         }
  
-       for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
-               struct scrub_bio *sbio = sdev->bios[i];
+       for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
+               struct scrub_bio *sbio = sctx->bios[i];
  
                 if (!sbio)
                         break;
                 kfree(sbio);
         }
  
-       scrub_free_csums(sdev);
-       kfree(sdev);
+       scrub_free_csums(sctx);
+       kfree(sctx);
  }
  
  static noinline_for_stack
-struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
+struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
  {
-       struct scrub_dev *sdev;
+       struct scrub_ctx *sctx;
         int             i;
         struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
-       int pages_per_bio;
+       int pages_per_rd_bio;
+       int ret;
  
-       pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO,
-                             bio_get_nr_vecs(dev->bdev));
-       sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
-       if (!sdev)
+       /*
+        * the setting of pages_per_rd_bio is correct for scrub but might
+        * be wrong for the dev_replace code where we might read from
+        * different devices in the initial huge bios. However, that
+        * code is able to correctly handle the case when adding a page
+        * to a bio fails.
+        */
+       if (dev->bdev)
+               pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
+                                        bio_get_nr_vecs(dev->bdev));
+       else
+               pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
+       sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
+       if (!sctx)
                 goto nomem;
-       sdev->dev = dev;
-       sdev->pages_per_bio = pages_per_bio;
-       sdev->curr = -1;
-       for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
+       sctx->is_dev_replace = is_dev_replace;
+       sctx->pages_per_rd_bio = pages_per_rd_bio;
+       sctx->curr = -1;
+       sctx->dev_root = dev->dev_root;
+       for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
                 struct scrub_bio *sbio;
  
                 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
                 if (!sbio)
                         goto nomem;
-               sdev->bios[i] = sbio;
+               sctx->bios[i] = sbio;
  
                 sbio->index = i;
-               sbio->sdev = sdev;
+               sbio->sctx = sctx;
                 sbio->page_count = 0;
                 sbio->work.func = scrub_bio_end_io_worker;
  
-               if (i != SCRUB_BIOS_PER_DEV-1)
-                       sdev->bios[i]->next_free = i + 1;
+               if (i != SCRUB_BIOS_PER_SCTX - 1)
+                       sctx->bios[i]->next_free = i + 1;
                 else
-                       sdev->bios[i]->next_free = -1;
-       }
-       sdev->first_free = 0;
-       sdev->nodesize = dev->dev_root->nodesize;
-       sdev->leafsize = dev->dev_root->leafsize;
-       sdev->sectorsize = dev->dev_root->sectorsize;
-       atomic_set(&sdev->in_flight, 0);
-       atomic_set(&sdev->fixup_cnt, 0);
-       atomic_set(&sdev->cancel_req, 0);
-       sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
-       INIT_LIST_HEAD(&sdev->csum_list);
-
-       spin_lock_init(&sdev->list_lock);
-       spin_lock_init(&sdev->stat_lock);
-       init_waitqueue_head(&sdev->list_wait);
-       return sdev;
+                       sctx->bios[i]->next_free = -1;
+       }
+       sctx->first_free = 0;
+       sctx->nodesize = dev->dev_root->nodesize;
+       sctx->leafsize = dev->dev_root->leafsize;
+       sctx->sectorsize = dev->dev_root->sectorsize;
+       atomic_set(&sctx->bios_in_flight, 0);
+       atomic_set(&sctx->workers_pending, 0);
+       atomic_set(&sctx->cancel_req, 0);
+       sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
+       INIT_LIST_HEAD(&sctx->csum_list);
+
+       spin_lock_init(&sctx->list_lock);
+       spin_lock_init(&sctx->stat_lock);
+       init_waitqueue_head(&sctx->list_wait);
+
+       ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
+                                fs_info->dev_replace.tgtdev, is_dev_replace);
+       if (ret) {
+               scrub_free_ctx(sctx);
+               return ERR_PTR(ret);
+       }
+       return sctx;
  
  nomem:
-       scrub_free_dev(sdev);
+       scrub_free_ctx(sctx);
         return ERR_PTR(-ENOMEM);
  }
  
-static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
+static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
+                                    void *warn_ctx)
  {
         u64 isize;
         u32 nlink;
@@ -277,7 +428,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
         int i;
         struct extent_buffer *eb;
         struct btrfs_inode_item *inode_item;
-       struct scrub_warning *swarn = ctx;
+       struct scrub_warning *swarn = warn_ctx;
         struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
         struct inode_fs_paths *ipath = NULL;
         struct btrfs_root *local_root;
@@ -345,8 +496,8 @@ err:
  
  static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
  {
-       struct btrfs_device *dev = sblock->sdev->dev;
-       struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+       struct btrfs_device *dev;
+       struct btrfs_fs_info *fs_info;
         struct btrfs_path *path;
         struct btrfs_key found_key;
         struct extent_buffer *eb;
@@ -361,15 +512,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
         const int bufsize = 4096;
         int ret;
  
+       WARN_ON(sblock->page_count < 1);
+       dev = sblock->pagev[0]->dev;
+       fs_info = sblock->sctx->dev_root->fs_info;
+
         path = btrfs_alloc_path();
  
         swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
         swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
-       BUG_ON(sblock->page_count < 1);
-       swarn.sector = (sblock->pagev[0].physical) >> 9;
-       swarn.logical = sblock->pagev[0].logical;
+       swarn.sector = (sblock->pagev[0]->physical) >> 9;
+       swarn.logical = sblock->pagev[0]->logical;
         swarn.errstr = errstr;
-       swarn.dev = dev;
+       swarn.dev = NULL;
         swarn.msg_bufsize = bufsize;
         swarn.scratch_bufsize = bufsize;
  
@@ -405,6 +559,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
                 } while (ret != 1);
         } else {
                 swarn.path = path;
+               swarn.dev = dev;
                 iterate_extent_inodes(fs_info, found_key.objectid,
                                         extent_item_pos, 1,
                                         scrub_print_warning_inode, &swarn);
@@ -416,11 +571,11 @@ out:
         kfree(swarn.msg_buf);
  }
  
-static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
+static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
  {
         struct page *page = NULL;
         unsigned long index;
-       struct scrub_fixup_nodatasum *fixup = ctx;
+       struct scrub_fixup_nodatasum *fixup = fixup_ctx;
         int ret;
         int corrected = 0;
         struct btrfs_key key;
@@ -451,7 +606,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
         }
  
         if (PageUptodate(page)) {
-               struct btrfs_mapping_tree *map_tree;
+               struct btrfs_fs_info *fs_info;
                 if (PageDirty(page)) {
                         /*
                          * we need to write the data to the defect sector. the
@@ -472,8 +627,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
                         ret = -EIO;
                         goto out;
                 }
-               map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
-               ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
+               fs_info = BTRFS_I(inode)->root->fs_info;
+               ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
                                         fixup->logical, page,
                                         fixup->mirror_num);
                 unlock_page(page);
@@ -530,21 +685,21 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
  {
         int ret;
         struct scrub_fixup_nodatasum *fixup;
-       struct scrub_dev *sdev;
+       struct scrub_ctx *sctx;
         struct btrfs_trans_handle *trans = NULL;
         struct btrfs_fs_info *fs_info;
         struct btrfs_path *path;
         int uncorrectable = 0;
  
         fixup = container_of(work, struct scrub_fixup_nodatasum, work);
-       sdev = fixup->sdev;
+       sctx = fixup->sctx;
         fs_info = fixup->root->fs_info;
  
         path = btrfs_alloc_path();
         if (!path) {
-               spin_lock(&sdev->stat_lock);
-               ++sdev->stat.malloc_errors;
-               spin_unlock(&sdev->stat_lock);
+               spin_lock(&sctx->stat_lock);
+               ++sctx->stat.malloc_errors;
+               spin_unlock(&sctx->stat_lock);
                 uncorrectable = 1;
                 goto out;
         }
@@ -573,35 +728,30 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
         }
         WARN_ON(ret != 1);
  
-       spin_lock(&sdev->stat_lock);
-       ++sdev->stat.corrected_errors;
-       spin_unlock(&sdev->stat_lock);
+       spin_lock(&sctx->stat_lock);
+       ++sctx->stat.corrected_errors;
+       spin_unlock(&sctx->stat_lock);
  
  out:
         if (trans && !IS_ERR(trans))
                 btrfs_end_transaction(trans, fixup->root);
         if (uncorrectable) {
-               spin_lock(&sdev->stat_lock);
-               ++sdev->stat.uncorrectable_errors;
-               spin_unlock(&sdev->stat_lock);
-
+               spin_lock(&sctx->stat_lock);
+               ++sctx->stat.uncorrectable_errors;
+               spin_unlock(&sctx->stat_lock);
+               btrfs_dev_replace_stats_inc(
+                       &sctx->dev_root->fs_info->dev_replace.
+                       num_uncorrectable_read_errors);
                 printk_ratelimited_in_rcu(KERN_ERR
                         "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
                         (unsigned long long)fixup->logical,
-                       rcu_str_deref(sdev->dev->name));
+                       rcu_str_deref(fixup->dev->name));
         }
  
         btrfs_free_path(path);
         kfree(fixup);
  
-       /* see caller why we're pretending to be paused in the scrub counters */
-       mutex_lock(&fs_info->scrub_lock);
-       atomic_dec(&fs_info->scrubs_running);
-       atomic_dec(&fs_info->scrubs_paused);
-       mutex_unlock(&fs_info->scrub_lock);
-       atomic_dec(&sdev->fixup_cnt);
-       wake_up(&fs_info->scrub_pause_wait);
-       wake_up(&sdev->list_wait);
+       scrub_pending_trans_workers_dec(sctx);
  }
  
  /*
@@ -614,7 +764,8 @@ out:
   */
  static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
  {
-       struct scrub_dev *sdev = sblock_to_check->sdev;
+       struct scrub_ctx *sctx = sblock_to_check->sctx;
+       struct btrfs_device *dev;
         struct btrfs_fs_info *fs_info;
         u64 length;
         u64 logical;
@@ -633,16 +784,33 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                                       DEFAULT_RATELIMIT_BURST);
  
         BUG_ON(sblock_to_check->page_count < 1);
-       fs_info = sdev->dev->dev_root->fs_info;
+       fs_info = sctx->dev_root->fs_info;
+       if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+               /*
+                * if we find an error in a super block, we just report it.
+                * They will get written with the next transaction commit
+                * anyway
+                */
+               spin_lock(&sctx->stat_lock);
+               ++sctx->stat.super_errors;
+               spin_unlock(&sctx->stat_lock);
+               return 0;
+       }
         length = sblock_to_check->page_count * PAGE_SIZE;
-       logical = sblock_to_check->pagev[0].logical;
-       generation = sblock_to_check->pagev[0].generation;
-       BUG_ON(sblock_to_check->pagev[0].mirror_num < 1);
-       failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1;
-       is_metadata = !(sblock_to_check->pagev[0].flags &
+       logical = sblock_to_check->pagev[0]->logical;
+       generation = sblock_to_check->pagev[0]->generation;
+       BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
+       failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
+       is_metadata = !(sblock_to_check->pagev[0]->flags &
                         BTRFS_EXTENT_FLAG_DATA);
-       have_csum = sblock_to_check->pagev[0].have_csum;
-       csum = sblock_to_check->pagev[0].csum;
+       have_csum = sblock_to_check->pagev[0]->have_csum;
+       csum = sblock_to_check->pagev[0]->csum;
+       dev = sblock_to_check->pagev[0]->dev;
+
+       if (sctx->is_dev_replace && !is_metadata && !have_csum) {
+               sblocks_for_recheck = NULL;
+               goto nodatasum_case;
+       }
  
         /*
          * read all mirrors one after the other. This includes to
@@ -677,43 +845,32 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                                      sizeof(*sblocks_for_recheck),
                                      GFP_NOFS);
         if (!sblocks_for_recheck) {
-               spin_lock(&sdev->stat_lock);
-               sdev->stat.malloc_errors++;
-               sdev->stat.read_errors++;
-               sdev->stat.uncorrectable_errors++;
-               spin_unlock(&sdev->stat_lock);
-               btrfs_dev_stat_inc_and_print(sdev->dev,
-                                            BTRFS_DEV_STAT_READ_ERRS);
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.malloc_errors++;
+               sctx->stat.read_errors++;
+               sctx->stat.uncorrectable_errors++;
+               spin_unlock(&sctx->stat_lock);
+               btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
                 goto out;
         }
  
         /* setup the context, map the logical blocks and alloc the pages */
-       ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length,
+       ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
                                         logical, sblocks_for_recheck);
         if (ret) {
-               spin_lock(&sdev->stat_lock);
-               sdev->stat.read_errors++;
-               sdev->stat.uncorrectable_errors++;
-               spin_unlock(&sdev->stat_lock);
-               btrfs_dev_stat_inc_and_print(sdev->dev,
-                                            BTRFS_DEV_STAT_READ_ERRS);
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.read_errors++;
+               sctx->stat.uncorrectable_errors++;
+               spin_unlock(&sctx->stat_lock);
+               btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
                 goto out;
         }
         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
         sblock_bad = sblocks_for_recheck + failed_mirror_index;
  
         /* build and submit the bios for the failed mirror, check checksums */
-       ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
-                                 csum, generation, sdev->csum_size);
-       if (ret) {
-               spin_lock(&sdev->stat_lock);
-               sdev->stat.read_errors++;
-               sdev->stat.uncorrectable_errors++;
-               spin_unlock(&sdev->stat_lock);
-               btrfs_dev_stat_inc_and_print(sdev->dev,
-                                            BTRFS_DEV_STAT_READ_ERRS);
-               goto out;
-       }
+       scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
+                           csum, generation, sctx->csum_size);
  
         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
             sblock_bad->no_io_error_seen) {
@@ -725,50 +882,54 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                  * different bio (usually one of the two latter cases is
                  * the cause)
                  */
-               spin_lock(&sdev->stat_lock);
-               sdev->stat.unverified_errors++;
-               spin_unlock(&sdev->stat_lock);
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.unverified_errors++;
+               spin_unlock(&sctx->stat_lock);
  
+               if (sctx->is_dev_replace)
+                       scrub_write_block_to_dev_replace(sblock_bad);
                 goto out;
         }
  
         if (!sblock_bad->no_io_error_seen) {
-               spin_lock(&sdev->stat_lock);
-               sdev->stat.read_errors++;
-               spin_unlock(&sdev->stat_lock);
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.read_errors++;
+               spin_unlock(&sctx->stat_lock);
                 if (__ratelimit(&_rs))
                         scrub_print_warning("i/o error", sblock_to_check);
-               btrfs_dev_stat_inc_and_print(sdev->dev,
-                                            BTRFS_DEV_STAT_READ_ERRS);
+               btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
         } else if (sblock_bad->checksum_error) {
-               spin_lock(&sdev->stat_lock);
-               sdev->stat.csum_errors++;
-               spin_unlock(&sdev->stat_lock);
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.csum_errors++;
+               spin_unlock(&sctx->stat_lock);
                 if (__ratelimit(&_rs))
                         scrub_print_warning("checksum error", sblock_to_check);
-               btrfs_dev_stat_inc_and_print(sdev->dev,
+               btrfs_dev_stat_inc_and_print(dev,
                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
         } else if (sblock_bad->header_error) {
-               spin_lock(&sdev->stat_lock);
-               sdev->stat.verify_errors++;
-               spin_unlock(&sdev->stat_lock);
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.verify_errors++;
+               spin_unlock(&sctx->stat_lock);
                 if (__ratelimit(&_rs))
                         scrub_print_warning("checksum/header error",
                                             sblock_to_check);
                 if (sblock_bad->generation_error)
-                       btrfs_dev_stat_inc_and_print(sdev->dev,
+                       btrfs_dev_stat_inc_and_print(dev,
                                 BTRFS_DEV_STAT_GENERATION_ERRS);
                 else
-                       btrfs_dev_stat_inc_and_print(sdev->dev,
+                       btrfs_dev_stat_inc_and_print(dev,
                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
         }
  
-       if (sdev->readonly)
+       if (sctx->readonly && !sctx->is_dev_replace)
                 goto did_not_correct_error;
  
         if (!is_metadata && !have_csum) {
                 struct scrub_fixup_nodatasum *fixup_nodatasum;
  
+nodatasum_case:
+               WARN_ON(sctx->is_dev_replace);
+
                 /*
                  * !is_metadata and !have_csum, this means that the data
                  * might not be COW'ed, that it might be modified
@@ -779,24 +940,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
                 if (!fixup_nodatasum)
                         goto did_not_correct_error;
-               fixup_nodatasum->sdev = sdev;
+               fixup_nodatasum->sctx = sctx;
+               fixup_nodatasum->dev = dev;
                 fixup_nodatasum->logical = logical;
                 fixup_nodatasum->root = fs_info->extent_root;
                 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
-               /*
-                * increment scrubs_running to prevent cancel requests from
-                * completing as long as a fixup worker is running. we must also
-                * increment scrubs_paused to prevent deadlocking on pause
-                * requests used for transactions commits (as the worker uses a
-                * transaction context). it is safe to regard the fixup worker
-                * as paused for all matters practical. effectively, we only
-                * avoid cancellation requests from completing.
-                */
-               mutex_lock(&fs_info->scrub_lock);
-               atomic_inc(&fs_info->scrubs_running);
-               atomic_inc(&fs_info->scrubs_paused);
-               mutex_unlock(&fs_info->scrub_lock);
-               atomic_inc(&sdev->fixup_cnt);
+               scrub_pending_trans_workers_inc(sctx);
                 fixup_nodatasum->work.func = scrub_fixup_nodatasum;
                 btrfs_queue_worker(&fs_info->scrub_workers,
                                    &fixup_nodatasum->work);
@@ -805,26 +954,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
  
         /*
          * now build and submit the bios for the other mirrors, check
-        * checksums
-        */
-       for (mirror_index = 0;
-            mirror_index < BTRFS_MAX_MIRRORS &&
-            sblocks_for_recheck[mirror_index].page_count > 0;
-            mirror_index++) {
-               if (mirror_index == failed_mirror_index)
-                       continue;
-
-               /* build and submit the bios, check checksums */
-               ret = scrub_recheck_block(fs_info,
-                                         sblocks_for_recheck + mirror_index,
-                                         is_metadata, have_csum, csum,
-                                         generation, sdev->csum_size);
-               if (ret)
-                       goto did_not_correct_error;
-       }
-
-       /*
-        * first try to pick the mirror which is completely without I/O
+        * checksums.
+        * First try to pick the mirror which is completely without I/O
          * errors and also does not have a checksum error.
          * If one is found, and if a checksum is present, the full block
          * that is known to contain an error is rewritten. Afterwards
@@ -840,24 +971,93 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
              mirror_index < BTRFS_MAX_MIRRORS &&
              sblocks_for_recheck[mirror_index].page_count > 0;
              mirror_index++) {
-               struct scrub_block *sblock_other = sblocks_for_recheck +
-                                                  mirror_index;
+               struct scrub_block *sblock_other;
+
+               if (mirror_index == failed_mirror_index)
+                       continue;
+               sblock_other = sblocks_for_recheck + mirror_index;
+
+               /* build and submit the bios, check checksums */
+               scrub_recheck_block(fs_info, sblock_other, is_metadata,
+                                   have_csum, csum, generation,
+                                   sctx->csum_size);
  
                 if (!sblock_other->header_error &&
                     !sblock_other->checksum_error &&
                     sblock_other->no_io_error_seen) {
-                       int force_write = is_metadata || have_csum;
-
-                       ret = scrub_repair_block_from_good_copy(sblock_bad,
-                                                               sblock_other,
-                                                               force_write);
+                       if (sctx->is_dev_replace) {
+                               scrub_write_block_to_dev_replace(sblock_other);
+                       } else {
+                               int force_write = is_metadata || have_csum;
+
+                               ret = scrub_repair_block_from_good_copy(
+                                               sblock_bad, sblock_other,
+                                               force_write);
+                       }
                         if (0 == ret)
                                 goto corrected_error;
                 }
         }
  
         /*
-        * in case of I/O errors in the area that is supposed to be
+        * for dev_replace, pick good pages and write to the target device.
+        */
+       if (sctx->is_dev_replace) {
+               success = 1;
+               for (page_num = 0; page_num < sblock_bad->page_count;
+                    page_num++) {
+                       int sub_success;
+
+                       sub_success = 0;
+                       for (mirror_index = 0;
+                            mirror_index < BTRFS_MAX_MIRRORS &&
+                            sblocks_for_recheck[mirror_index].page_count > 0;
+                            mirror_index++) {
+                               struct scrub_block *sblock_other =
+                                       sblocks_for_recheck + mirror_index;
+                               struct scrub_page *page_other =
+                                       sblock_other->pagev[page_num];
+
+                               if (!page_other->io_error) {
+                                       ret = scrub_write_page_to_dev_replace(
+                                                       sblock_other, page_num);
+                                       if (ret == 0) {
+                                               /* succeeded for this page */
+                                               sub_success = 1;
+                                               break;
+                                       } else {
+                                               btrfs_dev_replace_stats_inc(
+                                                       &sctx->dev_root->
+                                                       fs_info->dev_replace.
+                                                       num_write_errors);
+                                       }
+                               }
+                       }
+
+                       if (!sub_success) {
+                               /*
+                                * did not find a mirror to fetch the page
+                                * from. scrub_write_page_to_dev_replace()
+                                * handles this case (page->io_error), by
+                                * filling the block with zeros before
+                                * submitting the write request
+                                */
+                               success = 0;
+                               ret = scrub_write_page_to_dev_replace(
+                                               sblock_bad, page_num);
+                               if (ret)
+                                       btrfs_dev_replace_stats_inc(
+                                               &sctx->dev_root->fs_info->
+                                               dev_replace.num_write_errors);
+                       }
+               }
+
+               goto out;
+       }
+
+       /*
+        * for regular scrub, repair those pages that are errored.
+        * In case of I/O errors in the area that is supposed to be
          * repaired, continue by picking good copies of those pages.
          * Select the good pages from mirrors to rewrite bad pages from
          * the area to fix. Afterwards verify the checksum of the block
@@ -887,7 +1087,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
  
         success = 1;
         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
-               struct scrub_page *page_bad = sblock_bad->pagev + page_num;
+               struct scrub_page *page_bad = sblock_bad->pagev[page_num];
  
                 if (!page_bad->io_error)
                         continue;
@@ -898,8 +1098,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                      mirror_index++) {
                         struct scrub_block *sblock_other = sblocks_for_recheck +
                                                            mirror_index;
-                       struct scrub_page *page_other = sblock_other->pagev +
-                                                       page_num;
+                       struct scrub_page *page_other = sblock_other->pagev[
+                                                       page_num];
  
                         if (!page_other->io_error) {
                                 ret = scrub_repair_page_from_good_copy(
@@ -928,10 +1128,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                          * is verified, but most likely the data comes out
                          * of the page cache.
                          */
-                       ret = scrub_recheck_block(fs_info, sblock_bad,
-                                                 is_metadata, have_csum, csum,
-                                                 generation, sdev->csum_size);
-                       if (!ret && !sblock_bad->header_error &&
+                       scrub_recheck_block(fs_info, sblock_bad,
+                                           is_metadata, have_csum, csum,
+                                           generation, sctx->csum_size);
+                       if (!sblock_bad->header_error &&
                             !sblock_bad->checksum_error &&
                             sblock_bad->no_io_error_seen)
                                 goto corrected_error;
@@ -939,23 +1139,23 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                                 goto did_not_correct_error;
                 } else {
  corrected_error:
-                       spin_lock(&sdev->stat_lock);
-                       sdev->stat.corrected_errors++;
-                       spin_unlock(&sdev->stat_lock);
+                       spin_lock(&sctx->stat_lock);
+                       sctx->stat.corrected_errors++;
+                       spin_unlock(&sctx->stat_lock);
                         printk_ratelimited_in_rcu(KERN_ERR
                                 "btrfs: fixed up error at logical %llu on dev %s\n",
                                 (unsigned long long)logical,
-                               rcu_str_deref(sdev->dev->name));
+                               rcu_str_deref(dev->name));
                 }
         } else {
  did_not_correct_error:
-               spin_lock(&sdev->stat_lock);
-               sdev->stat.uncorrectable_errors++;
-               spin_unlock(&sdev->stat_lock);
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.uncorrectable_errors++;
+               spin_unlock(&sctx->stat_lock);
                 printk_ratelimited_in_rcu(KERN_ERR
                         "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
                         (unsigned long long)logical,
-                       rcu_str_deref(sdev->dev->name));
+                       rcu_str_deref(dev->name));
         }
  
  out:
@@ -966,11 +1166,11 @@ out:
                                                      mirror_index;
                         int page_index;
  
-                       for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO;
-                            page_index++)
-                               if (sblock->pagev[page_index].page)
-                                       __free_page(
-                                               sblock->pagev[page_index].page);
+                       for (page_index = 0; page_index < sblock->page_count;
+                            page_index++) {
+                               sblock->pagev[page_index]->sblock = NULL;
+                               scrub_page_put(sblock->pagev[page_index]);
+                       }
                 }
                 kfree(sblocks_for_recheck);
         }
@@ -978,8 +1178,9 @@ out:
         return 0;
  }
  
-static int scrub_setup_recheck_block(struct scrub_dev *sdev,
-                                    struct btrfs_mapping_tree *map_tree,
+static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
+                                    struct btrfs_fs_info *fs_info,
+                                    struct scrub_block *original_sblock,
                                      u64 length, u64 logical,
                                      struct scrub_block *sblocks_for_recheck)
  {
@@ -988,7 +1189,7 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
         int ret;
  
         /*
-        * note: the three members sdev, ref_count and outstanding_pages
+        * note: the two members ref_count and outstanding_pages
          * are not used (and not set) in the blocks that are used for
          * the recheck procedure
          */
@@ -1003,14 +1204,14 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
                  * with a length of PAGE_SIZE, each returned stripe
                  * represents one mirror
                  */
-               ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length,
-                                     &bbio, 0);
+               ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
+                                     &mapped_length, &bbio, 0);
                 if (ret || !bbio || mapped_length < sublen) {
                         kfree(bbio);
                         return -EIO;
                 }
  
-               BUG_ON(page_index >= SCRUB_PAGES_PER_BIO);
+               BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
                 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
                      mirror_index++) {
                         struct scrub_block *sblock;
@@ -1020,21 +1221,31 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
                                 continue;
  
                         sblock = sblocks_for_recheck + mirror_index;
-                       page = sblock->pagev + page_index;
+                       sblock->sctx = sctx;
+                       page = kzalloc(sizeof(*page), GFP_NOFS);
+                       if (!page) {
+leave_nomem:
+                               spin_lock(&sctx->stat_lock);
+                               sctx->stat.malloc_errors++;
+                               spin_unlock(&sctx->stat_lock);
+                               kfree(bbio);
+                               return -ENOMEM;
+                       }
+                       scrub_page_get(page);
+                       sblock->pagev[page_index] = page;
                         page->logical = logical;
                         page->physical = bbio->stripes[mirror_index].physical;
+                       BUG_ON(page_index >= original_sblock->page_count);
+                       page->physical_for_dev_replace =
+                               original_sblock->pagev[page_index]->
+                               physical_for_dev_replace;
                         /* for missing devices, dev->bdev is NULL */
                         page->dev = bbio->stripes[mirror_index].dev;
                         page->mirror_num = mirror_index + 1;
-                       page->page = alloc_page(GFP_NOFS);
-                       if (!page->page) {
-                               spin_lock(&sdev->stat_lock);
-                               sdev->stat.malloc_errors++;
-                               spin_unlock(&sdev->stat_lock);
-                               kfree(bbio);
-                               return -ENOMEM;
-                       }
                         sblock->page_count++;
+                       page->page = alloc_page(GFP_NOFS);
+                       if (!page->page)
+                               goto leave_nomem;
                 }
                 kfree(bbio);
                 length -= sublen;
@@ -1052,10 +1263,10 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
   * to take those pages that are not errored from all the mirrors so that
   * the pages that are errored in the just handled mirror can be repaired.
   */
-static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
-                              struct scrub_block *sblock, int is_metadata,
-                              int have_csum, u8 *csum, u64 generation,
-                              u16 csum_size)
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
+                               struct scrub_block *sblock, int is_metadata,
+                               int have_csum, u8 *csum, u64 generation,
+                               u16 csum_size)
  {
         int page_num;
  
@@ -1065,8 +1276,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
  
         for (page_num = 0; page_num < sblock->page_count; page_num++) {
                 struct bio *bio;
-               int ret;
-               struct scrub_page *page = sblock->pagev + page_num;
+               struct scrub_page *page = sblock->pagev[page_num];
                 DECLARE_COMPLETION_ONSTACK(complete);
  
                 if (page->dev->bdev == NULL) {
@@ -1075,20 +1285,19 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
                         continue;
                 }
  
-               BUG_ON(!page->page);
+               WARN_ON(!page->page);
                 bio = bio_alloc(GFP_NOFS, 1);
-               if (!bio)
-                       return -EIO;
+               if (!bio) {
+                       page->io_error = 1;
+                       sblock->no_io_error_seen = 0;
+                       continue;
+               }
                 bio->bi_bdev = page->dev->bdev;
                 bio->bi_sector = page->physical >> 9;
                 bio->bi_end_io = scrub_complete_bio_end_io;
                 bio->bi_private = &complete;
  
-               ret = bio_add_page(bio, page->page, PAGE_SIZE, 0);
-               if (PAGE_SIZE != ret) {
-                       bio_put(bio);
-                       return -EIO;
-               }
+               bio_add_page(bio, page->page, PAGE_SIZE, 0);
                 btrfsic_submit_bio(READ, bio);
  
                 /* this will also unplug the queue */
@@ -1105,7 +1314,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
                                              have_csum, csum, generation,
                                              csum_size);
  
-       return 0;
+       return;
  }
  
  static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
@@ -1120,14 +1329,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
         struct btrfs_root *root = fs_info->extent_root;
         void *mapped_buffer;
  
-       BUG_ON(!sblock->pagev[0].page);
+       WARN_ON(!sblock->pagev[0]->page);
         if (is_metadata) {
                 struct btrfs_header *h;
  
-               mapped_buffer = kmap_atomic(sblock->pagev[0].page);
+               mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
                 h = (struct btrfs_header *)mapped_buffer;
  
-               if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
+               if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
                     memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
                     memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
                            BTRFS_UUID_SIZE)) {
@@ -1141,7 +1350,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
                 if (!have_csum)
                         return;
  
-               mapped_buffer = kmap_atomic(sblock->pagev[0].page);
+               mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
         }
  
         for (page_num = 0;;) {
@@ -1157,9 +1366,9 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
                 page_num++;
                 if (page_num >= sblock->page_count)
                         break;
-               BUG_ON(!sblock->pagev[page_num].page);
+               WARN_ON(!sblock->pagev[page_num]->page);
  
-               mapped_buffer = kmap_atomic(sblock->pagev[page_num].page);
+               mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
         }
  
         btrfs_csum_final(crc, calculated_csum);
@@ -1197,17 +1406,23 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
                                             struct scrub_block *sblock_good,
                                             int page_num, int force_write)
  {
-       struct scrub_page *page_bad = sblock_bad->pagev + page_num;
-       struct scrub_page *page_good = sblock_good->pagev + page_num;
+       struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+       struct scrub_page *page_good = sblock_good->pagev[page_num];
  
-       BUG_ON(sblock_bad->pagev[page_num].page == NULL);
-       BUG_ON(sblock_good->pagev[page_num].page == NULL);
+       BUG_ON(page_bad->page == NULL);
+       BUG_ON(page_good->page == NULL);
         if (force_write || sblock_bad->header_error ||
             sblock_bad->checksum_error || page_bad->io_error) {
                 struct bio *bio;
                 int ret;
                 DECLARE_COMPLETION_ONSTACK(complete);
  
+               if (!page_bad->dev->bdev) {
+                       printk_ratelimited(KERN_WARNING
+                               "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
+                       return -EIO;
+               }
+
                 bio = bio_alloc(GFP_NOFS, 1);
                 if (!bio)
                         return -EIO;
@@ -1228,6 +1443,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
                 if (!bio_flagged(bio, BIO_UPTODATE)) {
                         btrfs_dev_stat_inc_and_print(page_bad->dev,
                                 BTRFS_DEV_STAT_WRITE_ERRS);
+                       btrfs_dev_replace_stats_inc(
+                               &sblock_bad->sctx->dev_root->fs_info->
+                               dev_replace.num_write_errors);
                         bio_put(bio);
                         return -EIO;
                 }
@@ -1237,13 +1455,174 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
         return 0;
  }
  
-static void scrub_checksum(struct scrub_block *sblock)
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
+{
+       int page_num;
+
+       for (page_num = 0; page_num < sblock->page_count; page_num++) {
+               int ret;
+
+               ret = scrub_write_page_to_dev_replace(sblock, page_num);
+               if (ret)
+                       btrfs_dev_replace_stats_inc(
+                               &sblock->sctx->dev_root->fs_info->dev_replace.
+                               num_write_errors);
+       }
+}
+
+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
+                                          int page_num)
+{
+       struct scrub_page *spage = sblock->pagev[page_num];
+
+       BUG_ON(spage->page == NULL);
+       if (spage->io_error) {
+               void *mapped_buffer = kmap_atomic(spage->page);
+
+               memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
+               flush_dcache_page(spage->page);
+               kunmap_atomic(mapped_buffer);
+       }
+       return scrub_add_page_to_wr_bio(sblock->sctx, spage);
+}
+
+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
+                                   struct scrub_page *spage)
+{
+       struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
+       struct scrub_bio *sbio;
+       int ret;
+
+       mutex_lock(&wr_ctx->wr_lock);
+again:
+       if (!wr_ctx->wr_curr_bio) {
+               wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
+                                             GFP_NOFS);
+               if (!wr_ctx->wr_curr_bio) {
+                       mutex_unlock(&wr_ctx->wr_lock);
+                       return -ENOMEM;
+               }
+               wr_ctx->wr_curr_bio->sctx = sctx;
+               wr_ctx->wr_curr_bio->page_count = 0;
+       }
+       sbio = wr_ctx->wr_curr_bio;
+       if (sbio->page_count == 0) {
+               struct bio *bio;
+
+               sbio->physical = spage->physical_for_dev_replace;
+               sbio->logical = spage->logical;
+               sbio->dev = wr_ctx->tgtdev;
+               bio = sbio->bio;
+               if (!bio) {
+                       bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
+                       if (!bio) {
+                               mutex_unlock(&wr_ctx->wr_lock);
+                               return -ENOMEM;
+                       }
+                       sbio->bio = bio;
+               }
+
+               bio->bi_private = sbio;
+               bio->bi_end_io = scrub_wr_bio_end_io;
+               bio->bi_bdev = sbio->dev->bdev;
+               bio->bi_sector = sbio->physical >> 9;
+               sbio->err = 0;
+       } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+                  spage->physical_for_dev_replace ||
+                  sbio->logical + sbio->page_count * PAGE_SIZE !=
+                  spage->logical) {
+               scrub_wr_submit(sctx);
+               goto again;
+       }
+
+       ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
+       if (ret != PAGE_SIZE) {
+               if (sbio->page_count < 1) {
+                       bio_put(sbio->bio);
+                       sbio->bio = NULL;
+                       mutex_unlock(&wr_ctx->wr_lock);
+                       return -EIO;
+               }
+               scrub_wr_submit(sctx);
+               goto again;
+       }
+
+       sbio->pagev[sbio->page_count] = spage;
+       scrub_page_get(spage);
+       sbio->page_count++;
+       if (sbio->page_count == wr_ctx->pages_per_wr_bio)
+               scrub_wr_submit(sctx);
+       mutex_unlock(&wr_ctx->wr_lock);
+
+       return 0;
+}
+
+static void scrub_wr_submit(struct scrub_ctx *sctx)
+{
+       struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
+       struct scrub_bio *sbio;
+
+       if (!wr_ctx->wr_curr_bio)
+               return;
+
+       sbio = wr_ctx->wr_curr_bio;
+       wr_ctx->wr_curr_bio = NULL;
+       WARN_ON(!sbio->bio->bi_bdev);
+       scrub_pending_bio_inc(sctx);
+       /* process all writes in a single worker thread. Then the block layer
+        * orders the requests before sending them to the driver which
+        * doubled the write performance on spinning disks when measured
+        * with Linux 3.5 */
+       btrfsic_submit_bio(WRITE, sbio->bio);
+}
+
+static void scrub_wr_bio_end_io(struct bio *bio, int err)
+{
+       struct scrub_bio *sbio = bio->bi_private;
+       struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
+
+       sbio->err = err;
+       sbio->bio = bio;
+
+       sbio->work.func = scrub_wr_bio_end_io_worker;
+       btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
+}
+
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
+{
+       struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+       struct scrub_ctx *sctx = sbio->sctx;
+       int i;
+
+       WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
+       if (sbio->err) {
+               struct btrfs_dev_replace *dev_replace =
+                       &sbio->sctx->dev_root->fs_info->dev_replace;
+
+               for (i = 0; i < sbio->page_count; i++) {
+                       struct scrub_page *spage = sbio->pagev[i];
+
+                       spage->io_error = 1;
+                       btrfs_dev_replace_stats_inc(&dev_replace->
+                                                   num_write_errors);
+               }
+       }
+
+       for (i = 0; i < sbio->page_count; i++)
+               scrub_page_put(sbio->pagev[i]);
+
+       bio_put(sbio->bio);
+       kfree(sbio);
+       scrub_pending_bio_dec(sctx);
+}
+
+static int scrub_checksum(struct scrub_block *sblock)
  {
         u64 flags;
         int ret;
  
-       BUG_ON(sblock->page_count < 1);
-       flags = sblock->pagev[0].flags;
+       WARN_ON(sblock->page_count < 1);
+       flags = sblock->pagev[0]->flags;
         ret = 0;
         if (flags & BTRFS_EXTENT_FLAG_DATA)
                 ret = scrub_checksum_data(sblock);
@@ -1255,30 +1634,32 @@ static void scrub_checksum(struct scrub_block *sblock)
                 WARN_ON(1);
         if (ret)
                 scrub_handle_errored_block(sblock);
+
+       return ret;
  }
  
  static int scrub_checksum_data(struct scrub_block *sblock)
  {
-       struct scrub_dev *sdev = sblock->sdev;
+       struct scrub_ctx *sctx = sblock->sctx;
         u8 csum[BTRFS_CSUM_SIZE];
         u8 *on_disk_csum;
         struct page *page;
         void *buffer;
         u32 crc = ~(u32)0;
         int fail = 0;
-       struct btrfs_root *root = sdev->dev->dev_root;
+       struct btrfs_root *root = sctx->dev_root;
         u64 len;
         int index;
  
         BUG_ON(sblock->page_count < 1);
-       if (!sblock->pagev[0].have_csum)
+       if (!sblock->pagev[0]->have_csum)
                 return 0;
  
-       on_disk_csum = sblock->pagev[0].csum;
-       page = sblock->pagev[0].page;
+       on_disk_csum = sblock->pagev[0]->csum;
+       page = sblock->pagev[0]->page;
         buffer = kmap_atomic(page);
  
-       len = sdev->sectorsize;
+       len = sctx->sectorsize;
         index = 0;
         for (;;) {
                 u64 l = min_t(u64, len, PAGE_SIZE);
@@ -1290,13 +1671,13 @@ static int scrub_checksum_data(struct scrub_block *sblock)
                         break;
                 index++;
                 BUG_ON(index >= sblock->page_count);
-               BUG_ON(!sblock->pagev[index].page);
-               page = sblock->pagev[index].page;
+               BUG_ON(!sblock->pagev[index]->page);
+               page = sblock->pagev[index]->page;
                 buffer = kmap_atomic(page);
         }
  
         btrfs_csum_final(crc, csum);
-       if (memcmp(csum, on_disk_csum, sdev->csum_size))
+       if (memcmp(csum, on_disk_csum, sctx->csum_size))
                 fail = 1;
  
         return fail;
@@ -1304,9 +1685,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
  
  static int scrub_checksum_tree_block(struct scrub_block *sblock)
  {
-       struct scrub_dev *sdev = sblock->sdev;
+       struct scrub_ctx *sctx = sblock->sctx;
         struct btrfs_header *h;
-       struct btrfs_root *root = sdev->dev->dev_root;
+       struct btrfs_root *root = sctx->dev_root;
         struct btrfs_fs_info *fs_info = root->fs_info;
         u8 calculated_csum[BTRFS_CSUM_SIZE];
         u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1321,10 +1702,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
         int index;
  
         BUG_ON(sblock->page_count < 1);
-       page = sblock->pagev[0].page;
+       page = sblock->pagev[0]->page;
         mapped_buffer = kmap_atomic(page);
         h = (struct btrfs_header *)mapped_buffer;
-       memcpy(on_disk_csum, h->csum, sdev->csum_size);
+       memcpy(on_disk_csum, h->csum, sctx->csum_size);
  
         /*
          * we don't use the getter functions here, as we
@@ -1332,10 +1713,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
          * b) the page is already kmapped
          */
  
-       if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr))
+       if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
                 ++fail;
  
-       if (sblock->pagev[0].generation != le64_to_cpu(h->generation))
+       if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
                 ++fail;
  
         if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1345,8 +1726,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
                    BTRFS_UUID_SIZE))
                 ++fail;
  
-       BUG_ON(sdev->nodesize != sdev->leafsize);
-       len = sdev->nodesize - BTRFS_CSUM_SIZE;
+       WARN_ON(sctx->nodesize != sctx->leafsize);
+       len = sctx->nodesize - BTRFS_CSUM_SIZE;
         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
         index = 0;
@@ -1360,15 +1741,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
                         break;
                 index++;
                 BUG_ON(index >= sblock->page_count);
-               BUG_ON(!sblock->pagev[index].page);
-               page = sblock->pagev[index].page;
+               BUG_ON(!sblock->pagev[index]->page);
+               page = sblock->pagev[index]->page;
                 mapped_buffer = kmap_atomic(page);
                 mapped_size = PAGE_SIZE;
                 p = mapped_buffer;
         }
  
         btrfs_csum_final(crc, calculated_csum);
-       if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
+       if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
                 ++crc_fail;
  
         return fail || crc_fail;
@@ -1377,8 +1758,8 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
  static int scrub_checksum_super(struct scrub_block *sblock)
  {
         struct btrfs_super_block *s;
-       struct scrub_dev *sdev = sblock->sdev;
-       struct btrfs_root *root = sdev->dev->dev_root;
+       struct scrub_ctx *sctx = sblock->sctx;
+       struct btrfs_root *root = sctx->dev_root;
         struct btrfs_fs_info *fs_info = root->fs_info;
         u8 calculated_csum[BTRFS_CSUM_SIZE];
         u8 on_disk_csum[BTRFS_CSUM_SIZE];
@@ -1393,15 +1774,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
         int index;
  
         BUG_ON(sblock->page_count < 1);
-       page = sblock->pagev[0].page;
+       page = sblock->pagev[0]->page;
         mapped_buffer = kmap_atomic(page);
         s = (struct btrfs_super_block *)mapped_buffer;
-       memcpy(on_disk_csum, s->csum, sdev->csum_size);
+       memcpy(on_disk_csum, s->csum, sctx->csum_size);
  
-       if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
+       if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
                 ++fail_cor;
  
-       if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
+       if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
                 ++fail_gen;
  
         if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1421,15 +1802,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
                         break;
                 index++;
                 BUG_ON(index >= sblock->page_count);
-               BUG_ON(!sblock->pagev[index].page);
-               page = sblock->pagev[index].page;
+               BUG_ON(!sblock->pagev[index]->page);
+               page = sblock->pagev[index]->page;
                 mapped_buffer = kmap_atomic(page);
                 mapped_size = PAGE_SIZE;
                 p = mapped_buffer;
         }
  
         btrfs_csum_final(crc, calculated_csum);
-       if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
+       if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
                 ++fail_cor;
  
         if (fail_cor + fail_gen) {
@@ -1438,14 +1819,14 @@ static int scrub_checksum_super(struct scrub_block *sblock)
                  * They will get written with the next transaction commit
                  * anyway
                  */
-               spin_lock(&sdev->stat_lock);
-               ++sdev->stat.super_errors;
-               spin_unlock(&sdev->stat_lock);
+               spin_lock(&sctx->stat_lock);
+               ++sctx->stat.super_errors;
+               spin_unlock(&sctx->stat_lock);
                 if (fail_cor)
-                       btrfs_dev_stat_inc_and_print(sdev->dev,
+                       btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
                 else
-                       btrfs_dev_stat_inc_and_print(sdev->dev,
+                       btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
                                 BTRFS_DEV_STAT_GENERATION_ERRS);
         }
  
@@ -1463,28 +1844,54 @@ static void scrub_block_put(struct scrub_block *sblock)
                 int i;
  
                 for (i = 0; i < sblock->page_count; i++)
-                       if (sblock->pagev[i].page)
-                               __free_page(sblock->pagev[i].page);
+                       scrub_page_put(sblock->pagev[i]);
                 kfree(sblock);
         }
  }
  
-static void scrub_submit(struct scrub_dev *sdev)
+static void scrub_page_get(struct scrub_page *spage)
+{
+       atomic_inc(&spage->ref_count);
+}
+
+static void scrub_page_put(struct scrub_page *spage)
+{
+       if (atomic_dec_and_test(&spage->ref_count)) {
+               if (spage->page)
+                       __free_page(spage->page);
+               kfree(spage);
+       }
+}
+
+static void scrub_submit(struct scrub_ctx *sctx)
  {
         struct scrub_bio *sbio;
  
-       if (sdev->curr == -1)
+       if (sctx->curr == -1)
                 return;
  
-       sbio = sdev->bios[sdev->curr];
-       sdev->curr = -1;
-       atomic_inc(&sdev->in_flight);
+       sbio = sctx->bios[sctx->curr];
+       sctx->curr = -1;
+       scrub_pending_bio_inc(sctx);
  
-       btrfsic_submit_bio(READ, sbio->bio);
+       if (!sbio->bio->bi_bdev) {
+               /*
+                * this case should not happen. If btrfs_map_block() is
+                * wrong, it could happen for dev-replace operations on
+                * missing devices when no mirrors are available, but in
+                * this case it should already fail the mount.
+                * This case is handled correctly (but _very_ slowly).
+                */
+               printk_ratelimited(KERN_WARNING
+                       "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
+               bio_endio(sbio->bio, -EIO);
+       } else {
+               btrfsic_submit_bio(READ, sbio->bio);
+       }
  }
  
-static int scrub_add_page_to_bio(struct scrub_dev *sdev,
-                                struct scrub_page *spage)
+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
+                                   struct scrub_page *spage)
  {
         struct scrub_block *sblock = spage->sblock;
         struct scrub_bio *sbio;
@@ -1494,28 +1901,29 @@ again:
         /*
          * grab a fresh bio or wait for one to become available
          */
-       while (sdev->curr == -1) {
-               spin_lock(&sdev->list_lock);
-               sdev->curr = sdev->first_free;
-               if (sdev->curr != -1) {
-                       sdev->first_free = sdev->bios[sdev->curr]->next_free;
-                       sdev->bios[sdev->curr]->next_free = -1;
-                       sdev->bios[sdev->curr]->page_count = 0;
-                       spin_unlock(&sdev->list_lock);
+       while (sctx->curr == -1) {
+               spin_lock(&sctx->list_lock);
+               sctx->curr = sctx->first_free;
+               if (sctx->curr != -1) {
+                       sctx->first_free = sctx->bios[sctx->curr]->next_free;
+                       sctx->bios[sctx->curr]->next_free = -1;
+                       sctx->bios[sctx->curr]->page_count = 0;
+                       spin_unlock(&sctx->list_lock);
                 } else {
-                       spin_unlock(&sdev->list_lock);
-                       wait_event(sdev->list_wait, sdev->first_free != -1);
+                       spin_unlock(&sctx->list_lock);
+                       wait_event(sctx->list_wait, sctx->first_free != -1);
                 }
         }
-       sbio = sdev->bios[sdev->curr];
+       sbio = sctx->bios[sctx->curr];
         if (sbio->page_count == 0) {
                 struct bio *bio;
  
                 sbio->physical = spage->physical;
                 sbio->logical = spage->logical;
+               sbio->dev = spage->dev;
                 bio = sbio->bio;
                 if (!bio) {
-                       bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio);
+                       bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
                         if (!bio)
                                 return -ENOMEM;
                         sbio->bio = bio;
@@ -1523,14 +1931,15 @@ again:
  
                 bio->bi_private = sbio;
                 bio->bi_end_io = scrub_bio_end_io;
-               bio->bi_bdev = sdev->dev->bdev;
-               bio->bi_sector = spage->physical >> 9;
+               bio->bi_bdev = sbio->dev->bdev;
+               bio->bi_sector = sbio->physical >> 9;
                 sbio->err = 0;
         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
                    spage->physical ||
                    sbio->logical + sbio->page_count * PAGE_SIZE !=
-                  spage->logical) {
-               scrub_submit(sdev);
+                  spage->logical ||
+                  sbio->dev != spage->dev) {
+               scrub_submit(sctx);
                 goto again;
         }
  
@@ -1542,81 +1951,87 @@ again:
                         sbio->bio = NULL;
                         return -EIO;
                 }
-               scrub_submit(sdev);
+               scrub_submit(sctx);
                 goto again;
         }
  
-       scrub_block_get(sblock); /* one for the added page */
+       scrub_block_get(sblock); /* one for the page added to the bio */
         atomic_inc(&sblock->outstanding_pages);
         sbio->page_count++;
-       if (sbio->page_count == sdev->pages_per_bio)
-               scrub_submit(sdev);
+       if (sbio->page_count == sctx->pages_per_rd_bio)
+               scrub_submit(sctx);
  
         return 0;
  }
  
-static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
-                      u64 physical, u64 flags, u64 gen, int mirror_num,
-                      u8 *csum, int force)
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+                      u64 physical, struct btrfs_device *dev, u64 flags,
+                      u64 gen, int mirror_num, u8 *csum, int force,
+                      u64 physical_for_dev_replace)
  {
         struct scrub_block *sblock;
         int index;
  
         sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
         if (!sblock) {
-               spin_lock(&sdev->stat_lock);
-               sdev->stat.malloc_errors++;
-               spin_unlock(&sdev->stat_lock);
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.malloc_errors++;
+               spin_unlock(&sctx->stat_lock);
                 return -ENOMEM;
         }
  
-       /* one ref inside this function, plus one for each page later on */
+       /* one ref inside this function, plus one for each page added to
+        * a bio later on */
         atomic_set(&sblock->ref_count, 1);
-       sblock->sdev = sdev;
+       sblock->sctx = sctx;
         sblock->no_io_error_seen = 1;
  
         for (index = 0; len > 0; index++) {
-               struct scrub_page *spage = sblock->pagev + index;
+               struct scrub_page *spage;
                 u64 l = min_t(u64, len, PAGE_SIZE);
  
-               BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
-               spage->page = alloc_page(GFP_NOFS);
-               if (!spage->page) {
-                       spin_lock(&sdev->stat_lock);
-                       sdev->stat.malloc_errors++;
-                       spin_unlock(&sdev->stat_lock);
-                       while (index > 0) {
-                               index--;
-                               __free_page(sblock->pagev[index].page);
-                       }
-                       kfree(sblock);
+               spage = kzalloc(sizeof(*spage), GFP_NOFS);
+               if (!spage) {
+leave_nomem:
+                       spin_lock(&sctx->stat_lock);
+                       sctx->stat.malloc_errors++;
+                       spin_unlock(&sctx->stat_lock);
+                       scrub_block_put(sblock);
                         return -ENOMEM;
                 }
+               BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+               scrub_page_get(spage);
+               sblock->pagev[index] = spage;
                 spage->sblock = sblock;
-               spage->dev = sdev->dev;
+               spage->dev = dev;
                 spage->flags = flags;
                 spage->generation = gen;
                 spage->logical = logical;
                 spage->physical = physical;
+               spage->physical_for_dev_replace = physical_for_dev_replace;
                 spage->mirror_num = mirror_num;
                 if (csum) {
                         spage->have_csum = 1;
-                       memcpy(spage->csum, csum, sdev->csum_size);
+                       memcpy(spage->csum, csum, sctx->csum_size);
                 } else {
                         spage->have_csum = 0;
                 }
                 sblock->page_count++;
+               spage->page = alloc_page(GFP_NOFS);
+               if (!spage->page)
+                       goto leave_nomem;
                 len -= l;
                 logical += l;
                 physical += l;
+               physical_for_dev_replace += l;
         }
  
-       BUG_ON(sblock->page_count == 0);
+       WARN_ON(sblock->page_count == 0);
         for (index = 0; index < sblock->page_count; index++) {
-               struct scrub_page *spage = sblock->pagev + index;
+               struct scrub_page *spage = sblock->pagev[index];
                 int ret;
  
-               ret = scrub_add_page_to_bio(sdev, spage);
+               ret = scrub_add_page_to_rd_bio(sctx, spage);
                 if (ret) {
                         scrub_block_put(sblock);
                         return ret;
@@ -1624,7 +2039,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
         }
  
         if (force)
-               scrub_submit(sdev);
+               scrub_submit(sctx);
  
         /* last one frees, either here or in bio completion for last page */
         scrub_block_put(sblock);
@@ -1634,8 +2049,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
  static void scrub_bio_end_io(struct bio *bio, int err)
  {
         struct scrub_bio *sbio = bio->bi_private;
-       struct scrub_dev *sdev = sbio->sdev;
-       struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+       struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
  
         sbio->err = err;
         sbio->bio = bio;
@@ -1646,10 +2060,10 @@ static void scrub_bio_end_io(struct bio *bio, int err)
  static void scrub_bio_end_io_worker(struct btrfs_work *work)
  {
         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
-       struct scrub_dev *sdev = sbio->sdev;
+       struct scrub_ctx *sctx = sbio->sctx;
         int i;
  
-       BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO);
+       BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
         if (sbio->err) {
                 for (i = 0; i < sbio->page_count; i++) {
                         struct scrub_page *spage = sbio->pagev[i];
@@ -1671,23 +2085,37 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work)
  
         bio_put(sbio->bio);
         sbio->bio = NULL;
-       spin_lock(&sdev->list_lock);
-       sbio->next_free = sdev->first_free;
-       sdev->first_free = sbio->index;
-       spin_unlock(&sdev->list_lock);
-       atomic_dec(&sdev->in_flight);
-       wake_up(&sdev->list_wait);
+       spin_lock(&sctx->list_lock);
+       sbio->next_free = sctx->first_free;
+       sctx->first_free = sbio->index;
+       spin_unlock(&sctx->list_lock);
+
+       if (sctx->is_dev_replace &&
+           atomic_read(&sctx->wr_ctx.flush_all_writes)) {
+               mutex_lock(&sctx->wr_ctx.wr_lock);
+               scrub_wr_submit(sctx);
+               mutex_unlock(&sctx->wr_ctx.wr_lock);
+       }
+
+       scrub_pending_bio_dec(sctx);
  }
  
  static void scrub_block_complete(struct scrub_block *sblock)
  {
-       if (!sblock->no_io_error_seen)
+       if (!sblock->no_io_error_seen) {
                 scrub_handle_errored_block(sblock);
-       else
-               scrub_checksum(sblock);
+       } else {
+               /*
+                * if has checksum error, write via repair mechanism in
+                * dev replace case, otherwise write here in dev replace
+                * case.
+                */
+               if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
+                       scrub_write_block_to_dev_replace(sblock);
+       }
  }
  
-static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
+static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
                            u8 *csum)
  {
         struct btrfs_ordered_sum *sum = NULL;
@@ -1695,15 +2123,15 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
         unsigned long i;
         unsigned long num_sectors;
  
-       while (!list_empty(&sdev->csum_list)) {
-               sum = list_first_entry(&sdev->csum_list,
+       while (!list_empty(&sctx->csum_list)) {
+               sum = list_first_entry(&sctx->csum_list,
                                        struct btrfs_ordered_sum, list);
                 if (sum->bytenr > logical)
                         return 0;
                 if (sum->bytenr + sum->len > logical)
                         break;
  
-               ++sdev->stat.csum_discards;
+               ++sctx->stat.csum_discards;
                 list_del(&sum->list);
                 kfree(sum);
                 sum = NULL;
@@ -1711,10 +2139,10 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
         if (!sum)
                 return 0;
  
-       num_sectors = sum->len / sdev->sectorsize;
+       num_sectors = sum->len / sctx->sectorsize;
         for (i = 0; i < num_sectors; ++i) {
                 if (sum->sums[i].bytenr == logical) {
-                       memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
+                       memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
                         ret = 1;
                         break;
                 }
@@ -1727,29 +2155,30 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
  }
  
  /* scrub extent tries to collect up to 64 kB for each bio */
-static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
-                       u64 physical, u64 flags, u64 gen, int mirror_num)
+static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
+                       u64 physical, struct btrfs_device *dev, u64 flags,
+                       u64 gen, int mirror_num, u64 physical_for_dev_replace)
  {
         int ret;
         u8 csum[BTRFS_CSUM_SIZE];
         u32 blocksize;
  
         if (flags & BTRFS_EXTENT_FLAG_DATA) {
-               blocksize = sdev->sectorsize;
-               spin_lock(&sdev->stat_lock);
-               sdev->stat.data_extents_scrubbed++;
-               sdev->stat.data_bytes_scrubbed += len;
-               spin_unlock(&sdev->stat_lock);
+               blocksize = sctx->sectorsize;
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.data_extents_scrubbed++;
+               sctx->stat.data_bytes_scrubbed += len;
+               spin_unlock(&sctx->stat_lock);
         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-               BUG_ON(sdev->nodesize != sdev->leafsize);
-               blocksize = sdev->nodesize;
-               spin_lock(&sdev->stat_lock);
-               sdev->stat.tree_extents_scrubbed++;
-               sdev->stat.tree_bytes_scrubbed += len;
-               spin_unlock(&sdev->stat_lock);
+               WARN_ON(sctx->nodesize != sctx->leafsize);
+               blocksize = sctx->nodesize;
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.tree_extents_scrubbed++;
+               sctx->stat.tree_bytes_scrubbed += len;
+               spin_unlock(&sctx->stat_lock);
         } else {
-               blocksize = sdev->sectorsize;
-               BUG_ON(1);
+               blocksize = sctx->sectorsize;
+               WARN_ON(1);
         }
  
         while (len) {
@@ -1758,26 +2187,38 @@ static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
  
                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
                         /* push csums to sbio */
-                       have_csum = scrub_find_csum(sdev, logical, l, csum);
+                       have_csum = scrub_find_csum(sctx, logical, l, csum);
                         if (have_csum == 0)
-                               ++sdev->stat.no_csum;
+                               ++sctx->stat.no_csum;
+                       if (sctx->is_dev_replace && !have_csum) {
+                               ret = copy_nocow_pages(sctx, logical, l,
+                                                      mirror_num,
+                                                     physical_for_dev_replace);
+                               goto behind_scrub_pages;
+                       }
                 }
-               ret = scrub_pages(sdev, logical, l, physical, flags, gen,
-                                 mirror_num, have_csum ? csum : NULL, 0);
+               ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
+                                 mirror_num, have_csum ? csum : NULL, 0,
+                                 physical_for_dev_replace);
+behind_scrub_pages:
                 if (ret)
                         return ret;
                 len -= l;
                 logical += l;
                 physical += l;
+               physical_for_dev_replace += l;
         }
         return 0;
  }
  
-static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
-       struct map_lookup *map, int num, u64 base, u64 length)
+static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
+                                          struct map_lookup *map,
+                                          struct btrfs_device *scrub_dev,
+                                          int num, u64 base, u64 length,
+                                          int is_dev_replace)
  {
         struct btrfs_path *path;
-       struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+       struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
         struct btrfs_root *root = fs_info->extent_root;
         struct btrfs_root *csum_root = fs_info->csum_root;
         struct btrfs_extent_item *extent;
@@ -1797,9 +2238,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
         struct reada_control *reada2;
         struct btrfs_key key_start;
         struct btrfs_key key_end;
-
         u64 increment = map->stripe_len;
         u64 offset;
+       u64 extent_logical;
+       u64 extent_physical;
+       u64 extent_len;
+       struct btrfs_device *extent_dev;
+       int extent_mirror_num;
  
         nstripes = length;
         offset = 0;
@@ -1843,8 +2288,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
          */
         logical = base + offset;
  
-       wait_event(sdev->list_wait,
-                  atomic_read(&sdev->in_flight) == 0);
+       wait_event(sctx->list_wait,
+                  atomic_read(&sctx->bios_in_flight) == 0);
         atomic_inc(&fs_info->scrubs_paused);
         wake_up(&fs_info->scrub_pause_wait);
  
@@ -1898,7 +2343,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
                  * canceled?
                  */
                 if (atomic_read(&fs_info->scrub_cancel_req) ||
-                   atomic_read(&sdev->cancel_req)) {
+                   atomic_read(&sctx->cancel_req)) {
                         ret = -ECANCELED;
                         goto out;
                 }
@@ -1907,9 +2352,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
                  */
                 if (atomic_read(&fs_info->scrub_pause_req)) {
                         /* push queued extents */
-                       scrub_submit(sdev);
-                       wait_event(sdev->list_wait,
-                                  atomic_read(&sdev->in_flight) == 0);
+                       atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+                       scrub_submit(sctx);
+                       mutex_lock(&sctx->wr_ctx.wr_lock);
+                       scrub_wr_submit(sctx);
+                       mutex_unlock(&sctx->wr_ctx.wr_lock);
+                       wait_event(sctx->list_wait,
+                                  atomic_read(&sctx->bios_in_flight) == 0);
+                       atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
                         atomic_inc(&fs_info->scrubs_paused);
                         wake_up(&fs_info->scrub_pause_wait);
                         mutex_lock(&fs_info->scrub_lock);
@@ -1926,7 +2376,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
  
                 ret = btrfs_lookup_csums_range(csum_root, logical,
                                                logical + map->stripe_len - 1,
-                                              &sdev->csum_list, 1);
+                                              &sctx->csum_list, 1);
                 if (ret)
                         goto out;
  
@@ -2004,9 +2454,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
                                              key.objectid;
                         }
  
-                       ret = scrub_extent(sdev, key.objectid, key.offset,
-                                          key.objectid - logical + physical,
-                                          flags, generation, mirror_num);
+                       extent_logical = key.objectid;
+                       extent_physical = key.objectid - logical + physical;
+                       extent_len = key.offset;
+                       extent_dev = scrub_dev;
+                       extent_mirror_num = mirror_num;
+                       if (is_dev_replace)
+                               scrub_remap_extent(fs_info, extent_logical,
+                                                  extent_len, &extent_physical,
+                                                  &extent_dev,
+                                                  &extent_mirror_num);
+                       ret = scrub_extent(sctx, extent_logical, extent_len,
+                                          extent_physical, extent_dev, flags,
+                                          generation, extent_mirror_num,
+                                          key.objectid - logical + physical);
                         if (ret)
                                 goto out;
  
@@ -2016,29 +2477,34 @@ next:
                 btrfs_release_path(path);
                 logical += increment;
                 physical += map->stripe_len;
-               spin_lock(&sdev->stat_lock);
-               sdev->stat.last_physical = physical;
-               spin_unlock(&sdev->stat_lock);
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.last_physical = physical;
+               spin_unlock(&sctx->stat_lock);
         }
+out:
         /* push queued extents */
-       scrub_submit(sdev);
+       scrub_submit(sctx);
+       mutex_lock(&sctx->wr_ctx.wr_lock);
+       scrub_wr_submit(sctx);
+       mutex_unlock(&sctx->wr_ctx.wr_lock);
  
-out:
         blk_finish_plug(&plug);
         btrfs_free_path(path);
         return ret < 0 ? ret : 0;
  }
  
-static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
-       u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length,
-       u64 dev_offset)
+static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
+                                         struct btrfs_device *scrub_dev,
+                                         u64 chunk_tree, u64 chunk_objectid,
+                                         u64 chunk_offset, u64 length,
+                                         u64 dev_offset, int is_dev_replace)
  {
         struct btrfs_mapping_tree *map_tree =
-               &sdev->dev->dev_root->fs_info->mapping_tree;
+               &sctx->dev_root->fs_info->mapping_tree;
         struct map_lookup *map;
         struct extent_map *em;
         int i;
-       int ret = -EINVAL;
+       int ret = 0;
  
         read_lock(&map_tree->map_tree.lock);
         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
@@ -2055,9 +2521,11 @@ static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
                 goto out;
  
         for (i = 0; i < map->num_stripes; ++i) {
-               if (map->stripes[i].dev == sdev->dev &&
+               if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
                     map->stripes[i].physical == dev_offset) {
-                       ret = scrub_stripe(sdev, map, i, chunk_offset, length);
+                       ret = scrub_stripe(sctx, map, scrub_dev, i,
+                                          chunk_offset, length,
+                                          is_dev_replace);
                         if (ret)
                                 goto out;
                 }
@@ -2069,11 +2537,13 @@ out:
  }
  
  static noinline_for_stack
-int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
+int scrub_enumerate_chunks(struct scrub_ctx *sctx,
+                          struct btrfs_device *scrub_dev, u64 start, u64 end,
+                          int is_dev_replace)
  {
         struct btrfs_dev_extent *dev_extent = NULL;
         struct btrfs_path *path;
-       struct btrfs_root *root = sdev->dev->dev_root;
+       struct btrfs_root *root = sctx->dev_root;
         struct btrfs_fs_info *fs_info = root->fs_info;
         u64 length;
         u64 chunk_tree;
@@ -2085,6 +2555,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
         struct btrfs_key key;
         struct btrfs_key found_key;
         struct btrfs_block_group_cache *cache;
+       struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
  
         path = btrfs_alloc_path();
         if (!path)
@@ -2094,11 +2565,10 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
         path->search_commit_root = 1;
         path->skip_locking = 1;
  
-       key.objectid = sdev->dev->devid;
+       key.objectid = scrub_dev->devid;
         key.offset = 0ull;
         key.type = BTRFS_DEV_EXTENT_KEY;
  
-
         while (1) {
                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                 if (ret < 0)
@@ -2117,7 +2587,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
  
                 btrfs_item_key_to_cpu(l, &found_key, slot);
  
-               if (found_key.objectid != sdev->dev->devid)
+               if (found_key.objectid != scrub_dev->devid)
                         break;
  
                 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
@@ -2151,11 +2621,62 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
                         ret = -ENOENT;
                         break;
                 }
-               ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
-                                 chunk_offset, length, found_key.offset);
+               dev_replace->cursor_right = found_key.offset + length;
+               dev_replace->cursor_left = found_key.offset;
+               dev_replace->item_needs_writeback = 1;
+               ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
+                                 chunk_offset, length, found_key.offset,
+                                 is_dev_replace);
+
+               /*
+                * flush, submit all pending read and write bios, afterwards
+                * wait for them.
+                * Note that in the dev replace case, a read request causes
+                * write requests that are submitted in the read completion
+                * worker. Therefore in the current situation, it is required
+                * that all write requests are flushed, so that all read and
+                * write requests are really completed when bios_in_flight
+                * changes to 0.
+                */
+               atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+               scrub_submit(sctx);
+               mutex_lock(&sctx->wr_ctx.wr_lock);
+               scrub_wr_submit(sctx);
+               mutex_unlock(&sctx->wr_ctx.wr_lock);
+
+               wait_event(sctx->list_wait,
+                          atomic_read(&sctx->bios_in_flight) == 0);
+               atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+               atomic_inc(&fs_info->scrubs_paused);
+               wake_up(&fs_info->scrub_pause_wait);
+               wait_event(sctx->list_wait,
+                          atomic_read(&sctx->workers_pending) == 0);
+
+               mutex_lock(&fs_info->scrub_lock);
+               while (atomic_read(&fs_info->scrub_pause_req)) {
+                       mutex_unlock(&fs_info->scrub_lock);
+                       wait_event(fs_info->scrub_pause_wait,
+                          atomic_read(&fs_info->scrub_pause_req) == 0);
+                       mutex_lock(&fs_info->scrub_lock);
+               }
+               atomic_dec(&fs_info->scrubs_paused);
+               mutex_unlock(&fs_info->scrub_lock);
+               wake_up(&fs_info->scrub_pause_wait);
+
+               dev_replace->cursor_left = dev_replace->cursor_right;
+               dev_replace->item_needs_writeback = 1;
                 btrfs_put_block_group(cache);
                 if (ret)
                         break;
+               if (is_dev_replace &&
+                   atomic64_read(&dev_replace->num_write_errors) > 0) {
+                       ret = -EIO;
+                       break;
+               }
+               if (sctx->stat.malloc_errors > 0) {
+                       ret = -ENOMEM;
+                       break;
+               }
  
                 key.offset = found_key.offset + length;
                 btrfs_release_path(path);
@@ -2170,14 +2691,14 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
         return ret < 0 ? ret : 0;
  }
  
-static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
+static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
+                                          struct btrfs_device *scrub_dev)
  {
         int     i;
         u64     bytenr;
         u64     gen;
         int     ret;
-       struct btrfs_device *device = sdev->dev;
-       struct btrfs_root *root = device->dev_root;
+       struct btrfs_root *root = sctx->dev_root;
  
         if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
                 return -EIO;
@@ -2186,15 +2707,16 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
  
         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
                 bytenr = btrfs_sb_offset(i);
-               if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
+               if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
                         break;
  
-               ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
-                                    BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
+               ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+                                 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
+                                 NULL, 1, bytenr);
                 if (ret)
                         return ret;
         }
-       wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+       wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
  
         return 0;
  }
@@ -2202,19 +2724,38 @@ static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
  /*
   * get a reference count on fs_info->scrub_workers. start worker if necessary
   */
-static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
+static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
+                                               int is_dev_replace)
  {
-       struct btrfs_fs_info *fs_info = root->fs_info;
         int ret = 0;
  
         mutex_lock(&fs_info->scrub_lock);
         if (fs_info->scrub_workers_refcnt == 0) {
-               btrfs_init_workers(&fs_info->scrub_workers, "scrub",
-                          fs_info->thread_pool_size, &fs_info->generic_worker);
+               if (is_dev_replace)
+                       btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
+                                       &fs_info->generic_worker);
+               else
+                       btrfs_init_workers(&fs_info->scrub_workers, "scrub",
+                                       fs_info->thread_pool_size,
+                                       &fs_info->generic_worker);
                 fs_info->scrub_workers.idle_thresh = 4;
                 ret = btrfs_start_workers(&fs_info->scrub_workers);
                 if (ret)
                         goto out;
+               btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
+                                  "scrubwrc",
+                                  fs_info->thread_pool_size,
+                                  &fs_info->generic_worker);
+               fs_info->scrub_wr_completion_workers.idle_thresh = 2;
+               ret = btrfs_start_workers(
+                               &fs_info->scrub_wr_completion_workers);
+               if (ret)
+                       goto out;
+               btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
+                                  &fs_info->generic_worker);
+               ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
+               if (ret)
+                       goto out;
         }
         ++fs_info->scrub_workers_refcnt;
  out:
@@ -2223,40 +2764,41 @@ out:
         return ret;
  }
  
-static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
+static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
  {
-       struct btrfs_fs_info *fs_info = root->fs_info;
-
         mutex_lock(&fs_info->scrub_lock);
-       if (--fs_info->scrub_workers_refcnt == 0)
+       if (--fs_info->scrub_workers_refcnt == 0) {
                 btrfs_stop_workers(&fs_info->scrub_workers);
+               btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
+               btrfs_stop_workers(&fs_info->scrub_nocow_workers);
+       }
         WARN_ON(fs_info->scrub_workers_refcnt < 0);
         mutex_unlock(&fs_info->scrub_lock);
  }
  
-
-int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
-                   struct btrfs_scrub_progress *progress, int readonly)
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+                   u64 end, struct btrfs_scrub_progress *progress,
+                   int readonly, int is_dev_replace)
  {
-       struct scrub_dev *sdev;
-       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct scrub_ctx *sctx;
         int ret;
         struct btrfs_device *dev;
  
-       if (btrfs_fs_closing(root->fs_info))
+       if (btrfs_fs_closing(fs_info))
                 return -EINVAL;
  
         /*
          * check some assumptions
          */
-       if (root->nodesize != root->leafsize) {
+       if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
                 printk(KERN_ERR
                        "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
-                      root->nodesize, root->leafsize);
+                      fs_info->chunk_root->nodesize,
+                      fs_info->chunk_root->leafsize);
                 return -EINVAL;
         }
  
-       if (root->nodesize > BTRFS_STRIPE_LEN) {
+       if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
                 /*
                  * in this case scrub is unable to calculate the checksum
                  * the way scrub is implemented. Do not handle this
@@ -2264,80 +2806,105 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
                  */
                 printk(KERN_ERR
                        "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
-                      root->nodesize, BTRFS_STRIPE_LEN);
+                      fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
                 return -EINVAL;
         }
  
-       if (root->sectorsize != PAGE_SIZE) {
+       if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
                 /* not supported for data w/o checksums */
                 printk(KERN_ERR
                        "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
-                      root->sectorsize, (unsigned long long)PAGE_SIZE);
+                      fs_info->chunk_root->sectorsize,
+                      (unsigned long long)PAGE_SIZE);
                 return -EINVAL;
         }
  
-       ret = scrub_workers_get(root);
+       if (fs_info->chunk_root->nodesize >
+           PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
+           fs_info->chunk_root->sectorsize >
+           PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
+               /*
+                * would exhaust the array bounds of pagev member in
+                * struct scrub_block
+                */
+               pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
+                      fs_info->chunk_root->nodesize,
+                      SCRUB_MAX_PAGES_PER_BLOCK,
+                      fs_info->chunk_root->sectorsize,
+                      SCRUB_MAX_PAGES_PER_BLOCK);
+               return -EINVAL;
+       }
+
+       ret = scrub_workers_get(fs_info, is_dev_replace);
         if (ret)
                 return ret;
  
-       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-       dev = btrfs_find_device(root, devid, NULL, NULL);
-       if (!dev || dev->missing) {
-               mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-               scrub_workers_put(root);
+       mutex_lock(&fs_info->fs_devices->device_list_mutex);
+       dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+       if (!dev || (dev->missing && !is_dev_replace)) {
+               mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+               scrub_workers_put(fs_info);
                 return -ENODEV;
         }
         mutex_lock(&fs_info->scrub_lock);
  
-       if (!dev->in_fs_metadata) {
+       if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
                 mutex_unlock(&fs_info->scrub_lock);
-               mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-               scrub_workers_put(root);
-               return -ENODEV;
+               mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+               scrub_workers_put(fs_info);
+               return -EIO;
         }
  
-       if (dev->scrub_device) {
+       btrfs_dev_replace_lock(&fs_info->dev_replace);
+       if (dev->scrub_device ||
+           (!is_dev_replace &&
+            btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
+               btrfs_dev_replace_unlock(&fs_info->dev_replace);
                 mutex_unlock(&fs_info->scrub_lock);
-               mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-               scrub_workers_put(root);
+               mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+               scrub_workers_put(fs_info);
                 return -EINPROGRESS;
         }
-       sdev = scrub_setup_dev(dev);
-       if (IS_ERR(sdev)) {
+       btrfs_dev_replace_unlock(&fs_info->dev_replace);
+       sctx = scrub_setup_ctx(dev, is_dev_replace);
+       if (IS_ERR(sctx)) {
                 mutex_unlock(&fs_info->scrub_lock);
-               mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-               scrub_workers_put(root);
-               return PTR_ERR(sdev);
+               mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+               scrub_workers_put(fs_info);
+               return PTR_ERR(sctx);
         }
-       sdev->readonly = readonly;
-       dev->scrub_device = sdev;
+       sctx->readonly = readonly;
+       dev->scrub_device = sctx;
  
         atomic_inc(&fs_info->scrubs_running);
         mutex_unlock(&fs_info->scrub_lock);
-       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+       mutex_unlock(&fs_info->fs_devices->device_list_mutex);
  
-       down_read(&fs_info->scrub_super_lock);
-       ret = scrub_supers(sdev);
-       up_read(&fs_info->scrub_super_lock);
+       if (!is_dev_replace) {
+               down_read(&fs_info->scrub_super_lock);
+               ret = scrub_supers(sctx, dev);
+               up_read(&fs_info->scrub_super_lock);
+       }
  
         if (!ret)
-               ret = scrub_enumerate_chunks(sdev, start, end);
+               ret = scrub_enumerate_chunks(sctx, dev, start, end,
+                                            is_dev_replace);
  
-       wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+       wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
         atomic_dec(&fs_info->scrubs_running);
         wake_up(&fs_info->scrub_pause_wait);
  
-       wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
+       wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
  
         if (progress)
-               memcpy(progress, &sdev->stat, sizeof(*progress));
+               memcpy(progress, &sctx->stat, sizeof(*progress));
  
         mutex_lock(&fs_info->scrub_lock);
         dev->scrub_device = NULL;
         mutex_unlock(&fs_info->scrub_lock);
  
-       scrub_free_dev(sdev);
-       scrub_workers_put(root);
+       scrub_free_ctx(sctx);
+       scrub_workers_put(fs_info);
  
         return ret;
  }
@@ -2377,9 +2944,8 @@ void btrfs_scrub_continue_super(struct btrfs_root *root)
         up_write(&root->fs_info->scrub_super_lock);
  }
  
-int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
+int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
  {
-
         mutex_lock(&fs_info->scrub_lock);
         if (!atomic_read(&fs_info->scrubs_running)) {
                 mutex_unlock(&fs_info->scrub_lock);
@@ -2399,23 +2965,18 @@ int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
         return 0;
  }
  
-int btrfs_scrub_cancel(struct btrfs_root *root)
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
+                          struct btrfs_device *dev)
  {
-       return __btrfs_scrub_cancel(root->fs_info);
-}
-
-int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
-{
-       struct btrfs_fs_info *fs_info = root->fs_info;
-       struct scrub_dev *sdev;
+       struct scrub_ctx *sctx;
  
         mutex_lock(&fs_info->scrub_lock);
-       sdev = dev->scrub_device;
-       if (!sdev) {
+       sctx = dev->scrub_device;
+       if (!sctx) {
                 mutex_unlock(&fs_info->scrub_lock);
                 return -ENOTCONN;
         }
-       atomic_inc(&sdev->cancel_req);
+       atomic_inc(&sctx->cancel_req);
         while (dev->scrub_device) {
                 mutex_unlock(&fs_info->scrub_lock);
                 wait_event(fs_info->scrub_pause_wait,
@@ -2438,12 +2999,12 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
          * does not go away in cancel_dev. FIXME: find a better solution
          */
         mutex_lock(&fs_info->fs_devices->device_list_mutex);
-       dev = btrfs_find_device(root, devid, NULL, NULL);
+       dev = btrfs_find_device(fs_info, devid, NULL, NULL);
         if (!dev) {
                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
                 return -ENODEV;
         }
-       ret = btrfs_scrub_cancel_dev(root, dev);
+       ret = btrfs_scrub_cancel_dev(fs_info, dev);
         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
  
         return ret;
@@ -2453,15 +3014,284 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
                          struct btrfs_scrub_progress *progress)
  {
         struct btrfs_device *dev;
-       struct scrub_dev *sdev = NULL;
+       struct scrub_ctx *sctx = NULL;
  
         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-       dev = btrfs_find_device(root, devid, NULL, NULL);
+       dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
         if (dev)
-               sdev = dev->scrub_device;
-       if (sdev)
-               memcpy(progress, &sdev->stat, sizeof(*progress));
+               sctx = dev->scrub_device;
+       if (sctx)
+               memcpy(progress, &sctx->stat, sizeof(*progress));
         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  
-       return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
+       return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
+}
+
+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
+                              u64 extent_logical, u64 extent_len,
+                              u64 *extent_physical,
+                              struct btrfs_device **extent_dev,
+                              int *extent_mirror_num)
+{
+       u64 mapped_length;
+       struct btrfs_bio *bbio = NULL;
+       int ret;
+
+       mapped_length = extent_len;
+       ret = btrfs_map_block(fs_info, READ, extent_logical,
+                             &mapped_length, &bbio, 0);
+       if (ret || !bbio || mapped_length < extent_len ||
+           !bbio->stripes[0].dev->bdev) {
+               kfree(bbio);
+               return;
+       }
+
+       *extent_physical = bbio->stripes[0].physical;
+       *extent_mirror_num = bbio->mirror_num;
+       *extent_dev = bbio->stripes[0].dev;
+       kfree(bbio);
+}
+
+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
+                             struct scrub_wr_ctx *wr_ctx,
+                             struct btrfs_fs_info *fs_info,
+                             struct btrfs_device *dev,
+                             int is_dev_replace)
+{
+       WARN_ON(wr_ctx->wr_curr_bio != NULL);
+
+       mutex_init(&wr_ctx->wr_lock);
+       wr_ctx->wr_curr_bio = NULL;
+       if (!is_dev_replace)
+               return 0;
+
+       WARN_ON(!dev->bdev);
+       wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
+                                        bio_get_nr_vecs(dev->bdev));
+       wr_ctx->tgtdev = dev;
+       atomic_set(&wr_ctx->flush_all_writes, 0);
+       return 0;
+}
+
+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
+{
+       mutex_lock(&wr_ctx->wr_lock);
+       kfree(wr_ctx->wr_curr_bio);
+       wr_ctx->wr_curr_bio = NULL;
+       mutex_unlock(&wr_ctx->wr_lock);
+}
+
+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+                           int mirror_num, u64 physical_for_dev_replace)
+{
+       struct scrub_copy_nocow_ctx *nocow_ctx;
+       struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+
+       nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
+       if (!nocow_ctx) {
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.malloc_errors++;
+               spin_unlock(&sctx->stat_lock);
+               return -ENOMEM;
+       }
+
+       scrub_pending_trans_workers_inc(sctx);
+
+       nocow_ctx->sctx = sctx;
+       nocow_ctx->logical = logical;
+       nocow_ctx->len = len;
+       nocow_ctx->mirror_num = mirror_num;
+       nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
+       nocow_ctx->work.func = copy_nocow_pages_worker;
+       btrfs_queue_worker(&fs_info->scrub_nocow_workers,
+                          &nocow_ctx->work);
+
+       return 0;
+}
+
+static void copy_nocow_pages_worker(struct btrfs_work *work)
+{
+       struct scrub_copy_nocow_ctx *nocow_ctx =
+               container_of(work, struct scrub_copy_nocow_ctx, work);
+       struct scrub_ctx *sctx = nocow_ctx->sctx;
+       u64 logical = nocow_ctx->logical;
+       u64 len = nocow_ctx->len;
+       int mirror_num = nocow_ctx->mirror_num;
+       u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
+       int ret;
+       struct btrfs_trans_handle *trans = NULL;
+       struct btrfs_fs_info *fs_info;
+       struct btrfs_path *path;
+       struct btrfs_root *root;
+       int not_written = 0;
+
+       fs_info = sctx->dev_root->fs_info;
+       root = fs_info->extent_root;
+
+       path = btrfs_alloc_path();
+       if (!path) {
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.malloc_errors++;
+               spin_unlock(&sctx->stat_lock);
+               not_written = 1;
+               goto out;
+       }
+
+       trans = btrfs_join_transaction(root);
+       if (IS_ERR(trans)) {
+               not_written = 1;
+               goto out;
+       }
+
+       ret = iterate_inodes_from_logical(logical, fs_info, path,
+                                         copy_nocow_pages_for_inode,
+                                         nocow_ctx);
+       if (ret != 0 && ret != -ENOENT) {
+               pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
+                       (unsigned long long)logical,
+                       (unsigned long long)physical_for_dev_replace,
+                       (unsigned long long)len,
+                       (unsigned long long)mirror_num, ret);
+               not_written = 1;
+               goto out;
+       }
+
+out:
+       if (trans && !IS_ERR(trans))
+               btrfs_end_transaction(trans, root);
+       if (not_written)
+               btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
+                                           num_uncorrectable_read_errors);
+
+       btrfs_free_path(path);
+       kfree(nocow_ctx);
+
+       scrub_pending_trans_workers_dec(sctx);
+}
+
+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
+{
+       unsigned long index;
+       struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
+       int ret = 0;
+       struct btrfs_key key;
+       struct inode *inode = NULL;
+       struct btrfs_root *local_root;
+       u64 physical_for_dev_replace;
+       u64 len;
+       struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+
+       key.objectid = root;
+       key.type = BTRFS_ROOT_ITEM_KEY;
+       key.offset = (u64)-1;
+       local_root = btrfs_read_fs_root_no_name(fs_info, &key);
+       if (IS_ERR(local_root))
+               return PTR_ERR(local_root);
+
+       key.type = BTRFS_INODE_ITEM_KEY;
+       key.objectid = inum;
+       key.offset = 0;
+       inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+       if (IS_ERR(inode))
+               return PTR_ERR(inode);
+
+       physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
+       len = nocow_ctx->len;
+       while (len >= PAGE_CACHE_SIZE) {
+               struct page *page = NULL;
+               int ret_sub;
+
+               index = offset >> PAGE_CACHE_SHIFT;
+
+               page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+               if (!page) {
+                       pr_err("find_or_create_page() failed\n");
+                       ret = -ENOMEM;
+                       goto next_page;
+               }
+
+               if (PageUptodate(page)) {
+                       if (PageDirty(page))
+                               goto next_page;
+               } else {
+                       ClearPageError(page);
+                       ret_sub = extent_read_full_page(&BTRFS_I(inode)->
+                                                        io_tree,
+                                                       page, btrfs_get_extent,
+                                                       nocow_ctx->mirror_num);
+                       if (ret_sub) {
+                               ret = ret_sub;
+                               goto next_page;
+                       }
+                       wait_on_page_locked(page);
+                       if (!PageUptodate(page)) {
+                               ret = -EIO;
+                               goto next_page;
+                       }
+               }
+               ret_sub = write_page_nocow(nocow_ctx->sctx,
+                                          physical_for_dev_replace, page);
+               if (ret_sub) {
+                       ret = ret_sub;
+                       goto next_page;
+               }
+
+next_page:
+               if (page) {
+                       unlock_page(page);
+                       put_page(page);
+               }
+               offset += PAGE_CACHE_SIZE;
+               physical_for_dev_replace += PAGE_CACHE_SIZE;
+               len -= PAGE_CACHE_SIZE;
+       }
+
+       if (inode)
+               iput(inode);
+       return ret;
+}
+
+static int write_page_nocow(struct scrub_ctx *sctx,
+                           u64 physical_for_dev_replace, struct page *page)
+{
+       struct bio *bio;
+       struct btrfs_device *dev;
+       int ret;
+       DECLARE_COMPLETION_ONSTACK(compl);
+
+       dev = sctx->wr_ctx.tgtdev;
+       if (!dev)
+               return -EIO;
+       if (!dev->bdev) {
+               printk_ratelimited(KERN_WARNING
+                       "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
+               return -EIO;
+       }
+       bio = bio_alloc(GFP_NOFS, 1);
+       if (!bio) {
+               spin_lock(&sctx->stat_lock);
+               sctx->stat.malloc_errors++;
+               spin_unlock(&sctx->stat_lock);
+               return -ENOMEM;
+       }
+       bio->bi_private = &compl;
+       bio->bi_end_io = scrub_complete_bio_end_io;
+       bio->bi_size = 0;
+       bio->bi_sector = physical_for_dev_replace >> 9;
+       bio->bi_bdev = dev->bdev;
+       ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+       if (ret != PAGE_CACHE_SIZE) {
+leave_with_eio:
+               bio_put(bio);
+               btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+               return -EIO;
+       }
+       btrfsic_submit_bio(WRITE_SYNC, bio);
+       wait_for_completion(&compl);
+
+       if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+               goto leave_with_eio;
+
+       bio_put(bio);
+       return 0;
  }
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c

index e78b297b0b00cc990e8eb7a3f1f34619638e5ef4..54454542ad4073352ddcabf27fd9e0bc528e2136 100644 (file)
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4397,9 +4397,9 @@ static int full_send_tree(struct send_ctx *sctx)
         if (!path)
                 return -ENOMEM;
  
-       spin_lock(&send_root->root_times_lock);
+       spin_lock(&send_root->root_item_lock);
         start_ctransid = btrfs_root_ctransid(&send_root->root_item);
-       spin_unlock(&send_root->root_times_lock);
+       spin_unlock(&send_root->root_item_lock);
  
         key.objectid = BTRFS_FIRST_FREE_OBJECTID;
         key.type = BTRFS_INODE_ITEM_KEY;
@@ -4422,9 +4422,9 @@ join_trans:
          * Make sure the tree has not changed after re-joining. We detect this
          * by comparing start_ctransid and ctransid. They should always match.
          */
-       spin_lock(&send_root->root_times_lock);
+       spin_lock(&send_root->root_item_lock);
         ctransid = btrfs_root_ctransid(&send_root->root_item);
-       spin_unlock(&send_root->root_times_lock);
+       spin_unlock(&send_root->root_item_lock);
  
         if (ctransid != start_ctransid) {
                 WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c

index 915ac14c20642ec619ec159d90d0fbed2a02b0da..99545df1b86c18071fed0cbf68dcbb8e745d5524 100644 (file)
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -55,6 +55,7 @@
  #include "export.h"
  #include "compression.h"
  #include "rcu-string.h"
+#include "dev-replace.h"
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/btrfs.h>
@@ -116,7 +117,16 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
         if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
                 sb->s_flags |= MS_RDONLY;
                 printk(KERN_INFO "btrfs is forced readonly\n");
-               __btrfs_scrub_cancel(fs_info);
+               /*
+                * Note that a running device replace operation is not
+                * canceled here although there is no way to update
+                * the progress. It would add the risk of a deadlock,
+                * therefore the canceling is ommited. The only penalty
+                * is that some I/O remains active until the procedure
+                * completes. The next time when the filesystem is
+                * mounted writeable again, the device replace
+                * operation continues.
+                */
  //             WARN_ON(1);
         }
  }
@@ -1186,7 +1196,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
         btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
         btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
         btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
-       btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
+                             new_pool_size);
  }
  
  static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -1215,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                 return 0;
  
         if (*flags & MS_RDONLY) {
+               /*
+                * this also happens on 'umount -rf' or on shutdown, when
+                * the filesystem is busy.
+                */
                 sb->s_flags |= MS_RDONLY;
  
+               btrfs_dev_replace_suspend_for_unmount(fs_info);
+               btrfs_scrub_cancel(fs_info);
+
                 ret = btrfs_commit_super(root);
                 if (ret)
                         goto restore;
@@ -1226,6 +1244,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                         goto restore;
                 }
  
+               if (fs_info->fs_devices->missing_devices >
+                    fs_info->num_tolerated_disk_barrier_failures &&
+                   !(*flags & MS_RDONLY)) {
+                       printk(KERN_WARNING
+                              "Btrfs: too many missing devices, writeable remount is not allowed\n");
+                       ret = -EACCES;
+                       goto restore;
+               }
+
                 if (btrfs_super_log_root(fs_info->super_copy) != 0) {
                         ret = -EINVAL;
                         goto restore;
@@ -1244,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                 if (ret)
                         goto restore;
  
+               ret = btrfs_resume_dev_replace_async(fs_info);
+               if (ret) {
+                       pr_warn("btrfs: failed to resume dev_replace\n");
+                       goto restore;
+               }
                 sb->s_flags &= ~MS_RDONLY;
         }
  
@@ -1336,7 +1368,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
                 min_stripe_size = BTRFS_STRIPE_LEN;
  
         list_for_each_entry(device, &fs_devices->devices, dev_list) {
-               if (!device->in_fs_metadata || !device->bdev)
+               if (!device->in_fs_metadata || !device->bdev ||
+                   device->is_tgtdev_for_dev_replace)
                         continue;
  
                 avail_space = device->total_bytes - device->bytes_used;
@@ -1647,10 +1680,14 @@ static int __init init_btrfs_fs(void)
         if (err)
                 goto free_ordered_data;
  
-       err = btrfs_interface_init();
+       err = btrfs_auto_defrag_init();
         if (err)
                 goto free_delayed_inode;
  
+       err = btrfs_interface_init();
+       if (err)
+               goto free_auto_defrag;
+
         err = register_filesystem(&btrfs_fs_type);
         if (err)
                 goto unregister_ioctl;
@@ -1662,6 +1699,8 @@ static int __init init_btrfs_fs(void)
  
  unregister_ioctl:
         btrfs_interface_exit();
+free_auto_defrag:
+       btrfs_auto_defrag_exit();
  free_delayed_inode:
         btrfs_delayed_inode_exit();
  free_ordered_data:
@@ -1681,6 +1720,7 @@ free_compress:
  static void __exit exit_btrfs_fs(void)
  {
         btrfs_destroy_cachep();
+       btrfs_auto_defrag_exit();
         btrfs_delayed_inode_exit();
         ordered_data_exit();
         extent_map_exit();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c

index 04bbfb1052ebfee9db25427d5542e795cac351cd..87fac9a21ea56578625536ac1229678e854ec5f7 100644 (file)
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -30,6 +30,7 @@
  #include "tree-log.h"
  #include "inode-map.h"
  #include "volumes.h"
+#include "dev-replace.h"
  
  #define BTRFS_ROOT_TRANS_TAG 0
  
@@ -145,16 +146,12 @@ loop:
          * the log must never go across transaction boundaries.
          */
         smp_mb();
-       if (!list_empty(&fs_info->tree_mod_seq_list)) {
-               printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when "
+       if (!list_empty(&fs_info->tree_mod_seq_list))
+               WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when "
                         "creating a fresh transaction\n");
-               WARN_ON(1);
-       }
-       if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
-               printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
+       if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
+               WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
                         "creating a fresh transaction\n");
-               WARN_ON(1);
-       }
         atomic_set(&fs_info->tree_mod_seq, 0);
  
         spin_lock_init(&cur_trans->commit_lock);
@@ -295,9 +292,9 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
         return 0;
  }
  
-static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
-                                                   u64 num_items, int type,
-                                                   int noflush)
+static struct btrfs_trans_handle *
+start_transaction(struct btrfs_root *root, u64 num_items, int type,
+                 enum btrfs_reserve_flush_enum flush)
  {
         struct btrfs_trans_handle *h;
         struct btrfs_transaction *cur_trans;
@@ -312,6 +309,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
                 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
                 h = current->journal_info;
                 h->use_count++;
+               WARN_ON(h->use_count > 2);
                 h->orig_rsv = h->block_rsv;
                 h->block_rsv = NULL;
                 goto got_it;
@@ -331,14 +329,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
                 }
  
                 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
-               if (noflush)
-                       ret = btrfs_block_rsv_add_noflush(root,
-                                               &root->fs_info->trans_block_rsv,
-                                               num_bytes);
-               else
-                       ret = btrfs_block_rsv_add(root,
-                                               &root->fs_info->trans_block_rsv,
-                                               num_bytes);
+               ret = btrfs_block_rsv_add(root,
+                                         &root->fs_info->trans_block_rsv,
+                                         num_bytes, flush);
                 if (ret)
                         return ERR_PTR(ret);
         }
@@ -422,13 +415,15 @@ got_it:
  struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
                                                    int num_items)
  {
-       return start_transaction(root, num_items, TRANS_START, 0);
+       return start_transaction(root, num_items, TRANS_START,
+                                BTRFS_RESERVE_FLUSH_ALL);
  }
  
-struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
                                         struct btrfs_root *root, int num_items)
  {
-       return start_transaction(root, num_items, TRANS_START, 1);
+       return start_transaction(root, num_items, TRANS_START,
+                                BTRFS_RESERVE_FLUSH_LIMIT);
  }
  
  struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
@@ -461,28 +456,31 @@ static noinline void wait_for_commit(struct btrfs_root *root,
  int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
  {
         struct btrfs_transaction *cur_trans = NULL, *t;
-       int ret;
+       int ret = 0;
  
-       ret = 0;
         if (transid) {
                 if (transid <= root->fs_info->last_trans_committed)
                         goto out;
  
+               ret = -EINVAL;
                 /* find specified transaction */
                 spin_lock(&root->fs_info->trans_lock);
                 list_for_each_entry(t, &root->fs_info->trans_list, list) {
                         if (t->transid == transid) {
                                 cur_trans = t;
                                 atomic_inc(&cur_trans->use_count);
+                               ret = 0;
                                 break;
                         }
-                       if (t->transid > transid)
+                       if (t->transid > transid) {
+                               ret = 0;
                                 break;
+                       }
                 }
                 spin_unlock(&root->fs_info->trans_lock);
-               ret = -EINVAL;
+               /* The specified transaction doesn't exist */
                 if (!cur_trans)
-                       goto out;  /* bad transid */
+                       goto out;
         } else {
                 /* find newest transaction that is committing | committed */
                 spin_lock(&root->fs_info->trans_lock);
@@ -502,9 +500,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
         }
  
         wait_for_commit(root, cur_trans);
-
         put_transaction(cur_trans);
-       ret = 0;
  out:
         return ret;
  }
@@ -851,7 +847,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
                 return ret;
  
         ret = btrfs_run_dev_stats(trans, root->fs_info);
-       BUG_ON(ret);
+       WARN_ON(ret);
+       ret = btrfs_run_dev_replace(trans, root->fs_info);
+       WARN_ON(ret);
  
         ret = btrfs_run_qgroups(trans, root->fs_info);
         BUG_ON(ret);
@@ -874,6 +872,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
         switch_commit_root(fs_info->extent_root);
         up_write(&fs_info->extent_commit_sem);
  
+       btrfs_after_dev_replace_commit(fs_info);
+
         return 0;
  }
  
@@ -958,7 +958,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
         struct btrfs_fs_info *info = root->fs_info;
         struct btrfs_trans_handle *trans;
         int ret;
-       unsigned long nr;
  
         if (xchg(&root->defrag_running, 1))
                 return 0;
@@ -970,9 +969,8 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
  
                 ret = btrfs_defrag_leaves(trans, root, cacheonly);
  
-               nr = trans->blocks_used;
                 btrfs_end_transaction(trans, root);
-               btrfs_btree_balance_dirty(info->tree_root, nr);
+               btrfs_btree_balance_dirty(info->tree_root);
                 cond_resched();
  
                 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
@@ -1032,8 +1030,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
         btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
  
         if (to_reserve > 0) {
-               ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
-                                                 to_reserve);
+               ret = btrfs_block_rsv_add(root, &pending->block_rsv,
+                                         to_reserve,
+                                         BTRFS_RESERVE_NO_FLUSH);
                 if (ret) {
                         pending->error = ret;
                         goto no_free_objectid;
@@ -1191,7 +1190,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                                     parent_inode, &key,
                                     BTRFS_FT_DIR, index);
         /* We have check then name at the beginning, so it is impossible. */
-       BUG_ON(ret == -EEXIST);
+       BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
         if (ret) {
                 btrfs_abort_transaction(trans, root, ret);
                 goto fail;
@@ -1309,9 +1308,10 @@ static void do_async_commit(struct work_struct *work)
          * We've got freeze protection passed with the transaction.
          * Tell lockdep about it.
          */
-       rwsem_acquire_read(
-               &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
-               0, 1, _THIS_IP_);
+       if (ac->newtrans->type < TRANS_JOIN_NOLOCK)
+               rwsem_acquire_read(
+                    &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                    0, 1, _THIS_IP_);
  
         current->journal_info = ac->newtrans;
  
@@ -1349,8 +1349,10 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
          * Tell lockdep we've released the freeze rwsem, since the
          * async commit thread will be the one to unlock it.
          */
-       rwsem_release(&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
-                     1, _THIS_IP_);
+       if (trans->type < TRANS_JOIN_NOLOCK)
+               rwsem_release(
+                       &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                       1, _THIS_IP_);
  
         schedule_delayed_work(&ac->work, 0);
  
@@ -1400,6 +1402,48 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
         kmem_cache_free(btrfs_trans_handle_cachep, trans);
  }
  
+static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
+                                         struct btrfs_root *root)
+{
+       int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
+       int snap_pending = 0;
+       int ret;
+
+       if (!flush_on_commit) {
+               spin_lock(&root->fs_info->trans_lock);
+               if (!list_empty(&trans->transaction->pending_snapshots))
+                       snap_pending = 1;
+               spin_unlock(&root->fs_info->trans_lock);
+       }
+
+       if (flush_on_commit || snap_pending) {
+               btrfs_start_delalloc_inodes(root, 1);
+               btrfs_wait_ordered_extents(root, 1);
+       }
+
+       ret = btrfs_run_delayed_items(trans, root);
+       if (ret)
+               return ret;
+
+       /*
+        * running the delayed items may have added new refs. account
+        * them now so that they hinder processing of more delayed refs
+        * as little as possible.
+        */
+       btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+
+       /*
+        * rename don't use btrfs_join_transaction, so, once we
+        * set the transaction to blocked above, we aren't going
+        * to get any new ordered operations.  We can safely run
+        * it here and no for sure that nothing new will be added
+        * to the list
+        */
+       btrfs_run_ordered_operations(root, 1);
+
+       return 0;
+}
+
  /*
   * btrfs_transaction state sequence:
   *    in_commit = 0, blocked = 0  (initial)
@@ -1414,15 +1458,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         struct btrfs_transaction *cur_trans = trans->transaction;
         struct btrfs_transaction *prev_trans = NULL;
         DEFINE_WAIT(wait);
-       int ret = -EIO;
+       int ret;
         int should_grow = 0;
         unsigned long now = get_seconds();
-       int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
  
-       btrfs_run_ordered_operations(root, 0);
+       ret = btrfs_run_ordered_operations(root, 0);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto cleanup_transaction;
+       }
  
-       if (cur_trans->aborted)
+       if (cur_trans->aborted) {
+               ret = cur_trans->aborted;
                 goto cleanup_transaction;
+       }
  
         /* make a pass through all the delayed refs we have so far
          * any runnings procs may add more while we are here
@@ -1490,39 +1539,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                 should_grow = 1;
  
         do {
-               int snap_pending = 0;
-
                 joined = cur_trans->num_joined;
-               if (!list_empty(&trans->transaction->pending_snapshots))
-                       snap_pending = 1;
  
                 WARN_ON(cur_trans != trans->transaction);
  
-               if (flush_on_commit || snap_pending) {
-                       btrfs_start_delalloc_inodes(root, 1);
-                       btrfs_wait_ordered_extents(root, 1);
-               }
-
-               ret = btrfs_run_delayed_items(trans, root);
+               ret = btrfs_flush_all_pending_stuffs(trans, root);
                 if (ret)
                         goto cleanup_transaction;
  
-               /*
-                * running the delayed items may have added new refs. account
-                * them now so that they hinder processing of more delayed refs
-                * as little as possible.
-                */
-               btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
-
-               /*
-                * rename don't use btrfs_join_transaction, so, once we
-                * set the transaction to blocked above, we aren't going
-                * to get any new ordered operations.  We can safely run
-                * it here and no for sure that nothing new will be added
-                * to the list
-                */
-               btrfs_run_ordered_operations(root, 1);
-
                 prepare_to_wait(&cur_trans->writer_wait, &wait,
                                 TASK_UNINTERRUPTIBLE);
  
@@ -1535,6 +1559,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         } while (atomic_read(&cur_trans->num_writers) > 1 ||
                  (should_grow && cur_trans->num_joined != joined));
  
+       ret = btrfs_flush_all_pending_stuffs(trans, root);
+       if (ret)
+               goto cleanup_transaction;
+
         /*
          * Ok now we need to make sure to block out any other joins while we
          * commit the transaction.  We could have started a join before setting
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h

index 80961947a6b27df59273f080f9f19316c8d98ffc..0e8aa1e6c2870274bc4d623e020bb2a668b238f8 100644 (file)
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -105,7 +105,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root);
  struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
                                                    int num_items);
-struct btrfs_trans_handle *btrfs_start_transaction_noflush(
+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
                                         struct btrfs_root *root, int num_items);
  struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
  struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c

index 81e407d9677a73bc5329a9807492a5c580d17ec0..83186c7e45d40db89192abad9da56628874cf5d1 100644 (file)
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2952,33 +2952,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                             struct btrfs_inode_item *item,
                             struct inode *inode, int log_inode_only)
  {
-       btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
-       btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
-       btrfs_set_inode_mode(leaf, item, inode->i_mode);
-       btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
-
-       btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
-                              inode->i_atime.tv_sec);
-       btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
-                               inode->i_atime.tv_nsec);
-
-       btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
-                              inode->i_mtime.tv_sec);
-       btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
-                               inode->i_mtime.tv_nsec);
-
-       btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
-                              inode->i_ctime.tv_sec);
-       btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
-                               inode->i_ctime.tv_nsec);
-
-       btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
-
-       btrfs_set_inode_sequence(leaf, item, inode->i_version);
-       btrfs_set_inode_transid(leaf, item, trans->transid);
-       btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
-       btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
-       btrfs_set_inode_block_group(leaf, item, 0);
+       struct btrfs_map_token token;
+
+       btrfs_init_map_token(&token);
  
         if (log_inode_only) {
                 /* set the generation to zero so the recover code
@@ -2986,14 +2962,63 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                  * just to say 'this inode exists' and a logging
                  * to say 'update this inode with these values'
                  */
-               btrfs_set_inode_generation(leaf, item, 0);
-               btrfs_set_inode_size(leaf, item, 0);
+               btrfs_set_token_inode_generation(leaf, item, 0, &token);
+               btrfs_set_token_inode_size(leaf, item, 0, &token);
         } else {
-               btrfs_set_inode_generation(leaf, item,
-                                          BTRFS_I(inode)->generation);
-               btrfs_set_inode_size(leaf, item, inode->i_size);
-       }
+               btrfs_set_token_inode_generation(leaf, item,
+                                                BTRFS_I(inode)->generation,
+                                                &token);
+               btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
+       }
+
+       btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
+       btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
+       btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
+       btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
+
+       btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+                                    inode->i_atime.tv_sec, &token);
+       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+                                     inode->i_atime.tv_nsec, &token);
+
+       btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+                                    inode->i_mtime.tv_sec, &token);
+       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+                                     inode->i_mtime.tv_nsec, &token);
+
+       btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+                                    inode->i_ctime.tv_sec, &token);
+       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+                                     inode->i_ctime.tv_nsec, &token);
+
+       btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
+                                    &token);
+
+       btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+       btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
+       btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
+       btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
+       btrfs_set_token_inode_block_group(leaf, item, 0, &token);
+}
  
+static int log_inode_item(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *log, struct btrfs_path *path,
+                         struct inode *inode)
+{
+       struct btrfs_inode_item *inode_item;
+       struct btrfs_key key;
+       int ret;
+
+       memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
+       ret = btrfs_insert_empty_item(trans, log, path, &key,
+                                     sizeof(*inode_item));
+       if (ret && ret != -EEXIST)
+               return ret;
+       inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                   struct btrfs_inode_item);
+       fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
+       btrfs_release_path(path);
+       return 0;
  }
  
  static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -3130,151 +3155,234 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
         return 0;
  }
  
-struct log_args {
-       struct extent_buffer *src;
-       u64 next_offset;
-       int start_slot;
-       int nr;
-};
+static int drop_adjacent_extents(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root, struct inode *inode,
+                                struct extent_map *em,
+                                struct btrfs_path *path)
+{
+       struct btrfs_file_extent_item *fi;
+       struct extent_buffer *leaf;
+       struct btrfs_key key, new_key;
+       struct btrfs_map_token token;
+       u64 extent_end;
+       u64 extent_offset = 0;
+       int extent_type;
+       int del_slot = 0;
+       int del_nr = 0;
+       int ret = 0;
+
+       while (1) {
+               btrfs_init_map_token(&token);
+               leaf = path->nodes[0];
+               path->slots[0]++;
+               if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                       if (del_nr) {
+                               ret = btrfs_del_items(trans, root, path,
+                                                     del_slot, del_nr);
+                               if (ret)
+                                       return ret;
+                               del_nr = 0;
+                       }
+
+                       ret = btrfs_next_leaf_write(trans, root, path, 1);
+                       if (ret < 0)
+                               return ret;
+                       if (ret > 0)
+                               return 0;
+                       leaf = path->nodes[0];
+               }
+
+               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+               if (key.objectid != btrfs_ino(inode) ||
+                   key.type != BTRFS_EXTENT_DATA_KEY ||
+                   key.offset >= em->start + em->len)
+                       break;
+
+               fi = btrfs_item_ptr(leaf, path->slots[0],
+                                   struct btrfs_file_extent_item);
+               extent_type = btrfs_token_file_extent_type(leaf, fi, &token);
+               if (extent_type == BTRFS_FILE_EXTENT_REG ||
+                   extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                       extent_offset = btrfs_token_file_extent_offset(leaf,
+                                                               fi, &token);
+                       extent_end = key.offset +
+                               btrfs_token_file_extent_num_bytes(leaf, fi,
+                                                                 &token);
+               } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                       extent_end = key.offset +
+                               btrfs_file_extent_inline_len(leaf, fi);
+               } else {
+                       BUG();
+               }
+
+               if (extent_end <= em->len + em->start) {
+                       if (!del_nr) {
+                               del_slot = path->slots[0];
+                       }
+                       del_nr++;
+                       continue;
+               }
+
+               /*
+                * Ok so we'll ignore previous items if we log a new extent,
+                * which can lead to overlapping extents, so if we have an
+                * existing extent we want to adjust we _have_ to check the next
+                * guy to make sure we even need this extent anymore, this keeps
+                * us from panicing in set_item_key_safe.
+                */
+               if (path->slots[0] < btrfs_header_nritems(leaf) - 1) {
+                       struct btrfs_key tmp_key;
+
+                       btrfs_item_key_to_cpu(leaf, &tmp_key,
+                                             path->slots[0] + 1);
+                       if (tmp_key.objectid == btrfs_ino(inode) &&
+                           tmp_key.type == BTRFS_EXTENT_DATA_KEY &&
+                           tmp_key.offset <= em->start + em->len) {
+                               if (!del_nr)
+                                       del_slot = path->slots[0];
+                               del_nr++;
+                               continue;
+                       }
+               }
+
+               BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+               memcpy(&new_key, &key, sizeof(new_key));
+               new_key.offset = em->start + em->len;
+               btrfs_set_item_key_safe(trans, root, path, &new_key);
+               extent_offset += em->start + em->len - key.offset;
+               btrfs_set_token_file_extent_offset(leaf, fi, extent_offset,
+                                                  &token);
+               btrfs_set_token_file_extent_num_bytes(leaf, fi, extent_end -
+                                                     (em->start + em->len),
+                                                     &token);
+               btrfs_mark_buffer_dirty(leaf);
+       }
+
+       if (del_nr)
+               ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+
+       return ret;
+}
  
  static int log_one_extent(struct btrfs_trans_handle *trans,
                           struct inode *inode, struct btrfs_root *root,
-                         struct extent_map *em, struct btrfs_path *path,
-                         struct btrfs_path *dst_path, struct log_args *args)
+                         struct extent_map *em, struct btrfs_path *path)
  {
         struct btrfs_root *log = root->log_root;
         struct btrfs_file_extent_item *fi;
+       struct extent_buffer *leaf;
+       struct list_head ordered_sums;
+       struct btrfs_map_token token;
         struct btrfs_key key;
-       u64 start = em->mod_start;
-       u64 search_start = start;
-       u64 len = em->mod_len;
-       u64 num_bytes;
-       int nritems;
+       u64 csum_offset = em->mod_start - em->start;
+       u64 csum_len = em->mod_len;
+       u64 extent_offset = em->start - em->orig_start;
+       u64 block_len;
         int ret;
+       bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
  
-       if (BTRFS_I(inode)->logged_trans == trans->transid) {
-               ret = __btrfs_drop_extents(trans, log, inode, dst_path, start,
-                                          start + len, NULL, 0);
-               if (ret)
-                       return ret;
+       INIT_LIST_HEAD(&ordered_sums);
+       btrfs_init_map_token(&token);
+       key.objectid = btrfs_ino(inode);
+       key.type = BTRFS_EXTENT_DATA_KEY;
+       key.offset = em->start;
+       path->really_keep_locks = 1;
+
+       ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
+       if (ret && ret != -EEXIST) {
+               path->really_keep_locks = 0;
+               return ret;
         }
+       leaf = path->nodes[0];
+       fi = btrfs_item_ptr(leaf, path->slots[0],
+                           struct btrfs_file_extent_item);
+       btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
+                                              &token);
+       if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+               skip_csum = true;
+               btrfs_set_token_file_extent_type(leaf, fi,
+                                                BTRFS_FILE_EXTENT_PREALLOC,
+                                                &token);
+       } else {
+               btrfs_set_token_file_extent_type(leaf, fi,
+                                                BTRFS_FILE_EXTENT_REG,
+                                                &token);
+               if (em->block_start == 0)
+                       skip_csum = true;
+       }
+
+       block_len = max(em->block_len, em->orig_block_len);
+       if (em->compress_type != BTRFS_COMPRESS_NONE) {
+               btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+                                                       em->block_start,
+                                                       &token);
+               btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+                                                          &token);
+       } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+               btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+                                                       em->block_start -
+                                                       extent_offset, &token);
+               btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+                                                          &token);
+       } else {
+               btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
+               btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
+                                                          &token);
+       }
+
+       btrfs_set_token_file_extent_offset(leaf, fi,
+                                          em->start - em->orig_start,
+                                          &token);
+       btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
+       btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->len, &token);
+       btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
+                                               &token);
+       btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
+       btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
+       btrfs_mark_buffer_dirty(leaf);
  
-       while (len) {
-               if (args->nr)
-                       goto next_slot;
-again:
-               key.objectid = btrfs_ino(inode);
-               key.type = BTRFS_EXTENT_DATA_KEY;
-               key.offset = search_start;
-
-               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-               if (ret < 0)
-                       return ret;
-
-               if (ret) {
-                       /*
-                        * A rare case were we can have an em for a section of a
-                        * larger extent so we need to make sure that this em
-                        * falls within the extent we've found.  If not we just
-                        * bail and go back to ye-olde way of doing things but
-                        * it happens often enough in testing that we need to do
-                        * this dance to make sure.
-                        */
-                       do {
-                               if (path->slots[0] == 0) {
-                                       btrfs_release_path(path);
-                                       if (search_start == 0)
-                                               return -ENOENT;
-                                       search_start--;
-                                       goto again;
-                               }
-
-                               path->slots[0]--;
-                               btrfs_item_key_to_cpu(path->nodes[0], &key,
-                                                     path->slots[0]);
-                               if (key.objectid != btrfs_ino(inode) ||
-                                   key.type != BTRFS_EXTENT_DATA_KEY) {
-                                       btrfs_release_path(path);
-                                       return -ENOENT;
-                               }
-                       } while (key.offset > start);
+       /*
+        * Have to check the extent to the right of us to make sure it doesn't
+        * fall in our current range.  We're ok if the previous extent is in our
+        * range since the recovery stuff will run us in key order and thus just
+        * drop the part we overwrote.
+        */
+       ret = drop_adjacent_extents(trans, log, inode, em, path);
+       btrfs_release_path(path);
+       path->really_keep_locks = 0;
+       if (ret) {
+               return ret;
+       }
  
-                       fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
-                                           struct btrfs_file_extent_item);
-                       num_bytes = btrfs_file_extent_num_bytes(path->nodes[0],
-                                                               fi);
-                       if (key.offset + num_bytes <= start) {
-                               btrfs_release_path(path);
-                               return -ENOENT;
-                       }
-               }
-               args->src = path->nodes[0];
-next_slot:
-               btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
-               fi = btrfs_item_ptr(args->src, path->slots[0],
-                                   struct btrfs_file_extent_item);
-               if (args->nr &&
-                   args->start_slot + args->nr == path->slots[0]) {
-                       args->nr++;
-               } else if (args->nr) {
-                       ret = copy_items(trans, inode, dst_path, args->src,
-                                        args->start_slot, args->nr,
-                                        LOG_INODE_ALL);
-                       if (ret)
-                               return ret;
-                       args->nr = 1;
-                       args->start_slot = path->slots[0];
-               } else if (!args->nr) {
-                       args->nr = 1;
-                       args->start_slot = path->slots[0];
-               }
-               nritems = btrfs_header_nritems(path->nodes[0]);
-               path->slots[0]++;
-               num_bytes = btrfs_file_extent_num_bytes(args->src, fi);
-               if (len < num_bytes) {
-                       /* I _think_ this is ok, envision we write to a
-                        * preallocated space that is adjacent to a previously
-                        * written preallocated space that gets merged when we
-                        * mark this preallocated space written.  If we do not
-                        * have the adjacent extent in cache then when we copy
-                        * this extent it could end up being larger than our EM
-                        * thinks it is, which is a-ok, so just set len to 0.
-                        */
-                       len = 0;
-               } else {
-                       len -= num_bytes;
-               }
-               start = key.offset + num_bytes;
-               args->next_offset = start;
-               search_start = start;
+       if (skip_csum)
+               return 0;
  
-               if (path->slots[0] < nritems) {
-                       if (len)
-                               goto next_slot;
-                       break;
-               }
+       /* block start is already adjusted for the file extent offset. */
+       ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
+                                      em->block_start + csum_offset,
+                                      em->block_start + csum_offset +
+                                      csum_len - 1, &ordered_sums, 0);
+       if (ret)
+               return ret;
  
-               if (args->nr) {
-                       ret = copy_items(trans, inode, dst_path, args->src,
-                                        args->start_slot, args->nr,
-                                        LOG_INODE_ALL);
-                       if (ret)
-                               return ret;
-                       args->nr = 0;
-                       btrfs_release_path(path);
-               }
+       while (!list_empty(&ordered_sums)) {
+               struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
+                                                  struct btrfs_ordered_sum,
+                                                  list);
+               if (!ret)
+                       ret = btrfs_csum_file_blocks(trans, log, sums);
+               list_del(&sums->list);
+               kfree(sums);
         }
  
-       return 0;
+       return ret;
  }
  
  static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      struct inode *inode,
-                                    struct btrfs_path *path,
-                                    struct btrfs_path *dst_path)
+                                    struct btrfs_path *path)
  {
-       struct log_args args;
         struct extent_map *em, *n;
         struct list_head extents;
         struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
@@ -3283,8 +3391,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
  
         INIT_LIST_HEAD(&extents);
  
-       memset(&args, 0, sizeof(args));
-
         write_lock(&tree->lock);
         test_gen = root->fs_info->last_trans_committed;
  
@@ -3317,34 +3423,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
  
                 write_unlock(&tree->lock);
  
-               /*
-                * If the previous EM and the last extent we left off on aren't
-                * sequential then we need to copy the items we have and redo
-                * our search
-                */
-               if (args.nr && em->mod_start != args.next_offset) {
-                       ret = copy_items(trans, inode, dst_path, args.src,
-                                        args.start_slot, args.nr,
-                                        LOG_INODE_ALL);
-                       if (ret) {
-                               free_extent_map(em);
-                               write_lock(&tree->lock);
-                               continue;
-                       }
-                       btrfs_release_path(path);
-                       args.nr = 0;
-               }
-
-               ret = log_one_extent(trans, inode, root, em, path, dst_path, &args);
+               ret = log_one_extent(trans, inode, root, em, path);
                 free_extent_map(em);
                 write_lock(&tree->lock);
         }
         WARN_ON(!list_empty(&extents));
         write_unlock(&tree->lock);
  
-       if (!ret && args.nr)
-               ret = copy_items(trans, inode, dst_path, args.src,
-                                args.start_slot, args.nr, LOG_INODE_ALL);
         btrfs_release_path(path);
         return ret;
  }
@@ -3400,7 +3485,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
  
  
         /* today the code can only do partial logging of directories */
-       if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
+       if (S_ISDIR(inode->i_mode) ||
+           (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                      &BTRFS_I(inode)->runtime_flags) &&
+            inode_only == LOG_INODE_EXISTS))
                 max_key.type = BTRFS_XATTR_ITEM_KEY;
         else
                 max_key.type = (u8)-1;
@@ -3432,14 +3520,28 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
         } else {
                 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                                        &BTRFS_I(inode)->runtime_flags)) {
+                       clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+                                 &BTRFS_I(inode)->runtime_flags);
                         ret = btrfs_truncate_inode_items(trans, log,
                                                          inode, 0, 0);
-               } else {
-                       fast_search = true;
+               } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+                                             &BTRFS_I(inode)->runtime_flags)) {
+                       if (inode_only == LOG_INODE_ALL)
+                               fast_search = true;
                         max_key.type = BTRFS_XATTR_ITEM_KEY;
                         ret = drop_objectid_items(trans, log, path, ino,
-                                                 BTRFS_XATTR_ITEM_KEY);
+                                                 max_key.type);
+               } else {
+                       if (inode_only == LOG_INODE_ALL)
+                               fast_search = true;
+                       ret = log_inode_item(trans, log, dst_path, inode);
+                       if (ret) {
+                               err = ret;
+                               goto out_unlock;
+                       }
+                       goto log_extents;
                 }
+
         }
         if (ret) {
                 err = ret;
@@ -3518,11 +3620,10 @@ next_slot:
                 ins_nr = 0;
         }
  
+log_extents:
         if (fast_search) {
-               btrfs_release_path(path);
                 btrfs_release_path(dst_path);
-               ret = btrfs_log_changed_extents(trans, root, inode, path,
-                                               dst_path);
+               ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
                 if (ret) {
                         err = ret;
                         goto out_unlock;
@@ -3531,8 +3632,10 @@ next_slot:
                 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
                 struct extent_map *em, *n;
  
+               write_lock(&tree->lock);
                 list_for_each_entry_safe(em, n, &tree->modified_extents, list)
                         list_del_init(&em->list);
+               write_unlock(&tree->lock);
         }
  
         if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index e3c6ee3cc2ba3904cd623433a136f78fe2f3ada6..5cce6aa7401287322c31673cf039f0e32918c6c4 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,7 +25,6 @@
  #include <linux/capability.h>
  #include <linux/ratelimit.h>
  #include <linux/kthread.h>
-#include <asm/div64.h>
  #include "compat.h"
  #include "ctree.h"
  #include "extent_map.h"
@@ -36,6 +35,8 @@
  #include "async-thread.h"
  #include "check-integrity.h"
  #include "rcu-string.h"
+#include "math.h"
+#include "dev-replace.h"
  
  static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
@@ -71,6 +72,19 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
         kfree(fs_devices);
  }
  
+static void btrfs_kobject_uevent(struct block_device *bdev,
+                                enum kobject_action action)
+{
+       int ret;
+
+       ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
+       if (ret)
+               pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
+                       action,
+                       kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
+                       &disk_to_dev(bdev->bd_disk)->kobj);
+}
+
  void btrfs_cleanup_fs_uuids(void)
  {
         struct btrfs_fs_devices *fs_devices;
@@ -108,6 +122,44 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
         return NULL;
  }
  
+static int
+btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
+                     int flush, struct block_device **bdev,
+                     struct buffer_head **bh)
+{
+       int ret;
+
+       *bdev = blkdev_get_by_path(device_path, flags, holder);
+
+       if (IS_ERR(*bdev)) {
+               ret = PTR_ERR(*bdev);
+               printk(KERN_INFO "btrfs: open %s failed\n", device_path);
+               goto error;
+       }
+
+       if (flush)
+               filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
+       ret = set_blocksize(*bdev, 4096);
+       if (ret) {
+               blkdev_put(*bdev, flags);
+               goto error;
+       }
+       invalidate_bdev(*bdev);
+       *bh = btrfs_read_dev_super(*bdev);
+       if (!*bh) {
+               ret = -EINVAL;
+               blkdev_put(*bdev, flags);
+               goto error;
+       }
+
+       return 0;
+
+error:
+       *bdev = NULL;
+       *bh = NULL;
+       return ret;
+}
+
  static void requeue_list(struct btrfs_pending_bios *pending_bios,
                         struct bio *head, struct bio *tail)
  {
@@ -467,7 +519,8 @@ error:
         return ERR_PTR(-ENOMEM);
  }
  
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+                              struct btrfs_fs_devices *fs_devices, int step)
  {
         struct btrfs_device *device, *next;
  
@@ -480,8 +533,9 @@ again:
         /* This is the initialized path, it is safe to release the devices. */
         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
                 if (device->in_fs_metadata) {
-                       if (!latest_transid ||
-                           device->generation > latest_transid) {
+                       if (!device->is_tgtdev_for_dev_replace &&
+                           (!latest_transid ||
+                            device->generation > latest_transid)) {
                                 latest_devid = device->devid;
                                 latest_transid = device->generation;
                                 latest_bdev = device->bdev;
@@ -489,6 +543,21 @@ again:
                         continue;
                 }
  
+               if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
+                       /*
+                        * In the first step, keep the device which has
+                        * the correct fsid and the devid that is used
+                        * for the dev_replace procedure.
+                        * In the second step, the dev_replace state is
+                        * read from the device tree and it is known
+                        * whether the procedure is really active or
+                        * not, which means whether this device is
+                        * used or whether it should be removed.
+                        */
+                       if (step == 0 || device->is_tgtdev_for_dev_replace) {
+                               continue;
+                       }
+               }
                 if (device->bdev) {
                         blkdev_put(device->bdev, device->mode);
                         device->bdev = NULL;
@@ -497,7 +566,8 @@ again:
                 if (device->writeable) {
                         list_del_init(&device->dev_alloc_list);
                         device->writeable = 0;
-                       fs_devices->rw_devices--;
+                       if (!device->is_tgtdev_for_dev_replace)
+                               fs_devices->rw_devices--;
                 }
                 list_del_init(&device->dev_list);
                 fs_devices->num_devices--;
@@ -555,7 +625,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
                 if (device->bdev)
                         fs_devices->open_devices--;
  
-               if (device->writeable) {
+               if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                         list_del_init(&device->dev_alloc_list);
                         fs_devices->rw_devices--;
                 }
@@ -637,18 +707,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                 if (!device->name)
                         continue;
  
-               bdev = blkdev_get_by_path(device->name->str, flags, holder);
-               if (IS_ERR(bdev)) {
-                       printk(KERN_INFO "btrfs: open %s failed\n", device->name->str);
-                       goto error;
-               }
-               filemap_write_and_wait(bdev->bd_inode->i_mapping);
-               invalidate_bdev(bdev);
-               set_blocksize(bdev, 4096);
-
-               bh = btrfs_read_dev_super(bdev);
-               if (!bh)
-                       goto error_close;
+               ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
+                                           &bdev, &bh);
+               if (ret)
+                       continue;
  
                 disk_super = (struct btrfs_super_block *)bh->b_data;
                 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -687,7 +749,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                         fs_devices->rotating = 1;
  
                 fs_devices->open_devices++;
-               if (device->writeable) {
+               if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                         fs_devices->rw_devices++;
                         list_add(&device->dev_alloc_list,
                                  &fs_devices->alloc_list);
@@ -697,9 +759,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
  
  error_brelse:
                 brelse(bh);
-error_close:
                 blkdev_put(bdev, flags);
-error:
                 continue;
         }
         if (fs_devices->open_devices == 0) {
@@ -744,40 +804,30 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
         u64 total_devices;
  
         flags |= FMODE_EXCL;
-       bdev = blkdev_get_by_path(path, flags, holder);
-
-       if (IS_ERR(bdev)) {
-               ret = PTR_ERR(bdev);
-               goto error;
-       }
-
         mutex_lock(&uuid_mutex);
-       ret = set_blocksize(bdev, 4096);
+       ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh);
         if (ret)
-               goto error_close;
-       bh = btrfs_read_dev_super(bdev);
-       if (!bh) {
-               ret = -EINVAL;
-               goto error_close;
-       }
+               goto error;
         disk_super = (struct btrfs_super_block *)bh->b_data;
         devid = btrfs_stack_device_id(&disk_super->dev_item);
         transid = btrfs_super_generation(disk_super);
         total_devices = btrfs_super_num_devices(disk_super);
-       if (disk_super->label[0])
+       if (disk_super->label[0]) {
+               if (disk_super->label[BTRFS_LABEL_SIZE - 1])
+                       disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
                 printk(KERN_INFO "device label %s ", disk_super->label);
-       else
+       } else {
                 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
+       }
         printk(KERN_CONT "devid %llu transid %llu %s\n",
                (unsigned long long)devid, (unsigned long long)transid, path);
         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
         if (!ret && fs_devices_ret)
                 (*fs_devices_ret)->total_devices = total_devices;
         brelse(bh);
-error_close:
-       mutex_unlock(&uuid_mutex);
         blkdev_put(bdev, flags);
  error:
+       mutex_unlock(&uuid_mutex);
         return ret;
  }
  
@@ -796,7 +846,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
  
         *length = 0;
  
-       if (start >= device->total_bytes)
+       if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
                 return 0;
  
         path = btrfs_alloc_path();
@@ -913,7 +963,7 @@ int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
         max_hole_size = 0;
         hole_size = 0;
  
-       if (search_start >= search_end) {
+       if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
                 ret = -ENOSPC;
                 goto error;
         }
@@ -1096,6 +1146,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
         struct btrfs_key key;
  
         WARN_ON(!device->in_fs_metadata);
+       WARN_ON(device->is_tgtdev_for_dev_replace);
         path = btrfs_alloc_path();
         if (!path)
                 return -ENOMEM;
@@ -1330,16 +1381,22 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                 root->fs_info->avail_system_alloc_bits |
                 root->fs_info->avail_metadata_alloc_bits;
  
-       if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
-           root->fs_info->fs_devices->num_devices <= 4) {
+       num_devices = root->fs_info->fs_devices->num_devices;
+       btrfs_dev_replace_lock(&root->fs_info->dev_replace);
+       if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
+               WARN_ON(num_devices < 1);
+               num_devices--;
+       }
+       btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
+
+       if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
                 printk(KERN_ERR "btrfs: unable to go below four devices "
                        "on raid10\n");
                 ret = -EINVAL;
                 goto out;
         }
  
-       if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
-           root->fs_info->fs_devices->num_devices <= 2) {
+       if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
                 printk(KERN_ERR "btrfs: unable to go below two "
                        "devices on raid1\n");
                 ret = -EINVAL;
@@ -1357,7 +1414,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                  * is held.
                  */
                 list_for_each_entry(tmp, devices, dev_list) {
-                       if (tmp->in_fs_metadata && !tmp->bdev) {
+                       if (tmp->in_fs_metadata &&
+                           !tmp->is_tgtdev_for_dev_replace &&
+                           !tmp->bdev) {
                                 device = tmp;
                                 break;
                         }
@@ -1371,24 +1430,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                         goto out;
                 }
         } else {
-               bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
-                                         root->fs_info->bdev_holder);
-               if (IS_ERR(bdev)) {
-                       ret = PTR_ERR(bdev);
+               ret = btrfs_get_bdev_and_sb(device_path,
+                                           FMODE_READ | FMODE_EXCL,
+                                           root->fs_info->bdev_holder, 0,
+                                           &bdev, &bh);
+               if (ret)
                         goto out;
-               }
-
-               set_blocksize(bdev, 4096);
-               invalidate_bdev(bdev);
-               bh = btrfs_read_dev_super(bdev);
-               if (!bh) {
-                       ret = -EINVAL;
-                       goto error_close;
-               }
                 disk_super = (struct btrfs_super_block *)bh->b_data;
                 devid = btrfs_stack_device_id(&disk_super->dev_item);
                 dev_uuid = disk_super->dev_item.uuid;
-               device = btrfs_find_device(root, devid, dev_uuid,
+               device = btrfs_find_device(root->fs_info, devid, dev_uuid,
                                            disk_super->fsid);
                 if (!device) {
                         ret = -ENOENT;
@@ -1396,6 +1447,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                 }
         }
  
+       if (device->is_tgtdev_for_dev_replace) {
+               pr_err("btrfs: unable to remove the dev_replace target dev\n");
+               ret = -EINVAL;
+               goto error_brelse;
+       }
+
         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
                 printk(KERN_ERR "btrfs: unable to remove the only writeable "
                        "device\n");
@@ -1415,6 +1472,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
         if (ret)
                 goto error_undo;
  
+       /*
+        * TODO: the superblock still includes this device in its num_devices
+        * counter although write_all_supers() is not locked out. This
+        * could give a filesystem state which requires a degraded mount.
+        */
         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
         if (ret)
                 goto error_undo;
@@ -1425,7 +1487,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
         spin_unlock(&root->fs_info->free_chunk_lock);
  
         device->in_fs_metadata = 0;
-       btrfs_scrub_cancel_dev(root, device);
+       btrfs_scrub_cancel_dev(root->fs_info, device);
  
         /*
          * the device list mutex makes sure that we don't change
@@ -1482,7 +1544,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
          * at this point, the device is zero sized.  We want to
          * remove it from the devices list and zero out the old super
          */
-       if (clear_super) {
+       if (clear_super && disk_super) {
                 /* make sure this device isn't detected as part of
                  * the FS anymore
                  */
@@ -1493,9 +1555,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
  
         ret = 0;
  
+       /* Notify udev that device has changed */
+       btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
  error_brelse:
         brelse(bh);
-error_close:
         if (bdev)
                 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
  out:
@@ -1512,6 +1576,112 @@ error_undo:
         goto error_brelse;
  }
  
+void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
+                                struct btrfs_device *srcdev)
+{
+       WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
+       list_del_rcu(&srcdev->dev_list);
+       list_del_rcu(&srcdev->dev_alloc_list);
+       fs_info->fs_devices->num_devices--;
+       if (srcdev->missing) {
+               fs_info->fs_devices->missing_devices--;
+               fs_info->fs_devices->rw_devices++;
+       }
+       if (srcdev->can_discard)
+               fs_info->fs_devices->num_can_discard--;
+       if (srcdev->bdev)
+               fs_info->fs_devices->open_devices--;
+
+       call_rcu(&srcdev->rcu, free_device);
+}
+
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+                                     struct btrfs_device *tgtdev)
+{
+       struct btrfs_device *next_device;
+
+       WARN_ON(!tgtdev);
+       mutex_lock(&fs_info->fs_devices->device_list_mutex);
+       if (tgtdev->bdev) {
+               btrfs_scratch_superblock(tgtdev);
+               fs_info->fs_devices->open_devices--;
+       }
+       fs_info->fs_devices->num_devices--;
+       if (tgtdev->can_discard)
+               fs_info->fs_devices->num_can_discard++;
+
+       next_device = list_entry(fs_info->fs_devices->devices.next,
+                                struct btrfs_device, dev_list);
+       if (tgtdev->bdev == fs_info->sb->s_bdev)
+               fs_info->sb->s_bdev = next_device->bdev;
+       if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
+               fs_info->fs_devices->latest_bdev = next_device->bdev;
+       list_del_rcu(&tgtdev->dev_list);
+
+       call_rcu(&tgtdev->rcu, free_device);
+
+       mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+}
+
+int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
+                             struct btrfs_device **device)
+{
+       int ret = 0;
+       struct btrfs_super_block *disk_super;
+       u64 devid;
+       u8 *dev_uuid;
+       struct block_device *bdev;
+       struct buffer_head *bh;
+
+       *device = NULL;
+       ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
+                                   root->fs_info->bdev_holder, 0, &bdev, &bh);
+       if (ret)
+               return ret;
+       disk_super = (struct btrfs_super_block *)bh->b_data;
+       devid = btrfs_stack_device_id(&disk_super->dev_item);
+       dev_uuid = disk_super->dev_item.uuid;
+       *device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+                                   disk_super->fsid);
+       brelse(bh);
+       if (!*device)
+               ret = -ENOENT;
+       blkdev_put(bdev, FMODE_READ);
+       return ret;
+}
+
+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+                                        char *device_path,
+                                        struct btrfs_device **device)
+{
+       *device = NULL;
+       if (strcmp(device_path, "missing") == 0) {
+               struct list_head *devices;
+               struct btrfs_device *tmp;
+
+               devices = &root->fs_info->fs_devices->devices;
+               /*
+                * It is safe to read the devices since the volume_mutex
+                * is held by the caller.
+                */
+               list_for_each_entry(tmp, devices, dev_list) {
+                       if (tmp->in_fs_metadata && !tmp->bdev) {
+                               *device = tmp;
+                               break;
+                       }
+               }
+
+               if (!*device) {
+                       pr_err("btrfs: no missing device found\n");
+                       return -ENOENT;
+               }
+
+               return 0;
+       } else {
+               return btrfs_find_device_by_path(root, device_path, device);
+       }
+}
+
  /*
   * does all the dirty work required for changing file system's UUID.
   */
@@ -1630,7 +1800,8 @@ next_slot:
                 read_extent_buffer(leaf, fs_uuid,
                                    (unsigned long)btrfs_device_fsid(dev_item),
                                    BTRFS_UUID_SIZE);
-               device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+               device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+                                          fs_uuid);
                 BUG_ON(!device); /* Logic error */
  
                 if (device->fs_devices->seeding) {
@@ -1678,16 +1849,17 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
         filemap_write_and_wait(bdev->bd_inode->i_mapping);
  
         devices = &root->fs_info->fs_devices->devices;
-       /*
-        * we have the volume lock, so we don't need the extra
-        * device list mutex while reading the list here.
-        */
+
+       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
         list_for_each_entry(device, devices, dev_list) {
                 if (device->bdev == bdev) {
                         ret = -EEXIST;
+                       mutex_unlock(
+                               &root->fs_info->fs_devices->device_list_mutex);
                         goto error;
                 }
         }
+       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  
         device = kzalloc(sizeof(*device), GFP_NOFS);
         if (!device) {
@@ -1737,6 +1909,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
         device->dev_root = root->fs_info->dev_root;
         device->bdev = bdev;
         device->in_fs_metadata = 1;
+       device->is_tgtdev_for_dev_replace = 0;
         device->mode = FMODE_EXCL;
         set_blocksize(device->bdev, 4096);
  
@@ -1844,6 +2017,98 @@ error:
         return ret;
  }
  
+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+                                 struct btrfs_device **device_out)
+{
+       struct request_queue *q;
+       struct btrfs_device *device;
+       struct block_device *bdev;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct list_head *devices;
+       struct rcu_string *name;
+       int ret = 0;
+
+       *device_out = NULL;
+       if (fs_info->fs_devices->seeding)
+               return -EINVAL;
+
+       bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
+                                 fs_info->bdev_holder);
+       if (IS_ERR(bdev))
+               return PTR_ERR(bdev);
+
+       filemap_write_and_wait(bdev->bd_inode->i_mapping);
+
+       devices = &fs_info->fs_devices->devices;
+       list_for_each_entry(device, devices, dev_list) {
+               if (device->bdev == bdev) {
+                       ret = -EEXIST;
+                       goto error;
+               }
+       }
+
+       device = kzalloc(sizeof(*device), GFP_NOFS);
+       if (!device) {
+               ret = -ENOMEM;
+               goto error;
+       }
+
+       name = rcu_string_strdup(device_path, GFP_NOFS);
+       if (!name) {
+               kfree(device);
+               ret = -ENOMEM;
+               goto error;
+       }
+       rcu_assign_pointer(device->name, name);
+
+       q = bdev_get_queue(bdev);
+       if (blk_queue_discard(q))
+               device->can_discard = 1;
+       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+       device->writeable = 1;
+       device->work.func = pending_bios_fn;
+       generate_random_uuid(device->uuid);
+       device->devid = BTRFS_DEV_REPLACE_DEVID;
+       spin_lock_init(&device->io_lock);
+       device->generation = 0;
+       device->io_width = root->sectorsize;
+       device->io_align = root->sectorsize;
+       device->sector_size = root->sectorsize;
+       device->total_bytes = i_size_read(bdev->bd_inode);
+       device->disk_total_bytes = device->total_bytes;
+       device->dev_root = fs_info->dev_root;
+       device->bdev = bdev;
+       device->in_fs_metadata = 1;
+       device->is_tgtdev_for_dev_replace = 1;
+       device->mode = FMODE_EXCL;
+       set_blocksize(device->bdev, 4096);
+       device->fs_devices = fs_info->fs_devices;
+       list_add(&device->dev_list, &fs_info->fs_devices->devices);
+       fs_info->fs_devices->num_devices++;
+       fs_info->fs_devices->open_devices++;
+       if (device->can_discard)
+               fs_info->fs_devices->num_can_discard++;
+       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+       *device_out = device;
+       return ret;
+
+error:
+       blkdev_put(bdev, FMODE_EXCL);
+       return ret;
+}
+
+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
+                                             struct btrfs_device *tgtdev)
+{
+       WARN_ON(fs_info->fs_devices->rw_devices == 0);
+       tgtdev->io_width = fs_info->dev_root->sectorsize;
+       tgtdev->io_align = fs_info->dev_root->sectorsize;
+       tgtdev->sector_size = fs_info->dev_root->sectorsize;
+       tgtdev->dev_root = fs_info->dev_root;
+       tgtdev->in_fs_metadata = 1;
+}
+
  static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
                                         struct btrfs_device *device)
  {
@@ -1900,7 +2165,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
  
         if (!device->writeable)
                 return -EACCES;
-       if (new_size <= device->total_bytes)
+       if (new_size <= device->total_bytes ||
+           device->is_tgtdev_for_dev_replace)
                 return -EINVAL;
  
         btrfs_set_super_total_bytes(super_copy, old_total + diff);
@@ -2338,18 +2604,6 @@ static int chunk_profiles_filter(u64 chunk_type,
         return 1;
  }
  
-static u64 div_factor_fine(u64 num, int factor)
-{
-       if (factor <= 0)
-               return 0;
-       if (factor >= 100)
-               return num;
-
-       num *= factor;
-       do_div(num, 100);
-       return num;
-}
-
  static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
                               struct btrfs_balance_args *bargs)
  {
@@ -2514,15 +2768,6 @@ static int should_balance_chunk(struct btrfs_root *root,
         return 1;
  }
  
-static u64 div_factor(u64 num, int factor)
-{
-       if (factor == 10)
-               return num;
-       num *= factor;
-       do_div(num, 10);
-       return num;
-}
-
  static int __btrfs_balance(struct btrfs_fs_info *fs_info)
  {
         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -2550,7 +2795,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
                 size_to_free = div_factor(old_size, 1);
                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
                 if (!device->writeable ||
-                   device->total_bytes - device->bytes_used > size_to_free)
+                   device->total_bytes - device->bytes_used > size_to_free ||
+                   device->is_tgtdev_for_dev_replace)
                         continue;
  
                 ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -2728,6 +2974,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
         u64 allowed;
         int mixed = 0;
         int ret;
+       u64 num_devices;
  
         if (btrfs_fs_closing(fs_info) ||
             atomic_read(&fs_info->balance_pause_req) ||
@@ -2756,10 +3003,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                 }
         }
  
+       num_devices = fs_info->fs_devices->num_devices;
+       btrfs_dev_replace_lock(&fs_info->dev_replace);
+       if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+               BUG_ON(num_devices < 1);
+               num_devices--;
+       }
+       btrfs_dev_replace_unlock(&fs_info->dev_replace);
         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
-       if (fs_info->fs_devices->num_devices == 1)
+       if (num_devices == 1)
                 allowed |= BTRFS_BLOCK_GROUP_DUP;
-       else if (fs_info->fs_devices->num_devices < 4)
+       else if (num_devices < 4)
                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
         else
                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
@@ -2902,6 +3156,7 @@ static int balance_kthread(void *data)
                 ret = btrfs_balance(fs_info->balance_ctl, NULL);
         }
  
+       atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
         mutex_unlock(&fs_info->balance_mutex);
         mutex_unlock(&fs_info->volume_mutex);
  
@@ -2924,6 +3179,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
                 return 0;
         }
  
+       WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
         tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
         if (IS_ERR(tsk))
                 return PTR_ERR(tsk);
@@ -3080,7 +3336,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
         u64 old_size = device->total_bytes;
         u64 diff = device->total_bytes - new_size;
  
-       if (new_size >= device->total_bytes)
+       if (device->is_tgtdev_for_dev_replace)
                 return -EINVAL;
  
         path = btrfs_alloc_path();
@@ -3235,6 +3491,14 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
         return 0;
  }
  
+struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+       { 2, 1, 0, 4, 2, 2 /* raid10 */ },
+       { 1, 1, 2, 2, 2, 2 /* raid1 */ },
+       { 1, 2, 1, 1, 1, 2 /* dup */ },
+       { 1, 1, 0, 2, 1, 1 /* raid0 */ },
+       { 1, 1, 0, 1, 1, 1 /* single */ },
+};
+
  static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                                struct btrfs_root *extent_root,
                                struct map_lookup **map_ret,
@@ -3264,43 +3528,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
         int ndevs;
         int i;
         int j;
+       int index;
  
         BUG_ON(!alloc_profile_is_valid(type, 0));
  
         if (list_empty(&fs_devices->alloc_list))
                 return -ENOSPC;
  
-       sub_stripes = 1;
-       dev_stripes = 1;
-       devs_increment = 1;
-       ncopies = 1;
-       devs_max = 0;   /* 0 == as many as possible */
-       devs_min = 1;
+       index = __get_raid_index(type);
  
-       /*
-        * define the properties of each RAID type.
-        * FIXME: move this to a global table and use it in all RAID
-        * calculation code
-        */
-       if (type & (BTRFS_BLOCK_GROUP_DUP)) {
-               dev_stripes = 2;
-               ncopies = 2;
-               devs_max = 1;
-       } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-               devs_min = 2;
-       } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
-               devs_increment = 2;
-               ncopies = 2;
-               devs_max = 2;
-               devs_min = 2;
-       } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-               sub_stripes = 2;
-               devs_increment = 2;
-               ncopies = 2;
-               devs_min = 4;
-       } else {
-               devs_max = 1;
-       }
+       sub_stripes = btrfs_raid_array[index].sub_stripes;
+       dev_stripes = btrfs_raid_array[index].dev_stripes;
+       devs_max = btrfs_raid_array[index].devs_max;
+       devs_min = btrfs_raid_array[index].devs_min;
+       devs_increment = btrfs_raid_array[index].devs_increment;
+       ncopies = btrfs_raid_array[index].ncopies;
  
         if (type & BTRFS_BLOCK_GROUP_DATA) {
                 max_stripe_size = 1024 * 1024 * 1024;
@@ -3347,13 +3589,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                 cur = cur->next;
  
                 if (!device->writeable) {
-                       printk(KERN_ERR
+                       WARN(1, KERN_ERR
                                "btrfs: read-only device in alloc_list\n");
-                       WARN_ON(1);
                         continue;
                 }
  
-               if (!device->in_fs_metadata)
+               if (!device->in_fs_metadata ||
+                   device->is_tgtdev_for_dev_replace)
                         continue;
  
                 if (device->total_bytes > device->bytes_used)
@@ -3382,6 +3624,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                 devices_info[ndevs].total_avail = total_avail;
                 devices_info[ndevs].dev = device;
                 ++ndevs;
+               WARN_ON(ndevs > fs_devices->rw_devices);
         }
  
         /*
@@ -3740,8 +3983,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
         }
  }
  
-int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
  {
+       struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
         struct extent_map *em;
         struct map_lookup *map;
         struct extent_map_tree *em_tree = &map_tree->map_tree;
@@ -3761,32 +4005,60 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
         else
                 ret = 1;
         free_extent_map(em);
+
+       btrfs_dev_replace_lock(&fs_info->dev_replace);
+       if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
+               ret++;
+       btrfs_dev_replace_unlock(&fs_info->dev_replace);
+
         return ret;
  }
  
-static int find_live_mirror(struct map_lookup *map, int first, int num,
-                           int optimal)
+static int find_live_mirror(struct btrfs_fs_info *fs_info,
+                           struct map_lookup *map, int first, int num,
+                           int optimal, int dev_replace_is_ongoing)
  {
         int i;
-       if (map->stripes[optimal].dev->bdev)
-               return optimal;
-       for (i = first; i < first + num; i++) {
-               if (map->stripes[i].dev->bdev)
-                       return i;
+       int tolerance;
+       struct btrfs_device *srcdev;
+
+       if (dev_replace_is_ongoing &&
+           fs_info->dev_replace.cont_reading_from_srcdev_mode ==
+            BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
+               srcdev = fs_info->dev_replace.srcdev;
+       else
+               srcdev = NULL;
+
+       /*
+        * try to avoid the drive that is the source drive for a
+        * dev-replace procedure, only choose it if no other non-missing
+        * mirror is available
+        */
+       for (tolerance = 0; tolerance < 2; tolerance++) {
+               if (map->stripes[optimal].dev->bdev &&
+                   (tolerance || map->stripes[optimal].dev != srcdev))
+                       return optimal;
+               for (i = first; i < first + num; i++) {
+                       if (map->stripes[i].dev->bdev &&
+                           (tolerance || map->stripes[i].dev != srcdev))
+                               return i;
+               }
         }
+
         /* we couldn't find one that doesn't fail.  Just return something
          * and the io error handling code will clean up eventually
          */
         return optimal;
  }
  
-static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                              u64 logical, u64 *length,
                              struct btrfs_bio **bbio_ret,
                              int mirror_num)
  {
         struct extent_map *em;
         struct map_lookup *map;
+       struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
         struct extent_map_tree *em_tree = &map_tree->map_tree;
         u64 offset;
         u64 stripe_offset;
@@ -3800,6 +4072,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
         int num_stripes;
         int max_errors = 0;
         struct btrfs_bio *bbio = NULL;
+       struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+       int dev_replace_is_ongoing = 0;
+       int num_alloc_stripes;
+       int patch_the_first_stripe_for_dev_replace = 0;
+       u64 physical_to_patch_in_first_stripe = 0;
  
         read_lock(&em_tree->lock);
         em = lookup_extent_mapping(em_tree, logical, *length);
@@ -3816,9 +4093,6 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
         map = (struct map_lookup *)em->bdev;
         offset = logical - em->start;
  
-       if (mirror_num > map->num_stripes)
-               mirror_num = 0;
-
         stripe_nr = offset;
         /*
          * stripe_nr counts the total number of stripes we have to stride
@@ -3845,6 +4119,93 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
         if (!bbio_ret)
                 goto out;
  
+       btrfs_dev_replace_lock(dev_replace);
+       dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+       if (!dev_replace_is_ongoing)
+               btrfs_dev_replace_unlock(dev_replace);
+
+       if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
+           !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
+           dev_replace->tgtdev != NULL) {
+               /*
+                * in dev-replace case, for repair case (that's the only
+                * case where the mirror is selected explicitly when
+                * calling btrfs_map_block), blocks left of the left cursor
+                * can also be read from the target drive.
+                * For REQ_GET_READ_MIRRORS, the target drive is added as
+                * the last one to the array of stripes. For READ, it also
+                * needs to be supported using the same mirror number.
+                * If the requested block is not left of the left cursor,
+                * EIO is returned. This can happen because btrfs_num_copies()
+                * returns one more in the dev-replace case.
+                */
+               u64 tmp_length = *length;
+               struct btrfs_bio *tmp_bbio = NULL;
+               int tmp_num_stripes;
+               u64 srcdev_devid = dev_replace->srcdev->devid;
+               int index_srcdev = 0;
+               int found = 0;
+               u64 physical_of_found = 0;
+
+               ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
+                            logical, &tmp_length, &tmp_bbio, 0);
+               if (ret) {
+                       WARN_ON(tmp_bbio != NULL);
+                       goto out;
+               }
+
+               tmp_num_stripes = tmp_bbio->num_stripes;
+               if (mirror_num > tmp_num_stripes) {
+                       /*
+                        * REQ_GET_READ_MIRRORS does not contain this
+                        * mirror, that means that the requested area
+                        * is not left of the left cursor
+                        */
+                       ret = -EIO;
+                       kfree(tmp_bbio);
+                       goto out;
+               }
+
+               /*
+                * process the rest of the function using the mirror_num
+                * of the source drive. Therefore look it up first.
+                * At the end, patch the device pointer to the one of the
+                * target drive.
+                */
+               for (i = 0; i < tmp_num_stripes; i++) {
+                       if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
+                               /*
+                                * In case of DUP, in order to keep it
+                                * simple, only add the mirror with the
+                                * lowest physical address
+                                */
+                               if (found &&
+                                   physical_of_found <=
+                                    tmp_bbio->stripes[i].physical)
+                                       continue;
+                               index_srcdev = i;
+                               found = 1;
+                               physical_of_found =
+                                       tmp_bbio->stripes[i].physical;
+                       }
+               }
+
+               if (found) {
+                       mirror_num = index_srcdev + 1;
+                       patch_the_first_stripe_for_dev_replace = 1;
+                       physical_to_patch_in_first_stripe = physical_of_found;
+               } else {
+                       WARN_ON(1);
+                       ret = -EIO;
+                       kfree(tmp_bbio);
+                       goto out;
+               }
+
+               kfree(tmp_bbio);
+       } else if (mirror_num > map->num_stripes) {
+               mirror_num = 0;
+       }
+
         num_stripes = 1;
         stripe_index = 0;
         stripe_nr_orig = stripe_nr;
@@ -3859,19 +4220,20 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                                             stripe_nr_end - stripe_nr_orig);
                 stripe_index = do_div(stripe_nr, map->num_stripes);
         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-               if (rw & (REQ_WRITE | REQ_DISCARD))
+               if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
                         num_stripes = map->num_stripes;
                 else if (mirror_num)
                         stripe_index = mirror_num - 1;
                 else {
-                       stripe_index = find_live_mirror(map, 0,
+                       stripe_index = find_live_mirror(fs_info, map, 0,
                                             map->num_stripes,
-                                           current->pid % map->num_stripes);
+                                           current->pid % map->num_stripes,
+                                           dev_replace_is_ongoing);
                         mirror_num = stripe_index + 1;
                 }
  
         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-               if (rw & (REQ_WRITE | REQ_DISCARD)) {
+               if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
                         num_stripes = map->num_stripes;
                 } else if (mirror_num) {
                         stripe_index = mirror_num - 1;
@@ -3885,7 +4247,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                 stripe_index = do_div(stripe_nr, factor);
                 stripe_index *= map->sub_stripes;
  
-               if (rw & REQ_WRITE)
+               if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
                         num_stripes = map->sub_stripes;
                 else if (rw & REQ_DISCARD)
                         num_stripes = min_t(u64, map->sub_stripes *
@@ -3895,9 +4257,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                         stripe_index += mirror_num - 1;
                 else {
                         int old_stripe_index = stripe_index;
-                       stripe_index = find_live_mirror(map, stripe_index,
+                       stripe_index = find_live_mirror(fs_info, map,
+                                             stripe_index,
                                               map->sub_stripes, stripe_index +
-                                             current->pid % map->sub_stripes);
+                                             current->pid % map->sub_stripes,
+                                             dev_replace_is_ongoing);
                         mirror_num = stripe_index - old_stripe_index + 1;
                 }
         } else {
@@ -3911,7 +4275,14 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
         }
         BUG_ON(stripe_index >= map->num_stripes);
  
-       bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
+       num_alloc_stripes = num_stripes;
+       if (dev_replace_is_ongoing) {
+               if (rw & (REQ_WRITE | REQ_DISCARD))
+                       num_alloc_stripes <<= 1;
+               if (rw & REQ_GET_READ_MIRRORS)
+                       num_alloc_stripes++;
+       }
+       bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
         if (!bbio) {
                 ret = -ENOMEM;
                 goto out;
@@ -3998,7 +4369,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                 }
         }
  
-       if (rw & REQ_WRITE) {
+       if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
                                  BTRFS_BLOCK_GROUP_RAID10 |
                                  BTRFS_BLOCK_GROUP_DUP)) {
@@ -4006,20 +4377,115 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                 }
         }
  
+       if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
+           dev_replace->tgtdev != NULL) {
+               int index_where_to_add;
+               u64 srcdev_devid = dev_replace->srcdev->devid;
+
+               /*
+                * duplicate the write operations while the dev replace
+                * procedure is running. Since the copying of the old disk
+                * to the new disk takes place at run time while the
+                * filesystem is mounted writable, the regular write
+                * operations to the old disk have to be duplicated to go
+                * to the new disk as well.
+                * Note that device->missing is handled by the caller, and
+                * that the write to the old disk is already set up in the
+                * stripes array.
+                */
+               index_where_to_add = num_stripes;
+               for (i = 0; i < num_stripes; i++) {
+                       if (bbio->stripes[i].dev->devid == srcdev_devid) {
+                               /* write to new disk, too */
+                               struct btrfs_bio_stripe *new =
+                                       bbio->stripes + index_where_to_add;
+                               struct btrfs_bio_stripe *old =
+                                       bbio->stripes + i;
+
+                               new->physical = old->physical;
+                               new->length = old->length;
+                               new->dev = dev_replace->tgtdev;
+                               index_where_to_add++;
+                               max_errors++;
+                       }
+               }
+               num_stripes = index_where_to_add;
+       } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
+                  dev_replace->tgtdev != NULL) {
+               u64 srcdev_devid = dev_replace->srcdev->devid;
+               int index_srcdev = 0;
+               int found = 0;
+               u64 physical_of_found = 0;
+
+               /*
+                * During the dev-replace procedure, the target drive can
+                * also be used to read data in case it is needed to repair
+                * a corrupt block elsewhere. This is possible if the
+                * requested area is left of the left cursor. In this area,
+                * the target drive is a full copy of the source drive.
+                */
+               for (i = 0; i < num_stripes; i++) {
+                       if (bbio->stripes[i].dev->devid == srcdev_devid) {
+                               /*
+                                * In case of DUP, in order to keep it
+                                * simple, only add the mirror with the
+                                * lowest physical address
+                                */
+                               if (found &&
+                                   physical_of_found <=
+                                    bbio->stripes[i].physical)
+                                       continue;
+                               index_srcdev = i;
+                               found = 1;
+                               physical_of_found = bbio->stripes[i].physical;
+                       }
+               }
+               if (found) {
+                       u64 length = map->stripe_len;
+
+                       if (physical_of_found + length <=
+                           dev_replace->cursor_left) {
+                               struct btrfs_bio_stripe *tgtdev_stripe =
+                                       bbio->stripes + num_stripes;
+
+                               tgtdev_stripe->physical = physical_of_found;
+                               tgtdev_stripe->length =
+                                       bbio->stripes[index_srcdev].length;
+                               tgtdev_stripe->dev = dev_replace->tgtdev;
+
+                               num_stripes++;
+                       }
+               }
+       }
+
         *bbio_ret = bbio;
         bbio->num_stripes = num_stripes;
         bbio->max_errors = max_errors;
         bbio->mirror_num = mirror_num;
+
+       /*
+        * this is the case that REQ_READ && dev_replace_is_ongoing &&
+        * mirror_num == num_stripes + 1 && dev_replace target drive is
+        * available as a mirror
+        */
+       if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
+               WARN_ON(num_stripes > 1);
+               bbio->stripes[0].dev = dev_replace->tgtdev;
+               bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
+               bbio->mirror_num = map->num_stripes + 1;
+       }
  out:
+       if (dev_replace_is_ongoing)
+               btrfs_dev_replace_unlock(dev_replace);
         free_extent_map(em);
         return ret;
  }
  
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                       u64 logical, u64 *length,
                       struct btrfs_bio **bbio_ret, int mirror_num)
  {
-       return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
+       return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
                                  mirror_num);
  }
  
@@ -4238,10 +4704,116 @@ static noinline void schedule_bio(struct btrfs_root *root,
                                    &device->work);
  }
  
+static int bio_size_ok(struct block_device *bdev, struct bio *bio,
+                      sector_t sector)
+{
+       struct bio_vec *prev;
+       struct request_queue *q = bdev_get_queue(bdev);
+       unsigned short max_sectors = queue_max_sectors(q);
+       struct bvec_merge_data bvm = {
+               .bi_bdev = bdev,
+               .bi_sector = sector,
+               .bi_rw = bio->bi_rw,
+       };
+
+       if (bio->bi_vcnt == 0) {
+               WARN_ON(1);
+               return 1;
+       }
+
+       prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
+       if ((bio->bi_size >> 9) > max_sectors)
+               return 0;
+
+       if (!q->merge_bvec_fn)
+               return 1;
+
+       bvm.bi_size = bio->bi_size - prev->bv_len;
+       if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
+               return 0;
+       return 1;
+}
+
+static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+                             struct bio *bio, u64 physical, int dev_nr,
+                             int rw, int async)
+{
+       struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
+
+       bio->bi_private = bbio;
+       bio->bi_private = merge_stripe_index_into_bio_private(
+                       bio->bi_private, (unsigned int)dev_nr);
+       bio->bi_end_io = btrfs_end_bio;
+       bio->bi_sector = physical >> 9;
+#ifdef DEBUG
+       {
+               struct rcu_string *name;
+
+               rcu_read_lock();
+               name = rcu_dereference(dev->name);
+               pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
+                        "(%s id %llu), size=%u\n", rw,
+                        (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
+                        name->str, dev->devid, bio->bi_size);
+               rcu_read_unlock();
+       }
+#endif
+       bio->bi_bdev = dev->bdev;
+       if (async)
+               schedule_bio(root, dev, rw, bio);
+       else
+               btrfsic_submit_bio(rw, bio);
+}
+
+static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+                             struct bio *first_bio, struct btrfs_device *dev,
+                             int dev_nr, int rw, int async)
+{
+       struct bio_vec *bvec = first_bio->bi_io_vec;
+       struct bio *bio;
+       int nr_vecs = bio_get_nr_vecs(dev->bdev);
+       u64 physical = bbio->stripes[dev_nr].physical;
+
+again:
+       bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
+       if (!bio)
+               return -ENOMEM;
+
+       while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
+               if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+                                bvec->bv_offset) < bvec->bv_len) {
+                       u64 len = bio->bi_size;
+
+                       atomic_inc(&bbio->stripes_pending);
+                       submit_stripe_bio(root, bbio, bio, physical, dev_nr,
+                                         rw, async);
+                       physical += len;
+                       goto again;
+               }
+               bvec++;
+       }
+
+       submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
+       return 0;
+}
+
+static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+{
+       atomic_inc(&bbio->error);
+       if (atomic_dec_and_test(&bbio->stripes_pending)) {
+               bio->bi_private = bbio->private;
+               bio->bi_end_io = bbio->end_io;
+               bio->bi_bdev = (struct block_device *)
+                       (unsigned long)bbio->mirror_num;
+               bio->bi_sector = logical >> 9;
+               kfree(bbio);
+               bio_endio(bio, -EIO);
+       }
+}
+
  int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                   int mirror_num, int async_submit)
  {
-       struct btrfs_mapping_tree *map_tree;
         struct btrfs_device *dev;
         struct bio *first_bio = bio;
         u64 logical = (u64)bio->bi_sector << 9;
@@ -4253,12 +4825,11 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
         struct btrfs_bio *bbio = NULL;
  
         length = bio->bi_size;
-       map_tree = &root->fs_info->mapping_tree;
         map_length = length;
  
-       ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
+       ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
                               mirror_num);
-       if (ret) /* -ENOMEM */
+       if (ret)
                 return ret;
  
         total_devs = bbio->num_stripes;
@@ -4276,52 +4847,48 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
  
         while (dev_nr < total_devs) {
+               dev = bbio->stripes[dev_nr].dev;
+               if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
+                       bbio_error(bbio, first_bio, logical);
+                       dev_nr++;
+                       continue;
+               }
+
+               /*
+                * Check and see if we're ok with this bio based on it's size
+                * and offset with the given device.
+                */
+               if (!bio_size_ok(dev->bdev, first_bio,
+                                bbio->stripes[dev_nr].physical >> 9)) {
+                       ret = breakup_stripe_bio(root, bbio, first_bio, dev,
+                                                dev_nr, rw, async_submit);
+                       BUG_ON(ret);
+                       dev_nr++;
+                       continue;
+               }
+
                 if (dev_nr < total_devs - 1) {
                         bio = bio_clone(first_bio, GFP_NOFS);
                         BUG_ON(!bio); /* -ENOMEM */
                 } else {
                         bio = first_bio;
                 }
-               bio->bi_private = bbio;
-               bio->bi_private = merge_stripe_index_into_bio_private(
-                               bio->bi_private, (unsigned int)dev_nr);
-               bio->bi_end_io = btrfs_end_bio;
-               bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
-               dev = bbio->stripes[dev_nr].dev;
-               if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
-#ifdef DEBUG
-                       struct rcu_string *name;
-
-                       rcu_read_lock();
-                       name = rcu_dereference(dev->name);
-                       pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
-                                "(%s id %llu), size=%u\n", rw,
-                                (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
-                                name->str, dev->devid, bio->bi_size);
-                       rcu_read_unlock();
-#endif
-                       bio->bi_bdev = dev->bdev;
-                       if (async_submit)
-                               schedule_bio(root, dev, rw, bio);
-                       else
-                               btrfsic_submit_bio(rw, bio);
-               } else {
-                       bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
-                       bio->bi_sector = logical >> 9;
-                       bio_endio(bio, -EIO);
-               }
+
+               submit_stripe_bio(root, bbio, bio,
+                                 bbio->stripes[dev_nr].physical, dev_nr, rw,
+                                 async_submit);
                 dev_nr++;
         }
         return 0;
  }
  
-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
                                        u8 *uuid, u8 *fsid)
  {
         struct btrfs_device *device;
         struct btrfs_fs_devices *cur_devices;
  
-       cur_devices = root->fs_info->fs_devices;
+       cur_devices = fs_info->fs_devices;
         while (cur_devices) {
                 if (!fsid ||
                     !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
@@ -4402,6 +4969,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
         em->bdev = (struct block_device *)map;
         em->start = logical;
         em->len = length;
+       em->orig_start = 0;
         em->block_start = 0;
         em->block_len = em->len;
  
@@ -4419,8 +4987,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
                 read_extent_buffer(leaf, uuid, (unsigned long)
                                    btrfs_stripe_dev_uuid_nr(chunk, i),
                                    BTRFS_UUID_SIZE);
-               map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
-                                                       NULL);
+               map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
+                                                       uuid, NULL);
                 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
                         kfree(map);
                         free_extent_map(em);
@@ -4461,6 +5029,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
         device->io_align = btrfs_device_io_align(leaf, dev_item);
         device->io_width = btrfs_device_io_width(leaf, dev_item);
         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+       WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
+       device->is_tgtdev_for_dev_replace = 0;
  
         ptr = (unsigned long)btrfs_device_uuid(dev_item);
         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
@@ -4538,7 +5108,7 @@ static int read_one_dev(struct btrfs_root *root,
                         return ret;
         }
  
-       device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+       device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
         if (!device || !device->bdev) {
                 if (!btrfs_test_opt(root, DEGRADED))
                         return -EIO;
@@ -4571,7 +5141,7 @@ static int read_one_dev(struct btrfs_root *root,
         fill_device_from_item(leaf, dev_item, device);
         device->dev_root = root->fs_info->dev_root;
         device->in_fs_metadata = 1;
-       if (device->writeable) {
+       if (device->writeable && !device->is_tgtdev_for_dev_replace) {
                 device->fs_devices->total_rw_bytes += device->total_bytes;
                 spin_lock(&root->fs_info->free_chunk_lock);
                 root->fs_info->free_chunk_space += device->total_bytes -
@@ -4930,7 +5500,7 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
         int i;
  
         mutex_lock(&fs_devices->device_list_mutex);
-       dev = btrfs_find_device(root, stats->devid, NULL, NULL);
+       dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
         mutex_unlock(&fs_devices->device_list_mutex);
  
         if (!dev) {
@@ -4958,3 +5528,21 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
         return 0;
  }
+
+int btrfs_scratch_superblock(struct btrfs_device *device)
+{
+       struct buffer_head *bh;
+       struct btrfs_super_block *disk_super;
+
+       bh = btrfs_read_dev_super(device->bdev);
+       if (!bh)
+               return -EINVAL;
+       disk_super = (struct btrfs_super_block *)bh->b_data;
+
+       memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+       set_buffer_dirty(bh);
+       sync_dirty_buffer(bh);
+       brelse(bh);
+
+       return 0;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h

index 53c06af92e8da94270dab4f24906b937441d8a8f..d3c3939ac7512e405995793e47b4ca2c3ce640dc 100644 (file)
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -50,6 +50,7 @@ struct btrfs_device {
         int in_fs_metadata;
         int missing;
         int can_discard;
+       int is_tgtdev_for_dev_replace;
  
         spinlock_t io_lock;
  
@@ -88,7 +89,7 @@ struct btrfs_device {
         u8 uuid[BTRFS_UUID_SIZE];
  
         /* per-device scrub information */
-       struct scrub_dev *scrub_device;
+       struct scrub_ctx *scrub_device;
  
         struct btrfs_work work;
         struct rcu_head rcu;
@@ -179,6 +180,15 @@ struct btrfs_device_info {
         u64 total_avail;
  };
  
+struct btrfs_raid_attr {
+       int sub_stripes;        /* sub_stripes info for map */
+       int dev_stripes;        /* stripes per dev */
+       int devs_max;           /* max devs to use */
+       int devs_min;           /* min devs needed */
+       int devs_increment;     /* ndevs has to be a multiple of this */
+       int ncopies;            /* how many copies to data has */
+};
+
  struct map_lookup {
         u64 type;
         int io_align;
@@ -248,7 +258,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
                            struct btrfs_device *device,
                            u64 chunk_tree, u64 chunk_objectid,
                            u64 chunk_offset, u64 start, u64 num_bytes);
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                     u64 logical, u64 *length,
                     struct btrfs_bio **bbio_ret, int mirror_num);
  int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -267,19 +277,27 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
  int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                           struct btrfs_fs_devices **fs_devices_ret);
  int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+                              struct btrfs_fs_devices *fs_devices, int step);
+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+                                        char *device_path,
+                                        struct btrfs_device **device);
+int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
+                             struct btrfs_device **device);
  int btrfs_add_device(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      struct btrfs_device *device);
  int btrfs_rm_device(struct btrfs_root *root, char *device_path);
  void btrfs_cleanup_fs_uuids(void);
-int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
  int btrfs_grow_device(struct btrfs_trans_handle *trans,
                       struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
                                        u8 *uuid, u8 *fsid);
  int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
  int btrfs_init_new_device(struct btrfs_root *root, char *path);
+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+                                 struct btrfs_device **device_out);
  int btrfs_balance(struct btrfs_balance_control *bctl,
                   struct btrfs_ioctl_balance_args *bargs);
  int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
@@ -296,6 +314,13 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
  int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
  int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
                         struct btrfs_fs_info *fs_info);
+void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
+                                struct btrfs_device *srcdev);
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+                                     struct btrfs_device *tgtdev);
+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
+                                             struct btrfs_device *tgtdev);
+int btrfs_scratch_superblock(struct btrfs_device *device);
  
  static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
                                       int index)
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c

index 3f4e2d69e83a13cb66f3f3a56024f53f5299f5c4..446a6848c5548f2f8f03f2b532eac9a174bd56c7 100644 (file)
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -122,6 +122,16 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
                  */
                 if (!value)
                         goto out;
+       } else {
+               di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
+                                       name, name_len, 0);
+               if (IS_ERR(di)) {
+                       ret = PTR_ERR(di);
+                       goto out;
+               }
+               if (!di && !value)
+                       goto out;
+               btrfs_release_path(path);
         }
  
  again:
@@ -198,6 +208,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
  
         inode_inc_iversion(inode);
         inode->i_ctime = CURRENT_TIME;
+       set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
         ret = btrfs_update_inode(trans, root, inode);
         BUG_ON(ret);
  out:
@@ -265,7 +276,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
  
                 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
                 if (verify_dir_item(root, leaf, di))
-                       continue;
+                       goto next;
  
                 name_len = btrfs_dir_name_len(leaf, di);
                 total_size += name_len + 1;
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h

index 54fab041b22ab7a5102879a1b82ac5b4f97966df..ea546a4e9609aa0f9c9b56a6f307e06b10602fe6 100644 (file)
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -45,7 +45,8 @@ struct extent_buffer;
  
  #define show_root_type(obj)                                            \
         obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) ||                \
-             (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-"
+             (obj >= BTRFS_ROOT_TREE_OBJECTID &&                       \
+              obj <= BTRFS_CSUM_TREE_OBJECTID)) ? __show_root_type(obj) : "-"
  
  #define BTRFS_GROUP_FLAGS      \
         { BTRFS_BLOCK_GROUP_DATA,       "DATA"}, \
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 18 Dec 2012 17:42:05 +0000 (09:42 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 18 Dec 2012 17:42:05 +0000 (09:42 -0800)
fs/btrfs/Makefile		patch \| blob \| history
fs/btrfs/acl.c		patch \| blob \| history
fs/btrfs/backref.c		patch \| blob \| history
fs/btrfs/btrfs_inode.h		patch \| blob \| history
fs/btrfs/check-integrity.c		patch \| blob \| history
fs/btrfs/compression.c		patch \| blob \| history
fs/btrfs/ctree.c		patch \| blob \| history
fs/btrfs/ctree.h		patch \| blob \| history
fs/btrfs/delayed-inode.c		patch \| blob \| history
fs/btrfs/dev-replace.c	[new file with mode: 0644]	patch \| blob
fs/btrfs/dev-replace.h	[new file with mode: 0644]	patch \| blob
fs/btrfs/dir-item.c		patch \| blob \| history
fs/btrfs/disk-io.c		patch \| blob \| history
fs/btrfs/disk-io.h		patch \| blob \| history
fs/btrfs/extent-tree.c		patch \| blob \| history
fs/btrfs/extent_io.c		patch \| blob \| history
fs/btrfs/extent_io.h		patch \| blob \| history
fs/btrfs/extent_map.c		patch \| blob \| history
fs/btrfs/extent_map.h		patch \| blob \| history
fs/btrfs/file-item.c		patch \| blob \| history
fs/btrfs/file.c		patch \| blob \| history
fs/btrfs/free-space-cache.c		patch \| blob \| history
fs/btrfs/inode-map.c		patch \| blob \| history
fs/btrfs/inode.c		patch \| blob \| history
fs/btrfs/ioctl.c		patch \| blob \| history
fs/btrfs/ioctl.h		patch \| blob \| history
fs/btrfs/math.h	[new file with mode: 0644]	patch \| blob
fs/btrfs/ordered-data.c		patch \| blob \| history
fs/btrfs/ordered-data.h		patch \| blob \| history
fs/btrfs/print-tree.c		patch \| blob \| history
fs/btrfs/reada.c		patch \| blob \| history
fs/btrfs/relocation.c		patch \| blob \| history
fs/btrfs/root-tree.c		patch \| blob \| history
fs/btrfs/scrub.c		patch \| blob \| history
fs/btrfs/send.c		patch \| blob \| history
fs/btrfs/super.c		patch \| blob \| history
fs/btrfs/transaction.c		patch \| blob \| history
fs/btrfs/transaction.h		patch \| blob \| history
fs/btrfs/tree-log.c		patch \| blob \| history
fs/btrfs/volumes.c		patch \| blob \| history
fs/btrfs/volumes.h		patch \| blob \| history
fs/btrfs/xattr.c		patch \| blob \| history
include/trace/events/btrfs.h		patch \| blob \| history