Btrfs: make the scrub page array dynamically allocated
With the modified design (in order to support the devive replace
procedure) it is necessary to alloc the page array dynamically.
The reason is that pages are reused. At first a page is used for
the bio to read the data from the filesystem, then the same page
is reused for the bio that writes the data to the target disk.
Since the read process and the write process are completely
decoupled, this requires a new concept of refcounts and get/put
functions for pages, and it requires to use newly created pages
for each read bio which are freed after the write operation
is finished.
Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 822c08a..15ac82a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -46,6 +46,12 @@
#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */
#define SCRUB_BIOS_PER_CTX 16 /* 1 MB per device in flight */
+
+/*
+ * the following value times PAGE_SIZE needs to be large enough to match the
+ * largest node/leaf/sector size that shall be supported.
+ * Values larger than BTRFS_STRIPE_LEN are not supported.
+ */
#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
struct scrub_page {
@@ -56,6 +62,7 @@
u64 generation;
u64 logical;
u64 physical;
+ atomic_t ref_count;
struct {
unsigned int mirror_num:8;
unsigned int have_csum:1;
@@ -79,7 +86,7 @@
};
struct scrub_block {
- struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK];
+ struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
int page_count;
atomic_t outstanding_pages;
atomic_t ref_count; /* free mem on transition to zero */
@@ -165,6 +172,8 @@
static int scrub_checksum_super(struct scrub_block *sblock);
static void scrub_block_get(struct scrub_block *sblock);
static void scrub_block_put(struct scrub_block *sblock);
+static void scrub_page_get(struct scrub_page *spage);
+static void scrub_page_put(struct scrub_page *spage);
static int scrub_add_page_to_bio(struct scrub_ctx *sctx,
struct scrub_page *spage);
static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
@@ -364,15 +373,15 @@
int ret;
WARN_ON(sblock->page_count < 1);
- dev = sblock->pagev[0].dev;
+ dev = sblock->pagev[0]->dev;
fs_info = sblock->sctx->dev_root->fs_info;
path = btrfs_alloc_path();
swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
- swarn.sector = (sblock->pagev[0].physical) >> 9;
- swarn.logical = sblock->pagev[0].logical;
+ swarn.sector = (sblock->pagev[0]->physical) >> 9;
+ swarn.logical = sblock->pagev[0]->logical;
swarn.errstr = errstr;
swarn.dev = NULL;
swarn.msg_bufsize = bufsize;
@@ -642,15 +651,15 @@
BUG_ON(sblock_to_check->page_count < 1);
fs_info = sctx->dev_root->fs_info;
length = sblock_to_check->page_count * PAGE_SIZE;
- logical = sblock_to_check->pagev[0].logical;
- generation = sblock_to_check->pagev[0].generation;
- BUG_ON(sblock_to_check->pagev[0].mirror_num < 1);
- failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1;
- is_metadata = !(sblock_to_check->pagev[0].flags &
+ logical = sblock_to_check->pagev[0]->logical;
+ generation = sblock_to_check->pagev[0]->generation;
+ BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
+ failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
+ is_metadata = !(sblock_to_check->pagev[0]->flags &
BTRFS_EXTENT_FLAG_DATA);
- have_csum = sblock_to_check->pagev[0].have_csum;
- csum = sblock_to_check->pagev[0].csum;
- dev = sblock_to_check->pagev[0].dev;
+ have_csum = sblock_to_check->pagev[0]->have_csum;
+ csum = sblock_to_check->pagev[0]->csum;
+ dev = sblock_to_check->pagev[0]->dev;
/*
* read all mirrors one after the other. This includes to
@@ -892,7 +901,7 @@
success = 1;
for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
- struct scrub_page *page_bad = sblock_bad->pagev + page_num;
+ struct scrub_page *page_bad = sblock_bad->pagev[page_num];
if (!page_bad->io_error)
continue;
@@ -903,8 +912,8 @@
mirror_index++) {
struct scrub_block *sblock_other = sblocks_for_recheck +
mirror_index;
- struct scrub_page *page_other = sblock_other->pagev +
- page_num;
+ struct scrub_page *page_other = sblock_other->pagev[
+ page_num];
if (!page_other->io_error) {
ret = scrub_repair_page_from_good_copy(
@@ -971,11 +980,11 @@
mirror_index;
int page_index;
- for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO;
- page_index++)
- if (sblock->pagev[page_index].page)
- __free_page(
- sblock->pagev[page_index].page);
+ for (page_index = 0; page_index < sblock->page_count;
+ page_index++) {
+ sblock->pagev[page_index]->sblock = NULL;
+ scrub_page_put(sblock->pagev[page_index]);
+ }
}
kfree(sblocks_for_recheck);
}
@@ -993,7 +1002,7 @@
int ret;
/*
- * note: the three members sctx, ref_count and outstanding_pages
+ * note: the two members ref_count and outstanding_pages
* are not used (and not set) in the blocks that are used for
* the recheck procedure
*/
@@ -1025,21 +1034,27 @@
continue;
sblock = sblocks_for_recheck + mirror_index;
- page = sblock->pagev + page_index;
- page->logical = logical;
- page->physical = bbio->stripes[mirror_index].physical;
- /* for missing devices, dev->bdev is NULL */
- page->dev = bbio->stripes[mirror_index].dev;
- page->mirror_num = mirror_index + 1;
- page->page = alloc_page(GFP_NOFS);
- if (!page->page) {
+ sblock->sctx = sctx;
+ page = kzalloc(sizeof(*page), GFP_NOFS);
+ if (!page) {
+leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
kfree(bbio);
return -ENOMEM;
}
+ scrub_page_get(page);
+ sblock->pagev[page_index] = page;
+ page->logical = logical;
+ page->physical = bbio->stripes[mirror_index].physical;
+ /* for missing devices, dev->bdev is NULL */
+ page->dev = bbio->stripes[mirror_index].dev;
+ page->mirror_num = mirror_index + 1;
sblock->page_count++;
+ page->page = alloc_page(GFP_NOFS);
+ if (!page->page)
+ goto leave_nomem;
}
kfree(bbio);
length -= sublen;
@@ -1071,7 +1086,7 @@
for (page_num = 0; page_num < sblock->page_count; page_num++) {
struct bio *bio;
int ret;
- struct scrub_page *page = sblock->pagev + page_num;
+ struct scrub_page *page = sblock->pagev[page_num];
DECLARE_COMPLETION_ONSTACK(complete);
if (page->dev->bdev == NULL) {
@@ -1080,7 +1095,7 @@
continue;
}
- BUG_ON(!page->page);
+ WARN_ON(!page->page);
bio = bio_alloc(GFP_NOFS, 1);
if (!bio)
return -EIO;
@@ -1125,14 +1140,14 @@
struct btrfs_root *root = fs_info->extent_root;
void *mapped_buffer;
- BUG_ON(!sblock->pagev[0].page);
+ WARN_ON(!sblock->pagev[0]->page);
if (is_metadata) {
struct btrfs_header *h;
- mapped_buffer = kmap_atomic(sblock->pagev[0].page);
+ mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
h = (struct btrfs_header *)mapped_buffer;
- if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
+ if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE)) {
@@ -1146,7 +1161,7 @@
if (!have_csum)
return;
- mapped_buffer = kmap_atomic(sblock->pagev[0].page);
+ mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
}
for (page_num = 0;;) {
@@ -1162,9 +1177,9 @@
page_num++;
if (page_num >= sblock->page_count)
break;
- BUG_ON(!sblock->pagev[page_num].page);
+ WARN_ON(!sblock->pagev[page_num]->page);
- mapped_buffer = kmap_atomic(sblock->pagev[page_num].page);
+ mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
}
btrfs_csum_final(crc, calculated_csum);
@@ -1202,11 +1217,11 @@
struct scrub_block *sblock_good,
int page_num, int force_write)
{
- struct scrub_page *page_bad = sblock_bad->pagev + page_num;
- struct scrub_page *page_good = sblock_good->pagev + page_num;
+ struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+ struct scrub_page *page_good = sblock_good->pagev[page_num];
- BUG_ON(sblock_bad->pagev[page_num].page == NULL);
- BUG_ON(sblock_good->pagev[page_num].page == NULL);
+ BUG_ON(page_bad->page == NULL);
+ BUG_ON(page_good->page == NULL);
if (force_write || sblock_bad->header_error ||
sblock_bad->checksum_error || page_bad->io_error) {
struct bio *bio;
@@ -1247,8 +1262,8 @@
u64 flags;
int ret;
- BUG_ON(sblock->page_count < 1);
- flags = sblock->pagev[0].flags;
+ WARN_ON(sblock->page_count < 1);
+ flags = sblock->pagev[0]->flags;
ret = 0;
if (flags & BTRFS_EXTENT_FLAG_DATA)
ret = scrub_checksum_data(sblock);
@@ -1276,11 +1291,11 @@
int index;
BUG_ON(sblock->page_count < 1);
- if (!sblock->pagev[0].have_csum)
+ if (!sblock->pagev[0]->have_csum)
return 0;
- on_disk_csum = sblock->pagev[0].csum;
- page = sblock->pagev[0].page;
+ on_disk_csum = sblock->pagev[0]->csum;
+ page = sblock->pagev[0]->page;
buffer = kmap_atomic(page);
len = sctx->sectorsize;
@@ -1295,8 +1310,8 @@
break;
index++;
BUG_ON(index >= sblock->page_count);
- BUG_ON(!sblock->pagev[index].page);
- page = sblock->pagev[index].page;
+ BUG_ON(!sblock->pagev[index]->page);
+ page = sblock->pagev[index]->page;
buffer = kmap_atomic(page);
}
@@ -1326,7 +1341,7 @@
int index;
BUG_ON(sblock->page_count < 1);
- page = sblock->pagev[0].page;
+ page = sblock->pagev[0]->page;
mapped_buffer = kmap_atomic(page);
h = (struct btrfs_header *)mapped_buffer;
memcpy(on_disk_csum, h->csum, sctx->csum_size);
@@ -1337,10 +1352,10 @@
* b) the page is already kmapped
*/
- if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr))
+ if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
++fail;
- if (sblock->pagev[0].generation != le64_to_cpu(h->generation))
+ if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
++fail;
if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1365,8 +1380,8 @@
break;
index++;
BUG_ON(index >= sblock->page_count);
- BUG_ON(!sblock->pagev[index].page);
- page = sblock->pagev[index].page;
+ BUG_ON(!sblock->pagev[index]->page);
+ page = sblock->pagev[index]->page;
mapped_buffer = kmap_atomic(page);
mapped_size = PAGE_SIZE;
p = mapped_buffer;
@@ -1398,15 +1413,15 @@
int index;
BUG_ON(sblock->page_count < 1);
- page = sblock->pagev[0].page;
+ page = sblock->pagev[0]->page;
mapped_buffer = kmap_atomic(page);
s = (struct btrfs_super_block *)mapped_buffer;
memcpy(on_disk_csum, s->csum, sctx->csum_size);
- if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
+ if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
++fail_cor;
- if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
+ if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
++fail_gen;
if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1426,8 +1441,8 @@
break;
index++;
BUG_ON(index >= sblock->page_count);
- BUG_ON(!sblock->pagev[index].page);
- page = sblock->pagev[index].page;
+ BUG_ON(!sblock->pagev[index]->page);
+ page = sblock->pagev[index]->page;
mapped_buffer = kmap_atomic(page);
mapped_size = PAGE_SIZE;
p = mapped_buffer;
@@ -1447,10 +1462,10 @@
++sctx->stat.super_errors;
spin_unlock(&sctx->stat_lock);
if (fail_cor)
- btrfs_dev_stat_inc_and_print(sblock->pagev[0].dev,
+ btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
else
- btrfs_dev_stat_inc_and_print(sblock->pagev[0].dev,
+ btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
BTRFS_DEV_STAT_GENERATION_ERRS);
}
@@ -1468,12 +1483,25 @@
int i;
for (i = 0; i < sblock->page_count; i++)
- if (sblock->pagev[i].page)
- __free_page(sblock->pagev[i].page);
+ scrub_page_put(sblock->pagev[i]);
kfree(sblock);
}
}
+static void scrub_page_get(struct scrub_page *spage)
+{
+ atomic_inc(&spage->ref_count);
+}
+
+static void scrub_page_put(struct scrub_page *spage)
+{
+ if (atomic_dec_and_test(&spage->ref_count)) {
+ if (spage->page)
+ __free_page(spage->page);
+ kfree(spage);
+ }
+}
+
static void scrub_submit(struct scrub_ctx *sctx)
{
struct scrub_bio *sbio;
@@ -1577,28 +1605,28 @@
return -ENOMEM;
}
- /* one ref inside this function, plus one for each page later on */
+ /* one ref inside this function, plus one for each page added to
+ * a bio later on */
atomic_set(&sblock->ref_count, 1);
sblock->sctx = sctx;
sblock->no_io_error_seen = 1;
for (index = 0; len > 0; index++) {
- struct scrub_page *spage = sblock->pagev + index;
+ struct scrub_page *spage;
u64 l = min_t(u64, len, PAGE_SIZE);
- BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
- spage->page = alloc_page(GFP_NOFS);
- if (!spage->page) {
+ spage = kzalloc(sizeof(*spage), GFP_NOFS);
+ if (!spage) {
+leave_nomem:
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
spin_unlock(&sctx->stat_lock);
- while (index > 0) {
- index--;
- __free_page(sblock->pagev[index].page);
- }
- kfree(sblock);
+ scrub_block_put(sblock);
return -ENOMEM;
}
+ BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+ scrub_page_get(spage);
+ sblock->pagev[index] = spage;
spage->sblock = sblock;
spage->dev = dev;
spage->flags = flags;
@@ -1613,14 +1641,17 @@
spage->have_csum = 0;
}
sblock->page_count++;
+ spage->page = alloc_page(GFP_NOFS);
+ if (!spage->page)
+ goto leave_nomem;
len -= l;
logical += l;
physical += l;
}
- BUG_ON(sblock->page_count == 0);
+ WARN_ON(sblock->page_count == 0);
for (index = 0; index < sblock->page_count; index++) {
- struct scrub_page *spage = sblock->pagev + index;
+ struct scrub_page *spage = sblock->pagev[index];
int ret;
ret = scrub_add_page_to_bio(sctx, spage);
@@ -2289,6 +2320,22 @@
return -EINVAL;
}
+ if (fs_info->chunk_root->nodesize >
+ PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
+ fs_info->chunk_root->sectorsize >
+ PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
+ /*
+ * would exhaust the array bounds of pagev member in
+ * struct scrub_block
+ */
+ pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
+ fs_info->chunk_root->nodesize,
+ SCRUB_MAX_PAGES_PER_BLOCK,
+ fs_info->chunk_root->sectorsize,
+ SCRUB_MAX_PAGES_PER_BLOCK);
+ return -EINVAL;
+ }
+
ret = scrub_workers_get(root);
if (ret)
return ret;