テストステ論

高テス協会会長が, テストステロンに関する情報をお届けします.

(writeboost report) ライトバックのソーティングつづき

http://akiradeveloper.hatenadiary.com/entry/2014/03/31/182626

dm-develで, RBツリーを利用したI/Oのソーティングが流行っていて, ライトブーストも便乗しようかなみたいな話をしました.

http://www.redhat.com/archives/dm-devel/2014-April/msg00000.html

「やるぜアッピール」をすると効果的なので, 「おれこれ好きだわ. ライトブーストでもやろうと思ってる. ソーティングすると速くなるわー」って言ったら, マイクからOKレス来ました. よーし実装するゾー(白目)


終了. ライトバックのコードがきれいになったと思います.

diff --git a/src/dm-writeboost-daemon.c b/src/dm-writeboost-daemon.c
index 64b4491..9890977 100644
--- a/src/dm-writeboost-daemon.c
+++ b/src/dm-writeboost-daemon.c
@@ -8,6 +8,8 @@
 #include "dm-writeboost-metadata.h"
 #include "dm-writeboost-daemon.h"
 
+#include <linux/rbtree.h>
+
 /*----------------------------------------------------------------*/
 
 static void update_barrier_deadline(struct wb_device *wb)
@@ -46,8 +48,7 @@ void flush_barrier_ios(struct work_struct *work)
 
 /*----------------------------------------------------------------*/
 
-static void
-process_deferred_barriers(struct wb_device *wb, struct flush_job *job)
+static void process_deferred_barriers(struct wb_device *wb, struct flush_job *job)
 {
    int r = 0;
    bool has_barrier = !bio_list_empty(&job->barrier_ios);
@@ -146,79 +147,121 @@ static void migrate_endio(unsigned long error, void *context)
        wake_up(&wb->migrate_io_wait_queue);
 }
 
-/*
- * asynchronously submit the segment data at position k in the migrate buffer.
- * batched migration first collects all the segments to migrate into a migrate buffer.
- * so, there are a number of segment data in the migrate buffer.
- * this function submits the one in position k.
- */
-static void submit_migrate_io(struct wb_device *wb, struct segment_header *seg,
-                size_t k)
+static void submit_migrate_io(struct wb_device *wb, struct migrate_io *mio)
 {
-  size_t a = wb->nr_caches_inseg * k;
-  void *p = wb->migrate_buffer + (wb->nr_caches_inseg << 12) * k;
-
-  u8 i;
-  for (i = 0; i < seg->length; i++) {
-      unsigned long offset = i << 12;
-      void *base = p + offset;
-
-      struct metablock *mb = seg->mb_array + i;
-      u8 dirty_bits = *(wb->memorized_dirtiness + (a + i));
-      if (!dirty_bits)
-          continue;
+   if (!mio->memorized_dirtiness)
+       return;
 
-      if (dirty_bits == 255) {
-          void *addr = base;
-          struct dm_io_request io_req_w = {
+   if (mio->memorized_dirtiness == 255) {
+       struct dm_io_request io_req_w = {
+           .client = wb_io_client,
+           .bi_rw = WRITE,
+           .notify.fn = migrate_endio,
+           .notify.context = wb,
+           .mem.type = DM_IO_VMA,
+           .mem.ptr.vma = mio->data,
+       };
+       struct dm_io_region region_w = {
+           .bdev = wb->backing_dev->bdev,
+           .sector = mio->sector,
+           .count = 1 << 3,
+       };
+       dm_safe_io(&io_req_w, 1, &region_w, NULL, false);
+   } else {
+       u8 i;
+       for (i = 0; i < 8; i++) {
+           struct dm_io_request io_req_w;
+           struct dm_io_region region_w;
+
+           bool bit_on = mio->memorized_dirtiness & (1 << i);
+           if (!bit_on)
+               continue;
+
+           io_req_w = (struct dm_io_request) {
                .client = wb_io_client,
                .bi_rw = WRITE,
                .notify.fn = migrate_endio,
                .notify.context = wb,
                .mem.type = DM_IO_VMA,
-              .mem.ptr.vma = addr,
+               .mem.ptr.vma = mio->data + (i << SECTOR_SHIFT),
            };
-          struct dm_io_region region_w = {
+           region_w = (struct dm_io_region) {
                .bdev = wb->backing_dev->bdev,
-              .sector = mb->sector,
-              .count = 1 << 3,
+               .sector = mio->sector + i,
+               .count = 1,
            };
            dm_safe_io(&io_req_w, 1, &region_w, NULL, false);
-      } else {
-          u8 j;
-          for (j = 0; j < 8; j++) {
-              struct dm_io_request io_req_w;
-              struct dm_io_region region_w;
-
-              void *addr = base + (j << SECTOR_SHIFT);
-              bool bit_on = dirty_bits & (1 << j);
-              if (!bit_on)
-                  continue;
-
-              io_req_w = (struct dm_io_request) {
-                  .client = wb_io_client,
-                  .bi_rw = WRITE,
-                  .notify.fn = migrate_endio,
-                  .notify.context = wb,
-                  .mem.type = DM_IO_VMA,
-                  .mem.ptr.vma = addr,
-              };
-              region_w = (struct dm_io_region) {
-                  .bdev = wb->backing_dev->bdev,
-                  .sector = mb->sector + j,
-                  .count = 1,
-              };
-              dm_safe_io(&io_req_w, 1, &region_w, NULL, false);
-          }
        }
    }
+
+}
+
+static void submit_migrate_ios(struct wb_device *wb)
+{
+   struct blk_plug plug;
+   struct rb_root mt = wb->migrate_tree;
+   blk_start_plug(&plug);
+   do {
+       struct migrate_io *mio = migrate_io_from_node(rb_first(&mt));
+       rb_erase(&mio->rb_node, &mt);
+       submit_migrate_io(wb, mio);
+   } while (!RB_EMPTY_ROOT(&mt));
+   blk_finish_plug(&plug);
+}
+
+bool compare_migrate_io(struct migrate_io *mio, struct migrate_io *pmio)
+{
+   BUG_ON(!mio);
+   BUG_ON(!pmio);
+   if (mio->sector < pmio->sector)
+       return true;
+   if (mio->id < pmio->id)
+       return true;
+   return false;
+}
+
+static void inc_migrate_io_count(u8 dirty_bits, size_t *migrate_io_count)
+{
+   u8 i;
+   if (!dirty_bits)
+       return;
+
+   if (dirty_bits == 255) {
+       (*migrate_io_count)++;
+   } else {
+       for (i = 0; i < 8; i++) {
+           if (dirty_bits & (1 << i))
+               (*migrate_io_count)++;
+       }
+   }
+}
+
+static void add_migrate_io(struct wb_device *wb, struct migrate_io *mio)
+{
+   struct rb_node **rbp, *parent;
+   rbp = &wb->migrate_tree.rb_node;
+   parent = NULL;
+   while (*rbp) {
+       struct migrate_io *pmio;
+       parent = *rbp;
+       pmio = migrate_io_from_node(parent);
+
+       if (compare_migrate_io(mio, pmio))
+           rbp = &(*rbp)->rb_left;
+       else
+           rbp = &(*rbp)->rb_right;
+   }
+   rb_link_node(&mio->rb_node, parent, rbp);
+   rb_insert_color(&mio->rb_node, &wb->migrate_tree);
 }
 
-static void memorize_data_to_migrate(struct wb_device *wb,
-                   struct segment_header *seg, size_t k)
+static void prepare_migrate_ios(struct wb_device *wb, struct segment_header *seg,
+               size_t k, size_t *migrate_io_count)
 {
    int r = 0;
 
+   u8 i;
+
    void *p = wb->migrate_buffer + (wb->nr_caches_inseg << 12) * k;
    struct dm_io_request io_req_r = {
        .client = wb_io_client,
@@ -233,60 +276,21 @@ static void memorize_data_to_migrate(struct wb_device *wb,
        .count = seg->length << 3,
    };
    IO(dm_safe_io(&io_req_r, 1, &region_r, NULL, false));
-}
-
-/*
- * we first memorize the dirtiness in the segments.
- * the memorized dirtiness is dirtier than that of any future moment
- * because it is only monotonously decreasing after flushed.
- * therefore, we will migrate the possible dirtiest state of the
- * segments which won't lose any dirty data.
- */
-static void memorize_metadata_to_migrate(struct wb_device *wb, struct segment_header *seg,
-                   size_t k, size_t *migrate_io_count)
-{
-  u8 i, j;
 
-  struct metablock *mb;
-  size_t a = wb->nr_caches_inseg * k;
-
-  /*
-   * we first memorize the dirtiness of the metablocks.
-   * dirtiness may decrease while we run through the migration code
-   * and it may cause corruption.
-   */
    for (i = 0; i < seg->length; i++) {
-      mb = seg->mb_array + i;
-      *(wb->memorized_dirtiness + (a + i)) = read_mb_dirtiness(wb, seg, mb);
-  }
-
-  for (i = 0; i < seg->length; i++) {
-      u8 dirty_bits = *(wb->memorized_dirtiness + (a + i));
+       struct metablock *mb = seg->mb_array + i;
 
-      if (!dirty_bits)
-          continue;
+       struct migrate_io *mio = wb->migrate_ios + (wb->nr_caches_inseg * k + i);
+       mio->memorized_dirtiness = read_mb_dirtiness(wb, seg, mb);
+       inc_migrate_io_count(mio->memorized_dirtiness, migrate_io_count);
+       mio->sector = mb->sector;
+       mio->data = p + (i << 12);
+       mio->id = seg->id;
 
-      if (dirty_bits == 255) {
-          (*migrate_io_count)++;
-      } else {
-          for (j = 0; j < 8; j++) {
-              if (dirty_bits & (1 << j))
-                  (*migrate_io_count)++;
-          }
-      }
+       add_migrate_io(wb, mio);
    }
 }
 
-/*
- * memorize the dirtiness and count up the number of io to migrate.
- */
-static void memorize_dirty_state(struct wb_device *wb, struct segment_header *seg,
-               size_t k, size_t *migrate_io_count)
-{
-  memorize_data_to_migrate(wb, seg, k);
-  memorize_metadata_to_migrate(wb, seg, k, migrate_io_count);
-}
-
 static void cleanup_segment(struct wb_device *wb, struct segment_header *seg)
 {
    u8 i;
@@ -299,48 +303,28 @@ static void cleanup_segment(struct wb_device *wb, struct segment_header *seg)
 static void transport_emigrates(struct wb_device *wb)
 {
    int r;
-  struct segment_header *seg;
    size_t k, migrate_io_count = 0;
 
-  for (k = 0; k < wb->num_emigrates; k++) {
-      seg = *(wb->emigrates + k);
-      memorize_dirty_state(wb, seg, k, &migrate_io_count);
-  }
+   wb->migrate_tree = RB_ROOT;
+
+   for (k = 0; k < wb->num_emigrates; k++)
+       prepare_migrate_ios(wb, *(wb->emigrates + k), k, &migrate_io_count);
 
-migrate_write:
    atomic_set(&wb->migrate_io_count, migrate_io_count);
    atomic_set(&wb->migrate_fail_count, 0);
 
-  for (k = 0; k < wb->num_emigrates; k++) {
-      seg = *(wb->emigrates + k);
-      submit_migrate_io(wb, seg, k);
-  }
+   submit_migrate_ios(wb);
 
-  LIVE_DEAD(
-      wait_event(wb->migrate_io_wait_queue,
-          !atomic_read(&wb->migrate_io_count))
-      ,
-      atomic_set(&wb->migrate_io_count, 0)
-  );
-
-  /*
-   * if one or more migrates are failed, retry
-   */
-  if (atomic_read(&wb->migrate_fail_count)) {
-      WBWARN("%u writebacks failed. retry",
-             atomic_read(&wb->migrate_fail_count));
-      goto migrate_write;
-  }
-  BUG_ON(atomic_read(&wb->migrate_io_count));
+   wait_event(wb->migrate_io_wait_queue, !atomic_read(&wb->migrate_io_count));
+   if (atomic_read(&wb->migrate_fail_count))
+       set_bit(WB_DEAD, &wb->flags);
 
    /*
     * we clean up the metablocks because there is no reason
     * to leave the them dirty.
     */
-  for (k = 0; k < wb->num_emigrates; k++) {
-      seg = *(wb->emigrates + k);
-      cleanup_segment(wb, seg);
-  }
+   for (k = 0; k < wb->num_emigrates; k++)
+       cleanup_segment(wb, *(wb->emigrates + k));
 
    /*
     * we must write back a segments if it was written persistently.
@@ -364,7 +348,7 @@ static u32 calc_nr_mig(struct wb_device *wb)
 
    nr_max_batch = ACCESS_ONCE(wb->nr_max_batched_migration);
    if (wb->nr_cur_batched_migration != nr_max_batch)
-      try_alloc_migration_buffer(wb, nr_max_batch);
+       try_alloc_migrate_ios(wb, nr_max_batch);
    return min(nr_mig_candidates, wb->nr_cur_batched_migration);
 }
 
diff --git a/src/dm-writeboost-metadata.c b/src/dm-writeboost-metadata.c
index 9a6fbb4..6885311 100644
--- a/src/dm-writeboost-metadata.c
+++ b/src/dm-writeboost-metadata.c
@@ -1503,13 +1503,13 @@ static int recover_cache(struct wb_device *wb)
  * bad User may set # of batches that can hardly allocate.
  * This function is robust in that case.
  */
-int try_alloc_migration_buffer(struct wb_device *wb, size_t nr_batch)
+int try_alloc_migrate_ios(struct wb_device *wb, size_t nr_batch)
 {
    int r = 0;
 
    struct segment_header **emigrates;
    void *buf;
-  void *memorized_dirtiness;
+   struct migrate_io *migrate_ios;
 
    emigrates = kmalloc(nr_batch * sizeof(struct segment_header *), GFP_KERNEL);
    if (!emigrates) {
@@ -1525,11 +1525,11 @@ int try_alloc_migration_buffer(struct wb_device *wb, size_t nr_batch)
        goto bad_alloc_buffer;
    }
 
-  memorized_dirtiness = kmalloc(nr_batch * wb->nr_caches_inseg, GFP_KERNEL);
-  if (!memorized_dirtiness) {
+   migrate_ios = kmalloc(nr_batch * wb->nr_caches_inseg * sizeof(struct migrate_io), GFP_KERNEL);
+   if (!migrate_ios) {
        WBERR("failed to allocate memorized dirtiness");
        r = -ENOMEM;
-      goto bad_alloc_memorized_dirtiness;
+       goto bad_alloc_migrate_ios;
    }
 
    /*
@@ -1538,21 +1538,21 @@ int try_alloc_migration_buffer(struct wb_device *wb, size_t nr_batch)
    kfree(wb->emigrates); /* kfree(NULL) is safe */
    if (wb->migrate_buffer)
        vfree(wb->migrate_buffer);
-  kfree(wb->memorized_dirtiness);
+   kfree(wb->migrate_ios);
 
    /*
     * swap by new values
     */
    wb->emigrates = emigrates;
    wb->migrate_buffer = buf;
-  wb->memorized_dirtiness = memorized_dirtiness;
+   wb->migrate_ios = migrate_ios;
    wb->nr_cur_batched_migration = nr_batch;
 
    return r;
 
 bad_alloc_buffer:
    kfree(wb->emigrates);
-bad_alloc_memorized_dirtiness:
+bad_alloc_migrate_ios:
    vfree(wb->migrate_buffer);
 
    return r;
@@ -1562,7 +1562,7 @@ static void free_migration_buffer(struct wb_device *wb)
 {
    kfree(wb->emigrates);
    vfree(wb->migrate_buffer);
-  kfree(wb->memorized_dirtiness);
+   kfree(wb->migrate_ios);
 }
 
 /*----------------------------------------------------------------*/
@@ -1632,7 +1632,7 @@ static int init_migrate_daemon(struct wb_device *wb)
     */
    nr_batch = 1 << (11 - wb->segment_size_order);
    wb->nr_max_batched_migration = nr_batch;
-  if (try_alloc_migration_buffer(wb, nr_batch))
+   if (try_alloc_migrate_ios(wb, nr_batch))
        return -ENOMEM;
 
    init_waitqueue_head(&wb->migrate_wait_queue);
diff --git a/src/dm-writeboost-metadata.h b/src/dm-writeboost-metadata.h
index 1053d54..3abbc63 100644
--- a/src/dm-writeboost-metadata.h
+++ b/src/dm-writeboost-metadata.h
@@ -40,7 +40,7 @@ u32 calc_checksum(void *rambuffer, u8 length);
 
 /*----------------------------------------------------------------*/
 
-int try_alloc_migration_buffer(struct wb_device *, size_t nr_batch);
+int try_alloc_migrate_ios(struct wb_device *, size_t nr_batch);
 
 /*----------------------------------------------------------------*/
 
diff --git a/src/dm-writeboost-target.c b/src/dm-writeboost-target.c
index 2db1477..9039054 100644
--- a/src/dm-writeboost-target.c
+++ b/src/dm-writeboost-target.c
@@ -149,8 +149,6 @@ static void plog_write_endio(unsigned long error, void *context)
 static void do_append_plog_t1(struct wb_device *wb, struct bio *bio,
                  struct write_job *job)
 {
-  int r = 0;
-
    struct dm_io_request io_req = {
        .client = wb_io_client,
        .bi_rw = WRITE,
@@ -230,8 +228,6 @@ static void wait_plog_writes_complete(struct wb_device *wb)
  */
 static void barrier_plog_writes(struct wb_device *wb)
 {
-  int r = 0;
-
    wait_plog_writes_complete(wb);
 
    /*
diff --git a/src/dm-writeboost.h b/src/dm-writeboost.h
index a625026..654459a 100644
--- a/src/dm-writeboost.h
+++ b/src/dm-writeboost.h
@@ -238,6 +238,19 @@ struct plog_meta_device {
 
 /*----------------------------------------------------------------*/
 
+struct migrate_io {
+   struct rb_node rb_node;
+
+   sector_t sector; /* key */
+   u64 id; /* key */
+
+   void *data;
+   u8 memorized_dirtiness;
+};
+#define migrate_io_from_node(node) rb_entry((node), struct migrate_io, rb_node)
+
+/*----------------------------------------------------------------*/
+
 enum STATFLAG {
    STAT_WRITE = 0, /* write or read */
    STAT_HIT, /* hit or miss */
@@ -401,10 +414,12 @@ struct wb_device {
    u32 nr_cur_batched_migration;
    u32 nr_max_batched_migration; /* tunable */
 
+   struct rb_root migrate_tree;
+
    u32 num_emigrates; /* number of emigrates */
-  struct segment_header **emigrates; /* Segments to be migrated */
-  void *migrate_buffer; /* memorizes the data blocks of the emigrates */
-  u8 *memorized_dirtiness; /* memorize the dirtiness of the metablocks to be migrated */
+   struct segment_header **emigrates; /* segments to be migrated */
+   void *migrate_buffer; /* the data blocks of the emigrates */
+   struct migrate_io *migrate_ios;
 
    /*---------------------------------------------*/
 

ソーティングされている検証. 実際にライトバックのアドレスをプリントして検証します.

コードは大体, 以下のような感じ

plug()
while (ios):
  submit_io(io)
unplug()

plugの境界でアドレスが連続しているのは, シーケンシャルライトだったけどセグメントが別になってしまったというパターンです. 注目するのは, 45312と270464の間です. 全く関係ないライトですが, ソートされてアドレス順に発行されています(もっとも, このケースであればどの道この順序だったと思いますが). sorted writebackが効果を発するのは, nr_max_batched_migrationが大きい時です. 出来るだけたくさんのセグメントを一気にライトバックすることにより, ソートの効果が上がります. ただし, あまり一気にライトバックするとbacking deviceが参ってしまい全体の性能が悪化する可能性があります(具体的にはリードが死にます). ここはチューニングのポイントです.

[  768.227708] device-mapper: writeboost: info@submit_migrate_ios() sector: 45184
[  768.228156] device-mapper: writeboost: info@submit_migrate_ios() sector: 45192
[  768.228548] device-mapper: writeboost: info@submit_migrate_ios() sector: 45200
[  768.228939] device-mapper: writeboost: info@submit_migrate_ios() sector: 45208
[  768.307586] device-mapper: writeboost: info@submit_migrate_ios() plug
[  768.308234] device-mapper: writeboost: info@submit_migrate_ios() sector: 45216
[  768.309091] device-mapper: writeboost: info@submit_migrate_ios() sector: 45224
[  768.309999] device-mapper: writeboost: info@submit_migrate_ios() sector: 45232
[  768.310886] device-mapper: writeboost: info@submit_migrate_ios() sector: 45240
[  768.311790] device-mapper: writeboost: info@submit_migrate_ios() sector: 45248
[  768.312749] device-mapper: writeboost: info@submit_migrate_ios() sector: 45256
[  768.313600] device-mapper: writeboost: info@submit_migrate_ios() sector: 45264
[  768.314505] device-mapper: writeboost: info@submit_migrate_ios() sector: 45272
[  768.315396] device-mapper: writeboost: info@submit_migrate_ios() sector: 45280
[  768.316324] device-mapper: writeboost: info@submit_migrate_ios() sector: 45288
[  768.317053] device-mapper: writeboost: info@submit_migrate_ios() sector: 45296
[  768.317726] device-mapper: writeboost: info@submit_migrate_ios() sector: 45304
[  768.318297] device-mapper: writeboost: info@submit_migrate_ios() sector: 45312
[  768.318678] device-mapper: writeboost: info@submit_migrate_ios() sector: 270464
[  768.319062] device-mapper: writeboost: info@submit_migrate_ios() sector: 270472
[  768.319447] device-mapper: writeboost: info@submit_migrate_ios() sector: 270480
[  768.319827] device-mapper: writeboost: info@submit_migrate_ios() sector: 270488
[  768.320261] device-mapper: writeboost: info@submit_migrate_ios() sector: 270496