summaryrefslogtreecommitdiffstats
path: root/fs/logfs/gc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/logfs/gc.c')
-rw-r--r--fs/logfs/gc.c732
1 files changed, 0 insertions, 732 deletions
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
deleted file mode 100644
index d4efb06..0000000
--- a/fs/logfs/gc.c
+++ /dev/null
@@ -1,732 +0,0 @@
-/*
- * fs/logfs/gc.c - garbage collection code
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- */
-#include "logfs.h"
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-/*
- * Wear leveling needs to kick in when the difference between low erase
- * counts and high erase counts gets too big. A good value for "too big"
- * may be somewhat below 10% of maximum erase count for the device.
- * Why not 397, to pick a nice round number with no specific meaning? :)
- *
- * WL_RATELIMIT is the minimum time between two wear level events. A huge
- * number of segments may fulfil the requirements for wear leveling at the
- * same time. If that happens we don't want to cause a latency from hell,
- * but just gently pick one segment every so often and minimize overhead.
- */
-#define WL_DELTA 397
-#define WL_RATELIMIT 100
-#define MAX_OBJ_ALIASES 2600
-#define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */
-#define LIST_SIZE 64 /* base size of candidate lists */
-#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */
-#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
-
-static int no_free_segments(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
-
- return super->s_free_list.count;
-}
-
-/* journal has distance -1, top-most ifile layer distance 0 */
-static u8 root_distance(struct super_block *sb, gc_level_t __gc_level)
-{
- struct logfs_super *super = logfs_super(sb);
- u8 gc_level = (__force u8)__gc_level;
-
- switch (gc_level) {
- case 0: /* fall through */
- case 1: /* fall through */
- case 2: /* fall through */
- case 3:
- /* file data or indirect blocks */
- return super->s_ifile_levels + super->s_iblock_levels - gc_level;
- case 6: /* fall through */
- case 7: /* fall through */
- case 8: /* fall through */
- case 9:
- /* inode file data or indirect blocks */
- return super->s_ifile_levels - (gc_level - 6);
- default:
- printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
- gc_level);
- WARN_ON(1);
- return super->s_ifile_levels + super->s_iblock_levels;
- }
-}
-
-static int segment_is_reserved(struct super_block *sb, u32 segno)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_area *area;
- void *reserved;
- int i;
-
- /* Some segments are reserved. Just pretend they were all valid */
- reserved = btree_lookup32(&super->s_reserved_segments, segno);
- if (reserved)
- return 1;
-
- /* Currently open segments */
- for_each_area(i) {
- area = super->s_area[i];
- if (area->a_is_open && area->a_segno == segno)
- return 1;
- }
-
- return 0;
-}
-
-static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
-{
- BUG();
-}
-
-/*
- * Returns the bytes consumed by valid objects in this segment. Object headers
- * are counted, the segment header is not.
- */
-static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
- gc_level_t *gc_level)
-{
- struct logfs_segment_entry se;
- u32 ec_level;
-
- logfs_get_segment_entry(sb, segno, &se);
- if (se.ec_level == cpu_to_be32(BADSEG) ||
- se.valid == cpu_to_be32(RESERVED))
- return RESERVED;
-
- ec_level = be32_to_cpu(se.ec_level);
- *ec = ec_level >> 4;
- *gc_level = GC_LEVEL(ec_level & 0xf);
- return be32_to_cpu(se.valid);
-}
-
-static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
- u64 bix, gc_level_t gc_level)
-{
- struct inode *inode;
- int err, cookie;
-
- inode = logfs_safe_iget(sb, ino, &cookie);
- err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0);
- BUG_ON(err);
- logfs_safe_iput(inode, cookie);
-}
-
-static u32 logfs_gc_segment(struct super_block *sb, u32 segno)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_segment_header sh;
- struct logfs_object_header oh;
- u64 ofs, ino, bix;
- u32 seg_ofs, logical_segno, cleaned = 0;
- int err, len, valid;
- gc_level_t gc_level;
-
- LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
-
- btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS);
- err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
- BUG_ON(err);
- gc_level = GC_LEVEL(sh.level);
- logical_segno = be32_to_cpu(sh.segno);
- if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
- logfs_mark_segment_bad(sb, segno);
- cleaned = -1;
- goto out;
- }
-
- for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
- seg_ofs + sizeof(oh) < super->s_segsize; ) {
- ofs = dev_ofs(sb, logical_segno, seg_ofs);
- err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh),
- &oh);
- BUG_ON(err);
-
- if (!memchr_inv(&oh, 0xff, sizeof(oh)))
- break;
-
- if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
- logfs_mark_segment_bad(sb, segno);
- cleaned = super->s_segsize - 1;
- goto out;
- }
-
- ino = be64_to_cpu(oh.ino);
- bix = be64_to_cpu(oh.bix);
- len = sizeof(oh) + be16_to_cpu(oh.len);
- valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level);
- if (valid == 1) {
- logfs_cleanse_block(sb, ofs, ino, bix, gc_level);
- cleaned += len;
- } else if (valid == 2) {
- /* Will be invalid upon journal commit */
- cleaned += len;
- }
- seg_ofs += len;
- }
-out:
- btree_remove32(&super->s_reserved_segments, segno);
- return cleaned;
-}
-
-static struct gc_candidate *add_list(struct gc_candidate *cand,
- struct candidate_list *list)
-{
- struct rb_node **p = &list->rb_tree.rb_node;
- struct rb_node *parent = NULL;
- struct gc_candidate *cur;
- int comp;
-
- cand->list = list;
- while (*p) {
- parent = *p;
- cur = rb_entry(parent, struct gc_candidate, rb_node);
-
- if (list->sort_by_ec)
- comp = cand->erase_count < cur->erase_count;
- else
- comp = cand->valid < cur->valid;
-
- if (comp)
- p = &parent->rb_left;
- else
- p = &parent->rb_right;
- }
- rb_link_node(&cand->rb_node, parent, p);
- rb_insert_color(&cand->rb_node, &list->rb_tree);
-
- if (list->count <= list->maxcount) {
- list->count++;
- return NULL;
- }
- cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node);
- rb_erase(&cand->rb_node, &list->rb_tree);
- cand->list = NULL;
- return cand;
-}
-
-static void remove_from_list(struct gc_candidate *cand)
-{
- struct candidate_list *list = cand->list;
-
- rb_erase(&cand->rb_node, &list->rb_tree);
- list->count--;
-}
-
-static void free_candidate(struct super_block *sb, struct gc_candidate *cand)
-{
- struct logfs_super *super = logfs_super(sb);
-
- btree_remove32(&super->s_cand_tree, cand->segno);
- kfree(cand);
-}
-
-u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec)
-{
- struct gc_candidate *cand;
- u32 segno;
-
- BUG_ON(list->count == 0);
-
- cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
- remove_from_list(cand);
- segno = cand->segno;
- if (ec)
- *ec = cand->erase_count;
- free_candidate(sb, cand);
- return segno;
-}
-
-/*
- * We have several lists to manage segments with. The reserve_list is used to
- * deal with bad blocks. We try to keep the best (lowest ec) segments on this
- * list.
- * The free_list contains free segments for normal usage. It usually gets the
- * second pick after the reserve_list. But when the free_list is running short
- * it is more important to keep the free_list full than to keep a reserve.
- *
- * Segments that are not free are put onto a per-level low_list. If we have
- * to run garbage collection, we pick a candidate from there. All segments on
- * those lists should have at least some free space so GC will make progress.
- *
- * And last we have the ec_list, which is used to pick segments for wear
- * leveling.
- *
- * If all appropriate lists are full, we simply free the candidate and forget
- * about that segment for a while. We have better candidates for each purpose.
- */
-static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
-{
- struct logfs_super *super = logfs_super(sb);
- u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
-
- if (cand->valid == 0) {
- /* 100% free segments */
- log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
- cand->segno, cand->erase_count,
- dev_ofs(sb, cand->segno, 0));
- cand = add_list(cand, &super->s_reserve_list);
- if (cand) {
- log_gc_noisy("add free segment %x (ec %x) at %llx\n",
- cand->segno, cand->erase_count,
- dev_ofs(sb, cand->segno, 0));
- cand = add_list(cand, &super->s_free_list);
- }
- } else {
- /* good candidates for Garbage Collection */
- if (cand->valid < full)
- cand = add_list(cand, &super->s_low_list[cand->dist]);
- /* good candidates for wear leveling,
- * segments that were recently written get ignored */
- if (cand)
- cand = add_list(cand, &super->s_ec_list);
- }
- if (cand)
- free_candidate(sb, cand);
-}
-
-static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
- u8 dist)
-{
- struct logfs_super *super = logfs_super(sb);
- struct gc_candidate *cand;
-
- cand = kmalloc(sizeof(*cand), GFP_NOFS);
- if (!cand)
- return -ENOMEM;
-
- cand->segno = segno;
- cand->valid = valid;
- cand->erase_count = ec;
- cand->dist = dist;
-
- btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS);
- __add_candidate(sb, cand);
- return 0;
-}
-
-static void remove_segment_from_lists(struct super_block *sb, u32 segno)
-{
- struct logfs_super *super = logfs_super(sb);
- struct gc_candidate *cand;
-
- cand = btree_lookup32(&super->s_cand_tree, segno);
- if (cand) {
- remove_from_list(cand);
- free_candidate(sb, cand);
- }
-}
-
-static void scan_segment(struct super_block *sb, u32 segno)
-{
- u32 valid, ec = 0;
- gc_level_t gc_level = 0;
- u8 dist;
-
- if (segment_is_reserved(sb, segno))
- return;
-
- remove_segment_from_lists(sb, segno);
- valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
- if (valid == RESERVED)
- return;
-
- dist = root_distance(sb, gc_level);
- add_candidate(sb, segno, valid, ec, dist);
-}
-
-static struct gc_candidate *first_in_list(struct candidate_list *list)
-{
- if (list->count == 0)
- return NULL;
- return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
-}
-
-/*
- * Find the best segment for garbage collection. Main criterion is
- * the segment requiring the least effort to clean. Secondary
- * criterion is to GC on the lowest level available.
- *
- * So we search the least effort segment on the lowest level first,
- * then move up and pick another segment iff is requires significantly
- * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
- */
-static struct gc_candidate *get_candidate(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- int i, max_dist;
- struct gc_candidate *cand = NULL, *this;
-
- max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1);
-
- for (i = max_dist; i >= 0; i--) {
- this = first_in_list(&super->s_low_list[i]);
- if (!this)
- continue;
- if (!cand)
- cand = this;
- if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid)
- cand = this;
- }
- return cand;
-}
-
-static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
-{
- struct logfs_super *super = logfs_super(sb);
- gc_level_t gc_level;
- u32 cleaned, valid, segno, ec;
- u8 dist;
-
- if (!cand) {
- log_gc("GC attempted, but no candidate found\n");
- return 0;
- }
-
- segno = cand->segno;
- dist = cand->dist;
- valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
- free_candidate(sb, cand);
- log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
- segno, (u64)segno << super->s_segshift,
- dist, no_free_segments(sb), valid,
- super->s_free_bytes);
- cleaned = logfs_gc_segment(sb, segno);
- log_gc("GC segment #%02x complete - now %x valid\n", segno,
- valid - cleaned);
- BUG_ON(cleaned != valid);
- return 1;
-}
-
-static int logfs_gc_once(struct super_block *sb)
-{
- struct gc_candidate *cand;
-
- cand = get_candidate(sb);
- if (cand)
- remove_from_list(cand);
- return __logfs_gc_once(sb, cand);
-}
-
-/* returns 1 if a wrap occurs, 0 otherwise */
-static int logfs_scan_some(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- u32 segno;
- int i, ret = 0;
-
- segno = super->s_sweeper;
- for (i = SCAN_RATIO; i > 0; i--) {
- segno++;
- if (segno >= super->s_no_segs) {
- segno = 0;
- ret = 1;
- /* Break out of the loop. We want to read a single
- * block from the segment size on next invocation if
- * SCAN_RATIO is set to match block size
- */
- break;
- }
-
- scan_segment(sb, segno);
- }
- super->s_sweeper = segno;
- return ret;
-}
-
-/*
- * In principle, this function should loop forever, looking for GC candidates
- * and moving data. LogFS is designed in such a way that this loop is
- * guaranteed to terminate.
- *
- * Limiting the loop to some iterations serves purely to catch cases when
- * these guarantees have failed. An actual endless loop is an obvious bug
- * and should be reported as such.
- */
-static void __logfs_gc_pass(struct super_block *sb, int target)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_block *block;
- int round, progress, last_progress = 0;
-
- /*
- * Doing too many changes to the segfile at once would result
- * in a large number of aliases. Write the journal before
- * things get out of hand.
- */
- if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES)
- logfs_write_anchor(sb);
-
- if (no_free_segments(sb) >= target &&
- super->s_no_object_aliases < MAX_OBJ_ALIASES)
- return;
-
- log_gc("__logfs_gc_pass(%x)\n", target);
- for (round = 0; round < SCAN_ROUNDS; ) {
- if (no_free_segments(sb) >= target)
- goto write_alias;
-
- /* Sync in-memory state with on-medium state in case they
- * diverged */
- logfs_write_anchor(sb);
- round += logfs_scan_some(sb);
- if (no_free_segments(sb) >= target)
- goto write_alias;
- progress = logfs_gc_once(sb);
- if (progress)
- last_progress = round;
- else if (round - last_progress > 2)
- break;
- continue;
-
- /*
- * The goto logic is nasty, I just don't know a better way to
- * code it. GC is supposed to ensure two things:
- * 1. Enough free segments are available.
- * 2. The number of aliases is bounded.
- * When 1. is achieved, we take a look at 2. and write back
- * some alias-containing blocks, if necessary. However, after
- * each such write we need to go back to 1., as writes can
- * consume free segments.
- */
-write_alias:
- if (super->s_no_object_aliases < MAX_OBJ_ALIASES)
- return;
- if (list_empty(&super->s_object_alias)) {
- /* All aliases are still in btree */
- return;
- }
- log_gc("Write back one alias\n");
- block = list_entry(super->s_object_alias.next,
- struct logfs_block, alias_list);
- block->ops->write_block(block);
- /*
- * To round off the nasty goto logic, we reset round here. It
- * is a safety-net for GC not making any progress and limited
- * to something reasonably small. If incremented it for every
- * single alias, the loop could terminate rather quickly.
- */
- round = 0;
- }
- LOGFS_BUG(sb);
-}
-
-static int wl_ratelimit(struct super_block *sb, u64 *next_event)
-{
- struct logfs_super *super = logfs_super(sb);
-
- if (*next_event < super->s_gec) {
- *next_event = super->s_gec + WL_RATELIMIT;
- return 0;
- }
- return 1;
-}
-
-static void logfs_wl_pass(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- struct gc_candidate *wl_cand, *free_cand;
-
- if (wl_ratelimit(sb, &super->s_wl_gec_ostore))
- return;
-
- wl_cand = first_in_list(&super->s_ec_list);
- if (!wl_cand)
- return;
- free_cand = first_in_list(&super->s_free_list);
- if (!free_cand)
- return;
-
- if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) {
- remove_from_list(wl_cand);
- __logfs_gc_once(sb, wl_cand);
- }
-}
-
-/*
- * The journal needs wear leveling as well. But moving the journal is an
- * expensive operation so we try to avoid it as much as possible. And if we
- * have to do it, we move the whole journal, not individual segments.
- *
- * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
- * calculations. First we check whether moving the journal would be a
- * significant improvement. That means that a) the current journal segments
- * have more wear than the future journal segments and b) the current journal
- * segments have more wear than normal ostore segments.
- * Rationale for b) is that we don't have to move the journal if it is aging
- * less than the ostore, even if the reserve segments age even less (they are
- * excluded from wear leveling, after all).
- * Next we check that the superblocks have less wear than the journal. Since
- * moving the journal requires writing the superblocks, we have to protect the
- * superblocks even more than the journal.
- *
- * Also we double the acceptable wear difference, compared to ostore wear
- * leveling. Journal data is read and rewritten rapidly, comparatively. So
- * soft errors have much less time to accumulate and we allow the journal to
- * be a bit worse than the ostore.
- */
-static void logfs_journal_wl_pass(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- struct gc_candidate *cand;
- u32 min_journal_ec = -1, max_reserve_ec = 0;
- int i;
-
- if (wl_ratelimit(sb, &super->s_wl_gec_journal))
- return;
-
- if (super->s_reserve_list.count < super->s_no_journal_segs) {
- /* Reserve is not full enough to move complete journal */
- return;
- }
-
- journal_for_each(i)
- if (super->s_journal_seg[i])
- min_journal_ec = min(min_journal_ec,
- super->s_journal_ec[i]);
- cand = rb_entry(rb_first(&super->s_free_list.rb_tree),
- struct gc_candidate, rb_node);
- max_reserve_ec = cand->erase_count;
- for (i = 0; i < 2; i++) {
- struct logfs_segment_entry se;
- u32 segno = seg_no(sb, super->s_sb_ofs[i]);
- u32 ec;
-
- logfs_get_segment_entry(sb, segno, &se);
- ec = be32_to_cpu(se.ec_level) >> 4;
- max_reserve_ec = max(max_reserve_ec, ec);
- }
-
- if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) {
- do_logfs_journal_wl_pass(sb);
- }
-}
-
-void logfs_gc_pass(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
-
- //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
- /* Write journal before free space is getting saturated with dirty
- * objects.
- */
- if (super->s_dirty_used_bytes + super->s_dirty_free_bytes
- + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes)
- logfs_write_anchor(sb);
- __logfs_gc_pass(sb, super->s_total_levels);
- logfs_wl_pass(sb);
- logfs_journal_wl_pass(sb);
-}
-
-static int check_area(struct super_block *sb, int i)
-{
- struct logfs_super *super = logfs_super(sb);
- struct logfs_area *area = super->s_area[i];
- gc_level_t gc_level;
- u32 cleaned, valid, ec;
- u32 segno = area->a_segno;
- u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
-
- if (!area->a_is_open)
- return 0;
-
- if (super->s_devops->can_write_buf(sb, ofs) == 0)
- return 0;
-
- printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs);
- /*
- * The device cannot write back the write buffer. Most likely the
- * wbuf was already written out and the system crashed at some point
- * before the journal commit happened. In that case we wouldn't have
- * to do anything. But if the crash happened before the wbuf was
- * written out correctly, we must GC this segment. So assume the
- * worst and always do the GC run.
- */
- area->a_is_open = 0;
- valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
- cleaned = logfs_gc_segment(sb, segno);
- if (cleaned != valid)
- return -EIO;
- return 0;
-}
-
-int logfs_check_areas(struct super_block *sb)
-{
- int i, err;
-
- for_each_area(i) {
- err = check_area(sb, i);
- if (err)
- return err;
- }
- return 0;
-}
-
-static void logfs_init_candlist(struct candidate_list *list, int maxcount,
- int sort_by_ec)
-{
- list->count = 0;
- list->maxcount = maxcount;
- list->sort_by_ec = sort_by_ec;
- list->rb_tree = RB_ROOT;
-}
-
-int logfs_init_gc(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- int i;
-
- btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool);
- logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1);
- logfs_init_candlist(&super->s_reserve_list,
- super->s_bad_seg_reserve, 1);
- for_each_area(i)
- logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
- logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
- return 0;
-}
-
-static void logfs_cleanup_list(struct super_block *sb,
- struct candidate_list *list)
-{
- struct gc_candidate *cand;
-
- while (list->count) {
- cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate,
- rb_node);
- remove_from_list(cand);
- free_candidate(sb, cand);
- }
- BUG_ON(list->rb_tree.rb_node);
-}
-
-void logfs_cleanup_gc(struct super_block *sb)
-{
- struct logfs_super *super = logfs_super(sb);
- int i;
-
- if (!super->s_free_list.count)
- return;
-
- /*
- * FIXME: The btree may still contain a single empty node. So we
- * call the grim visitor to clean up that mess. Btree code should
- * do it for us, really.
- */
- btree_grim_visitor32(&super->s_cand_tree, 0, NULL);
- logfs_cleanup_list(sb, &super->s_free_list);
- logfs_cleanup_list(sb, &super->s_reserve_list);
- for_each_area(i)
- logfs_cleanup_list(sb, &super->s_low_list[i]);
- logfs_cleanup_list(sb, &super->s_ec_list);
-}
OpenPOWER on IntegriCloud