Nothing has used the EboFS object storage in years.
Fix usage string of dupstore.
FileJournal and Journal are now part of OSD proper, not
EboFS.
The old scripts in src/jobs are probably broken, but
not touching them further right now.
Signed-off-by: Tommi Virtanen <tommi.virtanen@dreamhost.com>
OPTION(debug_objectcacher, OPT_INT, 0),
OPTION(debug_client, OPT_INT, 0),
OPTION(debug_osd, OPT_INT, 0),
- OPTION(debug_ebofs, OPT_INT, 1),
OPTION(debug_filestore, OPT_INT, 1),
OPTION(debug_journal, OPT_INT, 1),
OPTION(debug_bdev, OPT_INT, 1), // block device
OPTION(filestore_queue_committing_max_bytes, OPT_INT, 100 << 20), // "
OPTION(filestore_op_threads, OPT_INT, 2),
OPTION(filestore_commit_timeout, OPT_FLOAT, 600),
- OPTION(ebofs, OPT_BOOL, false),
- OPTION(ebofs_cloneable, OPT_BOOL, true),
- OPTION(ebofs_verify, OPT_BOOL, false),
- OPTION(ebofs_commit_ms, OPT_INT, 200), // 0 = no forced commit timeout (for debugging/tracing)
- OPTION(ebofs_oc_size, OPT_INT, 10000), // onode cache
- OPTION(ebofs_cc_size, OPT_INT, 10000), // cnode cache
- OPTION(ebofs_bc_size, OPT_U64, 50*256), // 4k blocks, *256 for MB
- OPTION(ebofs_bc_max_dirty, OPT_U64, 30*256), // before write() will block
- OPTION(ebofs_max_prefetch, OPT_INT, 1000), // 4k blocks
- OPTION(ebofs_realloc, OPT_BOOL, false), // hrm, this can cause bad fragmentation, don't use!
- OPTION(ebofs_verify_csum_on_read, OPT_BOOL, true),
OPTION(journal_dio, OPT_BOOL, true),
OPTION(journal_block_align, OPT_BOOL, true),
OPTION(journal_max_write_bytes, OPT_INT, 10 << 20),
int debug_objectcacher;
int debug_client;
int debug_osd;
- int debug_ebofs;
int debug_filestore;
int debug_journal;
int debug_bdev;
int filestore_op_threads;
float filestore_commit_timeout;
- // ebofs
- bool ebofs;
- bool ebofs_cloneable;
- bool ebofs_verify;
- int ebofs_commit_ms;
- int ebofs_oc_size;
- int ebofs_cc_size;
- uint64_t ebofs_bc_size;
- uint64_t ebofs_bc_max_dirty;
- int ebofs_max_prefetch;
- bool ebofs_realloc;
- bool ebofs_verify_csum_on_read;
-
// journal
bool journal_dio;
bool journal_block_align;
*/
#include <iostream>
-//#include "ebofs/Ebofs.h"
#include "os/FileStore.h"
#include "common/ceph_argparse.h"
#include "common/common_init.h"
void usage()
{
- cerr << "usage: dup.ebofs (ebofs|fakestore) src (ebofs|fakestore) dst" << std::endl;
+ cerr << "usage: dupstore filestore SRC filestore DST" << std::endl;
exit(0);
}
ObjectStore *src = 0, *dst = 0;
- //if (strcmp(args[0], "ebofs") == 0)
- //src = new Ebofs(args[1]);
- //else
if (strcmp(args[0], "filestore") == 0)
src = new FileStore(args[1], NULL);
else usage();
- //if (strcmp(args[2], "ebofs") == 0)
- //dst = new Ebofs(args[3]);
- //else
if (strcmp(args[2], "filestore") == 0)
dst = new FileStore(args[3], NULL);
else usage();
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-
-#include "common/config.h"
-#include "Allocator.h"
-#include "Ebofs.h"
-
-
-#define DOUT_SUBSYS ebofs
-#undef dout_prefix
-#define dout_prefix *_dout << "ebofs(" << fs->dev.get_device_name() << ").allocator."
-
-
-void Allocator::dump_freelist()
-{
- if (1) {
- interval_set<block_t> free; // validate too
-
- block_t n = 0;
- for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) {
- Table<block_t,block_t> *tab;
- if (b < EBOFS_NUM_FREE_BUCKETS) {
- tab = fs->free_tab[b];
- dout(0) << "dump bucket " << b << " " << tab->get_num_keys() << dendl;
- } else {
- tab = fs->limbo_tab;
- dout(0) << "dump limbo " << tab->get_num_keys() << dendl;;
- }
-
- if (tab->get_num_keys() > 0) {
- Table<block_t,block_t>::Cursor cursor(tab);
- assert(tab->find(0, cursor) >= 0);
- while (1) {
- dout(0) << "dump ex " << cursor.current().key << "~" << cursor.current().value << dendl;
- assert(cursor.current().value > 0);
-
- if (b < EBOFS_NUM_FREE_BUCKETS)
- n += cursor.current().value;
-
- if (free.contains( cursor.current().key, cursor.current().value ))
- dout(0) << "dump bad " << cursor.current().key << "~" << cursor.current().value << dendl;
- assert(!free.contains( cursor.current().key, cursor.current().value ));
- free.insert( cursor.current().key, cursor.current().value );
- if (cursor.move_right() <= 0) break;
- }
- } else {
- //dout(0) << " empty" << dendl;
- }
- }
-
- assert(n == fs->free_blocks);
- dout(0) << "dump combined freelist is " << free << dendl;
-
-
- // alloc_tab
- if (fs->alloc_tab->get_num_keys() > 0) {
- Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
- assert(fs->alloc_tab->find(0, cursor) >= 0);
- while (1) {
- dout(0) << "alloc ex " << cursor.current().key << "~" << cursor.current().value.first << " ref "
- << cursor.current().value.second
- << dendl;
- assert(cursor.current().value.first > 0);
-
- if (cursor.move_right() <= 0) break;
- }
- }
- }
-}
-
-
-int Allocator::find(extent_t& ex, int bucket, block_t num, block_t near, int dir)
-{
- Table<block_t,block_t>::Cursor cursor(fs->free_tab[bucket]);
- bool found = false;
-
- if ((dir == DIR_ANY || dir == DIR_FWD) &&
- fs->free_tab[bucket]->find( near, cursor ) >= 0) {
- // look to the right
- do {
- if (cursor.current().value >= num)
- found = true;
- } while (!found && cursor.move_right() > 0);
- }
-
- if ((dir == DIR_ANY || dir == DIR_BACK) &&
- !found) {
- // look to the left
- fs->free_tab[bucket]->find( near, cursor );
-
- while (!found && cursor.move_left() >= 0)
- if (cursor.current().value >= num)
- found = true;
- }
-
- if (found) {
- ex.start = cursor.current().key;
- ex.length = cursor.current().value;
- return 0;
- }
-
- return -1;
-}
-
-int Allocator::allocate(extent_t& ex, block_t num, block_t near)
-{
- //dump_freelist();
-
- int dir = DIR_ANY; // no dir
- if (near == NEAR_LAST_FWD) {
- near = last_pos;
- dir = DIR_FWD; // fwd
- }
- else if (near == NEAR_LAST)
- near = last_pos;
-
- int bucket;
-
- while (1) { // try twice, if fwd = true
-
- // look for contiguous extent
- for (bucket = pick_bucket(num); bucket < EBOFS_NUM_FREE_BUCKETS; bucket++) {
- if (find(ex, bucket, num, near, dir) >= 0) {
- // yay!
-
- // remove original
- fs->free_tab[bucket]->remove( ex.start );
- fs->free_blocks -= ex.length;
-
- if (ex.length > num) {
- if (ex.start < near) {
- // to the left
- if (ex.start + ex.length - num <= near) {
- // by a lot. take right-most portion.
- extent_t left;
- left.start = ex.start;
- left.length = ex.length - num;
- ex.start += left.length;
- ex.length -= left.length;
- assert(ex.length == num);
- _release_loner(left);
- } else {
- // take middle part.
- extent_t left,right;
- left.start = ex.start;
- left.length = near - ex.start;
- ex.start = near;
- right.start = ex.start + num;
- right.length = ex.length - left.length - num;
- ex.length = num;
- _release_loner(left);
- _release_loner(right);
- }
- }
- else {
- // to the right. take left-most part.
- extent_t right;
- right.start = ex.start + num;
- right.length = ex.length - num;
- ex.length = num;
- _release_loner(right);
- }
- }
-
- dout(20) << "allocate " << ex << " near " << near << dendl;
- last_pos = ex.end();
- //dump_freelist();
- if (g_conf.ebofs_cloneable)
- alloc_inc(ex);
- return num;
- }
- }
-
- if (dir == DIR_BACK || dir == DIR_ANY) break;
- dir = DIR_BACK;
- }
-
- // ok, find partial extent instead.
- for (block_t trysize = num/2; trysize >= 1; trysize /= 2) {
- int bucket = pick_bucket(trysize);
- if (find(ex, bucket, trysize, near) >= 0) {
- // yay!
- assert(ex.length < num);
-
- fs->free_tab[bucket]->remove(ex.start);
- fs->free_blocks -= ex.length;
- last_pos = ex.end();
- dout(20) << "allocate partial " << ex << " (wanted " << num << ") near " << near << dendl;
- //dump_freelist();
- if (g_conf.ebofs_cloneable)
- alloc_inc(ex);
- return ex.length;
- }
- }
-
- dout(1) << "allocate failed, fs completely full! " << fs->free_blocks << dendl;
- assert(0);
- //dump_freelist();
- return -1;
-}
-
-int Allocator::_release_into_limbo(extent_t& ex)
-{
- dout(10) << "_release_into_limbo " << ex << dendl;
- dout(10) << "limbo is " << limbo << dendl;
- assert(ex.length > 0);
- limbo.insert(ex.start, ex.length);
- fs->limbo_blocks += ex.length;
- return 0;
-}
-
-int Allocator::release(extent_t& ex)
-{
- if (g_conf.ebofs_cloneable)
- return alloc_dec(ex);
-
- _release_into_limbo(ex);
- return 0;
-}
-
-int Allocator::commit_limbo()
-{
- dout(20) << "commit_limbo" << dendl;
- for (map<block_t,block_t>::iterator i = limbo.m.begin();
- i != limbo.m.end();
- i++) {
- fs->limbo_tab->insert(i->first, i->second);
- //fs->free_blocks += i->second;
- }
- limbo.clear();
- //fs->limbo_blocks = 0;
- //dump_freelist();
- return 0;
-}
-
-int Allocator::release_limbo()
-{
- //dump_freelist();
- if (fs->limbo_tab->get_num_keys() > 0) {
- Table<block_t,block_t>::Cursor cursor(fs->limbo_tab);
- fs->limbo_tab->find(0, cursor);
- while (1) {
- extent_t ex = {cursor.current().key, cursor.current().value};
- dout(20) << "release_limbo ex " << ex << dendl;
-
- fs->limbo_blocks -= ex.length;
- _release_merge(ex);
-
- if (cursor.move_right() <= 0) break;
- }
- }
- fs->limbo_tab->clear();
- //dump_freelist();
- return 0;
-}
-
-
-
-/*
-int Allocator::_alloc_loner_inc(extent_t& ex)
-{
- Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
-
- if (fs->alloc_tab->find( ex.start, cursor )
- == Table<block_t,pair<block_t,int> >::Cursor::MATCH) {
- assert(cursor.current().value.first == ex.length);
- pair<block_t,int>& v = cursor.dirty_current_value();
- v.second++;
- dout(10) << "_alloc_loner_inc " << ex << " "
- << (v.second-1) << " -> " << v.second
- << dendl;
- } else {
- // insert it, @1
- fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length,1));
- dout(10) << "_alloc_loner_inc " << ex << " 0 -> 1" << dendl;
- }
- return 0;
-}
-
-int Allocator::_alloc_loner_dec(extent_t& ex)
-{
- Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
-
- if (fs->alloc_tab->find( ex.start, cursor )
- == Table<block_t,pair<block_t,int> >::Cursor::MATCH) {
- assert(cursor.current().value.first == ex.length);
- if (cursor.current().value.second == 1) {
- dout(10) << "_alloc_loner_dec " << ex << " 1 -> 0" << dendl;
- fs->alloc_tab->remove( cursor.current().key );
- } else {
- pair<block_t,int>& v = cursor.dirty_current_value();
- --v.second;
- dout(10) << "_alloc_loner_dec " << ex << " "
- << (v.second+1) << " -> " << v.second
- << dendl;
- }
- } else {
- assert(0);
- }
- return 0;
-}
-*/
-
-
-int Allocator::alloc_inc(extent_t ex)
-{
- dout(10) << "alloc_inc " << ex << dendl;
-
- // empty table?
- if (fs->alloc_tab->get_num_keys() == 0) {
- // easy.
- fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length,1));
- dout(10) << "alloc_inc + " << ex << " 0 -> 1 (first entry)" << dendl;
- return 0;
- }
-
- Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
-
- // try to move to left (to check for overlap)
- int r = fs->alloc_tab->find( ex.start, cursor );
- if (r == Table<block_t,pair<block_t,int> >::Cursor::OOB ||
- cursor.current().key > ex.start) {
- r = cursor.move_left();
- dout(10) << "alloc_inc move_left r = " << r << dendl;
- }
-
- while (1) {
- dout(10) << "alloc_inc loop at " << cursor.current().key
- << "~" << cursor.current().value.first
- << " ref " << cursor.current().value.second
- << dendl;
-
- // too far left?
- if (cursor.current().key < ex.start &&
- cursor.current().key + cursor.current().value.first <= ex.start) {
- // adjacent?
- bool adjacent = false;
- if (cursor.current().key + cursor.current().value.first == ex.start &&
- cursor.current().value.second == 1)
- adjacent = true;
-
- // no overlap.
- r = cursor.move_right();
- dout(10) << "alloc_inc move_right r = " << r << dendl;
-
- // at end?
- if (r <= 0) {
- // hmm!
- if (adjacent) {
- // adjust previous entry
- cursor.move_left();
- pair<block_t,int> &v = cursor.dirty_current_value();
- v.first += ex.length; // yay!
- dout(10) << "alloc_inc + " << ex << " 0 -> 1 (adjust at end)" << dendl;
- } else {
- // insert at end, finish.
- int r = fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length,1));
- dout(10) << "alloc_inc + " << ex << " 0 -> 1 (at end) .. r = " << r << dendl;
- //dump_freelist();
- }
- return 0;
- }
- }
- dout(10) << "alloc_inc at " << cursor.current().key
- << "~" << cursor.current().value.first
- << " ref " << cursor.current().value.second << dendl;
- if (cursor.current().key > ex.start) {
- // gap.
- // oooooo
- // nnnnn.....
- block_t l = MIN(ex.length, cursor.current().key - ex.start);
-
- fs->alloc_tab->insert(ex.start, pair<block_t,int>(l,1));
- dout(10) << "alloc_inc + " << ex.start << "~" << l << " 0 -> 1" << dendl;
- ex.start += l;
- ex.length -= l;
- if (ex.length == 0) break;
- fs->alloc_tab->find( ex.start, cursor );
- }
- else if (cursor.current().key < ex.start) {
- block_t end = cursor.current().value.first + cursor.current().key;
-
- if (end <= ex.end()) {
- // single split
- // oooooo
- // nnnnn
- pair<block_t,int>& v = cursor.dirty_current_value();
- v.first = ex.start - cursor.current().key;
- int ref = v.second;
-
- block_t l = end - ex.start;
- fs->alloc_tab->insert(ex.start, pair<block_t,int>(l, 1+ref));
-
- dout(10) << "alloc_inc " << ex.start << "~" << l
- << " " << ref << " -> " << ref+1
- << " (right split)" << dendl;
-
- ex.start += l;
- ex.length -= l;
- if (ex.length == 0) break;
- fs->alloc_tab->find( ex.start, cursor );
-
- } else {
- // double split, finish.
- // -------------
- // ------
- pair<block_t,int>& v = cursor.dirty_current_value();
- v.first = ex.start - cursor.current().key;
- int ref = v.second;
-
- fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length, 1+ref));
-
- int rl = end - ex.end();
- fs->alloc_tab->insert(ex.end(), pair<block_t,int>(rl, ref));
-
- dout(10) << "alloc_inc " << ex
- << " " << ref << " -> " << ref+1
- << " (double split finish)"
- << dendl;
-
- break;
- }
- }
- else {
- assert(cursor.current().key == ex.start);
-
- if (cursor.current().value.first <= ex.length) {
- // inc.
- // oooooo
- // nnnnnnnn
- pair<block_t,int>& v = cursor.dirty_current_value();
- v.second++;
- dout(10) << "alloc_inc " << ex.start << "~" << cursor.current().value.first
- << " " << cursor.current().value.second-1 << " -> "
- << cursor.current().value.second
- << " (left split)" << dendl;
- ex.start += v.first;
- ex.length -= v.first;
- if (ex.length == 0) break;
- cursor.move_right();
- } else {
- // single split, finish.
- // oooooo
- // nnn
- block_t l = cursor.current().value.first - ex.length;
- int ref = cursor.current().value.second;
-
- pair<block_t,int>& v = cursor.dirty_current_value();
- v.first = ex.length;
- v.second++;
-
- fs->alloc_tab->insert(ex.end(), pair<block_t,int>(l, ref));
-
- dout(10) << "alloc_inc " << ex
- << " " << ref << " -> " << ref+1
- << " (left split finish)"
- << dendl;
-
- break;
- }
- }
- }
-
- return 0;
-}
-
-
-int Allocator::alloc_dec(extent_t ex)
-{
- dout(10) << "alloc_dec " << ex << dendl;
-
- assert(fs->alloc_tab->get_num_keys() >= 0);
-
- Table<block_t,pair<block_t,int> >::Cursor cursor(fs->alloc_tab);
-
- // try to move to left (to check for overlap)
- int r = fs->alloc_tab->find( ex.start, cursor );
- dout(10) << "alloc_dec find r = " << r << dendl;
-
- if (r == Table<block_t,pair<block_t,int> >::Cursor::OOB ||
- cursor.current().key > ex.start) {
- r = cursor.move_left();
- dout(10) << "alloc_dec move_left r = " << r << dendl;
-
- // too far left?
- if (cursor.current().key < ex.start &&
- cursor.current().key + cursor.current().value.first <= ex.start) {
- // no overlap.
- dout(10) << "alloc_dec no overlap " << cursor.current().key
- << "~" << cursor.current().value.first
- << " " << cursor.current().value.second
- << " with " << ex << dendl;
- dump_freelist();
- assert(0);
- }
- }
-
- while (1) {
- dout(10) << "alloc_dec ? " << cursor.current().key
- << "~" << cursor.current().value.first
- << " " << cursor.current().value.second
- << ", ex is " << ex
- << dendl;
-
- assert(cursor.current().key <= ex.start); // no gap allowed.
-
- if (cursor.current().key < ex.start) {
- block_t end = cursor.current().value.first + cursor.current().key;
-
- if (end <= ex.end()) {
- // single split
- // oooooo
- // -----
- pair<block_t,int>& v = cursor.dirty_current_value();
- v.first = ex.start - cursor.current().key;
- int ref = v.second;
- dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first
- << " " << ref
- << " shortened left bit of single" << dendl;
-
- block_t l = end - ex.start;
- if (ref > 1) {
- fs->alloc_tab->insert(ex.start, pair<block_t,int>(l, ref-1));
- dout(10) << "alloc_dec . " << ex.start << "~" << l
- << " " << ref << " -> " << ref-1
- << dendl;
- } else {
- extent_t r = {ex.start, l};
- _release_into_limbo(r);
- }
-
- ex.start += l;
- ex.length -= l;
- if (ex.length == 0) break;
- fs->alloc_tab->find( ex.start, cursor );
-
- } else {
- // double split, finish.
- // ooooooooooooo
- // ------
- pair<block_t,int>& v = cursor.dirty_current_value();
- v.first = ex.start - cursor.current().key;
- int ref = v.second;
- dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first
- << " " << ref
- << " shorted left bit of double split" << dendl;
-
- if (ref > 1) {
- fs->alloc_tab->insert(ex.start, pair<block_t,int>(ex.length, ref-1));
- dout(10) << "alloc_inc s " << ex
- << " " << ref << " -> " << ref-1
- << " reinserted middle bit of double split"
- << dendl;
- } else {
- _release_into_limbo(ex);
- }
-
- int rl = end - ex.end();
- fs->alloc_tab->insert(ex.end(), pair<block_t,int>(rl, ref));
- dout(10) << "alloc_dec s " << ex.end() << "~" << rl
- << " " << ref
- << " reinserted right bit of double split" << dendl;
- break;
- }
- }
- else {
- assert(cursor.current().key == ex.start);
-
- if (cursor.current().value.first <= ex.length) {
- // inc.
- // oooooo
- // nnnnnnnn
- if (cursor.current().value.second > 1) {
- pair<block_t,int>& v = cursor.dirty_current_value();
- v.second--;
- dout(10) << "alloc_dec s " << ex.start << "~" << cursor.current().value.first
- << " " << cursor.current().value.second+1 << " -> " << cursor.current().value.second
- << dendl;
- ex.start += v.first;
- ex.length -= v.first;
- if (ex.length == 0) break;
- cursor.move_right();
- } else {
- extent_t r = {cursor.current().key, cursor.current().value.first};
- _release_into_limbo(r);
-
- ex.start += cursor.current().value.first;
- ex.length -= cursor.current().value.first;
- fs->alloc_tab->remove(cursor.current().key);
-
- if (ex.length == 0) break;
- fs->alloc_tab->find( ex.start, cursor );
- }
- } else {
- // single split, finish.
- // oooooo
- // nnn
- block_t l = cursor.current().value.first - ex.length;
- int ref = cursor.current().value.second;
-
- if (ref > 1) {
- pair<block_t,int>& v = cursor.dirty_current_value();
- v.first = ex.length;
- v.second--;
- dout(10) << "alloc_inc . " << ex
- << " " << ref << " -> " << ref-1
- << dendl;
- } else {
- fs->alloc_tab->remove(cursor.current().key);
- _release_into_limbo(ex);
- }
-
- dout(10) << "alloc_dec s " << ex.end() << "~" << l
- << " " << ref
- << " reinserted right bit of single split" << dendl;
- fs->alloc_tab->insert(ex.end(), pair<block_t,int>(l, ref));
- break;
- }
- }
-
-
- }
-
- return 0;
-}
-
-
-/*
- * release extent into freelist
- * WARNING: *ONLY* use this if you _know_ there are no adjacent free extents
- */
-int Allocator::_release_loner(extent_t& ex)
-{
- assert(ex.length > 0);
- int b = pick_bucket(ex.length);
- fs->free_tab[b]->insert(ex.start, ex.length);
- fs->free_blocks += ex.length;
- return 0;
-}
-
-/*
- * release extent into freelist
- * look for any adjacent extents and merge with them!
- */
-int Allocator::_release_merge(extent_t& orig)
-{
- dout(15) << "_release_merge " << orig << dendl;
- assert(orig.length > 0);
-
- extent_t newex = orig;
-
- // one after us?
- for (int b=0; b<EBOFS_NUM_FREE_BUCKETS; b++) {
- Table<block_t,block_t>::Cursor cursor(fs->free_tab[b]);
-
- if (fs->free_tab[b]->find( newex.start+newex.length, cursor )
- == Table<block_t,block_t>::Cursor::MATCH) {
- // add following extent to ours
- newex.length += cursor.current().value;
-
- // remove it
- fs->free_blocks -= cursor.current().value;
- fs->free_tab[b]->remove( cursor.current().key );
- break;
- }
- }
-
- // one before us?
- for (int b=0; b<EBOFS_NUM_FREE_BUCKETS; b++) {
- Table<block_t,block_t>::Cursor cursor(fs->free_tab[b]);
- fs->free_tab[b]->find( newex.start+newex.length, cursor );
- if (cursor.move_left() >= 0 &&
- (cursor.current().key + cursor.current().value == newex.start)) {
- // merge
- newex.start = cursor.current().key;
- newex.length += cursor.current().value;
-
- // remove it
- fs->free_blocks -= cursor.current().value;
- fs->free_tab[b]->remove( cursor.current().key );
- break;
- }
- }
-
- // ok, insert newex
- _release_loner(newex);
- return 0;
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef CEPH_EBOFS_ALLOCATOR_H
-#define CEPH_EBOFS_ALLOCATOR_H
-
-#include "types.h"
-
-#include "include/interval_set.h"
-
-class Ebofs;
-
-class Allocator {
-public:
- const static block_t NEAR_LAST = 0;
- const static block_t NEAR_LAST_FWD = 1;
-
- const static int DIR_ANY = 0;
- const static int DIR_FWD = 2;
- const static int DIR_BACK = 1;
-
-protected:
- Ebofs *fs;
- block_t last_pos;
-
-
- interval_set<block_t> limbo;
-
- static int pick_bucket(block_t num) {
- int b = 0;
- while (num > 1) {
- b++;
- num = num >> EBOFS_FREE_BUCKET_BITS;
- }
- if (b >= EBOFS_NUM_FREE_BUCKETS)
- b = EBOFS_NUM_FREE_BUCKETS-1;
- return b;
- }
-
- int find(extent_t& ex, int bucket, block_t num, block_t near, int dir = DIR_ANY);
-
- void dump_freelist();
-
- public:
- int _release_into_limbo(extent_t& ex);
-
- int _release_loner(extent_t& ex); // release loner extent
- int _release_merge(extent_t& ex); // release any extent (searches for adjacent)
-
- //int _alloc_loner_inc(extent_t& ex);
- //int _alloc_loner_dec(extent_t& ex);
-
-
- public:
- Allocator(Ebofs *f) : fs(f), last_pos(0) {}
-
- int allocate(extent_t& ex, block_t num, block_t near=NEAR_LAST);
- int release(extent_t& ex); // alias for alloc_dec
-
- int alloc_inc(extent_t ex);
- int alloc_dec(extent_t ex);
-
- int unallocate(extent_t& ex) { // skip limbo
- return _release_merge(ex);
- }
-
- int commit_limbo(); // limbo -> fs->limbo_tab
- int release_limbo(); // fs->limbo_tab -> free_tabs
-
-};
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "common/config.h"
-#include "BlockDevice.h"
-
-
-#include <unistd.h>
-#include <stdlib.h>
-//#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <sys/file.h>
-#include <iostream>
-#include <errno.h>
-
-#include <sys/uio.h>
-#include <limits.h>
-
-#include <sys/ioctl.h>
-
-#ifndef __CYGWIN__
-#ifndef DARWIN
-#include <linux/fs.h>
-#else
-/* lseek works on 64-bit offsets on OS/X */
-#define lseek64 lseek
-#endif
-#endif
-
-#define DOUT_SUBSYS bdev
-
-
-/*******************************************
- * biovec
- */
-
-inline ostream& operator<<(ostream& out, BlockDevice::biovec &bio)
-{
- out << "bio(";
- if (bio.type == BlockDevice::biovec::IO_READ) out << "rd ";
- if (bio.type == BlockDevice::biovec::IO_WRITE) out << "wr ";
- out << bio.start << "~" << bio.length;
- if (bio.note) out << " " << bio.note;
- out << " " << &bio;
- out << ")";
- return out;
-}
-
-
-
-/*******************************************
- * ElevatorQueue
- */
-
-#undef dout_prefix
-#define dout_prefix *_dout << "bdev(" << dev << ").elevatorq."
-
-
-int BlockDevice::ElevatorQueue::dequeue_io(list<biovec*>& biols,
- block_t& start, block_t& length,
- interval_set<block_t>& block_lock)
-{
- // queue empty?
- assert(!io_map.empty());
-
- dout(20) << "dequeue_io el_pos " << el_pos << " dir " << el_dir_forward << dendl;
-
- // find our position: i >= pos
- map<block_t,biovec*>::iterator i;
-
- int tries = 2;
- while (tries > 0) {
- if (el_dir_forward) {
- i = io_map.lower_bound(el_pos);
- if (i != io_map.end()) {
- break; // not at end. good.
- }
- } else {
- i = io_map.upper_bound(el_pos);
- if (i != io_map.begin()) {
- i--; // and back down one (to get i <= pos). good.
- break;
- }
- }
-
- // reverse (or initial startup)?
- if (g_conf.bdev_el_bidir || !el_dir_forward) {
- // dout(20) << "restart reversing" << dendl;
- el_dir_forward = !el_dir_forward;
- }
-
- if (el_dir_forward) {
- // forward
- el_pos = 0;
-
- if (g_conf.bdev_el_fw_max_ms) {
- el_stop = g_clock.now();
- utime_t max(0, 1000*g_conf.bdev_el_fw_max_ms); // (s,us), convert ms -> us!
- el_stop += max;
- // dout(20) << "restart forward sweep for " << max << dendl;
- } else {
- // dout(20) << "restart fowrard sweep" << dendl;
- }
- } else {
- // reverse
- el_pos = bdev->get_num_blocks();
-
- if (g_conf.bdev_el_bw_max_ms) {
- el_stop = g_clock.now();
- utime_t max(0, 1000*g_conf.bdev_el_bw_max_ms); // (s,us), convert ms -> us!
- el_stop += max;
- // dout(20) << "restart reverse sweep for " << max << dendl;
- } else {
- // dout(20) << "restart reverse sweep" << dendl;
- }
- }
-
- tries--;
- }
-
- assert(tries > 0); // this shouldn't happen if the queue is non-empty.
-
- // get some biovecs
- int num_bio = 0;
-
- dout(20) << "dequeue_io starting with " << i->first << " " << *i->second << dendl;
-
- // merge contiguous ops
- char type = i->second->type; // read or write
- int num_iovs = 0; // count eventual iov's for readv/writev
-
- start = i->first;
- length = 0;
-
- if (el_dir_forward)
- el_pos = start;
- else
- el_pos = i->first + i->second->length;
-
- // while (contiguous)
- while ((( el_dir_forward && el_pos == i->first) ||
- (!el_dir_forward && el_pos == i->first + i->second->length)) &&
- type == i->second->type) {
- biovec *bio = i->second;
-
- // allowed? (not already submitted to kernel?)
- if (block_lock.intersects(bio->start, bio->length)) {
- dout(20) << "dequeue_io " << bio->start << "~" << bio->length
- << " intersects block_lock " << block_lock << dendl;
- break; // stop, or go with what we've got so far
- }
-
- // add to biols
- int nv = bio->bl.buffers().size(); // how many iov's in this bio's bufferlist?
- if (num_bio &&
- num_iovs + nv >= IOV_MAX) break; // to many //g_conf.bdev_iov_max) break; // too many!
- num_iovs += nv;
-
- start = MIN(start, bio->start);
- length += bio->length;
-
- if (el_dir_forward) {
- dout(20) << "dequeue_io fw dequeue io at " << el_pos << " " << *i->second << dendl;
- biols.push_back(bio); // add at back
- } else {
- dout(20) << "dequeue_io bw dequeue io at " << el_pos << " " << *i->second << dendl;
- biols.push_front(bio); // add at front
- }
- num_bio++;
-
- // move elevator pointer
- bool at_end = false;
- map<block_t,biovec*>::iterator prev = i;
- if (el_dir_forward) {
- el_pos += bio->length; // cont. next would start right after us
- i++;
- if (i == io_map.end()) {
- at_end = true;
- }
- } else {
- el_pos -= bio->length;
- if (i == io_map.begin()) {
- at_end = true;
- } else {
- i--;
- }
- }
-
- // dequeue
- io_map.erase(prev);
- bio->in_queue = 0;
-
- if (at_end) break;
- }
-
- return num_bio;
-}
-
-
-
-/*******************************************
- * BarrierQueue
- */
-#undef dout_prefix
-#define dout_prefix *_dout << "bdev(" << dev << ").barrierq."
-
-void BlockDevice::BarrierQueue::barrier()
-{
- if (!qls.empty() && qls.front()->empty()) {
- assert(qls.size() == 1);
- dout(10) << "barrier not adding new queue, front is empty" << dendl;
- } else {
- qls.push_back(new ElevatorQueue(bdev, dev));
- dout(10) << "barrier adding new elevator queue (now " << qls.size() << "), front queue has "
- << qls.front()->size() << " ios left" << dendl;
- }
-}
-
-bool BlockDevice::BarrierQueue::bump()
-{
- assert(!qls.empty());
-
- // is the front queue(s) empty?
- bool did = false;
- while (!qls.empty() &&
- qls.front()->empty() &&
- qls.front() != qls.back()) {
- delete qls.front();
- qls.pop_front();
- dout(10) << "dequeue_io front empty, moving to next queue (" << qls.front()->size() << ")" << dendl;
- did = true;
- }
- return did;
-}
-
-int BlockDevice::BarrierQueue::dequeue_io(list<biovec*>& biols,
- block_t& start, block_t& length,
- interval_set<block_t>& locked)
-{
- assert(!qls.empty());
- int n = qls.front()->dequeue_io(biols, start, length, locked);
- bump(); // in case we emptied the front queue
- return n;
-}
-
-
-
-
-/*******************************************
- * BlockDevice
- */
-
-#undef dout_prefix
-#define dout_prefix *_dout << "bdev(" << dev << ")."
-
-block_t BlockDevice::get_num_blocks()
-{
- if (!num_blocks) {
- assert(fd > 0);
-
- int r;
- uint64_t bytes = 0;
-#ifdef BLKGETSIZE64
- // ioctl block device
- r = ioctl(fd, BLKGETSIZE64, &bytes);
- num_blocks = bytes / (uint64_t)EBOFS_BLOCK_SIZE;
- if (r == 0) {
- dout(10) << "get_num_blocks ioctl BLKGETSIZE64 reports "
- << num_blocks << " 4k blocks, "
- << bytes << " bytes"
- << dendl;
-#else
-# ifdef BLKGETSIZE
- // hrm, try the 32 bit ioctl?
- unsigned long sectors = 0;
- r = ioctl(fd, BLKGETSIZE, §ors);
- num_blocks = sectors/8ULL;
- bytes = sectors*512ULL;
- if (r == 0) {
- dout(10) << "get_num_blocks ioctl BLKGETSIZE reports " << sectors << " sectors, "
- << num_blocks << " 4k blocks, " << bytes << " bytes" << dendl;
-# else
- // probably CYGWIN or similar lame plaform...
- unsigned long sectors = 0;
- r = sectors; // shut up compiler
- if (0) {
-# endif
-#endif
- } else {
- // hmm, try stat!
- char buf[80];
- dout(10) << "get_num_blocks ioctl(2) failed with " << errno << " " << strerror_r(errno, buf, sizeof(buf)) << ", using stat(2)" << dendl;
- struct stat st;
- fstat(fd, &st);
- uint64_t bytes = st.st_size;
- num_blocks = bytes / EBOFS_BLOCK_SIZE;
- dout(10) << "get_num_blocks stat reports " << num_blocks << " 4k blocks, " << bytes << " bytes" << dendl;
- }
-
- if (g_conf.bdev_fake_mb) {
- num_blocks = g_conf.bdev_fake_mb * 256;
- dout(0) << "faking dev size " << g_conf.bdev_fake_mb << " mb" << dendl;
- }
- if (g_conf.bdev_fake_max_mb &&
- num_blocks > (block_t)g_conf.bdev_fake_max_mb * 256ULL) {
- dout(0) << "faking dev size " << g_conf.bdev_fake_max_mb << " mb" << dendl;
- num_blocks = g_conf.bdev_fake_max_mb * 256;
- }
-
- }
- return num_blocks;
-}
-
-
-
-/** io thread
- * each worker thread dequeues ios from the root_queue and submits them to the kernel.
- */
-void* BlockDevice::io_thread_entry()
-{
- lock.Lock();
-
- int whoami = io_threads_started++;
- io_threads_running++;
- assert(io_threads_running <= g_conf.bdev_iothreads);
- dout(10) << "io_thread" << whoami << " start, " << io_threads_running << " now running" << dendl;
-
- // get my own fd (and file position pointer)
- int fd = open_fd();
- assert(fd > 0);
-
- while (!io_stop) {
- if (!root_queue.empty()) {
- dout(20) << "io_thread" << whoami << "/" << io_threads_running << " going" << dendl;
-
- block_t start, length;
- list<biovec*> biols;
- int n = root_queue.dequeue_io(biols, start, length, io_block_lock);
-
- if (n == 0) {
- // failed to dequeue a do-able op, sleep for now
- dout(20) << "io_thread" << whoami << "/" << io_threads_running << " couldn't dequeue doable op, sleeping" << dendl;
- assert(io_threads_running > 1); // there must be someone else, if we couldn't dequeue something doable.
- }
- else {
- // lock blocks
- assert(start == biols.front()->start);
- io_block_lock.insert(start, length);
-
- // drop lock to do the io
- lock.Unlock();
- do_io(fd, biols);
- lock.Lock();
-
- // unlock blocks
- io_block_lock.erase(start, length);
-
- // someone might have blocked on our block_lock?
- if (io_threads_running < g_conf.bdev_iothreads &&
- (int)root_queue.size() > io_threads_running)
- io_wakeup.SignalAll();
-
- // loop again (don't sleep)
- continue;
- }
- }
-
- // sleep
- io_threads_running--;
- dout(20) << "io_thread" << whoami << " sleeping, "
- << io_threads_running << " threads now running,"
- << " queue has " << root_queue.size()
- << dendl;
-
- // first wait for signal | timeout?
- if (g_conf.bdev_idle_kick_after_ms > 0 &&
- idle_kicker &&
- io_threads_running == 0 && !is_idle_waiting) { // only the last thread asleep needs to kick.
- // sleep, but just briefly.
- dout(20) << "io_thread" << whoami << " doing short wait, to see if i stay idle" << dendl;
- is_idle_waiting = true;
- int r = io_wakeup.WaitInterval(lock, utime_t(0, g_conf.bdev_idle_kick_after_ms*1000));
- is_idle_waiting = false;
-
- if (io_stop)
- break;
-
- if (r == ETIMEDOUT) {
- dout(20) << "io_thread" << whoami << " timeout expired, kicking ebofs" << dendl;
- kicker_cond.Signal(); // signal kicker thread
- } else {
- dout(20) << "io_thread" << whoami << " signaled during short sleep, waking up" << dendl;
- goto wake_up;
- }
- }
-
- // sleeeep
- io_wakeup.Wait(lock); // and wait (if condition still holds)
-
- wake_up:
- io_threads_running++;
- assert(io_threads_running <= g_conf.bdev_iothreads);
- dout(20) << "io_thread" << whoami << "/" << io_threads_running << " woke up, " << io_threads_running << " threads now running" << dendl;
- }
-
- // clean up
- ::close(fd);
- io_threads_running--;
-
- lock.Unlock();
-
- dout(10) << "io_thread" << whoami << " finish" << dendl;
- return 0;
-}
-
-
-
-/** do_io
- * do a single io operation
- * (lock is NOT held, but we own the *biovec)
- */
-void BlockDevice::do_io(int fd, list<biovec*>& biols)
-{
- int r;
- assert(!biols.empty());
-
- // get full range, type, bl
- bufferlist bl;
- bl.claim(biols.front()->bl);
- block_t start = biols.front()->start;
- block_t length = biols.front()->length;
- char type = biols.front()->type;
-
- list<biovec*>::iterator p = biols.begin();
- int numbio = 1;
- for (p++; p != biols.end(); p++) {
- length += (*p)->length;
- bl.claim_append((*p)->bl);
- numbio++;
- }
-
- // do it
- dout(20) << "do_io start " << (type==biovec::IO_WRITE?"write":"read")
- << " " << start << "~" << length
- << " " << numbio << " bits" << dendl;
- if (type == biovec::IO_WRITE) {
- r = _write(fd, start, length, bl);
- } else if (type == biovec::IO_READ) {
- r = _read(fd, start, length, bl);
- } else assert(0);
- dout(20) << "do_io finish " << (type==biovec::IO_WRITE?"write":"read")
- << " " << start << "~" << length << dendl;
-
- // set rval
- for (p = biols.begin(); p != biols.end(); p++)
- (*p)->rval = r;
-
- if (1) {
- // put in completion queue
- complete_lock.Lock();
- complete_queue.splice( complete_queue.end(), biols );
- complete_queue_len += numbio;
- complete_wakeup.Signal();
- complete_lock.Unlock();
- dout(20) << "do_io kicked completer on " << (type==biovec::IO_WRITE?"write":"read")
- << " " << start << "~" << length << dendl;
-
- } else {
- // be slow and finish synchronously
- for (p = biols.begin(); p != biols.end(); p++)
- finish_io(*p);
- }
-}
-
-
-/** finish_io
- *
- * finish an io by signaling the cond or performing a callback.
- * called by completion thread, unless that's disabled above.
- */
-void BlockDevice::finish_io(biovec *bio)
-{
- bio->done = true;
- if (bio->cond) {
- lock.Lock(); // hmm?
- bio->cond->Signal();
- lock.Unlock();
- }
- else if (bio->cb) {
- bio->cb->finish((ioh_t)bio, bio->rval);
- delete bio->cb;
- delete bio;
- }
-}
-
-/*** completion_thread
- * handle Cond signals or callbacks for completed ios
- */
-void* BlockDevice::complete_thread_entry()
-{
- complete_lock.Lock();
- dout(10) << "complete_thread start" << dendl;
-
- while (!io_stop) {
-
- while (!complete_queue.empty()) {
- list<biovec*> ls;
- ls.swap(complete_queue);
- dout(10) << "complete_thread grabbed " << complete_queue_len << " biovecs" << dendl;
- complete_queue_len = 0;
-
- complete_lock.Unlock();
-
- // finish
- for (list<biovec*>::iterator p = ls.begin();
- p != ls.end();
- p++) {
- biovec *bio = *p;
- dout(20) << "complete_thread finishing " << *bio << dendl;
- finish_io(bio);
- }
-
- complete_lock.Lock();
- }
- if (io_stop) break;
-
- dout(25) << "complete_thread sleeping" << dendl;
- complete_wakeup.Wait(complete_lock);
- }
-
- dout(10) << "complete_thread finish" << dendl;
- complete_lock.Unlock();
- return 0;
-}
-
-
-/*** idle kicker thread
- * kick ebofs when we're idle. we're a separate thread (yuck)
- * because ebofs may be holding it's lock _and_ waiting for us
- * to do useful work. that rules out io_thread and complete_thread!
- */
-void* BlockDevice::kicker_thread_entry()
-{
- lock.Lock();
- dout(10) << "kicker_thread start" << dendl;
-
- while (!io_stop) {
-
- if (io_threads_running == 0 && idle_kicker) {
- dout(25) << "kicker_thread kicking ebofs" << dendl;
- lock.Unlock();
- idle_kicker->kick();
- lock.Lock();
- dout(25) << "kicker_thread done kicking ebofs" << dendl;
- }
- if (io_stop) break;
-
- dout(25) << "kicker_thread sleeping" << dendl;
- kicker_cond.Wait(lock);
- }
-
- dout(10) << "kicker_thread finish" << dendl;
- lock.Unlock();
- return 0;
-}
-
-
-
-
-// io queue
-
-void BlockDevice::_submit_io(biovec *b)
-{
- // NOTE: lock must be held
- dout(15) << "_submit_io " << *b << dendl;
-
- // wake up io_thread(s)?
- if ((int)root_queue.size() == io_threads_running)
- io_wakeup.SignalOne();
- else if ((int)root_queue.size() > io_threads_running)
- io_wakeup.SignalAll();
-
- // queue
- root_queue.submit_io(b);
-
- /*
- // [DEBUG] check for overlapping ios
- // BUG: this doesn't detect all overlaps w/ the next queue thing.
- if (g_conf.bdev_debug_check_io_overlap) {
- // BUG: this doesn't catch everything! eg 1~10000000 will be missed....
- multimap<block_t, biovec*>::iterator p = io_queue.lower_bound(b->start);
- if ((p != io_queue.end() &&
- p->first < b->start+b->length) ||
- (p != io_queue.begin() &&
- (p--, p->second->start + p->second->length > b->start))) {
- dout(1) << "_submit_io new io " << *b
- << " overlaps with existing " << *p->second << dendl;
- cerr << "_submit_io new io " << *b
- << " overlaps with existing " << *p->second << dendl;
- }
- }
- */
-
-}
-
-int BlockDevice::_cancel_io(biovec *bio)
-{
- // NOTE: lock must be held
-
- if (bio->in_queue == 0) {
- dout(15) << "_cancel_io " << *bio << " FAILED" << dendl;
- return -1;
- } else {
- dout(15) << "_cancel_io " << *bio << dendl;
- bio->in_queue->cancel_io(bio);
- if (root_queue.bump())
- io_wakeup.SignalAll(); // something happened!
- return 0;
- }
-}
-
-
-
-// low level io
-
-int BlockDevice::_read(int fd, block_t bno, unsigned num, bufferlist& bl)
-{
- dout(10) << "_read " << bno << "~" << num << dendl;
-
- assert(fd > 0);
-
- uint64_t offset = bno * EBOFS_BLOCK_SIZE;
- uint64_t actual = ::lseek64(fd, offset, SEEK_SET);
- assert(actual == offset);
-
- size_t len = num*EBOFS_BLOCK_SIZE;
- assert(bl.length() >= len);
-
- struct iovec iov[ bl.buffers().size() ];
- int n = 0;
- size_t left = len;
- for (list<bufferptr>::const_iterator i = bl.buffers().begin();
- i != bl.buffers().end();
- i++) {
- assert(i->length() % EBOFS_BLOCK_SIZE == 0);
-
- iov[n].iov_base = (void*)i->c_str();
- iov[n].iov_len = MIN(left, i->length());
-
- left -= iov[n].iov_len;
- n++;
- if (left == 0) break;
- }
-
- int got = ::readv(fd, iov, n);
- assert(got <= (int)len);
-
- return 0;
-}
-
-int BlockDevice::_write(int fd, unsigned bno, unsigned num, bufferlist& bl)
-{
- dout(10) << "_write " << bno << "~" << num << dendl;
-
- assert(fd > 0);
-
- while (1) {
- uint64_t offset = (uint64_t)bno << EBOFS_BLOCK_BITS;
- assert((uint64_t)bno * (uint64_t)EBOFS_BLOCK_SIZE == offset);
- uint64_t actual = ::lseek64(fd, offset, SEEK_SET);
- assert(actual == offset);
-
- // write buffers
- size_t len = num*EBOFS_BLOCK_SIZE;
-
- struct iovec iov[ bl.buffers().size() ];
-
- int n = 0;
- size_t left = len;
- for (list<bufferptr>::const_iterator i = bl.buffers().begin();
- i != bl.buffers().end();
- i++) {
- assert(i->length() % EBOFS_BLOCK_SIZE == 0);
-
- iov[n].iov_base = (void*)i->c_str();
- iov[n].iov_len = MIN(left, i->length());
-
- /*
- dout(10) << "_write " << (bno+(len-left))
- << "~" << (iov[n].iov_len / 4096)
- << " " << *i << dendl;
- */
-
- assert((((intptr_t)iov[n].iov_base) & ((intptr_t)4095ULL)) == 0);
- assert((iov[n].iov_len & 4095) == 0);
-
- left -= iov[n].iov_len;
- n++;
- if (left == 0 ||
- n == IOV_MAX) break;
- }
-
- int r = ::writev(fd, iov, n);
-
- if (r < 0) {
- char buf[80];
- dout(1) << "couldn't write bno " << bno << " num " << num
- << " (" << len << " bytes) in " << n << " iovs, r=" << r
- << " errno " << errno << " " << strerror_r(errno, buf, sizeof(buf)) << dendl;
- dout(1) << "bl is " << bl << dendl;
- assert(0);
- } else if (r < (int)len) {
- // hrm, we didn't write _all_ of our data. WTF kind of FS is this?
- dout(0) << "bloody hell, writev only wrote " << r << " of " << len << " bytes, looping" << dendl;
- assert(r % 4096 == 0);
- int wrote = r / 4096;
- bno += wrote;
- num -= wrote;
- bufferlist tail;
- tail.substr_of(bl, r, len-r);
- bl.claim(tail);
- continue;
- } else {
- // yay
- assert(r == (int)len);
- break;
- }
- }
- return 0;
-}
-
-
-
-// open/close
-
-int BlockDevice::open_fd()
-{
-#ifdef DARWIN
- int fd = ::open(dev.c_str(), O_RDWR|O_SYNC, 0);
- ::fcntl(fd, F_NOCACHE);
- return fd;
-#else
- return ::open(dev.c_str(), O_RDWR|O_SYNC|O_DIRECT, 0);
-#endif
-}
-
-int BlockDevice::open(kicker *idle)
-{
- assert(fd == 0);
-
- // open?
- fd = open_fd();
- if (fd < 0) {
- char buf[80];
- dout(1) << "open failed, r = " << fd << " " << strerror_r(errno, buf, sizeof(buf)) << dendl;
- fd = 0;
- return -1;
- }
-
- // lock
- if (g_conf.bdev_lock) {
- int r = ::flock(fd, LOCK_EX|LOCK_NB);
- if (r < 0) {
- derr(1) << "open " << dev << " failed to get LOCK_EX" << dendl;
- return -1;
- }
- }
-
- // figure size
- block_t b = get_num_blocks();
- if (!b) {
- dout(0) << "open can't determine size of device" << dendl;
- assert(0);
- }
- dout(2) << "open " << b << " blocks, " << b*4096 << " bytes" << dendl;
-
- // start thread
- io_threads_started = 0;
- io_threads.clear();
- for (int i=0; i<g_conf.bdev_iothreads; i++) {
- io_threads.push_back(new IOThread(this));
- io_threads.back()->create();
- }
- complete_thread.create();
- kicker_thread.create();
-
- // idle kicker?
- idle_kicker = idle;
-
- return fd;
-}
-
-
-/*
- * warning: ebofs shoudl drop it's lock before calling close(),
- * or else deadlock against the idle kicker
- */
-int BlockDevice::close()
-{
- assert(fd>0);
-
- idle_kicker = 0;
-
- // shut down io thread
- dout(10) << "close stopping io+complete threads" << dendl;
- lock.Lock();
- complete_lock.Lock();
- io_stop = true;
- io_wakeup.SignalAll();
- complete_wakeup.SignalAll();
- kicker_cond.Signal();
- complete_lock.Unlock();
- lock.Unlock();
-
- for (int i=0; i<g_conf.bdev_iothreads; i++) {
- io_threads[i]->join();
- delete io_threads[i];
- }
- io_threads.clear();
-
- complete_thread.join();
- kicker_thread.join();
-
- io_stop = false; // in case we start again
-
- dout(2) << "close " << dendl;
-
- if (g_conf.bdev_lock)
- ::flock(fd, LOCK_UN);
-
- ::close(fd);
- fd = 0;
-
- return 0;
-}
-
-int BlockDevice::cancel_io(ioh_t ioh)
-{
- biovec *pbio = (biovec*)ioh;
-
- lock.Lock();
- int r = _cancel_io(pbio);
- lock.Unlock();
-
- // FIXME?
- if (r == 0 && pbio->cb) {
- //pbio->cb->finish(ioh, 0);
- delete pbio->cb;
- delete pbio;
- }
-
- return r;
-}
-
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef CEPH_EBOFS_BLOCKDEVICE_H
-#define CEPH_EBOFS_BLOCKDEVICE_H
-
-using namespace std;
-#include "include/buffer.h"
-#include "include/interval_set.h"
-#include "include/Context.h"
-#include "common/Mutex.h"
-#include "common/Cond.h"
-#include "common/Thread.h"
-
-#include "types.h"
-
-
-typedef void *ioh_t; // opaque handle to an io request. (in actuality, a biovec*)
-
-
-class BlockDevice {
- public:
- // callback type for io completion notification
- class callback {
- public:
- virtual ~callback() {}
- virtual void finish(ioh_t ioh, int rval) = 0;
- };
-
- // kicker for idle notification
- class kicker {
- public:
- virtual ~kicker() {}
- virtual void kick() = 0;
- };
-
-
- /********************************************************/
-
- class Queue;
-
- // io item
- // two variants: one with Cond*, one with callback*.
- class biovec {
- public:
- static const char IO_WRITE = 1;
- static const char IO_READ = 2;
-
- char type;
- block_t start, length;
- bufferlist bl;
- callback *cb;
- Cond *cond;
- int rval;
- const char *note;
- bool done;
-
- Queue *in_queue;
-
- biovec(char t, block_t s, block_t l, bufferlist& b, callback *c, const char *n=0) :
- type(t), start(s), length(l), bl(b), cb(c), cond(0), rval(0), note(n), done(false), in_queue(0) {}
- biovec(char t, block_t s, block_t l, bufferlist& b, Cond *c, const char *n=0) :
- type(t), start(s), length(l), bl(b), cb(0), cond(c), rval(0), note(n), done(false), in_queue(0) {}
- };
- friend ostream& operator<<(ostream& out, biovec &bio);
-
-
- /********************************************************/
-
- /*
- * Queue -- abstract IO queue interface
- */
- class Queue {
- public:
- virtual ~Queue() {}
- virtual void submit_io(biovec *b) = 0;
- virtual void cancel_io(biovec *b) = 0;
- virtual int dequeue_io(list<biovec*>& biols,
- block_t& start, block_t& length,
- interval_set<block_t>& locked) = 0;
- virtual int size() = 0;
- virtual bool empty() { return size() == 0; }
- };
-
- /*
- * ElevatorQueue - simple elevator scheduler queue
- */
- class ElevatorQueue : public Queue {
- BlockDevice *bdev;
- const char *dev;
- map<block_t, biovec*> io_map;
- bool el_dir_forward;
- block_t el_pos;
- utime_t el_stop;
-
- public:
- ElevatorQueue(BlockDevice *bd, const char *d) :
- bdev(bd), dev(d),
- el_dir_forward(false),
- el_pos(0) {}
- void submit_io(biovec *b) {
- b->in_queue = this;
- assert(io_map.count(b->start) == 0);
- io_map[b->start] = b;
- }
- void cancel_io(biovec *b) {
- assert(b->in_queue == this);
- assert(io_map.count(b->start) &&
- io_map[b->start] == b);
- io_map.erase(b->start);
- b->in_queue = 0;
- }
- int dequeue_io(list<biovec*>& biols,
- block_t& start, block_t& length,
- interval_set<block_t>& locked);
- int size() {
- return io_map.size();
- }
- };
-
- /*
- * BarrierQueue - lets you specify io "barriers"
- * barrier() - force completion of all prior IOs before
- * future ios are started.
- * bump() - must be called after cancel_io to properly
- * detect empty subqueue.
- */
- class BarrierQueue : public Queue {
- BlockDevice *bdev;
- const char *dev;
- list<Queue*> qls;
- public:
- BarrierQueue(BlockDevice *bd, const char *d) : bdev(bd), dev(d) {
- barrier();
- }
- ~BarrierQueue() {
- for (list<Queue*>::iterator p = qls.begin();
- p != qls.end();
- ++p)
- delete *p;
- qls.clear();
- }
- int size() {
- // this isn't perfectly accurate.
- if (!qls.empty())
- return qls.front()->size();
- return 0;
- }
- void submit_io(biovec *b) {
- assert(!qls.empty());
- qls.back()->submit_io(b);
- }
- void cancel_io(biovec *b) {
- assert(0); // shouldn't happen.
- }
- int dequeue_io(list<biovec*>& biols,
- block_t& start, block_t& length,
- interval_set<block_t>& locked);
- void barrier();
- bool bump();
- };
-
-
- private:
- string dev; // my device file
- int fd;
- block_t num_blocks;
-
- Mutex lock;
-
- /** the root io queue.
- * i current assumeit's a barrier queue,but this can be changed
- * with some minor rearchitecting.
- */
- BarrierQueue root_queue;
-
- /* io_block_lock - block ranges current dispatched to kernel
- * once a bio is dispatched, it cannot be canceled, so an overlapping
- * io and be submitted. the overlapping io cannot be dispatched
- * to the kernel, however, until the original io finishes, or else
- * there will be a race condition.
- */
- interval_set<block_t> io_block_lock; // blocks currently dispatched to kernel
-
- // io threads
- Cond io_wakeup;
- bool io_stop;
- int io_threads_started, io_threads_running;
- bool is_idle_waiting;
-
- void *io_thread_entry();
-
- class IOThread : public Thread {
- BlockDevice *dev;
- public:
- IOThread(BlockDevice *d) : dev(d) {}
- void *entry() { return (void*)dev->io_thread_entry(); }
- } ;
-
- vector<IOThread*> io_threads;
-
- // private io interface
- int open_fd(); // get an fd (for a thread)
-
- void _submit_io(biovec *b);
- int _cancel_io(biovec *bio);
- void do_io(int fd, list<biovec*>& biols); // called by an io thread
-
- // low level io
- int _read(int fd, block_t bno, unsigned num, bufferlist& bl);
- int _write(int fd, unsigned bno, unsigned num, bufferlist& bl);
-
-
- // completion callback queue
- Mutex complete_lock;
- Cond complete_wakeup;
- list<biovec*> complete_queue;
- int complete_queue_len;
-
- void finish_io(biovec *bio);
-
- // complete thread
- void *complete_thread_entry();
- class CompleteThread : public Thread {
- BlockDevice *dev;
- public:
- CompleteThread(BlockDevice *d) : dev(d) {}
- void *entry() { return (void*)dev->complete_thread_entry(); }
- } complete_thread;
-
- // kicker
- kicker *idle_kicker; // not used..
- Mutex kicker_lock;
- Cond kicker_cond;
- void *kicker_thread_entry();
- class KickerThread : public Thread {
- BlockDevice *dev;
- public:
- KickerThread(BlockDevice *d) : dev(d) {}
- void *entry() { return (void*)dev->complete_thread_entry(); }
- } kicker_thread;
-
-
-
- public:
- BlockDevice(const char *d) :
- dev(d), fd(0), num_blocks(0),
- lock("BlockDevice::lock"),
- root_queue(this, dev.c_str()),
- io_stop(false), io_threads_started(0), io_threads_running(0), is_idle_waiting(false),
- complete_lock("BlockDevice::complete_lock"),
- complete_queue_len(0),
- complete_thread(this),
- idle_kicker(0), kicker_lock("BlockDevice::kicker_lock"), kicker_thread(this) { }
- ~BlockDevice() {
- if (fd > 0) close();
- }
-
- // get size in blocks
- block_t get_num_blocks();
- const char *get_device_name() const { return dev.c_str(); }
-
- // open/close
- int open(kicker *idle = 0);
- int close();
-
- // state stuff
- bool is_idle() {
- lock.Lock();
- bool idle = (io_threads_running == 0) && root_queue.empty();
- lock.Unlock();
- return idle;
- }
- void barrier() {
- lock.Lock();
- root_queue.barrier();
- lock.Unlock();
- }
- void _barrier() {
- root_queue.barrier();
- }
-
- // ** blocking interface **
-
- // read
- int read(block_t bno, unsigned num, bufferptr& bptr, const char *n=0) {
- bufferlist bl;
- bl.push_back(bptr);
- return read(bno, num, bl, n);
- }
- int read(block_t bno, unsigned num, bufferlist& bl, const char *n=0) {
- Cond c;
- biovec bio(biovec::IO_READ, bno, num, bl, &c, n);
-
- lock.Lock();
- _submit_io(&bio);
- _barrier(); // need this, to prevent starvation!
- while (!bio.done)
- c.Wait(lock);
- lock.Unlock();
- return bio.rval;
- }
-
- // write
- int write(unsigned bno, unsigned num, bufferptr& bptr, const char *n=0) {
- bufferlist bl;
- bl.push_back(bptr);
- return write(bno, num, bl, n);
- }
- int write(unsigned bno, unsigned num, bufferlist& bl, const char *n=0) {
- Cond c;
- biovec bio(biovec::IO_WRITE, bno, num, bl, &c, n);
-
- lock.Lock();
- _submit_io(&bio);
- _barrier(); // need this, to prevent starvation!
- while (!bio.done)
- c.Wait(lock);
- lock.Unlock();
- return bio.rval;
- }
-
- // ** non-blocking interface **
- ioh_t read(block_t bno, unsigned num, bufferlist& bl, callback *fin, const char *n=0) {
- biovec *pbio = new biovec(biovec::IO_READ, bno, num, bl, fin, n);
- lock.Lock();
- _submit_io(pbio);
- lock.Unlock();
- return (ioh_t)pbio;
- }
- ioh_t write(block_t bno, unsigned num, bufferlist& bl, callback *fin, const char *n=0) {
- biovec *pbio = new biovec(biovec::IO_WRITE, bno, num, bl, fin, n);
- lock.Lock();
- _submit_io(pbio);
- lock.Unlock();
- return (ioh_t)pbio;
- }
- int cancel_io(ioh_t ioh);
-
-};
-
-
-
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-
-#include "BufferCache.h"
-#include "Onode.h"
-
-#define DOUT_SUBSYS ebofs
-
-void do_apply_partial(bufferlist& bl, map<uint64_t, bufferlist>& pm)
-{
- assert(bl.length() == (unsigned)EBOFS_BLOCK_SIZE);
- //assert(partial_is_complete());
- //cout << "apply_partial" << std::endl;
- for (map<uint64_t, bufferlist>::iterator i = pm.begin();
- i != pm.end();
- i++) {
- //cout << "do_apply_partial at " << i->first << "~" << i->second.length() << std::endl;
- bl.copy_in(i->first, i->second.length(), i->second);
- }
- pm.clear();
-}
-
-
-
-/*********** BufferHead **************/
-
-#undef dout_prefix
-#define dout_prefix *_dout << "ebofs." << *this << "."
-
-
-void BufferHead::add_partial(uint64_t off, bufferlist& p)
-{
- unsigned len = p.length();
- assert(len <= (unsigned)EBOFS_BLOCK_SIZE);
- assert(off >= 0);
- assert(off + len <= EBOFS_BLOCK_SIZE);
-
- // trim any existing that overlaps
- map<uint64_t, bufferlist>::iterator i = partial.begin();
- while (i != partial.end()) {
- // is [off,off+len)...
- // past i?
- if (off >= i->first + i->second.length()) {
- i++;
- continue;
- }
- // before i?
- if (i->first >= off+len) break;
-
- // does [off,off+len)...
- // overlap all of i?
- if (off <= i->first && off+len >= i->first + i->second.length()) {
- // erase it and move on.
- partial.erase(i++);
- continue;
- }
- // overlap tail of i?
- if (off > i->first && off+len >= i->first + i->second.length()) {
- // shorten i.
- unsigned taillen = off - i->first;
- bufferlist o;
- o.claim( i->second );
- i->second.substr_of(o, 0, taillen);
- i++;
- continue;
- }
- // overlap head of i?
- if (off <= i->first && off+len < i->first + i->second.length()) {
- // move i (make new tail).
- uint64_t tailoff = off+len;
- unsigned trim = tailoff - i->first;
- partial[tailoff].substr_of(i->second, trim, i->second.length()-trim);
- partial.erase(i++); // should now be at tailoff
- i++;
- continue;
- }
- // split i?
- if (off > i->first && off+len < i->first + i->second.length()) {
- bufferlist o;
- o.claim( i->second );
- // shorten head
- unsigned headlen = off - i->first;
- i->second.substr_of(o, 0, headlen);
- // new tail
- unsigned tailoff = off+len - i->first;
- unsigned taillen = o.length() - len - headlen;
- partial[off+len].substr_of(o, tailoff, taillen);
- break;
- }
- assert(0);
- }
-
- // insert and adjust csum
- partial[off] = p;
-
- dout(10) << "add_partial off " << off << "~" << p.length() << dendl;
-}
-
-void BufferHead::apply_partial()
-{
- dout(10) << "apply_partial on " << partial.size() << " substrings" << dendl;
- assert(!partial.empty());
- csum_t *p = oc->on->get_extent_csum_ptr(start(), 1);
- do_apply_partial(data, partial);
- csum_t newc = calc_csum(data.c_str(), EBOFS_BLOCK_SIZE);
- oc->on->data_csum += newc - *p;
- *p = newc;
-}
-
-
-/************ ObjectCache **************/
-
-#undef dout_prefix
-#define dout_prefix *_dout << "ebofs.oc."
-
-void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl)
-{
- list<Context*> waiters;
-
- dout(10) << "rx_finish " << start << "~" << length << dendl;
- map<block_t, BufferHead*>::iterator p, next;
- for (p = data.lower_bound(start); p != data.end(); p = next) {
- next = p;
- next++;
-
- BufferHead *bh = p->second;
- dout(10) << "rx_finish ?" << *bh << dendl;
- assert(p->first == bh->start());
-
- // past?
- if (p->first >= start+length) break;
- if (bh->end() > start+length) break; // past
-
- assert(p->first >= start);
- assert(bh->end() <= start+length);
-
- dout(10) << "rx_finish !" << *bh << dendl;
-
- if (bh->rx_ioh == ioh)
- bh->rx_ioh = 0;
-
- // trigger waiters
- bh->take_read_waiters(waiters);
-
- if (bh->is_rx()) {
- assert(bh->get_version() == 0);
- assert(bh->end() <= start+length);
- assert(bh->start() >= start);
-
- bh->data.substr_of(bl, (bh->start()-start)*EBOFS_BLOCK_SIZE, bh->length()*EBOFS_BLOCK_SIZE);
-
- // verify checksum
- int bad = 0;
- if (g_conf.ebofs_verify_csum_on_read) {
- csum_t *want = bh->oc->on->get_extent_csum_ptr(bh->start(), bh->length());
- csum_t got[bh->length()];
- for (unsigned i=0; i<bh->length(); i++) {
- got[i] = calc_csum(&bh->data[i*EBOFS_BLOCK_SIZE], EBOFS_BLOCK_SIZE);
- if (false && rand() % 10 == 0) {
- dout(0) << "rx_finish HACK INJECTING bad csum" << dendl;
- derr(0) << "rx_finish HACK INJECTING bad csum" << dendl;
- got[i] = 0;
- }
- if (got[i] != want[i]) {
- dout(0) << "rx_finish bad csum wanted " << hex << want[i] << " got " << got[i] << dec
- << " for object block " << (i+bh->start())
- << dendl;
- bad++;
- }
- }
- if (bad) {
- block_t ostart = bh->start();
- block_t olen = bh->length();
- for (unsigned s=0; s<olen; s++) {
- if (got[s] != want[s]) {
- unsigned e;
- for (e=s; e<olen; e++)
- if (got[e] == want[e]) break;
- dout(0) << "rx_finish bad csum in " << bh->oc->on->object_id << " over " << s << "~" << (e-s) << dendl;
- derr(0) << "rx_finish bad csum in " << bh->oc->on->object_id << " over " << s << "~" << (e-s) << dendl;
-
- if (s) {
- BufferHead *middle = bc->split(bh, ostart+s);
- dout(0) << "rx_finish rx -> clean on " << *bh << dendl;
- bc->mark_clean(bh);
- bh = middle;
- }
- BufferHead *right = bh;
- if (e < olen)
- right = bc->split(bh, ostart+e);
- dout(0) << "rx_finish rx -> corrupt on " << *bh <<dendl;
- bc->mark_corrupt(bh);
- bh = right;
- s = e;
- }
- }
- }
- }
- if (bh) {
- dout(10) << "rx_finish rx -> clean on " << *bh << dendl;
- bc->mark_clean(bh);
- }
- }
- else if (bh->is_partial()) {
- dout(10) << "rx_finish partial -> tx on " << *bh << dendl;
-
- // see what block i am
- vector<extent_t> exv;
- on->map_extents(bh->start(), 1, exv, 0);
- assert(exv.size() == 1);
- assert(exv[0].start != 0);
- block_t cur_block = exv[0].start;
-
- uint64_t off_in_bl = (bh->start() - start) * EBOFS_BLOCK_SIZE;
- assert(off_in_bl >= 0);
- uint64_t len_in_bl = bh->length() * EBOFS_BLOCK_SIZE;
-
- // verify csum
- csum_t want = *bh->oc->on->get_extent_csum_ptr(bh->start(), 1);
- csum_t got = calc_csum(bl.c_str() + off_in_bl, len_in_bl);
- if (want != got) {
- derr(0) << "rx_finish bad csum on partial readback, want " << hex << want
- << " got " << got << dec << dendl;
- dout(0) << "rx_finish bad csum on partial readback, want " << hex << want
- << " got " << got << dec << dendl;
- *bh->oc->on->get_extent_csum_ptr(bh->start(), 1) = got;
- bh->oc->on->data_csum += got - want;
-
- interval_set<uint64_t> bad;
- bad.insert(bh->start()*EBOFS_BLOCK_SIZE, EBOFS_BLOCK_SIZE);
- bh->oc->on->bad_byte_extents.union_of(bad);
-
- interval_set<uint64_t> over;
- for (map<uint64_t,bufferlist>::iterator q = bh->partial.begin();
- q != bh->partial.end();
- q++)
- over.insert(bh->start()*EBOFS_BLOCK_SIZE+q->first, q->second.length());
- interval_set<uint64_t> new_over;
- new_over.intersection_of(over, bh->oc->on->bad_byte_extents);
- bh->oc->on->bad_byte_extents.subtract(new_over);
- }
-
- // apply partial to myself
- assert(bh->data.length() == 0);
- bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
- bh->data.push_back( bp );
- bufferlist sub;
- sub.substr_of(bl, off_in_bl, len_in_bl);
- bh->data.copy_in(0, EBOFS_BLOCK_SIZE, sub);
- bh->apply_partial();
-
- // write "normally"
- bc->mark_dirty(bh);
- bc->bh_write(on, bh, cur_block);
-
- assert(bh->oc->on->is_dirty());
-
- // clean up a bit
- bh->partial.clear();
- }
- else {
- dout(10) << "rx_finish ignoring status on (dirty|tx|clean) " << *bh << dendl;
- assert(bh->is_dirty() || // was overwritten
- bh->is_tx() || // was overwritten and queued
- bh->is_clean()); // was overwritten, queued, _and_ flushed to disk
- }
-
- }
-
- finish_contexts(waiters);
-}
-
-
-void ObjectCache::tx_finish(ioh_t ioh, block_t start, block_t length,
- version_t version, version_t epoch)
-{
- dout(10) << "tx_finish " << start << "~" << length << " v" << version << dendl;
- for (map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
- p != data.end();
- p++) {
- BufferHead *bh = p->second;
- dout(30) << "tx_finish ?bh " << *bh << dendl;
- assert(p->first == bh->start());
-
- // past?
- if (p->first >= start+length) {
- bh->oc->try_merge_bh_right(p);
- break;
- }
-
- if (bh->tx_ioh == ioh)
- bh->tx_ioh = 0;
-
- if (!bh->is_tx()) {
- dout(10) << "tx_finish bh not marked tx, skipping" << dendl;
- continue;
- }
- assert(bh->is_tx());
-
- if (version == bh->version) {
- dout(10) << "tx_finish tx -> clean on " << *bh << dendl;
- assert(bh->end() <= start+length);
- bh->set_last_flushed(version);
- bc->mark_clean(bh);
- bh->oc->try_merge_bh_left(p);
- } else {
- dout(10) << "tx_finish leaving tx, " << bh->version << " > " << version
- << " on " << *bh << dendl;
- assert(bh->version > version);
- }
- }
-}
-
-
-
-/*
- * return any bh's that are (partially) in this range that are TX.
- */
-int ObjectCache::find_tx(block_t start, block_t len,
- list<BufferHead*>& tx)
-{
- map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
-
- block_t cur = start;
- block_t left = len;
-
- /* don't care about overlap, we want things _fully_ in start~len.
- if (p != data.begin() &&
- (p == data.end() || p->first > cur)) {
- p--; // might overlap!
- if (p->first + p->second->length() <= cur)
- p++; // doesn't overlap.
- }
- */
-
- while (left > 0) {
- assert(cur+left == start+len);
-
- // at end?
- if (p == data.end())
- break;
-
- if (p->first <= cur) {
- // have it (or part of it)
- BufferHead *e = p->second;
-
- if (e->end() <= start+len &&
- e->is_tx())
- tx.push_back(e);
-
- block_t lenfromcur = MIN(e->end() - cur, left);
- cur += lenfromcur;
- left -= lenfromcur;
- p++;
- continue; // more?
- } else if (p->first > cur) {
- // gap.. miss
- block_t next = p->first;
- left -= (next-cur);
- cur = next;
- continue;
- }
- else
- assert(0);
- }
-
- return 0;
-}
-
-
-int ObjectCache::try_map_read(block_t start, block_t len)
-{
- map<block_t, BufferHead*>::iterator p = find_bh(start, len);
- block_t cur = start;
- block_t left = len;
-
- int num_missing = 0;
-
- while (left > 0) {
- // at end?
- if (p == data.end()) {
- // rest is a miss.
- vector<extent_t> exv;
- on->map_extents(cur, left, // no prefetch here!
- exv, 0);
- for (unsigned i=0; i<exv.size(); i++)
- if (exv[i].start)
- num_missing++;
- left = 0;
- cur = start+len;
- break;
- }
-
- if (p->first <= cur) {
- // have it (or part of it)
- BufferHead *e = p->second;
-
- if (e->is_clean() ||
- e->is_dirty() ||
- e->is_tx()) {
- dout(20) << "try_map_read hit " << *e << dendl;
- }
- else if (e->is_corrupt()) {
- dout(20) << "try_map_read corrupt " << *e << dendl;
- }
- else if (e->is_rx()) {
- dout(20) << "try_map_read rx " << *e << dendl;
- num_missing++;
- }
- else if (e->is_partial()) {
- dout(0) << "try_map_read partial " << *e << dendl;
- num_missing++;
- }
- else {
- dout(0) << "try_map_read got unexpected " << *e << dendl;
- assert(0);
- }
-
- block_t lenfromcur = MIN(e->end() - cur, left);
- cur += lenfromcur;
- left -= lenfromcur;
- p++;
- continue; // more?
- } else if (p->first > cur) {
- // gap.. miss
- block_t next = p->first;
- vector<extent_t> exv;
- on->map_extents(cur,
- MIN(next-cur, left), // no prefetch
- exv, 0);
- for (unsigned i=0; i<exv.size(); i++)
- if (exv[i].start) {
- dout(20) << "try_map_read gap " << exv[i] << dendl;
- num_missing++;
- }
- left -= (p->first - cur);
- cur = p->first;
- continue; // more?
- }
- else
- assert(0);
- }
-
- assert(left == 0);
- assert(cur == start+len);
- return num_missing;
-}
-
-
-
-
-
-/*
- * map a range of blocks into buffer_heads.
- * - create missing buffer_heads as necessary.
- * - fragment along disk extent boundaries
- */
-int ObjectCache::map_read(block_t start, block_t len,
- map<block_t, BufferHead*>& hits,
- map<block_t, BufferHead*>& missing,
- map<block_t, BufferHead*>& rx,
- map<block_t, BufferHead*>& partial) {
-
- map<block_t, BufferHead*>::iterator p = find_bh(start, len);
- block_t cur = start;
- block_t left = len;
-
- while (left > 0) {
- // at end?
- if (p == data.end()) {
- // rest is a miss.
- vector<extent_t> exv;
- on->map_extents(cur,
- //MIN(left + g_conf.ebofs_max_prefetch, // prefetch
- //on->object_blocks-cur),
- left, // no prefetch
- exv, 0);
- for (unsigned i=0; i<exv.size() && left > 0; i++) {
- BufferHead *n = new BufferHead(this, cur, exv[i].length);
- if (exv[i].start) {
- missing[cur] = n;
- dout(20) << "map_read miss " << left << " left, " << *n << dendl;
- } else {
- hits[cur] = n;
- n->set_state(BufferHead::STATE_CLEAN);
- dout(20) << "map_read hole " << left << " left, " << *n << dendl;
- }
- bc->add_bh(n);
- cur += MIN(left,exv[i].length);
- left -= MIN(left,exv[i].length);
- }
- assert(left == 0);
- assert(cur == start+len);
- break;
- }
-
- if (p->first <= cur) {
- // have it (or part of it)
- BufferHead *e = p->second;
-
- if (e->is_clean() ||
- e->is_dirty() ||
- e->is_tx() ||
- e->is_corrupt()) {
- hits[cur] = e; // readable!
- dout(20) << "map_read hit " << *e << dendl;
- bc->touch(e);
- }
- else if (e->is_rx()) {
- rx[cur] = e; // missing, not readable.
- dout(20) << "map_read rx " << *e << dendl;
- }
- else if (e->is_partial()) {
- partial[cur] = e;
- dout(20) << "map_read partial " << *e << dendl;
- }
- else {
- dout(0) << "map_read ??? got unexpected " << *e << dendl;
- assert(0);
- }
-
- block_t lenfromcur = MIN(e->end() - cur, left);
- cur += lenfromcur;
- left -= lenfromcur;
- p++;
- continue; // more?
- } else if (p->first > cur) {
- // gap.. miss
- block_t next = p->first;
- vector<extent_t> exv;
- on->map_extents(cur,
- //MIN(next-cur, MIN(left + g_conf.ebofs_max_prefetch, // prefetch
- // on->object_blocks-cur)),
- MIN(next-cur, left), // no prefetch
- exv, 0);
-
- for (unsigned i=0; i<exv.size() && left>0; i++) {
- BufferHead *n = new BufferHead(this, cur, exv[i].length);
- if (exv[i].start) {
- missing[cur] = n;
- dout(20) << "map_read gap " << *n << dendl;
- } else {
- n->set_state(BufferHead::STATE_CLEAN);
- hits[cur] = n;
- dout(20) << "map_read hole " << *n << dendl;
- }
- bc->add_bh(n);
- cur += MIN(left, n->length());
- left -= MIN(left, n->length());
- }
- continue; // more?
- }
- else
- assert(0);
- }
-
- assert(left == 0);
- assert(cur == start+len);
- return 0;
-}
-
-
-/*
- * map a range of pages on an object's buffer cache.
- *
- * - break up bufferheads that don't fall completely within the range
- * - cancel rx ops we contain
- * - resubmit non-contained rx ops if we split bufferheads
- * - cancel obsoleted tx ops
- * - break contained bh's over disk extent boundaries
- */
-int ObjectCache::map_write(block_t start, block_t len,
- map<block_t, BufferHead*>& hits,
- version_t super_epoch)
-{
- dout(10) << "map_write " << *on << " " << start << "~" << len << dendl;
-
- map<block_t, BufferHead*>::iterator p = find_bh(start, len); // p->first >= start
- block_t cur = start;
- block_t left = len;
-
- //dump();
-
- while (left > 0) {
- // max for this bh (bc of (re)alloc on disk)
- block_t max = left;
-
- // based on disk extent boundary ...
- vector<extent_t> exv;
- on->map_extents(cur, max, exv, 0);
- if (exv.size() > 1)
- max = exv[0].length;
- bool hole = false;
- if (exv.size() > 0 && exv[0].start == 0)
- hole = true;
-
- dout(10) << "map_write " << cur << "~" << max << dendl;
-
- // at end?
- if (p == data.end()) {
- BufferHead *n = new BufferHead(this, cur, max);
- if (hole)
- n->set_state(BufferHead::STATE_CLEAN); // hole
- bc->add_bh(n);
- hits[cur] = n;
- left -= max;
- cur += max;
- continue;
- }
-
- dout(10) << "p is " << *p->second << dendl;
-
-
- if (p->first <= cur) {
- BufferHead *bh = p->second;
- dout(10) << "map_write bh " << *bh << " intersected" << dendl;
-
- if (p->first < cur) {
- if (cur+max >= p->first+p->second->length()) {
- // we want right bit (one splice)
- if (bh->is_rx() && bc->bh_cancel_read(bh)) {
- BufferHead *right = bc->split(bh, cur);
- bc->bh_read(on, bh); // reread left bit
- bh = right;
- } else if (bh->is_tx() && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) {
- BufferHead *right = bc->split(bh, cur);
- bc->bh_write(on, bh); // rewrite left bit
- bh = right;
- } else {
- bh = bc->split(bh, cur); // just split it
- }
- p++;
- assert(p->second == bh);
- } else {
- // we want middle bit (two splices)
- if (bh->is_rx() && bc->bh_cancel_read(bh)) {
- BufferHead *middle = bc->split(bh, cur);
- bc->bh_read(on, bh); // reread left
- p++;
- assert(p->second == middle);
- BufferHead *right = bc->split(middle, cur+max);
- bc->bh_read(on, right); // reread right
- bh = middle;
- } else if (bh->is_tx() && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) {
- BufferHead *middle = bc->split(bh, cur);
- bc->bh_write(on, bh); // redo left
- p++;
- assert(p->second == middle);
- BufferHead *right = bc->split(middle, cur+max);
- bc->bh_write(on, right); // redo right
- bh = middle;
- } else {
- BufferHead *middle = bc->split(bh, cur);
- p++;
- assert(p->second == middle);
- bc->split(middle, cur+max);
- bh = middle;
- }
- }
- } else if (p->first == cur) {
- if (p->second->length() <= max) {
- // whole bufferhead, piece of cake.
- } else {
- // we want left bit (one splice)
- if (bh->is_rx() && bc->bh_cancel_read(bh)) {
- BufferHead *right = bc->split(bh, cur+max);
- bc->bh_read(on, right); // re-rx the right bit
- } else if (bh->is_tx() && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) {
- BufferHead *right = bc->split(bh, cur+max);
- bc->bh_write(on, right); // re-tx the right bit
- } else {
- bc->split(bh, cur+max); // just split
- }
- }
- }
-
- // try to cancel tx?
- if (bh->is_tx() && bh->epoch_modified == super_epoch)
- bc->bh_cancel_write(bh, super_epoch);
-
- // put in our map
- hits[cur] = bh;
-
- // keep going.
- block_t lenfromcur = bh->end() - cur;
- cur += lenfromcur;
- left -= lenfromcur;
- p++;
- continue;
- } else {
- // gap!
- block_t next = p->first;
- block_t glen = MIN(next-cur, max);
- dout(10) << "map_write gap " << cur << "~" << glen << dendl;
- BufferHead *n = new BufferHead(this, cur, glen);
- if (hole)
- n->set_state(BufferHead::STATE_CLEAN); // hole
- bc->add_bh(n);
- hits[cur] = n;
-
- cur += glen;
- left -= glen;
- continue; // more?
- }
- }
-
- assert(left == 0);
- assert(cur == start+len);
- return 0;
-}
-
-/* don't need this.
-int ObjectCache::scan_versions(block_t start, block_t len,
- version_t& low, version_t& high)
-{
- map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
- // p->first >= start
-
- if (p != data.begin() && p->first > start) {
- p--; // might overlap?
- if (p->first + p->second->length() <= start)
- p++; // doesn't overlap.
- }
- if (p->first >= start+len)
- return -1; // to the right. no hits.
-
- // start
- low = high = p->second->get_version();
-
- for (p++; p != data.end(); p++) {
- // past?
- if (p->first >= start+len) break;
-
- const version_t v = p->second->get_version();
- if (low > v) low = v;
- if (high < v) high = v;
- }
-
- return 0;
-}
-*/
-
-void ObjectCache::touch_bottom(block_t bstart, block_t blast)
-{
- for (map<block_t, BufferHead*>::iterator p = data.lower_bound(bstart);
- p != data.end();
- ++p) {
- BufferHead *bh = p->second;
-
- // don't trim unless it's entirely in our range
- if (bh->start() < bstart) continue;
- if (bh->end() > blast) break;
-
- dout(12) << "moving " << *bh << " to bottom of lru" << dendl;
- bc->touch_bottom(bh); // move to bottom of lru list
- }
-}
-
-void ObjectCache::discard_bh(BufferHead *bh, version_t super_epoch)
-{
- bool uncom = on->uncommitted.contains(bh->start(), bh->length());
- dout(10) << "discard_bh " << *bh << " uncom " << uncom
- << " of " << on->uncommitted
- << dendl;
-
- // whole thing
- // cancel any pending/queued io, if possible.
- if (bh->is_rx())
- bc->bh_cancel_read(bh);
- if (bh->is_tx() && uncom && bh->epoch_modified == super_epoch)
- bc->bh_cancel_write(bh, super_epoch);
- if (bh->shadow_of) {
- dout(10) << "truncate " << *bh << " unshadowing " << *bh->shadow_of << dendl;
- // shadow
- bh->shadow_of->remove_shadow(bh);
- }
-
- // kick read waiters
- list<Context*> finished;
- bh->take_read_waiters(finished);
- finish_contexts(finished, -1);
-
- bc->remove_bh(bh);
-}
-
-void ObjectCache::truncate(block_t blocks, version_t super_epoch)
-{
- dout(7) << "truncate " << object_id
- << " " << blocks << " blocks"
- << dendl;
-
- while (!data.empty()) {
- BufferHead *bh = data.rbegin()->second;
-
- if (bh->end() <= blocks) break;
-
- if (bh->start() < blocks) {
- // we want right bit (one splice)
- if (bh->is_rx() && bc->bh_cancel_read(bh)) {
- BufferHead *right = bc->split(bh, blocks);
- bc->bh_read(on, bh); // reread left bit
- bh = right;
- } else if (bh->is_tx() && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) {
- BufferHead *right = bc->split(bh, blocks);
- bc->bh_write(on, bh); // rewrite left bit
- bh = right;
- } else {
- bh = bc->split(bh, blocks); // just split it
- }
- // no worries about partials up here, they're always 1 block (and thus never split)
- }
-
- discard_bh(bh, super_epoch);
- }
-}
-
-
-void ObjectCache::clone_to(Onode *other)
-{
- ObjectCache *ton = 0;
-
- for (map<block_t, BufferHead*>::iterator p = data.begin();
- p != data.end();
- p++) {
- BufferHead *bh = p->second;
- dout(10) << "clone_to ? " << *bh << dendl;
- if (bh->is_dirty() || bh->is_tx() || bh->is_partial()) {
- // dup dirty or tx bh's
- if (!ton)
- ton = other->get_oc(bc);
- BufferHead *nbh = new BufferHead(ton, bh->start(), bh->length());
- nbh->data = bh->data; // just copy refs to underlying buffers.
- bc->add_bh(nbh);
-
- if (bh->is_partial()) {
- dout(0) << "clone_to PARTIAL FIXME NOT FULLY IMPLEMENTED ******" << dendl;
- nbh->partial = bh->partial;
- bc->mark_partial(nbh);
- } else {
- // clean buffer will shadow
- bh->add_shadow(nbh);
- bc->mark_clean(nbh);
- }
-
- dout(10) << "clone_to dup " << *bh << " -> " << *nbh << dendl;
- }
- }
-}
-
-
-
-BufferHead *ObjectCache::merge_bh_left(BufferHead *left, BufferHead *right)
-{
- dout(10) << "merge_bh_left " << *left << " " << *right << dendl;
- assert(left->end() == right->start());
- assert(left->is_clean());
- assert(!left->is_hole());
- assert(right->is_clean());
- assert(!right->is_hole());
- assert(right->get_num_ref() == 0);
-
- // hrm, is this right?
- if (right->version > left->version) left->version = right->version;
- if (right->last_flushed > left->last_flushed) left->last_flushed = right->last_flushed;
-
- bc->stat_sub(left);
- left->reset_length(left->length() + right->length());
- bc->stat_add(left);
- left->data.claim_append(right->data);
-
- // remove right
- bc->remove_bh(right);
- dout(10) << "merge_bh_left result " << *left << dendl;
- return left;
-}
-
-/* wait until this has a user
-void ObjectCache::try_merge_bh(BufferHead *bh)
-{
- dout(0) << "try_merge_bh " << *bh << dendl;
-
- map<block_t, BufferHead*>::iterator p = data.lower_bound(bh->start());
- assert(p->second == bh);
-
- try_merge_bh_left(p);
- try_merge_bh_right(p);
-}
-*/
-
-
-void ObjectCache::try_merge_bh_left(map<block_t, BufferHead*>::iterator& p)
-{
- BufferHead *bh = p->second;
- dout(10) << "try_merge_bh_left " << *bh << dendl;
-
- // left?
- if (p != data.begin()) {
- p--;
- if (p->second->end() == bh->start() &&
- p->second->is_clean() &&
- !p->second->is_hole() &&
- bh->is_clean() &&
- !bh->is_hole() &&
- bh->get_num_ref() == 0 &&
- bh->data.buffers().size() < 8 &&
- p->second->data.buffers().size() < 8)
- bh = merge_bh_left(p->second, bh); // yay!
- else
- p++; // nope.
- }
-}
-
-void ObjectCache::try_merge_bh_right(map<block_t, BufferHead*>::iterator& p)
-{
- BufferHead *bh = p->second;
- dout(10) << "try_merge_bh_right " << *bh << dendl;
-
- // right?
- map<block_t, BufferHead*>::iterator o = p;
- p++;
- if (p != data.end() &&
- bh->end() == p->second->start() &&
- p->second->is_clean() &&
- !p->second->is_hole() &&
- bh->is_clean() &&
- !bh->is_hole() &&
- p->second->get_num_ref() == 0 &&
- bh->data.buffers().size() < 8 &&
- p->second->data.buffers().size() < 8) {
- BufferHead *right = p->second;
- p--;
- merge_bh_left(bh, right);
- } else
- p = o;
-}
-
-
-void ObjectCache::scrub_csums()
-{
- dout(10) << "scrub_csums on " << *this->on << dendl;
- int bad = 0;
- for (map<block_t, BufferHead*>::iterator p = data.begin();
- p != data.end();
- p++) {
- BufferHead *bh = p->second;
- if (bh->is_rx() || bh->is_missing()) continue; // nothing to scrub
- if (bh->is_clean() && bh->data.length() == 0) continue; // hole.
- if (bh->is_clean() || bh->is_tx()) {
- for (unsigned i=0; i<bh->length(); i++) {
- vector<extent_t> exv;
- on->map_extents(bh->start()+i, 1, exv, 0);
- assert(exv.size() == 1);
- if (exv[0].start == 0) continue; // hole.
- csum_t want = *on->get_extent_csum_ptr(bh->start()+i, 1);
- csum_t b = calc_csum(&bh->data[i*EBOFS_BLOCK_SIZE], EBOFS_BLOCK_SIZE);
- if (b != want) {
- dout(0) << "scrub_csums bad data at " << (bh->start()+i) << " have "
- << hex << b << " should be " << want << dec
- << " in bh " << *bh
- << dendl;
- bad++;
- }
- }
- }
- }
- assert(bad == 0);
-}
-
-
-/************** BufferCache ***************/
-
-#undef dout_prefix
-#define dout_prefix *_dout << "ebofs.bc."
-
-
-BufferHead *BufferCache::split(BufferHead *orig, block_t after)
-{
- dout(20) << "split " << *orig << " at " << after << dendl;
-
- // split off right
- block_t newleftlen = after - orig->start();
- BufferHead *right = new BufferHead(orig->get_oc(), after, orig->length() - newleftlen);
- right->set_version(orig->get_version());
- right->epoch_modified = orig->epoch_modified;
- right->last_flushed = orig->last_flushed;
- right->set_state(orig->get_state());
- add_bh(right);
-
- // shorten left
- stat_sub(orig);
- orig->reset_length( newleftlen );
- stat_add(orig);
-
- // adjust rx_from
- if (orig->is_rx()) {
- right->rx_from = orig->rx_from;
- orig->rx_from.length = newleftlen;
- right->rx_from.length -= newleftlen;
- right->rx_from.start += newleftlen;
- }
-
- // dup shadows
- for (set<BufferHead*>::iterator p = orig->shadows.begin();
- p != orig->shadows.end();
- ++p)
- right->add_shadow(*p);
-
- // split buffers too
- bufferlist bl;
- bl.claim(orig->data);
- if (bl.length()) {
- assert(bl.length() == (orig->length()+right->length())*EBOFS_BLOCK_SIZE);
- right->data.substr_of(bl, orig->length()*EBOFS_BLOCK_SIZE, right->length()*EBOFS_BLOCK_SIZE);
- orig->data.substr_of(bl, 0, orig->length()*EBOFS_BLOCK_SIZE);
- }
-
- // move read waiters
- if (!orig->waitfor_read.empty()) {
- map<block_t, list<Context*> >::iterator o, p = orig->waitfor_read.end();
- p--;
- while (p != orig->waitfor_read.begin()) {
- if (p->first < right->start()) break;
- dout(0) << "split moving waiters at block " << p->first << " to right bh" << dendl;
- right->waitfor_read[p->first].swap( p->second );
- o = p;
- p--;
- orig->waitfor_read.erase(o);
- }
- }
-
- dout(20) << "split left is " << *orig << dendl;
- dout(20) << "split right is " << *right << dendl;
- return right;
-}
-
-
-
-
-void BufferCache::bh_read(Onode *on, BufferHead *bh, block_t from)
-{
- dout(10) << "bh_read " << *on << " on " << *bh << dendl;
-
- if (bh->is_missing()) {
- mark_rx(bh);
- } else {
- assert(bh->is_partial());
- }
-
- // get extent. there should be only one!
- vector<extent_t> exv;
- on->map_extents(bh->start(), bh->length(), exv, 0);
- assert(exv.size() == 1);
- assert(exv[0].start != 0); // not a hole.
- extent_t ex = exv[0];
-
- if (from) { // force behavior, used for reading partials
- dout(10) << "bh_read forcing read from block " << from << " (for a partial)" << dendl;
- ex.start = from;
- ex.length = 1;
- }
-
- // this should be empty!!
- assert(bh->rx_ioh == 0);
-
- dout(20) << "bh_read " << *on << " " << *bh << " from " << ex << dendl;
-
- C_OC_RxFinish *fin = new C_OC_RxFinish(ebofs_lock, on->oc,
- bh->start(), bh->length(),
- ex.start);
-
- //bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), fin->bl); // new buffers!
- fin->bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) );
-
- bh->rx_ioh = dev.read(ex.start, ex.length, fin->bl,
- fin);
- bh->rx_from = ex;
- on->oc->get();
-
-}
-
-bool BufferCache::bh_cancel_read(BufferHead *bh)
-{
- if (bh->rx_ioh && dev.cancel_io(bh->rx_ioh) >= 0) {
- dout(10) << "bh_cancel_read on " << *bh << dendl;
- bh->rx_ioh = 0;
- mark_missing(bh);
- int l = bh->oc->put();
- assert(l);
- return true;
- }
- return false;
-}
-
-void BufferCache::bh_write(Onode *on, BufferHead *bh, block_t shouldbe)
-{
- dout(10) << "bh_write " << *on << " on " << *bh << " in epoch " << bh->epoch_modified << dendl;
- assert(bh->get_version() > 0);
-
- assert(bh->is_dirty());
- mark_tx(bh);
-
- // get extents
- vector<extent_t> exv;
- on->map_extents(bh->start(), bh->length(), exv, 0);
- assert(exv.size() == 1);
- assert(exv[0].start != 0);
- extent_t ex = exv[0];
-
- if (shouldbe)
- assert(ex.length == 1 && ex.start == shouldbe);
-
- dout(20) << "bh_write " << *on << " " << *bh << " to " << ex << dendl;
-
- //assert(bh->tx_ioh == 0);
-
- assert(bh->get_last_flushed() < bh->get_version());
-
- bh->tx_block = ex.start;
- bh->tx_ioh = dev.write(ex.start, ex.length, bh->data,
- new C_OC_TxFinish(ebofs_lock, on->oc,
- bh->start(), bh->length(),
- bh->get_version(),
- bh->epoch_modified),
- "bh_write");
-
- on->oc->get();
- inc_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified );
-}
-
-
-bool BufferCache::bh_cancel_write(BufferHead *bh, version_t cur_epoch)
-{
- assert(bh->is_tx());
- assert(bh->epoch_modified == cur_epoch);
- assert(bh->epoch_modified > 0);
- if (bh->tx_ioh && dev.cancel_io(bh->tx_ioh) >= 0) {
- dout(10) << "bh_cancel_write on " << *bh << dendl;
- bh->tx_ioh = 0;
- mark_dirty(bh);
-
- dec_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified ); // assert.. this should be the same epoch!
-
- int l = bh->oc->put();
- assert(l);
- return true;
- }
- return false;
-}
-
-void BufferCache::tx_finish(ObjectCache *oc,
- ioh_t ioh, block_t start, block_t length,
- version_t version, version_t epoch)
-{
- ebofs_lock.Lock();
-
- // finish oc
- if (oc->put() == 0) {
- delete oc;
- } else
- oc->tx_finish(ioh, start, length, version, epoch);
-
- // update unflushed counter
- assert(get_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch) > 0);
- dec_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch);
-
- ebofs_lock.Unlock();
-}
-
-void BufferCache::rx_finish(ObjectCache *oc,
- ioh_t ioh, block_t start, block_t length,
- block_t diskstart,
- bufferlist& bl)
-{
- ebofs_lock.Lock();
- dout(10) << "rx_finish ioh " << ioh << " on " << start << "~" << length
- << ", at device block " << diskstart << dendl;
-
- // oc
- if (oc->put() == 0)
- delete oc;
- else
- oc->rx_finish(ioh, start, length, bl);
-
- // done.
- ebofs_lock.Unlock();
-}
-
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef CEPH_EBOFS_BUFFERCACHE_H
-#define CEPH_EBOFS_BUFFERCACHE_H
-
-#include "include/lru.h"
-#include "include/Context.h"
-
-#include "common/Clock.h"
-
-#include "types.h"
-#include "BlockDevice.h"
-
-#include "include/interval_set.h"
-#include "include/xlist.h"
-
-class ObjectCache;
-class BufferCache;
-class Onode;
-
-class BufferHead : public LRUObject {
- public:
- /*
- * - buffer_heads should always break across disk extent boundaries
- * - partial buffer_heads are always 1 block.
- */
- const static int STATE_MISSING = 0; // missing; data is on disk, but not loaded.
- const static int STATE_CLEAN = 1; // Rw clean
- const static int STATE_DIRTY = 2; // RW dirty
- const static int STATE_TX = 3; // Rw flushing to disk
- const static int STATE_RX = 4; // w reading from disk
- const static int STATE_PARTIAL = 5; // reading from disk, + partial content map. always 1 block.
- const static int STATE_CORRUPT = 6; // data on disk doesn't match onode checksum
-
- public:
- ObjectCache *oc;
-
- bufferlist data; // if empty, defined as zero (hole)
-
- ioh_t rx_ioh; //
- extent_t rx_from;
- ioh_t tx_ioh; //
- block_t tx_block;
-
- map<uint64_t, bufferlist> partial; // partial dirty content overlayed onto incoming data
-
- map<block_t, list<Context*> > waitfor_read;
-
- set<BufferHead*> shadows; // shadow bh's that clone()ed me.
- BufferHead* shadow_of;
-
-
- private:
- int ref;
- int state;
-
- public:
- version_t epoch_modified;
-
- version_t version; // current version in cache
- version_t last_flushed; // last version flushed to disk
-
- extent_t object_loc; // block position _in_object_
-
- utime_t dirty_stamp;
- //xlist<BufferHead*>::item xlist_dirty;
-
- bool want_to_expire; // wants to be at bottom of lru
-
- public:
- BufferHead(ObjectCache *o, block_t start, block_t len) :
- oc(o), //cancellable_ioh(0), tx_epoch(0),
- rx_ioh(0), tx_ioh(0), tx_block(0),
- shadow_of(0),
- ref(0), state(STATE_MISSING), epoch_modified(0), version(0), last_flushed(0),
- //object_loc(start, len),
- //xlist_dirty(this),
- want_to_expire(false) {
- object_loc.start = start;
- object_loc.length = len;
- }
- ~BufferHead() {
- unpin_shadows();
- }
-
- ObjectCache *get_oc() { return oc; }
-
- int get() {
- assert(ref >= 0);
- if (ref == 0) lru_pin();
- return ++ref;
- }
- int put() {
- assert(ref > 0);
- if (ref == 1) lru_unpin();
- --ref;
- return ref;
- }
- int get_num_ref() { return ref; }
-
- block_t start() { return object_loc.start; }
- //void set_start(block_t s) { object_loc.start = s; }
- block_t length() { return object_loc.length; }
- void reset_length(block_t l) { object_loc.length = l; }
- block_t end() { return start() + length(); }
- block_t last() { return end()-1; }
-
- version_t get_version() { return version; }
- void set_version(version_t v) { version = v; }
- version_t get_last_flushed() { return last_flushed; }
- void set_last_flushed(version_t v) {
- if (v <= last_flushed) cout << "last_flushed begin set to " << v << ", was " << last_flushed << std::endl;
- assert(v > last_flushed);
- last_flushed = v;
- }
-
- utime_t get_dirty_stamp() { return dirty_stamp; }
- void set_dirty_stamp(utime_t t) { dirty_stamp = t; }
-
- void set_state(int s) {
- if (s == STATE_PARTIAL || s == STATE_RX || s == STATE_TX) get();
- if (state == STATE_PARTIAL || state == STATE_RX || state == STATE_TX) put();
-
- if ((state == STATE_TX && s != STATE_TX) ||
- (state == STATE_PARTIAL && s != STATE_PARTIAL))
- unpin_shadows();
-
- state = s;
- }
- int get_state() { return state; }
-
- bool is_missing() { return state == STATE_MISSING; }
- bool is_dirty() { return state == STATE_DIRTY; }
- bool is_clean() { return state == STATE_CLEAN; }
- bool is_tx() { return state == STATE_TX; }
- bool is_rx() { return state == STATE_RX; }
- bool is_partial() { return state == STATE_PARTIAL; }
- bool is_corrupt() { return state == STATE_CORRUPT; }
-
- bool is_hole() { return is_clean() && data.length() == 0; }
-
- void add_shadow(BufferHead *dup) {
- shadows.insert(dup);
- dup->shadow_of = this;
- dup->get();
- }
- void remove_shadow(BufferHead *dup) {
- shadows.erase(dup);
- dup->shadow_of = 0;
- dup->put();
- }
- void unpin_shadows() {
- for (set<BufferHead*>::iterator p = shadows.begin();
- p != shadows.end();
- ++p) {
- //cout << "unpin shadow " << *p << std::endl;
- (*p)->shadow_of = 0;
- (*p)->put();
- }
- shadows.clear();
- }
-
- void copy_partial_substr(uint64_t start, uint64_t end, bufferlist& bl) {
- map<uint64_t, bufferlist>::iterator i = partial.begin();
-
- // skip first bits (fully to left)
- while ((i->first + i->second.length() < start) &&
- i != partial.end())
- i++;
- assert(i != partial.end());
- assert(i->first <= start);
-
- // first
- unsigned bhoff = MAX(start, i->first) - i->first;
- unsigned bhlen = MIN(end-start, i->second.length());
- bl.substr_of( i->second, bhoff, bhlen );
-
- uint64_t pos = i->first + i->second.length();
-
- // have continuous to end?
- for (i++; i != partial.end(); i++) {
- if (pos >= end) break;
- assert(pos == i->first);
-
- pos = i->first + i->second.length();
-
- if (pos <= end) { // this whole frag
- bl.append( i->second );
- } else { // partial end
- unsigned bhlen = end-start-bl.length();
- bufferlist frag;
- frag.substr_of( i->second, 0, bhlen );
- bl.claim_append(frag);
- break; // done.
- }
- }
-
- assert(pos >= end);
- assert(bl.length() == (unsigned)(end-start));
- }
-
- bool have_partial_range(uint64_t start, uint64_t end) {
- map<uint64_t, bufferlist>::iterator i = partial.begin();
-
- // skip first bits (fully to left)
- while ((i->first + i->second.length() < start) &&
- i != partial.end())
- i++;
- if (i == partial.end()) return false;
-
- // have start?
- if (i->first > start) return false;
- uint64_t pos = i->first + i->second.length();
-
- // have continuous to end?
- for (i++; i != partial.end(); i++) {
- assert(pos <= i->first);
- if (pos < i->first) return false;
- assert(pos == i->first);
- pos = i->first + i->second.length();
- if (pos >= end) break; // gone far enough
- }
-
- if (pos >= end) return true;
- return false;
- }
-
- bool partial_is_complete(uint64_t size) {
- return have_partial_range( 0, MIN(size, EBOFS_BLOCK_SIZE) );
- }
-
- void apply_partial();
- void add_partial(uint64_t off, bufferlist& p);
-
- void take_read_waiters(list<Context*>& finished) {
- for (map<block_t,list<Context*> >::iterator p = waitfor_read.begin();
- p != waitfor_read.end();
- p++)
- finished.splice(finished.begin(), p->second);
- waitfor_read.clear();
- }
-
-};
-
-inline ostream& operator<<(ostream& out, BufferHead& bh)
-{
- out << "bufferhead(" << bh.start() << "~" << bh.length();
- out << " v" << bh.get_version() << "/" << bh.get_last_flushed();
- if (bh.is_missing()) out << " missing";
- if (bh.is_dirty()) out << " dirty";
- if (bh.is_clean()) {
- out << " clean";
- if (bh.data.length() == 0)
- out << " HOLE";
- }
- if (bh.is_rx()) out << " rx";
- if (bh.is_tx()) out << " tx";
- if (bh.is_partial()) out << " partial";
- if (bh.is_corrupt()) out << " corrupt";
-
- // include epoch modified?
- if (bh.is_dirty() || bh.is_tx() || bh.is_partial())
- out << "(e" << bh.epoch_modified << ")";
-
- //out << " " << bh.data.length();
- out << " " << &bh;
- out << ")";
- return out;
-}
-
-
-class ObjectCache {
- public:
- pobject_t object_id;
- Onode *on;
- BufferCache *bc;
-
- private:
- map<block_t, BufferHead*> data;
- int ref;
-
- public:
- version_t write_count;
-
-
- public:
- ObjectCache(pobject_t o, Onode *_on, BufferCache *b) :
- object_id(o), on(_on), bc(b), ref(0),
- write_count(0) { }
- ~ObjectCache() {
- assert(data.empty());
- assert(ref == 0);
- }
-
- int get() {
- ++ref;
- //cout << "oc.get " << object_id << " " << ref << std::endl;
- return ref;
- }
- int put() {
- assert(ref > 0);
- --ref;
- //cout << "oc.put " << object_id << " " << ref << std::endl;
- return ref;
- }
-
- pobject_t get_object_id() { return object_id; }
-
-
- /*
- * will return bh containing pos.
- * if none, then the _next_ bh.
- * if none, then data.end().
- */
- map<block_t, BufferHead*>::iterator find_bh(block_t start, block_t len=0) {
- map<block_t, BufferHead*>::iterator p;
-
- // hack speed up common cases
- if (start == 0) {
- p = data.begin();
- } else if (len == 1 &&
- !data.empty() &&
- data.rbegin()->first <= start) {
- // append hack.
- p = data.end();
- p--;
- if (p->first < start) p++;
- } else {
- p = data.lower_bound(start);
- }
-
- if (p != data.begin() &&
- (p == data.end() || p->first > start)) {
- p--; // might overlap!
- if (p->first + p->second->length() <= start)
- p++; // doesn't overlap.
- }
- return p;
- }
-
- BufferHead *find_bh_containing(block_t b) {
- map<block_t, BufferHead*>::iterator p = find_bh(b, 1);
- if (p != data.end() &&
- p->second->start() <= b &&
- p->second->end() > b)
- return p->second;
- return 0;
- }
-
-
- void add_oc_bh(BufferHead *bh) {
- // add to my map
- assert(data.count(bh->start()) == 0);
-
- if (0) { // sanity check FIXME DEBUG
- //cout << "add_bh " << bh->start() << "~" << bh->length() << std::endl;
- map<block_t,BufferHead*>::iterator p = data.lower_bound(bh->start());
- if (p != data.end()) {
- //cout << " after " << *p->second << std::endl;
- //cout << " after starts at " << p->first << std::endl;
- assert(p->first >= bh->end());
- }
- if (p != data.begin()) {
- p--;
- //cout << " before starts at " << p->second->start()
- //<< " and ends at " << p->second->end() << std::endl;
- //cout << " before " << *p->second << std::endl;
- assert(p->second->end() <= bh->start());
- }
- }
-
- data[bh->start()] = bh;
- }
- void remove_oc_bh(BufferHead *bh) {
- assert(data.count(bh->start()));
- data.erase(bh->start());
- }
- bool is_empty() { return data.empty(); }
-
- void try_merge_bh(BufferHead *bh);
- void try_merge_bh_left(map<block_t, BufferHead*>::iterator& p);
- void try_merge_bh_right(map<block_t, BufferHead*>::iterator& p);
- BufferHead* merge_bh_left(BufferHead *left, BufferHead *right);
-
- int find_tx(block_t start, block_t len,
- list<BufferHead*>& tx);
-
- int map_read(block_t start, block_t len,
- map<block_t, BufferHead*>& hits, // hits
- map<block_t, BufferHead*>& missing, // read these from disk
- map<block_t, BufferHead*>& rx, // wait for these to finish reading from disk
- map<block_t, BufferHead*>& partial); // (maybe) wait for these to read from disk
- int try_map_read(block_t start, block_t len); // just tell us how many extents we're missing.
-
- int map_write(block_t start, block_t len,
- map<block_t, BufferHead*>& hits,
- version_t super_epoch);
-
- void touch_bottom(block_t bstart, block_t blast);
-
- BufferHead *split(BufferHead *bh, block_t off);
-
- /*int scan_versions(block_t start, block_t len,
- version_t& low, version_t& high);
- */
-
- void rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl);
- void tx_finish(ioh_t ioh, block_t start, block_t length, version_t v, version_t epoch);
-
- void truncate(block_t blocks, version_t super_epoch);
- void discard_bh(BufferHead *bh, version_t super_epoch);
- // void tear_down();
-
- void clone_to(Onode *other);
-
- void dump() {
- for (map<block_t,BufferHead*>::iterator i = data.begin();
- i != data.end();
- i++)
- cout << "dump: " << i->first << ": " << *i->second << std::endl;
- }
-
- void scrub_csums();
-
-};
-
-
-
-class BufferCache {
- public:
- Mutex &ebofs_lock; // hack: this is a ref to global ebofs_lock
- BlockDevice &dev;
-
- //xlist<BufferHead*> dirty_bh;
-
- LRU lru_dirty, lru_rest;
-
- bool poison_commit;
-
- private:
- Cond stat_cond;
- Cond flush_cond;
- int stat_waiter;
-
- uint64_t stat_all;
- uint64_t stat_clean, stat_corrupt;
- uint64_t stat_dirty;
- uint64_t stat_rx;
- uint64_t stat_tx;
- uint64_t stat_partial;
- uint64_t stat_missing;
-
- int partial_reads;
-
-
-#define EBOFS_BC_FLUSH_BHWRITE 0
-#define EBOFS_BC_FLUSH_PARTIAL 1
-
- map<version_t, int> epoch_unflushed[2];
-
- public:
- BufferCache(BlockDevice& d, Mutex& el) :
- ebofs_lock(el), dev(d),
- stat_waiter(0),
- stat_all(0), stat_clean(0), stat_corrupt(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_partial(0), stat_missing(0),
- partial_reads(0)
- {}
-
-
- uint64_t get_size() {
- assert(stat_clean+stat_dirty+stat_rx+stat_tx+stat_partial+stat_corrupt+stat_missing == stat_all);
- return stat_all;
- }
- uint64_t get_trimmable() {
- return stat_clean+stat_corrupt;
- }
-
-
- // bh's in cache
- void add_bh(BufferHead *bh) {
- bh->get_oc()->add_oc_bh(bh);
- if (bh->is_dirty()) {
- lru_dirty.lru_insert_mid(bh);
- //dirty_bh.push_back(&bh->xlist_dirty);
- } else
- lru_rest.lru_insert_mid(bh);
- stat_add(bh);
- }
- void touch(BufferHead *bh) {
- if (bh->is_dirty()) {
- lru_dirty.lru_touch(bh);
- } else
- lru_rest.lru_touch(bh);
- }
- void touch_bottom(BufferHead *bh) {
- if (bh->is_dirty()) {
- bh->want_to_expire = true;
- lru_dirty.lru_bottouch(bh);
- } else
- lru_rest.lru_bottouch(bh);
- }
- void remove_bh(BufferHead *bh) {
- bh->get_oc()->remove_oc_bh(bh);
- stat_sub(bh);
- if (bh->is_dirty()) {
- lru_dirty.lru_remove(bh);
- //dirty_bh.push_back(&bh->xlist_dirty);
- } else
- lru_rest.lru_remove(bh);
- delete bh;
- }
-
- // stats
- void stat_add(BufferHead *bh) {
- assert(stat_clean+stat_dirty+stat_rx+stat_tx+stat_partial+stat_corrupt+stat_missing == stat_all);
- switch (bh->get_state()) {
- case BufferHead::STATE_MISSING: stat_missing += bh->length(); break;
- case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break;
- case BufferHead::STATE_CORRUPT: stat_corrupt += bh->length(); break;
- case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break;
- case BufferHead::STATE_TX: stat_tx += bh->length(); break;
- case BufferHead::STATE_RX: stat_rx += bh->length(); break;
- case BufferHead::STATE_PARTIAL:
- stat_partial += bh->length();
- inc_partial_read();
- break;
- default: assert(0);
- }
- stat_all += bh->length();
- if (stat_waiter) stat_cond.Signal();
- }
- void stat_sub(BufferHead *bh) {
- assert(stat_clean+stat_dirty+stat_rx+stat_tx+stat_partial+stat_corrupt+stat_missing == stat_all);
- switch (bh->get_state()) {
- case BufferHead::STATE_MISSING: stat_missing -= bh->length(); assert(stat_missing >= 0); break;
- case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); assert(stat_clean >= 0); break;
- case BufferHead::STATE_CORRUPT: stat_corrupt -= bh->length(); assert(stat_corrupt >= 0); break;
- case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); assert(stat_dirty >= 0); break;
- case BufferHead::STATE_TX: stat_tx -= bh->length(); assert(stat_tx >= 0); break;
- case BufferHead::STATE_RX: stat_rx -= bh->length(); assert(stat_rx >= 0); break;
- case BufferHead::STATE_PARTIAL:
- stat_partial -= bh->length(); assert(stat_partial >= 0);
- dec_partial_read();
- break;
- default: assert(0);
- }
- stat_all -= bh->length();
- }
- uint64_t get_stat_tx() { return stat_tx; }
- uint64_t get_stat_rx() { return stat_rx; }
- uint64_t get_stat_dirty() { return stat_dirty; }
- uint64_t get_stat_clean() { return stat_clean; }
- uint64_t get_stat_partial() { return stat_partial; }
-
-
- map<version_t, int> &get_unflushed(int what) {
- return epoch_unflushed[what];
- }
-
- int get_unflushed(int what, version_t epoch) {
- return epoch_unflushed[what][epoch];
- }
- void inc_unflushed(int what, version_t epoch) {
- epoch_unflushed[what][epoch]++;
- //cout << "inc_unflushed " << epoch << " now " << epoch_unflushed[what][epoch] << std::endl;
- }
- void dec_unflushed(int what, version_t epoch) {
- epoch_unflushed[what][epoch]--;
- //cout << "dec_unflushed " << epoch << " now " << epoch_unflushed[what][epoch] << std::endl;
- if (epoch_unflushed[what][epoch] == 0)
- flush_cond.Signal();
- }
-
- bool get_num_partials() {
- return partial_reads;
- }
- void inc_partial_read() {
- partial_reads++;
- }
- void dec_partial_read() {
- partial_reads--;
- if (partial_reads == 0 && stat_waiter)
- stat_cond.Signal();
- }
-
- void waitfor_stat() {
- stat_waiter++;
- stat_cond.Wait(ebofs_lock);
- stat_waiter--;
- }
- void waitfor_partials() {
- stat_waiter++;
- while (partial_reads > 0)
- stat_cond.Wait(ebofs_lock);
- stat_waiter--;
-
- }
- void waitfor_flush() {
- flush_cond.Wait(ebofs_lock);
- }
-
-
- // bh state
- void set_state(BufferHead *bh, int s) {
- // move between lru lists?
- if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) {
- lru_rest.lru_remove(bh);
- lru_dirty.lru_insert_top(bh);
- //dirty_bh.push_back(&bh->xlist_dirty);
- }
- if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) {
- lru_dirty.lru_remove(bh);
- if (bh->want_to_expire)
- lru_rest.lru_insert_bot(bh);
- else
- lru_rest.lru_insert_mid(bh);
- //dirty_bh.remove(&bh->xlist_dirty);
- }
-
- // set state
- stat_sub(bh);
- bh->set_state(s);
- stat_add(bh);
- }
-
- void copy_state(BufferHead *bh1, BufferHead *bh2) {
- set_state(bh2, bh1->get_state());
- }
-
- void mark_missing(BufferHead *bh) { set_state(bh, BufferHead::STATE_MISSING); };
- void mark_clean(BufferHead *bh) { set_state(bh, BufferHead::STATE_CLEAN); };
- void mark_corrupt(BufferHead *bh) { set_state(bh, BufferHead::STATE_CORRUPT); };
- void mark_rx(BufferHead *bh) { set_state(bh, BufferHead::STATE_RX); };
- void mark_partial(BufferHead *bh) { set_state(bh, BufferHead::STATE_PARTIAL); };
- void mark_tx(BufferHead *bh) { set_state(bh, BufferHead::STATE_TX); };
- void mark_dirty(BufferHead *bh) {
- set_state(bh, BufferHead::STATE_DIRTY);
- bh->set_dirty_stamp(g_clock.now());
- };
-
-
- // io
- void bh_read(Onode *on, BufferHead *bh, block_t from=0);
- void bh_write(Onode *on, BufferHead *bh, block_t shouldbe=0);
-
- bool bh_cancel_read(BufferHead *bh);
- bool bh_cancel_write(BufferHead *bh, version_t cur_epoch);
-
- void rx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, block_t diskstart, bufferlist& bl);
- void tx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, version_t v, version_t e);
-
- friend class C_E_FlushPartial;
-
- // bh fun
- BufferHead *split(BufferHead *orig, block_t after);
-};
-
-
-class C_OC_RxFinish : public BlockDevice::callback {
- Mutex &lock;
- ObjectCache *oc;
- block_t start, length;
- block_t diskstart;
-public:
- bufferlist bl;
- C_OC_RxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, block_t ds) :
- lock(m), oc(o), start(s), length(l), diskstart(ds) {}
- void finish(ioh_t ioh, int r) {
- oc->bc->rx_finish(oc, ioh, start, length, diskstart, bl);
- }
-};
-
-class C_OC_TxFinish : public BlockDevice::callback {
- Mutex &lock;
- ObjectCache *oc;
- block_t start, length;
- version_t version;
- version_t epoch;
- public:
- C_OC_TxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, version_t v, version_t e) :
- lock(m), oc(o), start(s), length(l), version(v), epoch(e) {}
- void finish(ioh_t ioh, int r) {
- oc->bc->tx_finish(oc, ioh, start, length, version, epoch);
- }
-};
-
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef CEPH_EBOFS_CNODE_H
-#define CEPH_EBOFS_CNODE_H
-
-#include "Onode.h"
-
-/*
- * collection node
- *
- * holds attribute metadata for collections.
- * colletion membership is stored in b+tree tables, independent of tte cnode.
- */
-
-class Cnode : public LRUObject
-{
- private:
- int ref;
- bool dirty;
-
- public:
- coll_t coll_id;
- extent_t cnode_loc;
- epoch_t last_alloc_epoch;
-
- map<string,bufferptr> attr;
-
- public:
- Cnode(coll_t cid) : ref(0), dirty(false), coll_id(cid), last_alloc_epoch(0) {
- cnode_loc.length = 0;
- }
- ~Cnode() {
- }
-
- block_t get_cnode_id() { return cnode_loc.start; }
- int get_cnode_len() { return cnode_loc.length; }
-
- void get() {
- if (ref == 0) lru_pin();
- ref++;
- }
- void put() {
- ref--;
- if (ref == 0) lru_unpin();
- }
- int get_ref_count() { return ref; }
-
- void mark_dirty() {
- if (!dirty) {
- dirty = true;
- get();
- }
- }
- void mark_clean() {
- if (dirty) {
- dirty = false;
- put();
- }
- }
- bool is_dirty() { return dirty; }
-
-
- int get_attr_bytes() {
- int s = 0;
- for (map<string, bufferptr>::iterator i = attr.begin();
- i != attr.end();
- i++) {
- s += i->first.length() + 1;
- s += i->second.length() + sizeof(int);
- }
- return s;
- }
-
- //
- //???void clear();
-
-
-};
-
-inline ostream& operator<<(ostream& out, Cnode& cn)
-{
- out << "cnode(" << hex << cn.coll_id << dec;
- if (cn.is_dirty()) out << " dirty";
- //out << " " << &cn;
- out << ")";
- return out;
-}
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-
-#include "Ebofs.h"
-
-#include "os/FileJournal.h"
-
-#include <errno.h>
-
-#ifndef DARWIN
-#include <sys/vfs.h>
-#else
-#include <sys/param.h>
-#include <sys/mount.h>
-#endif // DARWIN
-
-// *******************
-
-#define DOUT_SUBSYS ebofs
-#undef dout_prefix
-#define dout_prefix *_dout << "ebofs(" << dev.get_device_name() << ")."
-
-
-char *nice_blocks(block_t b)
-{
- static char s[20];
- float sz = b*4.0;
- if (sz > (10 << 20))
- snprintf(s, sizeof(s), "%.1f GB", sz / (1024.0*1024.0));
- else if (sz > (10 << 10))
- snprintf(s, sizeof(s), "%.1f MB", sz / (1024.0));
- else
- snprintf(s, sizeof(s), "%llu KB", b*4ULL);
- return s;
-}
-
-int Ebofs::mount()
-{
- Mutex::Locker locker(ebofs_lock);
- assert(!mounted);
-
- // open dev
- int r = dev.open(&idle_kicker);
- if (r < 0) return r;
-
- dout(2) << "mounting " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << dendl;
-
- // read super
- bufferptr bp1 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
- bufferptr bp2 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
- dev.read(0, 1, bp1);
- dev.read(1, 1, bp2);
-
- struct ebofs_super *sb1 = (struct ebofs_super*)bp1.c_str();
- struct ebofs_super *sb2 = (struct ebofs_super*)bp2.c_str();
-
- // valid superblocks?
- if (!sb1->is_valid_magic() && !sb2->is_valid_magic()) {
- derr(0) << "mount bad magic, not a valid EBOFS file system" << dendl;
- return -EINVAL;
- }
- if (sb1->is_corrupt() && sb2->is_corrupt()) {
- derr(0) << "mount both superblocks are corrupt (bad csum)" << dendl;
- return -EINVAL;
- }
- if ((sb1->is_valid() && sb1->num_blocks > dev.get_num_blocks()) ||
- (sb2->is_valid() && sb2->num_blocks > dev.get_num_blocks())) {
- derr(0) << "mount superblock size exceeds actual device size" << dendl;
- return -EINVAL;
- }
-
- dout(3) << "mount super @0 epoch " << sb1->epoch << dendl;
- dout(3) << "mount super @1 epoch " << sb2->epoch << dendl;
-
- // pick newest super
- struct ebofs_super *sb = 0;
- if (sb1->epoch > sb2->epoch)
- sb = sb1;
- else
- sb = sb2;
- super_epoch = sb->epoch;
- op_seq = sb->op_seq;
- dout(3) << "mount epoch " << super_epoch << " op_seq " << op_seq << dendl;
-
- super_fsid = sb->fsid;
-
- free_blocks = sb->free_blocks;
- limbo_blocks = sb->limbo_blocks;
-
- // init node pools
- dout(3) << "mount nodepool" << dendl;
- nodepool.init( &sb->nodepool );
- nodepool.read_usemap_and_clean_nodes( dev, super_epoch );
-
- // open tables
- dout(3) << "mount opening tables" << dendl;
- object_tab = new Table<pobject_t, ebofs_inode_ptr>( nodepool, sb->object_tab );
- for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++)
- free_tab[i] = new Table<block_t, block_t>( nodepool, sb->free_tab[i] );
- limbo_tab = new Table<block_t, block_t>( nodepool, sb->limbo_tab );
- alloc_tab = new Table<block_t, pair<block_t,int> >( nodepool, sb->alloc_tab );
-
- collection_tab = new Table<coll_t,ebofs_inode_ptr>( nodepool, sb->collection_tab );
- co_tab = new Table<coll_pobject_t,bool>( nodepool, sb->co_tab );
-
- verify_tables();
-
- allocator.release_limbo();
-
- // open journal
- if (journalfn) {
- journal = new FileJournal(sb->fsid, &finisher, NULL, journalfn, g_conf.journal_dio);
- int err = journal->open(op_seq+1);
- if (err < 0) {
- dout(3) << "mount journal " << journalfn << " open failed" << dendl;
- delete journal;
- journal = 0;
- if (err == -EINVAL) {
- dout(0) << "mount journal appears corrupt/invalid, stopping" << dendl;
- dev.close();
- return -1;
- }
- } else {
- // replay journal
- dout(3) << "mount journal " << journalfn << " opened, replaying" << dendl;
-
- while (1) {
- bufferlist bl;
- uint64_t seq;
- if (!journal->read_entry(bl, seq)) {
- dout(3) << "mount replay: end of journal, done." << dendl;
- break;
- }
-
- if (seq <= op_seq) {
- dout(3) << "mount replay: skipping old op seq " << seq << " <= " << op_seq << dendl;
- continue;
- }
- op_seq++;
- assert(seq == op_seq);
-
- dout(3) << "mount replay: applying op seq " << seq << dendl;
- Transaction t(bl);
- _apply_transaction(t);
- }
-
- // done reading, make writeable.
- journal->make_writeable();
- }
- }
-
- dout(3) << "mount starting commit+finisher threads" << dendl;
- commit_thread.create();
- finisher.start();
-
- dout(1) << "mounted " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks())
- << (journal ? ", with journal":", no journal")
- << dendl;
- mounted = true;
-
- return 0;
-}
-
-
-int Ebofs::mkfs()
-{
- Mutex::Locker locker(ebofs_lock);
- assert(!mounted);
-
- int r = dev.open();
- if (r < 0)
- return r;
-
- block_t num_blocks = dev.get_num_blocks();
-
- // make a super-random fsid
- srand48(time(0) ^ getpid());
- super_fsid = ((uint64_t)lrand48() << 32) ^ mrand48();
- srand(time(0) ^ getpid());
- super_fsid ^= rand();
- super_fsid ^= (uint64_t)rand() << 32;
-
- free_blocks = 0;
- limbo_blocks = 0;
-
- // create first noderegion
- extent_t nr;
- nr.start = 2;
- nr.length = 20+ (num_blocks / 1000);
- if (nr.length < 10) nr.length = 10;
- nodepool.add_region(nr);
- dout(10) << "mkfs: first node region at " << nr << dendl;
-
- // allocate two usemaps
- block_t usemap_len = nodepool.get_usemap_len();
- nodepool.usemap_even.start = nr.end();
- nodepool.usemap_even.length = usemap_len;
- nodepool.usemap_odd.start = nodepool.usemap_even.end();
- nodepool.usemap_odd.length = usemap_len;
- dout(10) << "mkfs: even usemap at " << nodepool.usemap_even << dendl;
- dout(10) << "mkfs: odd usemap at " << nodepool.usemap_odd << dendl;
- nodepool.init_usemap();
-
- // init tables
- struct ebofs_table empty;
- empty.num_keys = 0;
- empty.root.nodeid = -1;
- empty.root.csum = 0;
- empty.depth = 0;
-
- object_tab = new Table<pobject_t, ebofs_inode_ptr>( nodepool, empty );
- collection_tab = new Table<coll_t, ebofs_inode_ptr>( nodepool, empty );
-
- for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++)
- free_tab[i] = new Table<block_t,block_t>( nodepool, empty );
- limbo_tab = new Table<block_t,block_t>( nodepool, empty );
- alloc_tab = new Table<block_t,pair<block_t,int> >( nodepool, empty );
-
- co_tab = new Table<coll_pobject_t, bool>( nodepool, empty );
-
- // add free space
- extent_t left;
- left.start = nodepool.usemap_odd.end();
- left.length = num_blocks - left.start;
- dout(10) << "mkfs: free data blocks at " << left << dendl;
- allocator._release_into_limbo( left );
- if (g_conf.ebofs_cloneable) {
- allocator.alloc_inc(nr);
- allocator.alloc_inc(nodepool.usemap_even);
- allocator.alloc_inc(nodepool.usemap_odd);
- }
- allocator.commit_limbo(); // -> limbo_tab
- allocator.release_limbo(); // -> free_tab
-
- // write nodes, super, 2x
- dout(10) << "mkfs: flushing nodepool and superblocks (2x)" << dendl;
-
- for (epoch_t e=0; e<2; e++) {
- nodepool.commit_start(dev, e);
- nodepool.commit_wait();
- bufferptr superbp;
- prepare_super(e, superbp);
- write_super(e, superbp);
- }
-
- // free memory
- dout(10) << "mkfs: cleaning up" << dendl;
- close_tables();
-
- dev.close();
-
-
- // create journal?
- if (journalfn) {
- Journal *journal = new FileJournal(super_fsid, &finisher, NULL, journalfn, g_conf.journal_dio);
- if (journal->create() < 0) {
- dout(3) << "mount journal " << journalfn << " created failed" << dendl;
- } else {
- dout(3) << "mount journal " << journalfn << " created" << dendl;
- }
- delete journal;
- journal = 0;
- }
-
- dout(2) << "mkfs: " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << dendl;
- return 0;
-}
-
-void Ebofs::close_tables()
-{
- // close tables
- delete object_tab;
- for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++)
- delete free_tab[i];
- delete limbo_tab;
- delete alloc_tab;
- delete collection_tab;
- delete co_tab;
-
- nodepool.close();
-}
-
-void Ebofs::verify_tables()
-{
- bool o = g_conf.ebofs_verify;
- g_conf.ebofs_verify = true;
-
- object_tab->verify("onmount");
- limbo_tab->verify("onmount");
- alloc_tab->verify("onmount");
- collection_tab->verify("onmount");
- co_tab->verify("onmount");
- for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++)
- free_tab[i]->verify("onmount");
-
- g_conf.ebofs_verify = o;
-}
-
-int Ebofs::umount()
-{
- ebofs_lock.Lock();
-
- // mark unmounting
- dout(2) << "umount start" << dendl;
- readonly = true;
- unmounting = true;
-
- // kick commit thread
- dout(5) << "umount stopping commit thread" << dendl;
- commit_cond.Signal();
- ebofs_lock.Unlock();
- commit_thread.join();
- ebofs_lock.Lock();
-
- // kick finisher thread
- dout(5) << "umount stopping finisher thread" << dendl;
- finisher.stop();
-
- // close journal
- if (journal) {
- journal->close();
- delete journal;
- journal = 0;
- }
-
- trim_bc(0);
- trim_inodes(0);
-
- for (hash_map<pobject_t,Onode*>::iterator i = onode_map.begin();
- i != onode_map.end();
- i++) {
- dout(0) << "umount *** leftover: " << i->first << " " << *(i->second) << dendl;
- }
-
- // free memory
- dout(5) << "umount cleaning up" << dendl;
- close_tables();
- dev.close();
- readonly = unmounting = mounted = false;
-
- dout(2) << "umount done on " << dev.get_device_name() << dendl;
- ebofs_lock.Unlock();
- return 0;
-}
-
-
-
-void Ebofs::prepare_super(version_t epoch, bufferptr& bp)
-{
- bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE);
- bp.zero();
-
- struct ebofs_super *sb = (ebofs_super*)bp.c_str(); // this way it's aligned.
-
- dout(10) << "prepare_super v" << epoch << dendl;
-
- // fill in super
- sb->s_magic = EBOFS_MAGIC;
- sb->fsid = super_fsid;
- sb->epoch = epoch;
- sb->op_seq = op_seq;
- sb->num_blocks = dev.get_num_blocks();
-
- sb->free_blocks = free_blocks;
- sb->limbo_blocks = limbo_blocks;
-
- // tables
- sb->object_tab.num_keys = object_tab->get_num_keys();
- sb->object_tab.root = object_tab->get_root();
- sb->object_tab.depth = object_tab->get_depth();
-
- for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++) {
- sb->free_tab[i].num_keys = free_tab[i]->get_num_keys();
- sb->free_tab[i].root = free_tab[i]->get_root();
- sb->free_tab[i].depth = free_tab[i]->get_depth();
- }
- sb->limbo_tab.num_keys = limbo_tab->get_num_keys();
- sb->limbo_tab.root = limbo_tab->get_root();
- sb->limbo_tab.depth = limbo_tab->get_depth();
-
- sb->alloc_tab.num_keys = alloc_tab->get_num_keys();
- sb->alloc_tab.root = alloc_tab->get_root();
- sb->alloc_tab.depth = alloc_tab->get_depth();
-
- sb->collection_tab.num_keys = collection_tab->get_num_keys();
- sb->collection_tab.root = collection_tab->get_root();
- sb->collection_tab.depth = collection_tab->get_depth();
-
- sb->co_tab.num_keys = co_tab->get_num_keys();
- sb->co_tab.root = co_tab->get_root();
- sb->co_tab.depth = co_tab->get_depth();
-
- // pools
- sb->nodepool.num_regions = nodepool.region_loc.size();
- for (unsigned i=0; i<nodepool.region_loc.size(); i++) {
- sb->nodepool.region_loc[i] = nodepool.region_loc[i];
- }
- sb->nodepool.node_usemap_even = nodepool.usemap_even;
- sb->nodepool.node_usemap_odd = nodepool.usemap_odd;
-
- // csum
- sb->super_csum = sb->calc_csum();
- dout(20) << "super csum is " << sb->super_csum << " " << sb->calc_csum() << dendl;
- assert(!sb->is_corrupt());
-}
-
-void Ebofs::write_super(version_t epoch, bufferptr& bp)
-{
- block_t bno = epoch & 1;
-
- dout(10) << "write_super v" << epoch << " to b" << bno << dendl;
-
- dev.write(bno, 1, bp, "write_super");
-}
-
-int Ebofs::commit_thread_entry()
-{
- ebofs_lock.Lock();
- dout(10) << "commit_thread start" << dendl;
-
- assert(!commit_thread_started); // there can be only one
- commit_thread_started = true;
- sync_cond.Signal();
-
- while (mounted) {
-
- // wait for kick, or timeout
- if (g_conf.ebofs_commit_ms) {
- // normal wait+timeout
- dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms" << dendl;
- commit_cond.WaitInterval(ebofs_lock, utime_t(0, g_conf.ebofs_commit_ms*1000));
- } else {
- // DEBUG.. wait until kicked
- dout(10) << "commit_thread no commit_ms, waiting until kicked" << dendl;
- commit_cond.Wait(ebofs_lock);
- }
-
- if (unmounting) {
- dout(10) << "commit_thread unmounting: final commit pass" << dendl;
- assert(readonly);
- unmounting = false;
- mounted = false;
- dirty = true;
- }
-
- if (!dirty && !limbo_blocks) {
- dout(10) << "commit_thread not dirty - kicking waiters" << dendl;
- finisher.queue(commit_waiters[super_epoch]);
- }
- else {
- // --- wait for partials to finish ---
- commit_starting = true;
- if (bc.get_num_partials() > 0) {
- dout(10) << "commit_thread waiting for " << bc.get_num_partials() << " partials to complete" << dendl;
- dev.barrier();
- bc.waitfor_partials();
- dout(10) << "commit_thread partials completed" << dendl;
- }
- commit_starting = false;
-
- // --- get ready for a new epoch ---
- uint64_t last_op = op_seq;
- super_epoch++;
- dirty = false;
-
- derr(10) << "commit_thread commit start, new epoch " << super_epoch << " last_op " << last_op << dendl;
- dout(10) << "commit_thread commit start, new epoch " << super_epoch << " last_op " << last_op << dendl;
- dout(2) << "commit_thread data: "
- << 100*(dev.get_num_blocks()-get_free_blocks())/dev.get_num_blocks() << "% used, "
- << get_free_blocks() << " (" << 100*get_free_blocks()/dev.get_num_blocks()
- << "%) free in " << get_free_extents()
- << ", " << get_limbo_blocks() << " (" << 100*get_limbo_blocks()/dev.get_num_blocks()
- << "%) limbo in " << get_limbo_extents()
- << dendl;
- dout(2) << "commit_thread nodes: "
- << 100*nodepool.get_num_used()/nodepool.get_num_total() << "% used, "
- << nodepool.get_num_free() << " (" << 100*nodepool.get_num_free()/nodepool.get_num_total() << "%) free, "
- << nodepool.get_num_limbo() << " (" << 100*nodepool.get_num_limbo()/nodepool.get_num_total() << "%) limbo, "
- << nodepool.get_num_total() << " total." << dendl;
- dout(2) << "commit_thread bc: "
- << "size " << bc.get_size()
- << ", trimmable " << bc.get_trimmable()
- << ", max " << g_conf.ebofs_bc_size
- << "; dirty " << bc.get_stat_dirty()
- << ", tx " << bc.get_stat_tx()
- << ", max dirty " << g_conf.ebofs_bc_max_dirty
- << dendl;
-
- bufferptr superbp;
- int attempt = 1;
- while (1) {
- // --- queue up commit writes ---
- bc.poison_commit = false;
- commit_inodes_start(); // do this first; it currently involves inode reallocation
- allocator.commit_limbo(); // limbo -> limbo_tab
- nodepool.commit_start(dev, super_epoch);
- prepare_super(super_epoch, superbp); // prepare super (before any new changes get made!)
-
- // --- now (try to) flush everything ---
- // (partial writes may fail if read block has a bad csum)
-
- // blockdev barrier (prioritize our writes!)
- dout(30) << "commit_thread barrier. flushing inodes " << inodes_flushing << dendl;
- dev.barrier();
-
- // wait for it all to flush (drops global lock)
- commit_bc_wait(super_epoch-1);
- dout(30) << "commit_thread bc flushed" << dendl;
- commit_inodes_wait();
- dout(30) << "commit_thread inodes flushed" << dendl;
- nodepool.commit_wait();
- dout(30) << "commit_thread btree nodes flushed" << dendl;
-
- if (!bc.poison_commit)
- break; // ok!
-
- ++attempt;
- dout(1) << "commit_thread commit poisoned, retrying, attempt " << attempt << dendl;
- /* actually, poisoning isn't needed after all.
- * it's probably a bad idea, but i'll leave it in anyway,
- * in case it becomes useful later. for now,
- */
- assert(0); // NO!
- }
-
- // ok, now (synchronously) write the prior super!
- dout(10) << "commit_thread commit flushed, writing super for prior epoch" << dendl;
- ebofs_lock.Unlock();
- write_super(super_epoch, superbp);
- ebofs_lock.Lock();
-
- dout(10) << "commit_thread wrote super" << dendl;
-
- // free limbo space now
- // (since we're done allocating things,
- // AND we've flushed all previous epoch data)
- allocator.release_limbo(); // limbo_tab -> free_tabs
- nodepool.commit_finish();
-
- // do we need more node space?
- if (nodepool.get_num_free() < nodepool.get_num_total() / 3) {
- dout(2) << "commit_thread running low on node space, allocating more." << dendl;
- alloc_more_node_space();
- }
-
- // trim journal
- if (journal) journal->committed_thru(last_op);
-
- // kick waiters
- dout(10) << "commit_thread queueing commit + kicking sync waiters" << dendl;
- finisher.queue(commit_waiters[super_epoch-1]);
- commit_waiters.erase(super_epoch-1);
- sync_cond.Signal();
-
- dout(10) << "commit_thread commit finish" << dendl;
- }
-
- // trim bc?
- trim_bc();
- trim_inodes();
-
- }
-
- dout(10) << "commit_thread finish" << dendl;
- commit_thread_started = false;
- ebofs_lock.Unlock();
- return 0;
-}
-
-
-void Ebofs::alloc_more_node_space()
-{
- dout(1) << "alloc_more_node_space free " << nodepool.get_num_free() << "/" << nodepool.get_num_total() << dendl;
-
- if (nodepool.num_regions() < EBOFS_MAX_NODE_REGIONS) {
- int want = nodepool.get_num_total();
-
- extent_t ex;
- allocator.allocate(ex, want, 2);
- dout(1) << "alloc_more_node_space wants " << want << " more, got " << ex << dendl;
-
- extent_t even, odd;
- unsigned ulen = nodepool.get_usemap_len(nodepool.get_num_total() + ex.length);
- allocator.allocate(even, ulen, 2);
- allocator.allocate(odd, ulen, 2);
- dout(1) << "alloc_more_node_space maps need " << ulen << " x2, got " << even << " " << odd << dendl;
-
- if (even.length == ulen && odd.length == ulen) {
- dout(1) << "alloc_more_node_space got " << ex << ", new usemaps at even " << even << " odd " << odd << dendl;
- allocator.release(nodepool.usemap_even);
- allocator.release(nodepool.usemap_odd);
- nodepool.add_region(ex);
-
- // expand usemap?
- nodepool.usemap_even = even;
- nodepool.usemap_odd = odd;
- nodepool.expand_usemap();
- } else {
- dout (1) << "alloc_more_node_space failed to get space for new usemaps" << dendl;
- allocator.release(ex);
- allocator.release(even);
- allocator.release(odd);
- //assert(0);
- }
- } else {
- dout(1) << "alloc_more_node_space already have max node regions!" << dendl;
- assert(0);
- }
-}
-
-
-
-// *** onodes ***
-
-Onode* Ebofs::new_onode(pobject_t oid)
-{
- Onode* on = new Onode(oid);
-
- assert(onode_map.count(oid) == 0);
- onode_map[oid] = on;
- onode_lru.lru_insert_top(on);
-
- on->get();
- on->onode_loc.start = 0;
- on->onode_loc.length = 0;
-
- assert(object_tab->lookup(oid) < 0);
- ebofs_inode_ptr ptr(on->onode_loc, 0);
- object_tab->insert(oid, ptr); // even tho i'm not placed yet
-
- dirty_onode(on);
-
- dout(7) << "new_onode " << *on << dendl;
- return on;
-}
-
-Onode* Ebofs::decode_onode(bufferlist& bl, unsigned& off, csum_t csum)
-{
- // verify csum
- struct ebofs_onode *eo = (struct ebofs_onode*)(bl.c_str() + off);
- if (eo->onode_bytes > bl.length() - off) {
- derr(0) << "obviously corrupt onode (bad onode_bytes)" << dendl;
- return 0;
- }
- csum_t actual = calc_csum_unaligned(bl.c_str() + off + sizeof(csum_t),
- eo->onode_bytes - sizeof(csum_t));
- if (actual != eo->onode_csum) {
- derr(0) << "corrupt onode (bad csum actual " << actual << " != onode's " << eo->onode_csum << ")" << dendl;
- return 0;
- }
- if (actual != csum) {
- derr(0) << "corrupt onode (bad csum actual " << actual << " != expected " << csum << ")" << dendl;
- return 0;
- }
-
- // build onode
- Onode *on = new Onode(eo->object_id);
- on->readonly = eo->readonly;
- on->onode_loc = eo->onode_loc;
- on->object_size = eo->object_size;
- on->alloc_blocks = eo->alloc_blocks;
- on->data_csum = eo->data_csum;
-
- // parse
- char *p = (char*)(eo + 1);
-
- // parse collection list
- for (int i=0; i<eo->num_collections; i++) {
- coll_t c = *((coll_t*)p);
- p += sizeof(c);
- on->collections.insert(c);
- }
-
- // parse attributes
- for (unsigned i=0; i<eo->num_attr; i++) {
- string key = p;
- p += key.length() + 1;
- int len = *(int*)(p);
- p += sizeof(len);
- on->attr[key] = buffer::copy(p, len);
- p += len;
- dout(15) << "decode_onode " << *on << " attr " << key << " len " << len << dendl;
- }
-
- // parse extents
- on->extent_map.clear();
- block_t n = 0;
- for (unsigned i=0; i<eo->num_extents; i++) {
- extent_t ex = *((extent_t*)p);
- p += sizeof(extent_t);
- on->extent_map[n].ex = ex;
- if (ex.start) {
- on->extent_map[n].csum.resize(ex.length);
- memcpy(&on->extent_map[n].csum[0], p, sizeof(csum_t)*ex.length);
- p += sizeof(csum_t)*ex.length;
- }
- dout(15) << "decode_onode " << *on << " ex " << i << ": " << ex << dendl;
- n += ex.length;
- }
- on->last_block = n;
-
- // parse bad byte extents
- for (unsigned i=0; i<eo->num_bad_byte_extents; i++) {
- extent_t ex = *((extent_t*)p);
- p += sizeof(ex);
- on->bad_byte_extents.insert(ex.start, ex.length);
- dout(15) << "decode_onode " << *on << " bad byte ex " << ex << dendl;
- }
-
- unsigned len = p - (char*)eo;
- assert(len == eo->onode_bytes);
- return on;
-}
-
-Onode* Ebofs::get_onode(pobject_t oid)
-{
- while (1) {
- // in cache?
- if (have_onode(oid)) {
- // yay
- Onode *on = onode_map[oid];
- on->get();
- //dout(0) << "get_onode " << *on << dendl;
- return on;
- }
-
- // on disk?
- ebofs_inode_ptr ptr;
- if (object_tab->lookup(oid, ptr) < 0) {
- dout(10) << "onode lookup failed on " << oid << dendl;
- // object dne.
- return 0;
- }
-
- // already loading?
- if (waitfor_onode.count(oid)) {
- // yep, just wait.
- Cond c;
- waitfor_onode[oid].push_back(&c);
- dout(10) << "get_onode " << oid << " already loading, waiting" << dendl;
- c.Wait(ebofs_lock);
- continue;
- }
-
- dout(10) << "get_onode reading " << oid << " from " << ptr.loc << dendl;
-
- assert(waitfor_onode.count(oid) == 0);
- waitfor_onode[oid].clear(); // this should be empty initially.
-
- // read it!
- bufferlist bl;
- bl.push_back( buffer::create_page_aligned( EBOFS_BLOCK_SIZE*ptr.loc.length ) );
-
- ebofs_lock.Unlock();
- dev.read( ptr.loc.start, ptr.loc.length, bl );
- ebofs_lock.Lock();
-
- unsigned off = 0;
- Onode *on = decode_onode(bl, off, ptr.csum);
- if (!on) {
- assert(0); // corrupt!
- }
- assert(on->object_id == oid);
- onode_map[oid] = on;
- onode_lru.lru_insert_top(on);
-
- // wake up other waiters
- for (list<Cond*>::iterator i = waitfor_onode[oid].begin();
- i != waitfor_onode[oid].end();
- i++)
- (*i)->Signal();
- waitfor_onode.erase(oid); // remove Cond list
-
- on->get();
- //dout(0) << "get_onode " << *on << " (loaded)" << dendl;
- return on;
- }
-}
-
-
-class C_E_InodeFlush : public BlockDevice::callback {
- Ebofs *ebofs;
-public:
- C_E_InodeFlush(Ebofs *e) : ebofs(e) {}
- void finish(ioh_t ioh, int r) {
- ebofs->flush_inode_finish();
- }
-};
-
-
-csum_t Ebofs::encode_onode(Onode *on, bufferlist& bl, unsigned& off)
-{
- unsigned start_off = off;
-
- // onode
- struct ebofs_onode eo;
- eo.readonly = on->readonly;
- eo.onode_loc = on->onode_loc;
- eo.object_id = on->object_id;
- eo.object_size = on->object_size;
- eo.alloc_blocks = on->alloc_blocks;
- eo.data_csum = on->data_csum;
- eo.inline_bytes = 0; /* write me */
- eo.num_collections = on->collections.size();
- eo.num_attr = on->attr.size();
- eo.num_extents = on->extent_map.size();
- eo.num_bad_byte_extents = on->bad_byte_extents.m.size();
- bl.copy_in(off, sizeof(eo), (char*)&eo);
- off += sizeof(eo);
-
- // collections
- for (set<coll_t>::iterator i = on->collections.begin();
- i != on->collections.end();
- i++) {
- bl.copy_in(off, sizeof(*i), (char*)&(*i));
- off += sizeof(*i);
- }
-
- // attr
- for (map<string, bufferptr>::iterator i = on->attr.begin();
- i != on->attr.end();
- i++) {
- bl.copy_in(off, i->first.length()+1, i->first.c_str());
- off += i->first.length()+1;
- int l = i->second.length();
- bl.copy_in(off, sizeof(int), (char*)&l);
- off += sizeof(int);
- if (l)
- bl.copy_in(off, l, i->second.c_str());
- off += l;
- dout(15) << "encode_onode " << *on << " attr " << i->first << " len " << l << dendl;
- }
-
- // extents
- for (map<block_t,ExtentCsum>::iterator i = on->extent_map.begin();
- i != on->extent_map.end();
- i++) {
- ExtentCsum &o = i->second;
- bl.copy_in(off, sizeof(extent_t), (char*)&(o.ex));
- off += sizeof(extent_t);
- if (o.ex.start) {
- bl.copy_in(off, sizeof(csum_t)*o.ex.length, (char*)&o.csum[0]);
- off += sizeof(csum_t)*o.ex.length;
- }
- dout(15) << "encode_onode " << *on << " ex " << i->first << ": " << o.ex << dendl;
- }
-
- // bad byte extents
- for (map<uint64_t,uint64_t>::iterator p = on->bad_byte_extents.m.begin();
- p != on->bad_byte_extents.m.end();
- p++) {
- extent_t o = {p->first, p->second};
- bl.copy_in(off, sizeof(o), (char*)&o);
- off += sizeof(o);
- dout(15) << "encode_onode " << *on << " bad byte ex " << o << dendl;
- }
-
- eo.onode_bytes = off - start_off;
- bl.copy_in(start_off + sizeof(csum_t), sizeof(__u32), (char*)&eo.onode_bytes);
- eo.onode_csum = calc_csum_unaligned(bl.c_str() + start_off + sizeof(csum_t),
- eo.onode_bytes - sizeof(csum_t));
- bl.copy_in(start_off, sizeof(csum_t), (char*)&eo);
- dout(15) << "encode_onode len " << eo.onode_bytes << " csum " << eo.onode_csum << dendl;
-
- return eo.onode_csum;
-}
-
-void Ebofs::write_onode(Onode *on)
-{
- // buffer
- unsigned bytes = on->get_ondisk_bytes();
- unsigned blocks = DIV_ROUND_UP(bytes, EBOFS_BLOCK_SIZE);
-
- bufferlist bl;
- bl.push_back(buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks));
-
- // relocate onode
- if (on->onode_loc.length)
- allocator.release(on->onode_loc);
- block_t first = 0;
- if (on->alloc_blocks)
- first = on->get_first_block();
- allocator.allocate(on->onode_loc, blocks, first);
-
- dout(10) << "write_onode " << *on << " to " << on->onode_loc << dendl;
-
- // encode
- unsigned off = 0;
- csum_t csum = encode_onode(on, bl, off);
- assert(off == bytes);
- if (off < bl.length())
- bl.zero(off, bl.length()-off);
-
- // update pointer
- object_tab->remove(on->object_id);
- ebofs_inode_ptr ptr(on->onode_loc, csum);
- object_tab->insert(on->object_id, ptr);
- //object_tab->verify();
-
- // write
- dev.write( on->onode_loc.start, on->onode_loc.length, bl,
- new C_E_InodeFlush(this), "write_onode" );
-}
-
-void Ebofs::remove_onode(Onode *on)
-{
- dout(8) << "remove_onode " << *on << dendl;
-
- assert(on->get_ref_count() >= 1); // caller
-
- // tear down buffer cache
- if (on->oc) {
- on->oc->truncate(0, super_epoch); // this will kick readers along the way.
- on->close_oc();
- }
-
- // remove from onode map, mark dangling/deleted
- onode_map.erase(on->object_id);
- onode_lru.lru_remove(on);
- on->deleted = true;
- on->dangling = true;
-
- // remove from object table
- //dout(0) << "remove_onode on " << *on << dendl;
- object_tab->remove(on->object_id);
-
- // free onode space
- if (on->onode_loc.length)
- allocator.release(on->onode_loc);
-
- // free data space
- for (map<block_t,ExtentCsum>::iterator i = on->extent_map.begin();
- i != on->extent_map.end();
- i++)
- if (i->second.ex.start)
- allocator.release(i->second.ex);
- on->extent_map.clear();
-
- // remove from collections
- for (set<coll_t>::iterator i = on->collections.begin();
- i != on->collections.end();
- i++) {
- co_tab->remove(coll_pobject_t(*i,on->object_id));
- }
- on->collections.clear();
-
- // dirty -> clean?
- if (on->is_dirty()) {
- on->mark_clean(); // this unpins *on
- dirty_onodes.erase(on);
- }
-
- if (on->get_ref_count() > 1) dout(10) << "remove_onode **** will survive " << *on << dendl;
- put_onode(on);
-
- dirty = true;
-}
-
-void Ebofs::put_onode(Onode *on)
-{
- on->put();
- //dout(0) << "put_onode " << *on << dendl;
-
- if (on->get_ref_count() == 0 && on->dangling) {
- //dot(0) << " *** hosing on " << *on << dendl;
- delete on;
- }
-}
-
-void Ebofs::dirty_onode(Onode *on)
-{
- if (!on->is_dirty()) {
- dout(10) << "dirty_onode " << *on << dendl;
- on->mark_dirty();
- dirty_onodes.insert(on);
- } else {
- dout(10) << "dirty_onode " << *on << " (already dirty)" << dendl;
- }
- dirty = true;
-}
-
-void Ebofs::trim_inodes(int max)
-{
- unsigned omax = onode_lru.lru_get_max();
- unsigned cmax = cnode_lru.lru_get_max();
- if (max >= 0) omax = cmax = max;
- dout(10) << "trim_inodes start " << onode_lru.lru_get_size() << " / " << omax << " onodes, "
- << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << dendl;
-
- // onodes
- while (onode_lru.lru_get_size() > omax) {
- // expire an item
- Onode *on = (Onode*)onode_lru.lru_expire();
- if (on == 0) break; // nothing to expire
-
- // expire
- dout(20) << "trim_inodes removing onode " << *on << dendl;
- onode_map.erase(on->object_id);
- on->dangling = true;
-
- if (on->get_ref_count() == 0) {
- assert(on->oc == 0); // an open oc pins the onode!
- delete on;
- } else {
- dout(0) << "trim_inodes still active: " << *on << dendl;
- assert(0); // huh?
- }
- }
-
-
- // cnodes
- while (cnode_lru.lru_get_size() > cmax) {
- // expire an item
- Cnode *cn = (Cnode*)cnode_lru.lru_expire();
- if (cn == 0) break; // nothing to expire
-
- // expire
- dout(20) << "trim_inodes removing cnode " << *cn << dendl;
- cnode_map.erase(cn->coll_id);
-
- delete cn;
- }
-
- dout(10) << "trim_inodes finish "
- << onode_lru.lru_get_size() << " / " << omax << " onodes, "
- << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << dendl;
-}
-
-
-
-// *** cnodes ****
-
-Cnode* Ebofs::new_cnode(coll_t cid)
-{
- Cnode* cn = new Cnode(cid);
-
- assert(cnode_map.count(cid) == 0);
- cnode_map[cid] = cn;
- cnode_lru.lru_insert_top(cn);
-
- cn->get();
- cn->cnode_loc.start = 0;
- cn->cnode_loc.length = 0;
-
- assert(collection_tab->lookup(cid) < 0);
- ebofs_inode_ptr ptr(cn->cnode_loc, 0);
- collection_tab->insert(cid, ptr); // even tho i'm not placed yet
-
- dirty_cnode(cn);
-
- return cn;
-}
-
-Cnode* Ebofs::decode_cnode(bufferlist& bl, unsigned& off, csum_t csum)
-{
- // verify csum
- struct ebofs_cnode *ec = (struct ebofs_cnode*)(bl.c_str() + off);
- if (ec->cnode_bytes > bl.length() - off) {
- derr(0) << "obviously corrupt cnode (bad cnode_bytes)" << dendl;
- return 0;
- }
- csum_t actual = calc_csum_unaligned(bl.c_str() + off + sizeof(csum_t),
- ec->cnode_bytes - sizeof(csum_t));
- if (actual != ec->cnode_csum) {
- derr(0) << "corrupt cnode (bad csum actual " << actual << " != cnode's " << ec->cnode_csum << ")" << dendl;
- return 0;
- }
- if (actual != csum) {
- derr(0) << "corrupt cnode (bad csum actual " << actual << " != expected " << csum << ")" << dendl;
- return 0;
- }
-
- // build cnode
- Cnode *cn = new Cnode(ec->coll_id);
- cn->cnode_loc = ec->cnode_loc;
-
- // parse attributes
- char *p = (char*)(ec + 1);
- for (unsigned i=0; i<ec->num_attr; i++) {
- string key = p;
- p += key.length() + 1;
- int len = *(int*)(p);
- p += sizeof(len);
- cn->attr[key] = buffer::copy(p, len);
- p += len;
- dout(15) << "get_cnode " << *cn << " attr " << key << " len " << len << dendl;
- }
-
- unsigned len = p - (char*)ec;
- assert(len == ec->cnode_bytes);
- return cn;
-}
-
-Cnode* Ebofs::get_cnode(coll_t cid)
-{
- while (1) {
- // in cache?
- if (cnode_map.count(cid)) {
- // yay
- Cnode *cn = cnode_map[cid];
- cn->get();
- return cn;
- }
-
- // on disk?
- ebofs_inode_ptr ptr;
- if (collection_tab->lookup(cid, ptr) < 0) {
- // object dne.
- return 0;
- }
-
- // already loading?
- if (waitfor_cnode.count(cid)) {
- // yep, just wait.
- Cond c;
- waitfor_cnode[cid].push_back(&c);
- dout(10) << "get_cnode " << cid << " already loading, waiting" << dendl;
- c.Wait(ebofs_lock);
- continue;
- }
-
- dout(10) << "get_cnode reading " << cid << " from " << ptr.loc << dendl;
-
- assert(waitfor_cnode.count(cid) == 0);
- waitfor_cnode[cid].clear(); // this should be empty initially.
-
- // read it!
- bufferlist bl;
- //bufferpool.alloc( EBOFS_BLOCK_SIZE*cnode_loc.length, bl );
- bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*ptr.loc.length) );
-
- ebofs_lock.Unlock();
- dev.read( ptr.loc.start, ptr.loc.length, bl );
- ebofs_lock.Lock();
-
- unsigned off = 0;
- Cnode *cn = decode_cnode(bl, off, ptr.csum);
- if (!cn) {
- assert(0); // corrupt!
- }
- assert(cn->coll_id == cid);
- cnode_map[cid] = cn;
- cnode_lru.lru_insert_top(cn);
-
- // wake up other waiters
- for (list<Cond*>::iterator i = waitfor_cnode[cid].begin();
- i != waitfor_cnode[cid].end();
- i++)
- (*i)->Signal();
- waitfor_cnode.erase(cid); // remove Cond list
-
- cn->get();
- return cn;
- }
-}
-
-csum_t Ebofs::encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off)
-{
- unsigned start_off = off;
-
- // cnode
- struct ebofs_cnode ec;
- ec.cnode_loc = cn->cnode_loc;
- ec.coll_id = cn->coll_id;
- ec.num_attr = cn->attr.size();
- bl.copy_in(off, sizeof(ec), (char*)&ec);
- off += sizeof(ec);
-
- // attr
- for (map<string, bufferptr>::iterator i = cn->attr.begin();
- i != cn->attr.end();
- i++) {
- bl.copy_in(off, i->first.length()+1, i->first.c_str());
- off += i->first.length()+1;
- int len = i->second.length();
- bl.copy_in(off, sizeof(int), (char*)&len);
- off += sizeof(int);
- bl.copy_in(off, len, i->second.c_str());
- off += len;
-
- dout(15) << "encode_cnode " << *cn << " attr " << i->first << " len " << len << dendl;
- }
-
- ec.cnode_bytes = off - start_off;
- bl.copy_in(start_off + sizeof(csum_t), sizeof(__u32), (char*)&ec.cnode_bytes);
- ec.cnode_csum = calc_csum_unaligned(bl.c_str() + start_off + sizeof(csum_t),
- ec.cnode_bytes - sizeof(csum_t));
- bl.copy_in(start_off, sizeof(csum_t), (char*)&ec);
- dout(15) << "encode_cnode len " << ec.cnode_bytes << " csum " << ec.cnode_csum << dendl;
-
- return ec.cnode_csum;
-}
-
-void Ebofs::write_cnode(Cnode *cn)
-{
- // allocate buffer
- unsigned bytes = sizeof(ebofs_cnode) + cn->get_attr_bytes();
- unsigned blocks = DIV_ROUND_UP(bytes, EBOFS_BLOCK_SIZE);
-
- bufferlist bl;
- //bufferpool.alloc( EBOFS_BLOCK_SIZE*blocks, bl );
- bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) );
-
- // relocate cnode!
- if (cn->cnode_loc.length)
- allocator.release(cn->cnode_loc);
- allocator.allocate(cn->cnode_loc, blocks, Allocator::NEAR_LAST_FWD);
-
- dout(10) << "write_cnode " << *cn << " to " << cn->cnode_loc
- << " bufptr " << (void*)bl.c_str() << dendl;
-
- // encode
- unsigned off = 0;
- csum_t csum = encode_cnode(cn, bl, off);
- assert(off == bytes);
- if (off < bl.length())
- bl.zero(off, bl.length()-off);
-
- // update pointer
- collection_tab->remove(cn->coll_id);
- ebofs_inode_ptr ptr(cn->cnode_loc, csum);
- collection_tab->insert(cn->coll_id, ptr);
-
- // write
- dev.write( cn->cnode_loc.start, cn->cnode_loc.length, bl,
- new C_E_InodeFlush(this), "write_cnode" );
-}
-
-void Ebofs::remove_cnode(Cnode *cn)
-{
- dout(10) << "remove_cnode " << *cn << dendl;
-
- // remove from table
- collection_tab->remove(cn->coll_id);
-
- // free cnode space
- if (cn->cnode_loc.length)
- allocator.release(cn->cnode_loc);
-
- // remove from dirty list?
- if (cn->is_dirty())
- dirty_cnodes.erase(cn);
-
- // remove from map and lru
- cnode_map.erase(cn->coll_id);
- cnode_lru.lru_remove(cn);
-
- // count down refs
- cn->mark_clean();
- cn->put();
- assert(cn->get_ref_count() == 0);
-
- // hose.
- delete cn;
-
- dirty = true;
-}
-
-void Ebofs::put_cnode(Cnode *cn)
-{
- cn->put();
-}
-
-void Ebofs::dirty_cnode(Cnode *cn)
-{
- if (!cn->is_dirty()) {
- cn->mark_dirty();
- dirty_cnodes.insert(cn);
- }
- dirty = true;
-}
-
-
-
-
-
-void Ebofs::flush_inode_finish()
-{
- ebofs_lock.Lock();
- {
- inodes_flushing--;
- if (inodes_flushing < 1000)
- dout(20) << "flush_inode_finish, " << inodes_flushing << " left" << dendl;
- if (inodes_flushing == 0)
- inode_commit_cond.Signal();
- }
- ebofs_lock.Unlock();
-}
-
-void Ebofs::commit_inodes_start()
-{
- dout(10) << "commit_inodes_start" << dendl;
-
- assert(inodes_flushing == 0);
-
- // onodes
- for (set<Onode*>::iterator i = dirty_onodes.begin();
- i != dirty_onodes.end();
- i++) {
- Onode *on = *i;
- inodes_flushing++;
- write_onode(on);
- on->mark_clean();
- on->uncommitted.clear(); // commit any newly allocated blocks
- }
- dirty_onodes.clear();
-
- // cnodes
- for (set<Cnode*>::iterator i = dirty_cnodes.begin();
- i != dirty_cnodes.end();
- i++) {
- Cnode *cn = *i;
- inodes_flushing++;
- write_cnode(cn);
- cn->mark_clean();
- }
- dirty_cnodes.clear();
-
- dout(10) << "commit_inodes_start writing " << inodes_flushing << " onodes+cnodes" << dendl;
-}
-
-void Ebofs::commit_inodes_wait()
-{
- // caller must hold ebofs_lock
- while (inodes_flushing > 0) {
- dout(10) << "commit_inodes_wait waiting for " << inodes_flushing << " onodes+cnodes to flush" << dendl;
- inode_commit_cond.Wait(ebofs_lock);
- }
- dout(10) << "commit_inodes_wait all flushed" << dendl;
-}
-
-
-
-
-
-
-
-// *** buffer cache ***
-
-void Ebofs::trim_buffer_cache()
-{
- ebofs_lock.Lock();
- trim_bc(0);
- ebofs_lock.Unlock();
-}
-
-void Ebofs::trim_bc(int64_t max)
-{
- if (max < 0)
- max = g_conf.ebofs_bc_size;
- dout(10) << "trim_bc start: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << dendl;
-
- while (bc.get_size() > (uint64_t)max &&
- bc.get_trimmable()) {
- BufferHead *bh = (BufferHead*) bc.lru_rest.lru_expire();
- if (!bh) break;
-
- dout(25) << "trim_bc trimming " << *bh << dendl;
- assert(bh->is_clean() || bh->is_corrupt());
-
- ObjectCache *oc = bh->oc;
- bc.remove_bh(bh);
-
- if (oc->is_empty()) {
- Onode *on = oc->on;
- dout(10) << "trim_bc closing oc on " << *on << dendl;
- on->close_oc();
- }
- }
-
- dout(10) << "trim_bc finish: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << dendl;
-}
-
-
-void Ebofs::kick_idle()
-{
- dout(10) << "kick_idle" << dendl;
- //commit_cond.Signal();
-
- ebofs_lock.Lock();
- if (mounted && !unmounting && dirty) {
- dout(10) << "kick_idle dirty, doing commit" << dendl;
- commit_cond.Signal();
- } else {
- dout(10) << "kick_idle !dirty or !mounted or unmounting, doing nothing" << dendl;
- }
- ebofs_lock.Unlock();
-}
-
-void Ebofs::sync(Context *onsafe)
-{
- ebofs_lock.Lock();
- if (onsafe) {
- dirty = true;
-
- if (journal) {
- // journal empty transaction
- Transaction t;
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(++op_seq, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- }
- ebofs_lock.Unlock();
-}
-
-void Ebofs::sync()
-{
- ebofs_lock.Lock();
- if (!dirty) {
- dout(7) << "sync in " << super_epoch << ", not dirty" << dendl;
- } else {
- epoch_t start = super_epoch;
- dout(7) << "sync start in " << start << dendl;
- while (super_epoch == start) {
- dout(7) << "sync kicking commit in " << super_epoch << dendl;
- dirty = true;
- commit_cond.Signal();
- sync_cond.Wait(ebofs_lock);
- }
- dout(10) << "sync finish in " << super_epoch << dendl;
- }
- ebofs_lock.Unlock();
-}
-
-
-
-void Ebofs::commit_bc_wait(version_t epoch)
-{
- dout(10) << "commit_bc_wait on epoch " << epoch << dendl;
-
- while (bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE,epoch) > 0 ||
- bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL,epoch) > 0) {
- //dout(10) << "commit_bc_wait " << bc.get_unflushed(epoch) << " unflushed in epoch " << epoch << dendl;
- dout(10) << "commit_bc_wait epoch " << epoch
- << ", unflushed bhwrite " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE)
- << ", unflushed partial " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL)
- << dendl;
- bc.waitfor_flush();
- }
-
- bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE).erase(epoch);
- bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL).erase(epoch);
-
- dout(10) << "commit_bc_wait all flushed for epoch " << epoch
- << "; " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE)
- << " " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL)
- << dendl;
-}
-
-
-
-int Ebofs::statfs(struct statfs *buf)
-{
- dout(7) << "statfs" << dendl;
-
- buf->f_type = EBOFS_MAGIC; /* type of filesystem */
- buf->f_bsize = 4096; /* optimal transfer block size */
- buf->f_blocks = dev.get_num_blocks(); /* total data blocks in file system */
- buf->f_bfree = get_free_blocks()
- + get_limbo_blocks(); /* free blocks in fs */
- buf->f_bavail = get_free_blocks(); /* free blocks avail to non-superuser -- actually, for writing. */
- buf->f_files = nodepool.get_num_total(); /* total file nodes in file system */
- buf->f_ffree = nodepool.get_num_free(); /* free file nodes in fs */
- //buf->f_fsid = 0; /* file system id */
-#ifndef DARWIN
- buf->f_namelen = 8; /* maximum length of filenames */
-#endif // DARWIN
-
- return 0;
-}
-
-
-
-
-/*
- * allocate a write to blocks on disk.
- * - take care to not overwrite any "safe" data blocks.
- * - allocate/map new extents on disk as necessary
- */
-void Ebofs::alloc_write(Onode *on,
- block_t start, block_t len,
- interval_set<block_t>& alloc,
- block_t& old_bfirst, block_t& old_blast,
- csum_t& old_csum_first, csum_t& old_csum_last)
-{
- // first decide what pages to (re)allocate
- alloc.insert(start, len); // start with whole range
-
- // figure out what bits are already uncommitted
- interval_set<block_t> already_uncom;
- already_uncom.intersection_of(alloc, on->uncommitted);
-
- // subtract those off, so we're left with the committed bits (that must be reallocated).
- alloc.subtract(already_uncom);
-
- dout(10) << "alloc_write must (re)alloc " << alloc << " on " << *on << dendl;
-
- // release it (into limbo)
- for (map<block_t,block_t>::iterator i = alloc.m.begin();
- i != alloc.m.end();
- i++) {
- // get old region
- vector<extent_t> old;
- on->map_extents(i->first, i->second, old, 0);
- for (unsigned o=0; o<old.size(); o++)
- if (old[o].start)
- allocator.release(old[o]);
-
- // take note if first/last blocks in write range are remapped.. in case we need to do a partial read/write thing
- // these are for partial, so we don't care about TX bh's, so don't worry about bits canceling stuff below.
- if (!old.empty()) {
- if (old[0].start &&
- i->first == start) { // ..if not a hole..
- old_bfirst = old[0].start;
- old_csum_first = *on->get_extent_csum_ptr(start, 1);
- dout(20) << "alloc_write old_bfirst " << old_bfirst << " of " << old[0]
- << " csum " << old_csum_first << dendl;
- }
- if (old[old.size()-1].start &&
- i->first+i->second == start+len &&
- start+len <= on->last_block) {
- old_blast = old[old.size()-1].last();
- old_csum_last = *on->get_extent_csum_ptr(start+len-1, 1);
- dout(20) << "alloc_write old_blast " << old_blast << " of " << old[old.size()-1]
- << " csum " << old_csum_last << dendl;
- }
- }
- }
-
- // reallocate uncommitted too?
- // ( --> yes. we can always make better allocation decisions later, with more information. )
- if (g_conf.ebofs_realloc) {
- list<BufferHead*> tx;
-
- ObjectCache *oc = on->get_oc(&bc);
- oc->find_tx(start, len, tx);
-
- for (list<BufferHead*>::reverse_iterator p = tx.rbegin();
- p != tx.rend();
- p++) {
- BufferHead *bh = *p;
-
- // cancelable/moveable?
- if (alloc.contains(bh->start(), bh->length())) {
- dout(10) << "alloc_write " << *bh << " already in " << alloc << dendl;
- continue;
- }
-
- vector<extent_t> old;
- on->map_extents(bh->start(), bh->length(), old, 0);
- assert(old.size() == 1);
-
- if (bh->start() >= start && bh->end() <= start+len) {
- assert(bh->epoch_modified == super_epoch);
- if (bc.bh_cancel_write(bh, super_epoch)) {
- if (bh->length() == 1)
- dout(10) << "alloc_write unallocated tx " << old[0] << ", canceled " << *bh << dendl;
- // no, this isn't compatible with clone() and extent reference counting.
- //allocator.unallocate(old[0]); // release (into free)
- allocator.release(old[0]); // **FIXME** no cloning yet, my friend!
- alloc.insert(bh->start(), bh->length());
- } else {
- if (bh->length() == 1)
- dout(10) << "alloc_write released tx " << old[0] << ", couldn't cancel " << *bh << dendl;
- allocator.release(old[0]); // release (into limbo)
- alloc.insert(bh->start(), bh->length());
- }
- } else {
- if (bh->length() == 1)
- dout(10) << "alloc_write skipped tx " << old[0] << ", not entirely within "
- << start << "~" << len
- << " bh " << *bh << dendl;
- }
- }
-
- dout(10) << "alloc_write will (re)alloc " << alloc << " on " << *on << dendl;
- }
-
- if (alloc.empty()) return; // no need to dirty the onode below!
-
-
- // merge alloc into onode uncommitted map
- //dout(10) << " union of " << on->uncommitted << " and " << alloc << dendl;
- interval_set<block_t> old = on->uncommitted;
- on->uncommitted.union_of(alloc);
-
- dout(10) << "alloc_write onode.uncommitted is now " << on->uncommitted << dendl;
-
- if (0) {
- // verify
- interval_set<block_t> ta;
- ta.intersection_of(on->uncommitted, alloc);
- dout(0) << " ta " << ta << dendl;
- assert(alloc == ta);
-
- interval_set<block_t> tb;
- tb.intersection_of(on->uncommitted, old);
- dout(0) << " tb " << tb << dendl;
- assert(old == tb);
- }
-
- dirty_onode(on);
-
- // allocate the space
- for (map<block_t,block_t>::iterator i = alloc.m.begin();
- i != alloc.m.end();
- i++) {
- dout(15) << "alloc_write alloc " << i->first << "~" << i->second << " (of " << start << "~" << len << ")" << dendl;
-
- // allocate new space
- block_t left = i->second;
- block_t cur = i->first;
- while (left > 0) {
- extent_t ex;
- allocator.allocate(ex, left, Allocator::NEAR_LAST_FWD);
- dout(10) << "alloc_write got " << ex << " for object offset " << cur << dendl;
- on->set_extent(cur, ex); // map object to new region
- left -= ex.length;
- cur += ex.length;
- }
- }
-}
-
-
-int Ebofs::check_partial_edges(Onode *on, uint64_t off, uint64_t len,
- bool &partial_head, bool &partial_tail)
-{
- // partial block overwrite at head or tail?
- uint64_t last_block_byte = on->last_block * EBOFS_BLOCK_SIZE;
- partial_head = (off < last_block_byte) && (off & EBOFS_BLOCK_MASK);
- partial_tail = ((off+len) < on->object_size) && ((off+len) & EBOFS_BLOCK_MASK);
- dout(10) << "check_partial_edges on " << *on << " " << off << "~" << len
- << " " << partial_head << "/" << partial_tail << dendl;
-
- if ((partial_head || partial_tail) && commit_starting) {
- ObjectCache *oc = on->get_oc(&bc);
-
- // verify that partials don't depend on unread data!
- if (partial_head) {
- block_t bstart = off / EBOFS_BLOCK_SIZE;
- BufferHead *bh = oc->find_bh_containing(bstart);
- if (!bh) {
- dout(10) << "check_partial_edges missing data for partial head, deferring" << dendl;
- return -1;
- }
- if (bh->is_missing() || bh->is_rx()) {
- dout(10) << "check_partial_edges missing data for partial head " << *bh << ", deferring" << dendl;
- return -1;
- }
- if (bh->is_partial()) {
- unsigned off_in_bh = off & EBOFS_BLOCK_MASK;
- unsigned end_in_bh = MAX(EBOFS_BLOCK_SIZE, off_in_bh+len);
- if (!(off_in_bh == 0 || bh->have_partial_range(0, off_in_bh)) ||
- !(end_in_bh == EBOFS_BLOCK_SIZE || bh->have_partial_range(end_in_bh, EBOFS_BLOCK_SIZE-end_in_bh))) {
- dout(10) << "check_partial_edges can't complete partial head " << *bh << ", deferring" << dendl;
- return -1;
- }
- }
- }
- if (partial_tail) {
- block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE;
- BufferHead *bh = oc->find_bh_containing(blast);
- if (!bh) {
- dout(10) << "check_partial_edges missing data for partial tail, deferring" << dendl;
- return -1;
- }
- if (bh->is_missing() || bh->is_rx()) {
- dout(10) << "check_partial_edges missing data for partial tail " << *bh << ", deferring" << dendl;
- return -1;
- }
- if (bh->is_partial()) {
- uint64_t off_in_bh = off & EBOFS_BLOCK_MASK;
- uint64_t end_in_bh = MAX(EBOFS_BLOCK_SIZE, off_in_bh+len);
- uint64_t end = EBOFS_BLOCK_SIZE;
- if (bh->end()*EBOFS_BLOCK_SIZE > last_block_byte)
- end = last_block_byte & EBOFS_BLOCK_MASK;
- if (!(off_in_bh == 0 || bh->have_partial_range(0, off_in_bh)) ||
- !(end_in_bh >= end || bh->have_partial_range(end_in_bh, end-end_in_bh))) {
- dout(10) << "check_partial_edges can't complete partial tail " << *bh << ", deferring" << dendl;
- return -1;
- }
- }
- }
- dout(10) << "check_partial_edges commit_starting, and partial head|tail, but we can proceed." << dendl;
- }
-
- return 0;
-}
-
-int Ebofs::apply_write(Onode *on, uint64_t off, uint64_t len, const bufferlist& bl)
-{
- ObjectCache *oc = on->get_oc(&bc);
- //oc->scrub_csums();
-
- assert(bl.length() == len);
-
- // map into blocks
- uint64_t opos = off; // byte pos in object
- uint64_t left = len; // bytes left
- block_t bstart = off / EBOFS_BLOCK_SIZE;
- block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE;
- block_t blen = blast-bstart+1;
-
- // check partial edges
- bool partial_head, partial_tail;
- if (check_partial_edges(on, off, len, partial_head, partial_tail) < 0)
- return -1;
-
- // -- starting changing stuff --
-
- // extending object?
- uint64_t old_object_size = on->object_size;
- if (off+len > on->object_size) {
- dout(10) << "apply_write extending size on " << *on << ": " << on->object_size
- << " -> " << off+len << dendl;
- on->object_size = off+len;
- }
-
- // map block range onto buffer_heads
- map<block_t, BufferHead*> hits;
- oc->map_write(bstart, blen, hits, super_epoch);
-
- // allocate write on disk.
- interval_set<block_t> alloc;
- block_t old_last_block = on->last_block;
- block_t old_bfirst = 0; // zero means not defined here (since we ultimately pass to bh_read)
- block_t old_blast = 0;
- csum_t old_csum_first = 0;
- csum_t old_csum_last = 0;
- alloc_write(on, bstart, blen, alloc, old_bfirst, old_blast, old_csum_first, old_csum_last);
- dout(20) << "apply_write old_bfirst " << old_bfirst << ", old_blast " << old_blast << dendl;
-
- if (fake_writes) {
- on->uncommitted.clear(); // worst case!
- return 0;
- }
-
- // get current versions
- version_t highv = ++oc->write_count;
-
- // copy from bl into buffer cache
- list<Context*> finished;
- unsigned blpos = 0; // byte pos in input buffer
- for (map<block_t, BufferHead*>::iterator i = hits.begin();
- i != hits.end();
- i++) {
- BufferHead *bh = i->second;
- bh->set_version(highv);
- bh->epoch_modified = super_epoch;
-
- // break bh over disk extent boundaries
- vector<extent_t> exv;
- on->map_extents(bh->start(), bh->length(), exv, 0);
- dout(10) << "apply_write bh " << *bh << " maps to " << exv << dendl;
- if (exv.size() > 1) {
- dout(10) << "apply_write breaking interior bh " << *bh << " over extent boundary "
- << exv[0] << " " << exv[1] << dendl;
- BufferHead *right = bc.split(bh, bh->start() + exv[0].length);
- hits[right->start()] = right;
- }
-
- // mark holes 'clean'
- if (bh->start() >= old_last_block) {
- assert(bh->is_missing());
- bc.mark_clean(bh);
- dout(10) << "apply_write treating appended bh as a hole " << *bh << dendl;
- } else {
- if (exv[0].start == 0) {
- assert(bh->is_missing() || bh->is_clean());
- dout(10) << "apply_write marking old hole clean " << *bh << dendl;
- bc.mark_clean(bh);
- }
- }
-
- // take read waiters
- bh->take_read_waiters(finished); // this is a bit aggressive, since we kick waiters on partials
-
- // need to split off partial? (partials can only be ONE block)
- if ((bh->is_missing() || bh->is_rx()) && bh->length() > 1) {
- if (bh->start() == bstart && partial_head) {
- BufferHead *right = bc.split(bh, bh->start()+1);
- hits[right->start()] = right;
- dout(10) << "apply_write split off left block for partial write; rest is " << *right << dendl;
- }
- if (bh->last() == blast && partial_tail) {
- BufferHead *right = bc.split(bh, bh->last());
- hits[right->start()] = right;
- dout(10) << "apply_write split off right block for upcoming partial write; rest is " << *right << dendl;
- }
- }
-
- // locate ourselves in bh
- unsigned off_in_bh = opos - bh->start()*EBOFS_BLOCK_SIZE;
- assert(off_in_bh >= 0);
-
- // partial at head or tail?
- if ((bh->start() == bstart && partial_head) ||
- (bh->last() == blast && partial_tail)) {
- unsigned len_in_bh = MIN( left,
- (bh->end()*EBOFS_BLOCK_SIZE)-opos );
-
- if (bh->is_partial() || bh->is_rx() || bh->is_missing() || bh->is_corrupt()) {
- assert(bh->length() == 1);
-
- if (bh->is_corrupt()) {
- dout(10) << "apply_write marking non-overwritten bytes bad on corrupt " << *bh << dendl;
- interval_set<uint64_t> bad;
- uint64_t bs = bh->start() * EBOFS_BLOCK_SIZE;
- if (off_in_bh) bad.insert(bs, bs+off_in_bh);
- if (off_in_bh+len_in_bh < (unsigned)EBOFS_BLOCK_SIZE)
- bad.insert(bs+off_in_bh+len_in_bh, bs+EBOFS_BLOCK_SIZE-off_in_bh-len_in_bh);
- dout(10) << "apply_write marking non-overwritten bytes " << bad << " bad on corrupt " << *bh << dendl;
- bh->oc->on->bad_byte_extents.union_of(bad);
- csum_t csum = calc_csum(bh->data.c_str(), bh->data.length());
- dout(10) << "apply_write marking corrupt bh csum " << hex << csum << dec << " clean " << *bh << dendl;
- *on->get_extent_csum_ptr(bh->start(), 1) = csum;
- on->data_csum += csum;
- bc.mark_clean(bh);
- } else {
- // newly realloc? carry old checksum over since we're only partially overwriting
- if (bh->start() == bstart && alloc.contains(bstart)) {
- dout(10) << "apply_write carrying over starting csum " << hex << old_csum_first << dec
- << " for partial " << *bh << dendl;
- *on->get_extent_csum_ptr(bh->start(), 1) = old_csum_first;
- on->data_csum += old_csum_first;
- } else if (bh->end()-1 == blast && alloc.contains(blast)) {
- dout(10) << "apply_write carrying over ending csum " << hex << old_csum_last << dec
- << " for partial " << *bh << dendl;
- *on->get_extent_csum_ptr(bh->end()-1, 1) = old_csum_last;
- on->data_csum += old_csum_last;
- }
- }
-
- // add frag to partial
- dout(10) << "apply_write writing into partial " << *bh << ":"
- << " off_in_bh " << off_in_bh
- << " len_in_bh " << len_in_bh
- << dendl;
- bufferlist sb;
- sb.substr_of(bl, blpos, len_in_bh); // substr in existing buffer
- sb.rebuild(); // recopy into properly sized buffer, so that we drop references to user buffer
- bh->add_partial(off_in_bh, sb);
- left -= len_in_bh;
- blpos += len_in_bh;
- opos += len_in_bh;
-
- if (bh->is_partial() &&
- bh->partial_is_complete(on->object_size - bh->start()*EBOFS_BLOCK_SIZE)) {
- dout(10) << "apply_write completed partial " << *bh << dendl;
- bc.bh_cancel_read(bh); // cancel old rx op, if we can.
- bh->data.clear();
- bh->data.push_back(buffer::create_page_aligned(EBOFS_BLOCK_SIZE));
- bh->apply_partial();
- bc.mark_dirty(bh);
- bc.bh_write(on, bh);
- }
- else if (bh->is_rx()) {
- dout(10) << "apply_write rx -> partial " << *bh << dendl;
- assert(bh->length() == 1);
- bc.mark_partial(bh);
- assert(!commit_starting); // otherwise, but in check_partial_edges
- }
- else if (bh->is_missing() || bh->is_corrupt()) {
- dout(10) << "apply_write missing -> partial " << *bh << dendl;
- assert(bh->length() == 1);
- bc.mark_partial(bh);
- assert(!commit_starting); // otherwise, but in check_partial_edges
-
- // take care to read from _old_ disk block locations!
- if (bh->start() == bstart)
- bc.bh_read(on, bh, old_bfirst);
- else if (bh->start() == blast)
- bc.bh_read(on, bh, old_blast);
- else assert(0);
- }
- else if (bh->is_partial()) {
- dout(10) << "apply_write already partial, no need to submit rx on " << *bh << dendl;
- }
-
- } else {
- assert(bh->is_clean() || bh->is_dirty() || bh->is_tx());
-
- // just write into the bh!
- dout(10) << "apply_write writing leading/tailing partial into " << *bh << ":"
- << " off_in_bh " << off_in_bh
- << " len_in_bh " << len_in_bh
- << dendl;
-
- // copy data into new buffers first (copy on write!)
- // FIXME: only do the modified pages? this might be a big bh!
- bufferlist oldbl;
- oldbl.claim(bh->data);
- bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) );
- if (oldbl.length()) {
- // had data
- if (off_in_bh)
- bh->data.copy_in(0, off_in_bh, oldbl);
- if (off_in_bh+len_in_bh < bh->data.length())
- bh->data.copy_in(off_in_bh+len_in_bh, bh->data.length()-off_in_bh-len_in_bh,
- oldbl.c_str()+off_in_bh+len_in_bh);
- } else {
- // was a hole
- if (off_in_bh)
- bh->data.zero(0, off_in_bh);
- if (off_in_bh+len_in_bh < bh->data.length())
- bh->data.zero(off_in_bh+len_in_bh, bh->data.length()-off_in_bh-len_in_bh);
- }
-
- // new data
- bufferlist sub;
- sub.substr_of(bl, blpos, len_in_bh);
- bh->data.copy_in(off_in_bh, len_in_bh, sub);
-
- // update csum
- block_t rbfirst = off_in_bh/EBOFS_BLOCK_SIZE;
- block_t rblast = DIV_ROUND_UP(off_in_bh+len_in_bh, EBOFS_BLOCK_SIZE);
- block_t bnum = rblast-rbfirst;
- csum_t *csum = on->get_extent_csum_ptr(bh->start()+rbfirst, bnum);
- dout(20) << "calc csum for " << rbfirst << "~" << bnum << dendl;
- for (unsigned i=0; i<bnum; i++) {
- on->data_csum -= csum[i];
- dout(30) << "old csum for " << (i+rbfirst) << " is " << hex << csum[i] << dec << dendl;
- csum[i] = calc_csum(&bh->data[i*EBOFS_BLOCK_SIZE], EBOFS_BLOCK_SIZE);
- dout(30) << "new csum for " << (i+rbfirst) << " is " << hex << csum[i] << dec << dendl;
- on->data_csum += csum[i];
- dout(30) << "new data_csum is " << hex << on->data_csum << dec << dendl;
- }
-
- blpos += len_in_bh;
- left -= len_in_bh;
- opos += len_in_bh;
-
- if (!bh->is_dirty())
- bc.mark_dirty(bh);
-
- bc.bh_write(on, bh);
- }
- continue;
- }
-
- // ok
- // we're now writing up to a block boundary, or EOF.
- assert(off_in_bh+left >= (uint64_t)(EBOFS_BLOCK_SIZE*bh->length()) ||
- (opos+left) >= on->object_size);
-
- unsigned len_in_bh = MIN((uint64_t)bh->length()*EBOFS_BLOCK_SIZE - off_in_bh,
- left);
- assert(len_in_bh <= left);
-
- dout(10) << "apply_write writing into " << *bh << ":"
- << " off_in_bh " << off_in_bh << " len_in_bh " << len_in_bh
- << dendl;
-
- // i will write:
- bufferlist sub;
- sub.substr_of(bl, blpos, len_in_bh);
-
- if (off_in_bh == 0 &&
- sub.is_page_aligned() &&
- sub.is_n_page_sized()) {
- // assume caller isn't going to modify written buffers.
- // just refrence them!
- assert(sub.length() == bh->length()*EBOFS_BLOCK_SIZE);
- dout(10) << "apply_write yippee, written buffer already page aligned" << dendl;
- bh->data.claim(sub);
- } else {
- // alloc new buffer.
- bh->data.clear();
- bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) );
-
- // zero leader?
- if (off_in_bh &&
- opos > old_object_size) {
- uint64_t zstart = MAX(0, old_object_size-(uint64_t)bh->start()*EBOFS_BLOCK_SIZE);
- uint64_t zlen = off_in_bh - zstart;
- dout(15) << "apply_write zeroing bh lead over " << zstart << "~" << zlen << dendl;
- bh->data.zero(zstart, zlen);
- }
-
- // copy data
- bufferlist sub;
- sub.substr_of(bl, blpos, len_in_bh);
- bh->data.copy_in(off_in_bh, len_in_bh, sub);
-
- // zero the past-eof tail, too, to be tidy.
- if (len_in_bh < bh->data.length()) {
- uint64_t zstart = off_in_bh+len_in_bh;
- uint64_t zlen = bh->data.length()-(off_in_bh+len_in_bh);
- bh->data.zero(zstart, zlen);
- dout(15) << "apply_write zeroing bh tail over " << zstart << "~" << zlen << dendl;
- }
- }
-
- // fill in csums
- unsigned blocks = DIV_ROUND_UP(off_in_bh+len_in_bh, EBOFS_BLOCK_SIZE);
- csum_t *csum = on->get_extent_csum_ptr(bh->start(), blocks);
- for (unsigned i=0; i<blocks; i++) {
- on->data_csum -= csum[i];
- csum[i] = calc_csum(bh->data.c_str() + i*EBOFS_BLOCK_SIZE, EBOFS_BLOCK_SIZE);
- on->data_csum += csum[i];
- }
- on->verify_extents();
-
- blpos += len_in_bh;
- left -= len_in_bh;
- opos += len_in_bh;
-
- // old partial?
- if (bh->is_partial())
- bc.bh_cancel_read(bh); // cancel rx (if any) too.
-
- // mark dirty
- if (!bh->is_dirty())
- bc.mark_dirty(bh);
-
- bc.bh_write(on, bh);
- }
-
- assert(left == 0);
- assert(opos == off+len);
- assert(blpos == bl.length());
-
- // oc->scrub_csums();
-
- dirty_onode(on);
- finish_contexts(finished);
- return 0;
-}
-
-
-int Ebofs::apply_zero(Onode *on, uint64_t off, size_t len)
-{
- dout(10) << "apply_zero " << off << "~" << len << " on " << *on << dendl;
-
- bool partial_head, partial_tail;
- if (check_partial_edges(on, off, len, partial_head, partial_tail) < 0)
- return -1;
-
- // zero edges
- // head?
- if (off & EBOFS_BLOCK_MASK) {
- size_t l = EBOFS_BLOCK_SIZE - (off & EBOFS_BLOCK_MASK);
- if (l > len) l = len;
- if (partial_head) {
- bufferptr bp(l);
- bp.zero();
- bufferlist bl;
- bl.push_back(bp);
- int r = apply_write(on, off, bl.length(), bl);
- assert(r == 0);
- }
- off += l;
- len -= l;
- }
- if (len == 0) return 0; // done!
-
- // tail?
- if ((off+len) & EBOFS_BLOCK_MASK) {
- int l = (off+len) & EBOFS_BLOCK_MASK;
- bufferptr bp(l);
- bp.zero();
- bufferlist bl;
- bl.push_back(bp);
- int r = apply_write(on, off+len-bl.length(), bp.length(), bl);
- assert(r == 0);
- len -= l;
- }
- if (len == 0) return 0; // done!
-
- // map middle onto buffers
- assert(len > 0);
- assert((off & EBOFS_BLOCK_MASK) == 0);
- assert((len & EBOFS_BLOCK_MASK) == 0);
- block_t bstart = off / EBOFS_BLOCK_SIZE;
- block_t blen = len / EBOFS_BLOCK_SIZE;
- assert(blen > 0);
-
- map<block_t,BufferHead*> hits;
- ObjectCache *oc = on->get_oc(&bc);
- oc->map_write(bstart, blen, hits, super_epoch);
-
- map<block_t,BufferHead*>::iterator p = hits.begin();
- while (p != hits.end()) {
- map<block_t,BufferHead*>::iterator next = p;
- next++;
- BufferHead *bh = p->second;
- oc->discard_bh(bh, super_epoch);
- p = next;
- }
-
- // free old blocks
- vector<extent_t> old;
- on->map_extents(bstart, blen, old, 0);
- for (unsigned i=0; i<old.size(); i++)
- if (old[i].start)
- allocator.release(old[i]);
- extent_t hole = {0, blen};
- on->set_extent(bstart, hole);
-
- // adjust uncom
- interval_set<block_t> zeroed;
- zeroed.insert(bstart, blen);
- interval_set<block_t> olduncom;
- olduncom.intersection_of(zeroed, on->uncommitted);
- dout(10) << "_zeroed old uncom " << on->uncommitted << " zeroed " << zeroed
- << " subtracting " << olduncom << dendl;
- on->uncommitted.subtract(olduncom);
- dout(10) << "_zeroed new uncom " << on->uncommitted << dendl;
-
- dirty_onode(on);
- return 0;
-}
-
-
-
-// *** file i/o ***
-
-int Ebofs::attempt_read(Onode *on, uint64_t off, size_t len, bufferlist& bl,
- Cond *will_wait_on, bool *will_wait_on_bool)
-{
- dout(10) << "attempt_read " << *on << " " << off << "~" << len << dendl;
- ObjectCache *oc = on->get_oc(&bc);
-
- // overlapping bad byte extents?
- if (!on->bad_byte_extents.empty()) {
- if (on->bad_byte_extents.contains(off)) {
- dout(10) << "attempt_read corrupt (bad byte extent) at off " << off << ", returning -EIO" << dendl;
- return -EIO;
- }
- if (on->bad_byte_extents.end() > off) {
- uint64_t bad = on->bad_byte_extents.start_after(off);
- if (bad < off+(uint64_t)len) {
- len = bad-off;
- dout(10) << "attempt_read corrupt (bad byte extent) at " << bad << ", shortening read to " << len << dendl;
- }
- }
- }
-
- // map
- block_t bstart = off / EBOFS_BLOCK_SIZE;
- block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE;
- block_t blen = blast-bstart+1;
-
- map<block_t, BufferHead*> hits;
- map<block_t, BufferHead*> missing; // read these
- map<block_t, BufferHead*> rx; // wait for these
- map<block_t, BufferHead*> partials; // ??
- oc->map_read(bstart, blen, hits, missing, rx, partials);
-
- // missing buffers?
- if (!missing.empty()) {
- for (map<block_t,BufferHead*>::iterator i = missing.begin();
- i != missing.end();
- i++) {
- dout(10) << "attempt_read missing buffer " << *(i->second) << dendl;
- bc.bh_read(on, i->second);
- }
- BufferHead *wait_on = missing.begin()->second;
- block_t b = MAX(wait_on->start(), bstart);
- wait_on->waitfor_read[b].push_back(new C_Cond(will_wait_on, will_wait_on_bool));
- return 0;
- }
-
- // wait on rx?
- if (!rx.empty()) {
- BufferHead *wait_on = rx.begin()->second;
- Context *c = new C_Cond(will_wait_on, will_wait_on_bool);
- dout(20) << "attempt_read waiting for read to finish on " << *wait_on << " c " << c << dendl;
- block_t b = MAX(wait_on->start(), bstart);
- wait_on->waitfor_read[b].push_back(c);
- return 0;
- }
-
- // are partials sufficient?
- for (map<block_t,BufferHead*>::iterator i = partials.begin();
- i != partials.end();
- i++) {
- BufferHead *bh = i->second;
- uint64_t bhstart = (uint64_t)(bh->start()*EBOFS_BLOCK_SIZE);
- uint64_t bhend = (uint64_t)(bh->end()*EBOFS_BLOCK_SIZE);
- uint64_t start = MAX( off, bhstart );
- uint64_t end = MIN( off+(uint64_t)len, bhend );
-
- if (!i->second->have_partial_range(start-bhstart, end-bhstart)) {
- // wait on this one
- Context *c = new C_Cond(will_wait_on, will_wait_on_bool);
- dout(10) << "attempt_read insufficient partial buffer " << *(i->second) << " c " << c << dendl;
- i->second->waitfor_read[i->second->start()].push_back(c);
- return 0;
- }
- dout(10) << "attempt_read have partial range " << (start-bhstart) << "~" << (end-bhstart) << " on " << *bh << dendl;
- }
-
- // yay, we have it all!
- // concurrently walk thru hits, partials, corrupt.
- map<block_t,BufferHead*>::iterator h = hits.begin();
- map<block_t,BufferHead*>::iterator p = partials.begin();
-
- bl.clear();
- uint64_t pos = off;
- block_t curblock = bstart;
- while (curblock <= blast) {
- BufferHead *bh = 0;
- if (h != hits.end() && h->first == curblock) {
- bh = h->second;
- h++;
- } else if (p != partials.end() && p->first == curblock) {
- bh = p->second;
- p++;
- } else assert(0);
-
- uint64_t bhstart = (uint64_t)(bh->start()*EBOFS_BLOCK_SIZE);
- uint64_t bhend = (uint64_t)(bh->end()*EBOFS_BLOCK_SIZE);
- uint64_t start = MAX( pos, bhstart );
- uint64_t end = MIN( off+(uint64_t)len, bhend );
-
- if (bh->is_corrupt()) {
- if (bl.length()) {
- dout(10) << "attempt_read corrupt at " << *bh << ", returning short result" << dendl;
- return 1;
- } else {
- dout(10) << "attempt_read corrupt at " << *bh << ", returning -EIO" << dendl;
- return -EIO;
- }
- } else if (bh->is_partial()) {
- // copy from a partial block. yuck!
- bufferlist frag;
- dout(10) << "attempt_read copying partial range " << (start-bhstart) << "~" << (end-bhstart) << " on " << *bh << dendl;
- bh->copy_partial_substr( start-bhstart, end-bhstart, frag );
- bl.claim_append( frag );
- pos += frag.length();
- } else {
- // copy from a full block.
- if (bhstart == start && bhend == end) {
- if (bh->data.length()) {
- dout(10) << "aligned " << (start-bhstart) << "~" << (end-start) << " of " << bh->data.length() << " in " << *bh << dendl;
- bl.append( bh->data );
- pos += bh->data.length();
- } else {
- dout(10) << "aligned " << (start-bhstart) << "~" << (end-start) << " of hole in " << *bh << dendl;
- bl.append_zero(end-start);
- pos += end-start;
- }
- } else {
- if (bh->data.length()) {
- dout(10) << "substr " << (start-bhstart) << "~" << (end-start) << " of " << bh->data.length() << " in " << *bh << dendl;
- bufferlist frag;
- frag.substr_of(bh->data, start-bhstart, end-start);
- pos += frag.length();
- bl.claim_append( frag );
- } else {
- dout(10) << "substr " << (start-bhstart) << "~" << (end-start) << " of hole in " << *bh << dendl;
- bl.append_zero(end-start);
- pos += end-start;
- }
- }
- }
-
- curblock = bh->end();
- }
-
- assert(bl.length() == len);
- return 1;
-}
-
-
-/*
- * is_cached -- query whether a object extent is in our cache
- * return value of -1 if onode isn't loaded. otherwise, the number
- * of extents that need to be read (i.e. # of seeks)
- */
-int Ebofs::is_cached(coll_t cid, pobject_t oid, uint64_t off, size_t len)
-{
- ebofs_lock.Lock();
- int r = _is_cached(oid, off, len);
- ebofs_lock.Unlock();
- return r;
-}
-
-int Ebofs::_is_cached(pobject_t oid, uint64_t off, size_t len)
-{
- if (!have_onode(oid)) {
- dout(7) << "_is_cached " << oid << " " << off << "~" << len << " ... onode " << dendl;
- return -1; // object dne?
- }
- Onode *on = get_onode(oid);
-
- if (!on->have_oc()) {
- // nothing is cached. return # of extents in file.
- dout(10) << "_is_cached have onode but no object cache, returning extent count" << dendl;
- return on->extent_map.size();
- }
-
- // map
- block_t bstart = off / EBOFS_BLOCK_SIZE;
- block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE;
- block_t blen = blast-bstart+1;
-
- map<block_t, BufferHead*> hits;
- map<block_t, BufferHead*> missing; // read these
- map<block_t, BufferHead*> rx; // wait for these
- map<block_t, BufferHead*> partials; // ??
-
- int num_missing = on->get_oc(&bc)->try_map_read(bstart, blen);
- dout(7) << "_is_cached try_map_read reports " << num_missing << " missing extents" << dendl;
- return num_missing;
-
- // FIXME: actually, we should calculate if these extents are contiguous.
- // and not using map_read, probably...
- /* hrmpf
- block_t dpos = 0;
- block_t opos = bstart;
- while (opos < blen) {
- if (hits.begin()->first == opos) {
- } else {
- block_t d;
- if (missing.begin()->first == opos) d = missing.begin()->second.
-
- }
- */
-}
-
-void Ebofs::trim_from_cache(coll_t cid, pobject_t oid, uint64_t off, size_t len)
-{
- ebofs_lock.Lock();
- _trim_from_cache(oid, off, len);
- ebofs_lock.Unlock();
-}
-
-void Ebofs::_trim_from_cache(pobject_t oid, uint64_t off, size_t len)
-{
- // be careful not to load it if we don't have it
- if (!have_onode(oid)) {
- dout(7) << "_trim_from_cache " << oid << " " << off << "~" << len << " ... onode not in cache " << dendl;
- return;
- }
-
- // ok, we have it, get a pointer.
- Onode *on = get_onode(oid);
-
- if (!on->have_oc())
- return; // nothing is cached.
-
- // map to blocks
- block_t bstart = off / EBOFS_BLOCK_SIZE;
- block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE;
-
- ObjectCache *oc = on->get_oc(&bc);
- oc->touch_bottom(bstart, blast);
-
- return;
-}
-
-
-int Ebofs::read(coll_t cid, pobject_t oid,
- uint64_t off, size_t len,
- bufferlist& bl)
-{
- ebofs_lock.Lock();
- int r = _read(oid, off, len, bl);
- ebofs_lock.Unlock();
- return r;
-}
-
-int Ebofs::_read(pobject_t oid, uint64_t off, size_t len, bufferlist& bl)
-{
- dout(7) << "_read " << oid << " " << off << "~" << len << dendl;
-
- Onode *on = get_onode(oid);
- if (!on) {
- dout(7) << "_read " << oid << " " << off << "~" << len << " ... dne " << dendl;
- return -ENOENT; // object dne?
- }
-
- // read data into bl. block as necessary.
- Cond cond;
-
- int r = 0;
- while (1) {
- // check size bound
- if (off >= on->object_size) {
- dout(7) << "_read " << oid << " " << off << "~" << len << " ... off past eof " << on->object_size << dendl;
- r = 0;
- break;
- }
-
- size_t try_len = len ? len:on->object_size;
- size_t will_read = MIN(off+(uint64_t)try_len, on->object_size) - off;
-
- bool done;
- r = attempt_read(on, off, will_read, bl, &cond, &done);
- if (r != 0)
- break;
-
- // wait
- while (!done)
- cond.Wait(ebofs_lock);
-
- if (on->deleted) {
- dout(7) << "_read " << oid << " " << off << "~" << len << " ... object deleted" << dendl;
- r = -ENOENT;
- break;
- }
- }
-
- put_onode(on);
-
- trim_bc();
-
- if (r < 0) return r; // return error,
- dout(7) << "_read " << oid << " " << off << "~" << len << " ... got " << bl.length() << dendl;
- return bl.length(); // or bytes read.
-}
-
-
-bool Ebofs::_write_will_block()
-{
- return (bc.get_stat_dirty()+bc.get_stat_tx() > g_conf.ebofs_bc_max_dirty);
-}
-
-bool Ebofs::write_will_block()
-{
- ebofs_lock.Lock();
- bool b = _write_will_block();
- ebofs_lock.Unlock();
- return b;
-}
-
-
-unsigned Ebofs::apply_transaction(Transaction& t, Context *onjournal, Context *ondisk)
-{
- ebofs_lock.Lock();
- dout(7) << "apply_transaction start (" << t.get_num_ops() << " ops)" << dendl;
-
- bufferlist bl;
- if (journal)
- t.encode(bl);
-
- unsigned r = _apply_transaction(t);
-
- // journal, wait for commit
- if (r != 0) {
- if (onjournal) {
- delete onjournal; // kill callback, but still journal below (in case transaction had side effects)
- onjournal = 0;
- }
- if (ondisk) {
- delete ondisk;
- ondisk = 0;
- }
- }
-
- if (journal) {
- journal->submit_entry(++op_seq, bl, onjournal);
- } else if (onjournal)
- queue_commit_waiter(onjournal);
-
- if (ondisk)
- queue_commit_waiter(ondisk);
-
- ebofs_lock.Unlock();
- return r;
-}
-
-unsigned Ebofs::_apply_transaction(Transaction& t)
-{
- // verify we have enough space
- if (t.disk_space_required() > get_free_blocks()*EBOFS_BLOCK_SIZE) {
- derr(0) << "apply_transaction needs " << t.disk_space_required() << " bytes > "
- << (get_free_blocks()*EBOFS_BLOCK_SIZE) << " free" << dendl;
- return -ENOSPC;
- }
-
- // do ops
- unsigned r = 0; // bit fields indicate which ops failed.
- int bit = 1;
- while (t.have_op()) {
- int op = t.get_op();
- switch (op) {
-
- case Transaction::OP_STARTSYNC:
- dirty = true;
- commit_cond.Signal();
- break;
-
- case Transaction::OP_TOUCH:
- {
- coll_t cid = t.get_cid();
- pobject_t oid = t.get_oid();
- if (_touch(cid, oid) < 0) {
- dout(7) << "apply_transaction fail on _touch" << dendl;
- r &= bit;
- }
- }
- break;
-
- case Transaction::OP_WRITE:
- {
- coll_t cid = t.get_cid();
- pobject_t oid = t.get_oid();
- uint64_t offset = t.get_length();
- uint64_t len = t.get_length();
- bufferlist& bl = t.get_bl();
- if (_write(cid, oid, offset, len, bl) < 0) {
- dout(7) << "apply_transaction fail on _write" << dendl;
- r &= bit;
- }
- }
- break;
-
- case Transaction::OP_ZERO:
- {
- coll_t cid = t.get_cid();
- pobject_t oid = t.get_oid();
- uint64_t offset = t.get_length();
- uint64_t len = t.get_length();
- if (_zero(cid, oid, offset, len) < 0) {
- dout(7) << "apply_transaction fail on _zero" << dendl;
- r &= bit;
- }
- }
- break;
-
- case Transaction::OP_TRIMCACHE:
- {
- coll_t cid = t.get_cid();
- pobject_t oid = t.get_oid();
- uint64_t offset = t.get_length();
- uint64_t len = t.get_length();
- _trim_from_cache(oid, offset, len);
- }
- break;
-
- case Transaction::OP_TRUNCATE:
- {
- coll_t cid = t.get_cid();
- pobject_t oid = t.get_oid();
- uint64_t offset = t.get_length();
- if (_truncate(cid, oid, offset) < 0) {
- dout(7) << "apply_transaction fail on _truncate" << dendl;
- r &= bit;
- }
- }
- break;
-
- case Transaction::OP_REMOVE:
- {
- coll_t cid = t.get_cid();
- pobject_t oid = t.get_oid();
- if (_remove(cid, oid) < 0) {
- dout(7) << "apply_transaction fail on _remove" << dendl;
- r &= bit;
- }
- }
- break;
-
- case Transaction::OP_SETATTR:
- {
- coll_t cid = t.get_cid();
- pobject_t oid = t.get_oid();
- const char *attrname = t.get_attrname();
- bufferlist& bl = t.get_bl();
- if (_setattr(oid, attrname, bl.c_str(), bl.length()) < 0) {
- dout(7) << "apply_transaction fail on _setattr" << dendl;
- r &= bit;
- }
- }
- break;
-
- case Transaction::OP_SETATTRS:
- {
- coll_t cid = t.get_cid();
- pobject_t oid = t.get_oid();
- map<string,bufferptr>& attrset = t.get_attrset();
- if (_setattrs(oid, attrset) < 0) {
- dout(7) << "apply_transaction fail on _setattrs" << dendl;
- r &= bit;
- }
- }
- break;
-
- case Transaction::OP_RMATTR:
- {
- coll_t cid = t.get_cid();
- pobject_t oid = t.get_oid();
- const char *attrname = t.get_attrname();
- if (_rmattr(oid, attrname) < 0) {
- dout(7) << "apply_transaction fail on _rmattr" << dendl;
- r &= bit;
- }
- }
- break;
-
- case Transaction::OP_CLONE:
- {
- coll_t cid = t.get_cid();
- pobject_t oid = t.get_oid();
- pobject_t noid = t.get_oid();
- if (_clone(cid, oid, noid) < 0) {
- dout(7) << "apply_transaction fail on _clone" << dendl;
- r &= bit;
- }
- }
- break;
-
- case Transaction::OP_CLONERANGE:
- {
- coll_t cid = t.get_cid();
- pobject_t oid = t.get_oid();
- pobject_t noid = t.get_oid();
- uint64_t off = t.get_length();
- uint64_t len = t.get_length();
- if (_clone_range(cid, oid, noid, off, len) < 0) {
- dout(7) << "apply_transaction fail on _clone_range" << dendl;
- r &= bit;
- }
- }
- break;
-
- case Transaction::OP_MKCOLL:
- {
- coll_t cid = t.get_cid();
- if (_create_collection(cid) < 0) {
- dout(7) << "apply_transaction fail on _create_collection" << dendl;
- r &= bit;
- }
- }
- break;
-
- case Transaction::OP_RMCOLL:
- {
- coll_t cid = t.get_cid();
- if (_destroy_collection(cid) < 0) {
- dout(7) << "apply_transaction fail on _destroy_collection" << dendl;
- r &= bit;
- }
- }
- break;
-
- case Transaction::OP_COLL_ADD:
- {
- coll_t cid = t.get_cid();
- pobject_t oid = t.get_oid();
- if (_collection_add(cid, oid) < 0) {
- //dout(7) << "apply_transaction fail on _collection_add" << dendl;
- //r &= bit;
- }
- }
- break;
-
- case Transaction::OP_COLL_REMOVE:
- {
- coll_t cid = t.get_cid();
- pobject_t oid = t.get_oid();
- if (_collection_remove(cid, oid) < 0) {
- dout(7) << "apply_transaction fail on _collection_remove" << dendl;
- r &= bit;
- }
- }
- break;
-
- case Transaction::OP_COLL_SETATTR:
- {
- coll_t cid = t.get_cid();
- const char *attrname = t.get_attrname();
- bufferlist& bl = t.get_bl();
- if (_collection_setattr(cid, attrname, bl.c_str(), bl.length()) < 0) {
- //if (_collection_setattr(cid, attrname, attrval.first, attrval.second) < 0) {
- dout(7) << "apply_transaction fail on _collection_setattr" << dendl;
- r &= bit;
- }
- }
- break;
-
- case Transaction::OP_COLL_RMATTR:
- {
- coll_t cid = t.get_cid();
- const char *attrname = t.get_attrname();
- if (_collection_rmattr(cid, attrname) < 0) {
- dout(7) << "apply_transaction fail on _collection_rmattr" << dendl;
- r &= bit;
- }
- }
- break;
-
- default:
- dout(0) << "bad op " << op << dendl;
- assert(0);
- }
-
- bit = bit << 1;
- }
-
- dout(7) << "_apply_transaction finish (r = " << r << ")" << dendl;
- return r;
-}
-
-int Ebofs::_touch(coll_t cid, pobject_t oid)
-{
- dout(7) << "_touch " << oid << dendl;
-
- // get|create inode
- Onode *on = get_onode(oid);
- if (!on) {
- on = new_onode(oid); // new inode!
- _collection_add(cid, oid);
- dirty_onode(on);
- }
- put_onode(on);
- return 0;
-}
-
-
-int Ebofs::_write(coll_t cid, pobject_t oid, uint64_t offset, size_t length, const bufferlist& bl)
-{
- dout(7) << "_write " << cid << " " << oid << " " << offset << "~" << length << dendl;
- assert(bl.length() == length);
-
- // get|create inode
- Onode *on = get_onode(oid);
- if (!on) {
- on = new_onode(oid); // new inode!
- _collection_add(cid, oid);
- }
-
- while (1) {
- // too much unflushed dirty data? (if so, block!)
- if (_write_will_block()) {
- dout(10) << "_write blocking "
- << oid << " " << offset << "~" << length
- << " bc: "
- << "size " << bc.get_size()
- << ", trimmable " << bc.get_trimmable()
- << ", max " << g_conf.ebofs_bc_size
- << "; dirty " << bc.get_stat_dirty()
- << ", tx " << bc.get_stat_tx()
- << ", max dirty " << g_conf.ebofs_bc_max_dirty
- << dendl;
-
- while (_write_will_block())
- bc.waitfor_stat(); // waits on ebofs_lock
-
- dout(10) << "_write unblocked "
- << oid << " " << offset << "~" << length
- << " bc: "
- << "size " << bc.get_size()
- << ", trimmable " << bc.get_trimmable()
- << ", max " << g_conf.ebofs_bc_size
- << "; dirty " << bc.get_stat_dirty()
- << ", tx " << bc.get_stat_tx()
- << ", max dirty " << g_conf.ebofs_bc_max_dirty
- << dendl;
- }
-
- // out of space?
- unsigned max = (length+offset) / EBOFS_BLOCK_SIZE + 10; // very conservative; assumes we have to rewrite
- max += dirty_onodes.size() + dirty_cnodes.size();
- if (max >= free_blocks) {
- dout(1) << "write failing, only " << free_blocks << " blocks free, may need up to " << max << dendl;
- return -ENOSPC;
- }
-
- if (on->readonly) {
- put_onode(on);
- return -EACCES;
- }
-
- // apply write to buffer cache
- if (length == 0) {
- dirty_onode(on);
- break;
- } else {
- int r = apply_write(on, offset, length, bl);
- if (r == 0)
- break; // yay!
- assert(r < 0);
- dout(1) << "write waiting for commit to finish" << dendl;
- sync_cond.Wait(ebofs_lock);
- if (on->deleted) {
- put_onode(on);
- return -ENOENT;
- }
- }
- }
-
- // done.
- put_onode(on);
- trim_bc();
-
- return length;
-}
-
-int Ebofs::_zero(coll_t cid, pobject_t oid, uint64_t offset, size_t length)
-{
- dout(7) << "_zero " << oid << " " << offset << "~" << length << dendl;
-
- // get|create inode
- Onode *on = get_onode(oid);
- if (!on) {
- on = new_onode(oid); // new inode!
- _collection_add(cid, oid);
- }
- if (on->readonly) {
- put_onode(on);
- return -EACCES;
- }
-
- if (length > 0 &&
- offset < on->object_size) {
- if (offset + (uint64_t)length >= on->object_size) {
- _truncate(cid, oid, offset);
- } else {
- while (1) {
- int r = apply_zero(on, offset, length);
- if (r == 0) break;
- assert(r < 0);
- dout(10) << "_zero waiting for commit to finish" << dendl;
- sync_cond.Wait(ebofs_lock);
- if (on->deleted) {
- put_onode(on);
- return -ENOENT;
- }
- }
- }
- }
-
- // done.
- put_onode(on);
- trim_bc();
-
- return length;
-}
-
-
-int Ebofs::write(coll_t cid, pobject_t oid,
- uint64_t off, size_t len,
- const bufferlist& bl, Context *onsafe)
-{
- ebofs_lock.Lock();
-
- // go
- int r = _write(cid, oid, off, len, bl);
-
- // commit waiter
- if (r > 0) {
- assert((size_t)r == len);
- if (journal) {
- Transaction t;
- t.write(cid, oid, off, len, bl);
- bufferlist tbl;
- t.encode(tbl);
- journal->submit_entry(++op_seq, tbl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- } else {
- if (onsafe) delete onsafe;
- }
-
- ebofs_lock.Unlock();
- return r;
-}
-
-int Ebofs::zero(coll_t cid, pobject_t oid, uint64_t off, size_t len, Context *onsafe)
-{
- ebofs_lock.Lock();
-
- // go
- int r = _zero(cid, oid, off, len);
-
- // commit waiter
- if (r > 0) {
- assert((size_t)r == len);
- if (journal) {
- Transaction t;
- t.zero(cid, oid, off, len);
- bufferlist tbl;
- t.encode(tbl);
- journal->submit_entry(++op_seq, tbl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- } else {
- if (onsafe) delete onsafe;
- }
-
- ebofs_lock.Unlock();
- return r;
-}
-
-
-int Ebofs::_remove(coll_t cid, pobject_t oid)
-{
- dout(7) << "_remove " << oid << dendl;
-
- // get inode
- Onode *on = get_onode(oid);
- if (!on) return -ENOENT;
-
- // ok remove it!
- remove_onode(on);
-
- return 0;
-}
-
-
-int Ebofs::remove(coll_t cid, pobject_t oid, Context *onsafe)
-{
- ebofs_lock.Lock();
-
- // do it
- int r = _remove(cid, oid);
-
- // journal, wait for commit
- if (r >= 0) {
- if (journal) {
- Transaction t;
- t.remove(cid, oid);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(++op_seq, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- } else {
- if (onsafe) delete onsafe;
- }
-
- ebofs_lock.Unlock();
- return r;
-}
-
-int Ebofs::_truncate(coll_t cid, pobject_t oid, uint64_t size)
-{
- dout(7) << "_truncate " << oid << " size " << size << dendl;
-
- Onode *on = get_onode(oid);
- if (!on)
- return -ENOENT;
- if (on->readonly) {
- put_onode(on);
- return -EACCES;
- }
-
- int r = 0;
- if (size > on->object_size) {
- r = -EINVAL; // whatever
- }
- else if (size < on->object_size) {
- // change size
- on->object_size = size;
- dirty_onode(on);
-
- // free blocks
- block_t nblocks = 0;
- if (size) nblocks = 1 + (size-1) / EBOFS_BLOCK_SIZE;
- if (on->last_block > nblocks) {
- vector<extent_t> extra;
- on->truncate_extents(nblocks, extra);
- for (unsigned i=0; i<extra.size(); i++)
- if (extra[i].start)
- allocator.release(extra[i]);
- }
-
- // truncate buffer cache
- if (on->oc) {
- on->oc->truncate(on->last_block, super_epoch);
- if (on->oc->is_empty())
- on->close_oc();
- }
-
- // update uncommitted
- interval_set<block_t> uncom;
- if (nblocks > 0) {
- interval_set<block_t> left;
- left.insert(0, nblocks);
- uncom.intersection_of(left, on->uncommitted);
- }
- dout(10) << "uncommitted was " << on->uncommitted << " now " << uncom << dendl;
- on->uncommitted = uncom;
-
- }
- else {
- assert(size == on->object_size);
- }
-
- put_onode(on);
- return r;
-}
-
-
-int Ebofs::truncate(coll_t cid, pobject_t oid, uint64_t size, Context *onsafe)
-{
- ebofs_lock.Lock();
-
- int r = _truncate(cid, oid, size);
-
- // journal, wait for commit
- if (r >= 0) {
- if (journal) {
- Transaction t;
- t.truncate(cid, oid, size);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(++op_seq, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- } else {
- if (onsafe) delete onsafe;
- }
-
- ebofs_lock.Unlock();
- return r;
-}
-
-
-
-int Ebofs::clone(coll_t cid, pobject_t from, pobject_t to, Context *onsafe)
-{
- ebofs_lock.Lock();
-
- int r = _clone(cid, from, to);
-
- // journal, wait for commit
- if (r >= 0) {
- if (journal) {
- Transaction t;
- t.clone(cid, from, to);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(++op_seq, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- } else {
- if (onsafe) delete onsafe;
- }
-
- ebofs_lock.Unlock();
- return r;
-}
-
-int Ebofs::_clone(coll_t cid, pobject_t from, pobject_t to)
-{
- dout(7) << "_clone " << from << " -> " << to << dendl;
-
- assert(g_conf.ebofs_cloneable);
- if (!g_conf.ebofs_cloneable)
- return -1; // no!
-
- Onode *fon = get_onode(from);
- if (!fon) return -ENOENT;
- Onode *ton = get_onode(to);
- if (ton) {
- put_onode(fon);
- put_onode(ton);
- return -EEXIST;
- }
- ton = new_onode(to);
- assert(ton);
- _collection_add(cid, to);
-
- // copy easy bits
- ton->readonly = true;
- ton->object_size = fon->object_size;
- ton->alloc_blocks = fon->alloc_blocks;
- ton->last_block = fon->last_block;
- ton->attr = fon->attr;
-
- // collections
- for (set<coll_t>::iterator p = fon->collections.begin();
- p != fon->collections.end();
- p++)
- _collection_add(*p, to);
-
- // extents
- ton->extent_map = fon->extent_map;
- for (map<block_t, ExtentCsum>::iterator p = ton->extent_map.begin();
- p != ton->extent_map.end();
- ++p)
- if (p->second.ex.start)
- allocator.alloc_inc(p->second.ex);
-
- // clear uncommitted
- fon->uncommitted.clear();
-
- // muck with ObjectCache
- if (fon->oc)
- fon->oc->clone_to( ton );
-
- // ok!
- put_onode(ton);
- put_onode(fon);
- return 0;
-}
-
-
-int Ebofs::_clone_range(coll_t cid, pobject_t from, pobject_t to, uint64_t off, uint64_t len)
-{
- dout(7) << "_clone_range " << from << " -> " << to << " " << off << "~" << len << dendl;
-
- // bah.
- bufferlist bl;
- int r = _read(from, off, len, bl);
- if (r < 0)
- return r;
- r = _write(cid, to, off, len, bl);
- return r;
-}
-
-
-/*
- * pick object revision with rev < specified rev.
- * (oid.rev is a noninclusive upper bound.)
- *
- */
-int Ebofs::pick_object_revision_lt(coll_t cid, pobject_t& oid)
-{
- assert(oid.oid.snap > 0); // this is only useful for non-zero oid.rev
-
- int r = -EEXIST; // return code
- ebofs_lock.Lock();
- {
- pobject_t orig = oid;
- pobject_t live = oid;
- live.oid.snap = 0;
-
- if (object_tab->get_num_keys() > 0) {
- Table<pobject_t, ebofs_inode_ptr>::Cursor cursor(object_tab);
-
- object_tab->find(oid, cursor); // this will be just _past_ highest eligible rev
- if (cursor.move_left() > 0) {
- bool firstpass = true;
- while (1) {
- pobject_t t = cursor.current().key;
- if (t.oid.ino != oid.oid.ino ||
- t.oid.bno != oid.oid.bno) // passed to previous object
- break;
- if (oid.oid.snap < t.oid.snap) { // rev < desired. possible match.
- r = 0;
- oid = t;
- break;
- }
- if (firstpass && oid.oid.snap >= t.oid.snap) { // there is no old rev < desired. try live.
- r = 0;
- oid = live;
- break;
- }
- if (cursor.move_left() <= 0) break;
- firstpass = false;
- }
- }
- }
-
- dout(8) << "find_object_revision " << orig << " -> " << oid
- << " r=" << r << dendl;
- }
- ebofs_lock.Unlock();
- return r;
-}
-
-
-
-
-bool Ebofs::exists(coll_t cid, pobject_t oid)
-{
- ebofs_lock.Lock();
- dout(8) << "exists " << oid << dendl;
- bool e = (object_tab->lookup(oid) == 0);
- ebofs_lock.Unlock();
- return e;
-}
-
-int Ebofs::stat(coll_t cid, pobject_t oid, struct stat *st)
-{
- ebofs_lock.Lock();
- int r = _stat(oid,st);
- ebofs_lock.Unlock();
- return r;
-}
-
-int Ebofs::_stat(pobject_t oid, struct stat *st)
-{
- dout(7) << "_stat " << oid << dendl;
-
- Onode *on = get_onode(oid);
- if (!on) return -ENOENT;
-
- // ??
- st->st_size = on->object_size;
-
- put_onode(on);
- return 0;
-}
-
-
-int Ebofs::_setattr(pobject_t oid, const char *name, const void *value, size_t size)
-{
- dout(8) << "setattr " << oid << " '" << name << "' len " << size << dendl;
-
- Onode *on = get_onode(oid);
- if (!on) return -ENOENT;
-
- string n(name);
- on->attr[n] = buffer::copy((char*)value, size);
- dirty_onode(on);
- put_onode(on);
-
- dout(8) << "setattr " << oid << " '" << name << "' len " << size << " success" << dendl;
-
- return 0;
-}
-
-int Ebofs::setattr(coll_t cid, pobject_t oid, const char *name, const void *value, size_t size, Context *onsafe)
-{
- ebofs_lock.Lock();
- int r = _setattr(oid, name, value, size);
-
- // journal, wait for commit
- if (r >= 0) {
- if (journal) {
- Transaction t;
- t.setattr(cid, oid, name, value, size);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(++op_seq, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- } else {
- if (onsafe) delete onsafe;
- }
-
- ebofs_lock.Unlock();
- return r;
-}
-
-int Ebofs::_setattrs(pobject_t oid, map<string,bufferptr>& attrset)
-{
- dout(8) << "setattrs " << oid << dendl;
-
- Onode *on = get_onode(oid);
- if (!on) return -ENOENT;
-
- on->attr = attrset;
- dirty_onode(on);
- put_onode(on);
- return 0;
-}
-
-int Ebofs::setattrs(coll_t cid, pobject_t oid, map<string,bufferptr>& attrset, Context *onsafe)
-{
- ebofs_lock.Lock();
- int r = _setattrs(oid, attrset);
-
- // journal, wait for commit
- if (r >= 0) {
- if (journal) {
- Transaction t;
- t.setattrs(cid, oid, attrset);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(++op_seq, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- } else {
- if (onsafe) delete onsafe;
- }
-
- ebofs_lock.Unlock();
- return r;
-}
-
-
-int Ebofs::get_object_collections(coll_t cid, pobject_t oid, set<coll_t>& ls)
-{
- ebofs_lock.Lock();
- int r = _get_object_collections(oid, ls);
- ebofs_lock.Unlock();
- return r;
-}
-
-int Ebofs::_get_object_collections(pobject_t oid, set<coll_t>& ls)
-{
- dout(8) << "_get_object_collections " << oid << dendl;
-
- Onode *on = get_onode(oid);
- if (!on) return -ENOENT;
- ls = on->collections;
- put_onode(on);
- return 0;
-}
-
-int Ebofs::getattr(coll_t cid, pobject_t oid, const char *name, void *value, size_t size)
-{
- ebofs_lock.Lock();
- int r = _getattr(oid, name, value, size);
- ebofs_lock.Unlock();
- return r;
-}
-int Ebofs::getattr(coll_t cid, pobject_t oid, const char *name, bufferptr& bp)
-{
- ebofs_lock.Lock();
- int r = _getattr(oid, name, bp);
- ebofs_lock.Unlock();
- return r;
-}
-
-int Ebofs::_getattr(pobject_t oid, const char *name, void *value, size_t size)
-{
- dout(8) << "_getattr " << oid << " '" << name << "' maxlen " << size << dendl;
-
- Onode *on = get_onode(oid);
- if (!on) return -ENOENT;
-
- string n(name);
- int r = 0;
- if (on->attr.count(n) == 0) {
- dout(10) << "_getattr " << oid << " '" << name << "' dne" << dendl;
- r = -ENODATA;
- } else {
- r = MIN( on->attr[n].length(), size );
- dout(10) << "_getattr " << oid << " '" << name << "' got len " << r << dendl;
- memcpy(value, on->attr[n].c_str(), r );
- }
- put_onode(on);
- return r;
-}
-int Ebofs::_getattr(pobject_t oid, const char *name, bufferptr& bp)
-{
- dout(8) << "_getattr " << oid << " '" << name << "'" << dendl;
-
- Onode *on = get_onode(oid);
- if (!on) return -ENOENT;
-
- string n(name);
- int r = 0;
- if (on->attr.count(n) == 0) {
- dout(10) << "_getattr " << oid << " '" << name << "' dne" << dendl;
- r = -ENODATA;
- } else {
- bp = on->attr[n];
- r = bp.length();
- dout(10) << "_getattr " << oid << " '" << name << "' got len " << r << dendl;
- }
- put_onode(on);
- return r;
-}
-
-int Ebofs::getattrs(coll_t cid, pobject_t oid, map<string,bufferptr> &aset)
-{
- ebofs_lock.Lock();
- int r = _getattrs(oid, aset);
- ebofs_lock.Unlock();
- return r;
-}
-
-int Ebofs::_getattrs(pobject_t oid, map<string,bufferptr> &aset)
-{
- dout(8) << "_getattrs " << oid << dendl;
-
- Onode *on = get_onode(oid);
- if (!on) return -ENOENT;
- aset = on->attr;
- put_onode(on);
- return 0;
-}
-
-
-
-int Ebofs::_rmattr(pobject_t oid, const char *name)
-{
- dout(8) << "_rmattr " << oid << " '" << name << "'" << dendl;
-
- Onode *on = get_onode(oid);
- if (!on) return -ENOENT;
-
- string n(name);
- on->attr.erase(n);
- dirty_onode(on);
- put_onode(on);
- return 0;
-}
-
-int Ebofs::rmattr(coll_t cid, pobject_t oid, const char *name, Context *onsafe)
-{
- ebofs_lock.Lock();
-
- int r = _rmattr(oid, name);
-
- // journal, wait for commit
- if (r >= 0) {
- if (journal) {
- Transaction t;
- t.rmattr(cid, oid, name);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(++op_seq, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- } else {
- if (onsafe) delete onsafe;
- }
-
- ebofs_lock.Unlock();
- return r;
-}
-
-int Ebofs::listattr(coll_t cid, pobject_t oid, vector<string>& attrs)
-{
- ebofs_lock.Lock();
- dout(8) << "listattr " << oid << dendl;
-
- Onode *on = get_onode(oid);
- if (!on) {
- ebofs_lock.Unlock();
- return -ENOENT;
- }
-
- attrs.clear();
- for (map<string,bufferptr>::iterator i = on->attr.begin();
- i != on->attr.end();
- i++) {
- attrs.push_back(i->first);
- }
-
- put_onode(on);
- ebofs_lock.Unlock();
- return 0;
-}
-
-
-/***************** collections ******************/
-
-int Ebofs::list_collections(vector<coll_t>& ls)
-{
- ebofs_lock.Lock();
- dout(9) << "list_collections " << dendl;
-
- Table<coll_t, ebofs_inode_ptr>::Cursor cursor(collection_tab);
-
- int num = 0;
- if (collection_tab->find(0, cursor) >= 0) {
- while (1) {
- ls.push_back(cursor.current().key);
- num++;
- if (cursor.move_right() <= 0) break;
- }
- }
-
- ebofs_lock.Unlock();
- return num;
-}
-
-int Ebofs::_create_collection(coll_t cid)
-{
- dout(9) << "_create_collection " << hex << cid << dec << dendl;
-
- if (_collection_exists(cid))
- return -EEXIST;
-
- Cnode *cn = new_cnode(cid);
- put_cnode(cn);
-
- return 0;
-}
-
-int Ebofs::create_collection(coll_t cid, Context *onsafe)
-{
- ebofs_lock.Lock();
-
- int r = _create_collection(cid);
-
- // journal, wait for commit
- if (r >= 0) {
- if (journal) {
- Transaction t;
- t.create_collection(cid);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(++op_seq, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- } else {
- if (onsafe) delete onsafe;
- }
-
- ebofs_lock.Unlock();
- return r;
-}
-
-int Ebofs::_destroy_collection(coll_t cid)
-{
- dout(9) << "_destroy_collection " << hex << cid << dec << dendl;
-
- if (!_collection_exists(cid))
- return -ENOENT;
-
- Cnode *cn = get_cnode(cid);
- assert(cn);
-
- // hose mappings
- vector<pobject_t> objects;
- _collection_list(cid, objects);
- for (vector<pobject_t>::iterator i = objects.begin();
- i != objects.end();
- i++) {
- co_tab->remove(coll_pobject_t(cid,*i));
-
- Onode *on = get_onode(*i);
- if (on) {
- on->collections.erase(cid);
- dirty_onode(on);
- put_onode(on);
- }
- }
-
- remove_cnode(cn);
- return 0;
-}
-
-int Ebofs::destroy_collection(coll_t cid, Context *onsafe)
-{
- ebofs_lock.Lock();
-
- int r = _destroy_collection(cid);
-
- // journal, wait for commit
- if (r >= 0) {
- if (journal) {
- Transaction t;
- t.remove_collection(cid);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(++op_seq, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- } else {
- if (onsafe) delete onsafe;
- }
-
- ebofs_lock.Unlock();
- return r;
-}
-
-bool Ebofs::collection_exists(coll_t cid)
-{
- ebofs_lock.Lock();
- dout(10) << "collection_exists " << hex << cid << dec << dendl;
- bool r = _collection_exists(cid);
- dout(10) << "collection_exists " << hex << cid << dec << " = " << r << dendl;
- ebofs_lock.Unlock();
- return r;
-}
-bool Ebofs::_collection_exists(coll_t cid)
-{
- return (collection_tab->lookup(cid) == 0);
-}
-
-int Ebofs::_collection_add(coll_t cid, pobject_t oid)
-{
- dout(9) << "_collection_add " << hex << cid << " object " << oid << dec << dendl;
-
- if (!_collection_exists(cid))
- return -ENOENT;
-
- Onode *on = get_onode(oid);
- if (!on) return -ENOENT;
-
- int r = 0;
-
- if (on->collections.count(cid) == 0) {
- on->collections.insert(cid);
- dirty_onode(on);
- co_tab->insert(coll_pobject_t(cid,oid), true);
- } else {
- r = -ENOENT; // FIXME? already in collection.
- }
-
- put_onode(on);
- return r;
-}
-
-int Ebofs::collection_add(coll_t cid, coll_t ocid, pobject_t oid, Context *onsafe)
-{
- ebofs_lock.Lock();
-
- int r = _collection_add(cid, oid);
-
- // journal, wait for commit
- if (r >= 0) {
- if (journal) {
- Transaction t;
- t.collection_add(cid, ocid, oid);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(++op_seq, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- } else {
- if (onsafe) delete onsafe;
- }
-
- ebofs_lock.Unlock();
- return 0;
-}
-
-int Ebofs::_collection_remove(coll_t cid, pobject_t oid)
-{
- dout(9) << "_collection_remove " << hex << cid << " object " << oid << dec << dendl;
-
- if (!_collection_exists(cid))
- return -ENOENT;
-
- Onode *on = get_onode(oid);
- if (!on) return -ENOENT;
-
- int r = 0;
-
- if (on->collections.count(cid)) {
- on->collections.erase(cid);
- dirty_onode(on);
- co_tab->remove(coll_pobject_t(cid,oid));
- } else {
- r = -ENOENT; // FIXME?
- }
-
- put_onode(on);
- return r;
-}
-
-int Ebofs::collection_remove(coll_t cid, pobject_t oid, Context *onsafe)
-{
- ebofs_lock.Lock();
-
- int r = _collection_remove(cid, oid);
-
- // journal, wait for commit
- if (r >= 0) {
- if (journal) {
- Transaction t;
- t.collection_remove(cid, oid);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(++op_seq, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- } else {
- if (onsafe) delete onsafe;
- }
-
- ebofs_lock.Unlock();
- return 0;
-}
-
-
-bool Ebofs::collection_empty(coll_t cid)
-{
- ebofs_lock.Lock();
-
- dout(9) << "collection_empty " << hex << cid << dec << dendl;
-
- if (!_collection_exists(cid)) {
- ebofs_lock.Unlock();
- return -ENOENT;
- }
-
- Table<coll_pobject_t, bool>::Cursor cursor(co_tab);
-
- bool empty = true;
- if (co_tab->find(coll_pobject_t(cid,pobject_t()), cursor) >= 0) {
- while (1) {
- const coll_t c = cursor.current().key.first;
- const pobject_t o = cursor.current().key.second;
- if (c != cid) break; // end!
- empty = false;
- break;
- }
- }
-
- ebofs_lock.Unlock();
- return empty;
-}
-
-
-
-int Ebofs::collection_list(coll_t cid, vector<pobject_t>& ls)
-{
- ebofs_lock.Lock();
- int num = _collection_list(cid, ls);
- ebofs_lock.Unlock();
- return num;
-}
-
-int Ebofs::_collection_list(coll_t cid, vector<pobject_t>& ls)
-{
- dout(9) << "collection_list " << hex << cid << dec << dendl;
-
- if (!_collection_exists(cid)) {
- ebofs_lock.Unlock();
- return -ENOENT;
- }
-
- Table<coll_pobject_t, bool>::Cursor cursor(co_tab);
-
- int num = 0;
- if (co_tab->find(coll_pobject_t(cid,pobject_t()), cursor) >= 0) {
- while (1) {
- const coll_t c = cursor.current().key.first;
- const pobject_t o = cursor.current().key.second;
- if (c != cid) break; // end!
- dout(10) << "collection_list " << hex << cid << " includes " << o << dec << dendl;
- ls.push_back(o);
- num++;
- if (cursor.move_right() < 0) break;
- }
- }
-
- return num;
-}
-
-
-int Ebofs::_collection_setattr(coll_t cid, const char *name, const void *value, size_t size)
-{
- dout(10) << "_collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << dendl;
-
- Cnode *cn = get_cnode(cid);
- if (!cn) return -ENOENT;
-
- string n(name);
- cn->attr[n] = buffer::copy((char*)value, size);
- dirty_cnode(cn);
- put_cnode(cn);
-
- return 0;
-}
-
-int Ebofs::collection_setattr(coll_t cid, const char *name, const void *value, size_t size, Context *onsafe)
-{
- ebofs_lock.Lock();
- dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << dendl;
-
- int r = _collection_setattr(cid, name, value, size);
-
- // journal, wait for commit
- if (r >= 0) {
- if (journal) {
- Transaction t;
- t.collection_setattr(cid, name, value, size);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(++op_seq, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- } else {
- if (onsafe) delete onsafe;
- }
-
- ebofs_lock.Unlock();
- return 0;
-}
-
-int Ebofs::collection_getattr(coll_t cid, const char *name, void *value, size_t size)
-{
- ebofs_lock.Lock();
- dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' maxlen " << size << dendl;
-
- Cnode *cn = get_cnode(cid);
- if (!cn) {
- ebofs_lock.Unlock();
- return -ENOENT;
- }
-
- string n(name);
- int r;
- if (cn->attr.count(n) == 0) {
- r = -1;
- } else {
- r = MIN( cn->attr[n].length(), size );
- memcpy(value, cn->attr[n].c_str(), r);
- }
-
- put_cnode(cn);
- ebofs_lock.Unlock();
- return r;
-}
-
-int Ebofs::collection_getattr(coll_t cid, const char *name, bufferlist& bl)
-{
- ebofs_lock.Lock();
- dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << dendl;
-
- Cnode *cn = get_cnode(cid);
- if (!cn) {
- ebofs_lock.Unlock();
- return -ENOENT;
- }
-
- string n(name);
- int r;
- if (cn->attr.count(n) == 0) {
- r = -1;
- } else {
- bl.push_back(cn->attr[n]);
- r = bl.length();
- }
-
- put_cnode(cn);
- ebofs_lock.Unlock();
- return r;
-}
-
-int Ebofs::collection_getattrs(coll_t cid, map<string,bufferptr> &aset)
-{
- ebofs_lock.Lock();
- int r = _collection_getattrs(cid, aset);
- ebofs_lock.Unlock();
- return r;
-}
-
-int Ebofs::_collection_getattrs(coll_t cid, map<string,bufferptr> &aset)
-{
- dout(8) << "_collection_getattrs " << cid << dendl;
-
- Cnode *cn = get_cnode(cid);
- if (!cn) return -ENOENT;
- aset = cn->attr;
- put_cnode(cn);
- return 0;
-}
-
-int Ebofs::collection_setattrs(coll_t cid, map<string,bufferptr> &aset)
-{
- ebofs_lock.Lock();
- int r = _collection_setattrs(cid, aset);
- ebofs_lock.Unlock();
- return r;
-}
-
-int Ebofs::_collection_setattrs(coll_t cid, map<string,bufferptr> &aset)
-{
- dout(8) << "_collection_setattrs " << cid << dendl;
-
- Cnode *cn = get_cnode(cid);
- if (!cn) return -ENOENT;
- cn->attr = aset;
- dirty_cnode(cn);
- put_cnode(cn);
- return 0;
-}
-
-
-int Ebofs::_collection_rmattr(coll_t cid, const char *name)
-{
- dout(10) << "_collection_rmattr " << hex << cid << dec << " '" << name << "'" << dendl;
-
- Cnode *cn = get_cnode(cid);
- if (!cn) return -ENOENT;
-
- string n(name);
- cn->attr.erase(n);
-
- dirty_cnode(cn);
- put_cnode(cn);
-
- return 0;
-}
-
-int Ebofs::collection_rmattr(coll_t cid, const char *name, Context *onsafe)
-{
- ebofs_lock.Lock();
-
- int r = _collection_rmattr(cid, name);
-
- // journal, wait for commit
- if (r >= 0) {
- if (journal) {
- Transaction t;
- t.collection_rmattr(cid, name);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(++op_seq, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- } else {
- if (onsafe) delete onsafe;
- }
-
- ebofs_lock.Unlock();
- return 0;
-}
-
-int Ebofs::collection_listattr(coll_t cid, vector<string>& attrs)
-{
- ebofs_lock.Lock();
- dout(10) << "collection_listattr " << hex << cid << dec << dendl;
-
- Cnode *cn = get_cnode(cid);
- if (!cn) {
- ebofs_lock.Unlock();
- return -ENOENT;
- }
-
- attrs.clear();
- for (map<string,bufferptr>::iterator i = cn->attr.begin();
- i != cn->attr.end();
- i++) {
- attrs.push_back(i->first);
- }
-
- put_cnode(cn);
- ebofs_lock.Unlock();
- return 0;
-}
-
-
-
-void Ebofs::_export_freelist(bufferlist& bl)
-{
- for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) {
- Table<block_t,block_t> *tab;
- if (b < EBOFS_NUM_FREE_BUCKETS) {
- tab = free_tab[b];
- } else {
- tab = limbo_tab;
- }
-
- if (tab->get_num_keys() > 0) {
- Table<block_t,block_t>::Cursor cursor(tab);
- assert(tab->find(0, cursor) >= 0);
- while (1) {
- assert(cursor.current().value > 0);
-
- extent_t ex = {cursor.current().key, cursor.current().value};
- dout(10) << "_export_freelist " << ex << dendl;
- bl.append((char*)&ex, sizeof(ex));
- if (cursor.move_right() <= 0) break;
- }
- }
- }
-}
-
-void Ebofs::_import_freelist(bufferlist& bl)
-{
- // clear
- for (int b=0; b<EBOFS_NUM_FREE_BUCKETS; b++)
- free_tab[b]->clear();
- limbo_tab->clear();
-
- // import!
- int num = bl.length() / sizeof(extent_t);
- extent_t *p = (extent_t*)bl.c_str();
- for (int i=0; i<num; i++) {
- dout(10) << "_import_freelist " << p[i] << dendl;
- allocator._release_loner(p[i]);
- }
-}
-
-void Ebofs::_get_frag_stat(FragmentationStat& st)
-{
- ebofs_lock.Lock();
-
- // free list is easy
- st.total = dev.get_num_blocks();
- st.total_free = get_free_blocks() + get_limbo_blocks();
- st.free_extent_dist.clear();
- st.num_free_extent = 0;
- st.avg_free_extent = 0;
-/*
- uint64_t tfree = 0;
- for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) {
- Table<block_t,block_t> *tab;
- if (b < EBOFS_NUM_FREE_BUCKETS) {
- tab = free_tab[b];
- dout(30) << "dump bucket " << b << " " << tab->get_num_keys() << dendl;
- } else {
- tab = limbo_tab;
- dout(30) << "dump limbo " << tab->get_num_keys() << dendl;;
- }
-
- if (tab->get_num_keys() > 0) {
- Table<block_t,block_t>::Cursor cursor(tab);
- assert(tab->find(0, cursor) >= 0);
- while (1) {
- assert(cursor.current().value > 0);
-
- block_t l = cursor.current().value;
- tfree += l;
- int b = 0;
- do {
- l = l >> 1;
- b++;
- } while (l);
- st.free_extent_dist[b]++;
- st.free_extent_dist_sum[b] += cursor.current().value;
- st.num_free_extent++;
-
- if (cursor.move_right() <= 0) break;
- }
- }
- }
- st.avg_free_extent = tfree / st.num_free_extent;
-*/
-
- // used extents is harder. :(
- st.num_extent = 0;
- st.avg_extent = 0;
- st.extent_dist.clear();
- st.extent_dist_sum.clear();
- st.avg_extent_per_object = 0;
- st.avg_extent_jump = 0;
-
- Table<pobject_t,ebofs_inode_ptr>::Cursor cursor(object_tab);
- object_tab->find(pobject_t(), cursor);
- int nobj = 0;
- int njump = 0;
- while (object_tab->get_num_keys() > 0) {
- Onode *on = get_onode(cursor.current().key);
- assert(on);
-
- nobj++;
- st.avg_extent_per_object += on->extent_map.size();
-
- for (map<block_t,ExtentCsum>::iterator p = on->extent_map.begin();
- p != on->extent_map.end();
- p++) {
- if (p->second.ex.start == 0) continue; // ignore holes
- block_t l = p->second.ex.length;
-
- st.num_extent++;
- st.avg_extent += l;
- if (p->first > 0) {
- njump++;
- st.avg_extent_jump += l;
- }
-
- int b = 0;
- do {
- l = l >> 1;
- b++;
- } while (l);
- st.extent_dist[b]++;
- st.extent_dist_sum[b] += p->second.ex.length;
- }
- put_onode(on);
- if (cursor.move_right() <= 0) break;
- }
- if (njump) st.avg_extent_jump /= njump;
- if (nobj) st.avg_extent_per_object /= (float)nobj;
- if (st.num_extent) st.avg_extent /= st.num_extent;
-
- ebofs_lock.Unlock();
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#include "include/Context.h"
-#include "include/buffer.h"
-#include "include/hash.h"
-
-#include "types.h"
-#include "Onode.h"
-#include "Cnode.h"
-#include "BlockDevice.h"
-#include "nodes.h"
-#include "Allocator.h"
-#include "Table.h"
-
-#include "os/Journal.h"
-#include "os/ObjectStore.h"
-
-#include "common/Mutex.h"
-#include "common/Cond.h"
-#include "common/Finisher.h"
-
-//typedef pair<object_t,coll_t> object_coll_t;
-typedef pair<coll_t,pobject_t> coll_pobject_t;
-
-
-class Ebofs : public ObjectStore {
-protected:
- Mutex ebofs_lock; // a beautiful global lock
-
- // ** debuggy **
- bool fake_writes;
-
- // ** super **
-public:
- BlockDevice dev;
-protected:
- bool mounted, unmounting, dirty;
- bool readonly;
- version_t super_epoch;
- uint64_t op_seq;
- bool commit_starting;
- bool commit_thread_started;
- Cond commit_cond; // to wake up the commit thread
- Cond sync_cond;
- uint64_t super_fsid;
-
- map<version_t, vector<Context*> > commit_waiters;
-
- void prepare_super(version_t epoch, bufferptr& bp);
- void write_super(version_t epoch, bufferptr& bp);
- int commit_thread_entry();
-
- class CommitThread : public Thread {
- Ebofs *ebofs;
- public:
- CommitThread(Ebofs *e) : ebofs(e) {}
- void *entry() {
- ebofs->commit_thread_entry();
- return 0;
- }
- } commit_thread;
-
-public:
- uint64_t get_fsid() { return super_fsid; }
- epoch_t get_super_epoch() { return super_epoch; }
-
- void queue_commit_waiter(Context *oncommit) {
- if (oncommit)
- commit_waiters[super_epoch].push_back(oncommit);
- }
-
-
-protected:
-
-
- // ** journal **
- char *journalfn;
- Journal *journal;
-
- // ** allocator **
- block_t free_blocks, limbo_blocks;
- Allocator allocator;
- friend class Allocator;
-
- block_t get_free_blocks() { return free_blocks; }
- block_t get_limbo_blocks() { return limbo_blocks; }
- block_t get_free_extents() {
- int n = 0;
- for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++)
- n += free_tab[i]->get_num_keys();
- return n;
- }
- block_t get_limbo_extents() { return limbo_tab->get_num_keys(); }
-
-
- // ** tables and sets **
- // nodes
- NodePool nodepool; // for all tables...
-
- // tables
- Table<pobject_t, ebofs_inode_ptr> *object_tab;
- Table<block_t,block_t> *free_tab[EBOFS_NUM_FREE_BUCKETS];
- Table<block_t,block_t> *limbo_tab;
- Table<block_t,pair<block_t,int> > *alloc_tab;
-
- // collections
- Table<coll_t, ebofs_inode_ptr> *collection_tab;
- Table<coll_pobject_t, bool> *co_tab;
-
- void close_tables();
- void verify_tables();
-
-
- // ** onodes **
- hash_map<pobject_t, Onode*> onode_map; // onode cache
- LRU onode_lru;
- set<Onode*> dirty_onodes;
- map<pobject_t, list<Cond*> > waitfor_onode;
-
- Onode* new_onode(pobject_t oid); // make new onode. ref++.
- bool have_onode(pobject_t oid) {
- return onode_map.count(oid);
- }
- Onode* decode_onode(bufferlist& bl, unsigned& off, csum_t csum);
- csum_t encode_onode(Onode *on, bufferlist& bl, unsigned& off);
- Onode* get_onode(pobject_t oid); // get cached onode, or read from disk. ref++.
- void remove_onode(Onode *on);
- void put_onode(Onode* o); // put it back down. ref--.
- void dirty_onode(Onode* o);
- void write_onode(Onode *on);
-
- // ** cnodes **
- hash_map<coll_t, Cnode*> cnode_map;
- LRU cnode_lru;
- set<Cnode*> dirty_cnodes;
- map<coll_t, list<Cond*> > waitfor_cnode;
-
- Cnode* decode_cnode(bufferlist& bl, unsigned& off, csum_t csum);
- csum_t encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off);
- Cnode* new_cnode(coll_t cid);
- Cnode* get_cnode(coll_t cid);
- void remove_cnode(Cnode *cn);
- void put_cnode(Cnode *cn);
- void dirty_cnode(Cnode *cn);
- void write_cnode(Cnode *cn);
-
- // ** onodes+cnodes = inodes **
- int inodes_flushing;
- Cond inode_commit_cond;
-
- void flush_inode_finish();
- void commit_inodes_start();
- void commit_inodes_wait();
- friend class C_E_InodeFlush;
-
- void trim_inodes(int max = -1);
-
- // ** buffer cache **
- BufferCache bc;
- pthread_t flushd_thread_id;
-
- version_t trigger_commit();
- void commit_bc_wait(version_t epoch);
- void trim_bc(int64_t max = -1);
-
- public:
- void kick_idle();
- void sync();
- void sync(Context *onsafe);
- void trim_buffer_cache();
-
- class IdleKicker : public BlockDevice::kicker {
- Ebofs *ebo;
- public:
- IdleKicker(Ebofs *t) : ebo(t) {}
- void kick() { ebo->kick_idle(); }
- } idle_kicker;
-
-
- protected:
- int check_partial_edges(Onode *on, uint64_t off, uint64_t len,
- bool &partial_head, bool &partial_tail);
-
- void alloc_write(Onode *on,
- block_t start, block_t len,
- interval_set<block_t>& alloc,
- block_t& old_bfirst, block_t& old_blast,
- csum_t& old_csum_first, csum_t& old_csum_last);
- int apply_write(Onode *on, uint64_t off, uint64_t len, const bufferlist& bl);
- int apply_zero(Onode *on, uint64_t off, size_t len);
- int attempt_read(Onode *on, uint64_t off, size_t len, bufferlist& bl,
- Cond *will_wait_on, bool *will_wait_on_bool);
-
- Finisher finisher;
-
- void alloc_more_node_space();
-
- void do_csetattrs(map<coll_t, map<const char*, pair<void*,int> > > &cmods);
- void do_setattrs(Onode *on, map<const char*, pair<void*,int> > &setattrs);
-
-
- public:
- Ebofs(const char *devfn, const char *jfn=0) :
- ebofs_lock("Ebofs::ebofs_lock"),
- fake_writes(false),
- dev(devfn),
- mounted(false), unmounting(false), dirty(false), readonly(false),
- super_epoch(0), op_seq(0),
- commit_starting(false), commit_thread_started(false),
- commit_thread(this),
- journal(0),
- free_blocks(0), limbo_blocks(0),
- allocator(this),
- nodepool(ebofs_lock),
- object_tab(0), limbo_tab(0), collection_tab(0), co_tab(0),
- onode_lru(g_conf.ebofs_oc_size),
- cnode_lru(g_conf.ebofs_cc_size),
- inodes_flushing(0),
- bc(dev, ebofs_lock),
- idle_kicker(this) {
- for (int i=0; i<EBOFS_NUM_FREE_BUCKETS; i++)
- free_tab[i] = 0;
- if (jfn) {
- journalfn = new char[strlen(jfn) + 1];
- strcpy(journalfn, jfn);
- } else {
- journalfn = new char[strlen(devfn) + 100];
- strcpy(journalfn, devfn);
- strcat(journalfn, ".journal");
- }
- }
- ~Ebofs() {
- }
-
- int mkfs();
- int mount();
- int umount();
-
- int statfs(struct statfs *buf);
-
- // atomic transaction
- unsigned apply_transaction(Transaction& t, Context *onjournal=0, Context *ondisk=0);
-
- int pick_object_revision_lt(coll_t cid, pobject_t& oid);
-
- // object interface
- bool exists(coll_t cid, pobject_t);
- int stat(coll_t cid, pobject_t, struct stat*);
- int read(coll_t cid, pobject_t, uint64_t off, size_t len, bufferlist& bl);
- int is_cached(coll_t cid, pobject_t oid, uint64_t off, size_t len);
-
- int write(coll_t cid, pobject_t oid, uint64_t off, size_t len, const bufferlist& bl, Context *onsafe);
- int zero(coll_t cid, pobject_t oid, uint64_t off, size_t len, Context *onsafe);
- int truncate(coll_t cid, pobject_t oid, uint64_t size, Context *onsafe=0);
- int remove(coll_t cid, pobject_t oid, Context *onsafe=0);
- bool write_will_block();
- void trim_from_cache(coll_t cid, pobject_t oid, uint64_t off, size_t len);
-
- int rename(pobject_t from, pobject_t to);
- int clone(coll_t cid, pobject_t from, pobject_t to, Context *onsafe);
-
- // object attr
- int setattr(coll_t cid, pobject_t oid, const char *name, const void *value, size_t size, Context *onsafe=0);
- int setattrs(coll_t cid, pobject_t oid, map<string,bufferptr>& attrset, Context *onsafe=0);
- int getattr(coll_t cid, pobject_t oid, const char *name, void *value, size_t size);
- int getattr(coll_t cid, pobject_t oid, const char *name, bufferptr& bp);
- int getattrs(coll_t cid, pobject_t oid, map<string,bufferptr> &aset);
- int rmattr(coll_t cid, pobject_t oid, const char *name, Context *onsafe=0);
- int listattr(coll_t cid, pobject_t oid, vector<string>& attrs);
-
- int get_object_collections(coll_t cid, pobject_t oid, set<coll_t>& ls);
-
- // collections
- int list_collections(vector<coll_t>& ls);
- bool collection_exists(coll_t c);
-
- int create_collection(coll_t c, Context *onsafe);
- int destroy_collection(coll_t c, Context *onsafe);
- int collection_add(coll_t c, coll_t cid, pobject_t o, Context *onsafe);
- int collection_remove(coll_t c, pobject_t o, Context *onsafe);
-
- bool collection_empty(coll_t c);
- int collection_list(coll_t c, vector<pobject_t>& o);
-
- int collection_setattr(coll_t cid, const char *name, const void *value, size_t size, Context *onsafe);
- int collection_setattrs(coll_t cid, map<string,bufferptr> &aset);
- int collection_getattr(coll_t cid, const char *name, void *value, size_t size);
- int collection_getattr(coll_t cid, const char *name, bufferlist& bl);
- int collection_getattrs(coll_t cid, map<string,bufferptr> &aset);
- int collection_rmattr(coll_t cid, const char *name, Context *onsafe);
- int collection_listattr(coll_t oid, vector<string>& attrs);
-
- // maps
- int map_lookup(pobject_t o, bufferlist& key, bufferlist& val);
- int map_insert(pobject_t o, bufferlist& key, bufferlist& val);
- int map_remove(pobject_t o, bufferlist& key);
- int map_list(pobject_t o, list<bufferlist>& keys);
- int map_list(pobject_t o, map<bufferlist,bufferlist>& vals);
- int map_list(pobject_t o,
- bufferlist& start, bufferlist& end,
- map<bufferlist,bufferlist>& vals);
-
- // crap
- void _fake_writes(bool b) { fake_writes = b; }
- void _get_frag_stat(FragmentationStat& st);
-
- void _import_freelist(bufferlist& bl);
- void _export_freelist(bufferlist& bl);
-
-
-private:
- // private interface -- use if caller already holds lock
- unsigned _apply_transaction(Transaction& t);
-
- int _read(pobject_t oid, uint64_t off, size_t len, bufferlist& bl);
- int _is_cached(pobject_t oid, uint64_t off, size_t len);
- int _stat(pobject_t oid, struct stat *st);
- int _getattr(pobject_t oid, const char *name, void *value, size_t size);
- int _getattr(pobject_t oid, const char *name, bufferptr& bp);
- int _getattrs(pobject_t oid, map<string,bufferptr> &aset);
- int _get_object_collections(pobject_t oid, set<coll_t>& ls);
-
- bool _write_will_block();
- int _touch(coll_t cid, pobject_t oid);
- int _write(coll_t cid, pobject_t oid, uint64_t off, size_t len, const bufferlist& bl);
- void _trim_from_cache(pobject_t oid, uint64_t off, size_t len);
- int _truncate(coll_t cid, pobject_t oid, uint64_t size);
- int _zero(coll_t cid, pobject_t oid, uint64_t offset, size_t length);
- int _remove(coll_t cid, pobject_t oid);
- int _clone(coll_t cid, pobject_t from, pobject_t to);
- int _clone_range(coll_t cid, pobject_t from, pobject_t to, uint64_t off, uint64_t len);
- int _setattr(pobject_t oid, const char *name, const void *value, size_t size);
- int _setattrs(pobject_t oid, map<string,bufferptr>& attrset);
- int _rmattr(pobject_t oid, const char *name);
- bool _collection_exists(coll_t c);
- int _collection_list(coll_t c, vector<pobject_t>& o);
- int _create_collection(coll_t c);
- int _destroy_collection(coll_t c);
- int _collection_add(coll_t c, pobject_t o);
- int _collection_remove(coll_t c, pobject_t o);
- int _collection_getattrs(coll_t oid, map<string,bufferptr> &aset);
- int _collection_setattr(coll_t oid, const char *name, const void *value, size_t size);
- int _collection_setattrs(coll_t oid, map<string,bufferptr> &aset);
- int _collection_rmattr(coll_t cid, const char *name);
-
-
-};
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef CEPH_EBOFS_ONODE_H
-#define CEPH_EBOFS_ONODE_H
-
-#include "include/lru.h"
-
-#include "types.h"
-#include "BufferCache.h"
-
-#include "include/interval_set.h"
-
-/*
- * object node (like an inode)
- *
- * holds object metadata, including
- * size
- * allocation (extent list)
- * attributes
- *
- */
-
-struct ExtentCsum {
- extent_t ex;
- vector<csum_t> csum;
-
- void resize_tail() {
- unsigned old = csum.size();
- csum.resize(ex.length);
- for (block_t j=old; j<ex.length; j++)
- csum[j] = 0;
- }
- void resize_head() {
- if (ex.length < csum.size()) {
- memmove(&csum[0], &csum[csum.size()-ex.length], ex.length*sizeof(csum_t));
- csum.resize(ex.length);
- } else if (ex.length > csum.size()) {
- int old = csum.size();
- csum.resize(ex.length);
- memmove(&csum[ex.length-old], &csum[0], ex.length*sizeof(csum_t));
- for (block_t b = 0; b<ex.length-old; b++)
- csum[b] = 0;
- }
- }
-};
-inline ostream& operator<<(ostream& out, ExtentCsum &ec) {
- out << ec.ex;
- out << '=';
- out << hex << ec.csum << dec;
- return out;
-}
-
-class Onode : public LRUObject {
-private:
- int ref;
-
-public:
- pobject_t object_id;
- version_t version; // incremented on each modify.
-
- // data
- extent_t onode_loc;
- epoch_t last_alloc_epoch; // epoch i last allocated for
-
- uint64_t object_size;
- uint64_t alloc_blocks, last_block;
- csum_t data_csum;
- bool readonly;
-
- // onode
- set<coll_t> collections;
- map<string, bufferptr> attr;
-
- map<block_t, ExtentCsum> extent_map;
- interval_set<uint64_t> bad_byte_extents;
-
- interval_set<block_t> uncommitted;
-
- ObjectCache *oc;
-
- bool dirty;
- bool dangling; // not in onode_map
- bool deleted; // deleted
-
- //list<Context*> commit_waiters;
-
- public:
- Onode(pobject_t oid) : ref(0), object_id(oid), version(0), last_alloc_epoch(0),
- object_size(0), alloc_blocks(0), last_block(0), data_csum(0),
- readonly(0),
- oc(0),
- dirty(false), dangling(false), deleted(false) {
- onode_loc.start = 0;
- onode_loc.length = 0;
- }
- ~Onode() {
- if (oc) delete oc;
- }
-
- block_t get_onode_id() { return onode_loc.start; }
- int get_onode_len() { return onode_loc.length; }
-
- int get_ref_count() { return ref; }
- void get() {
- if (ref == 0) lru_pin();
- ref++;
- //cout << "ebofs.onode.get " << hex << object_id << dec << " " << ref << std::endl;
- }
- void put() {
- ref--;
- if (ref == 0) lru_unpin();
- //cout << "ebofs.onode.put " << hex << object_id << dec << " " << ref << std::endl;
- }
-
- void mark_dirty() {
- if (!dirty) {
- dirty = true;
- get();
- }
- }
- void mark_clean() {
- if (dirty) {
- dirty = false;
- put();
- }
- }
- bool is_dirty() { return dirty; }
- bool is_deleted() { return deleted; }
- bool is_dangling() { return dangling; }
-
-
- bool have_oc() {
- return oc != 0;
- }
- ObjectCache *get_oc(BufferCache *bc) {
- if (!oc) {
- oc = new ObjectCache(object_id, this, bc);
- oc->get();
- get();
- }
- return oc;
- }
- void close_oc() {
- if (oc) {
- //cout << "close_oc on " << object_id << std::endl;
- assert(oc->is_empty());
- if (oc->put() == 0){
- //cout << "************************* hosing oc" << std::endl;
- delete oc;
- }
- oc = 0;
- put();
- }
- }
-
-
- // allocation
- void verify_extents() {
- if (0) { // do crazy stupid sanity checking
- block_t count = 0, pos = 0;
- interval_set<block_t> is;
- csum_t csum = 0;
-
- set<block_t> s;
- //cout << "verify_extentsing. data_csum=" << hex << data_csum << dec << std::endl;
-
- for (map<block_t,ExtentCsum>::iterator p = extent_map.begin();
- p != extent_map.end();
- p++) {
- //cout << " verify_extents " << p->first << ": " << p->second << std::endl;
- assert(pos == p->first);
- pos += p->second.ex.length;
- if (p->second.ex.start) {
- count += p->second.ex.length;
- for (unsigned j=0;j<p->second.ex.length;j++) {
- assert(s.count(p->second.ex.start+j) == 0);
- s.insert(p->second.ex.start+j);
- csum += p->second.csum[j];
- }
- }
- }
- //cout << " verify_extents got csum " << hex << csum << " want " << data_csum << dec << std::endl;
-
- assert(s.size() == count);
- assert(count == alloc_blocks);
- assert(pos == last_block);
- assert(csum == data_csum);
- }
- }
-
- csum_t *get_extent_csum_ptr(block_t offset, block_t len) {
- map<block_t,ExtentCsum>::iterator p = extent_map.lower_bound(offset);
- if (p == extent_map.end() || p->first > offset)
- p--;
- assert(p->first <= offset);
- assert(p->second.ex.start != 0);
- assert(offset+len <= p->first + p->second.ex.length);
- return &p->second.csum[offset-p->first];
- }
-
- /*
- * set_extent - adjust extent map.
- * assume new extent will have csum of 0.
- * factor clobbered extents out of csums.
- */
- void set_extent(block_t offset, extent_t ex) {
- //cout << "set_extent " << offset << " -> " << ex << " ... " << last_block << std::endl;
-
- verify_extents();
-
- // at the end?
- if (offset == last_block) {
- //cout << " appending " << ex << std::endl;
- if (!extent_map.empty() &&
- ((extent_map.rbegin()->first &&
- ex.start &&
- extent_map.rbegin()->second.ex.end() == ex.start) ||
- (!extent_map.rbegin()->first &&
- !ex.start))) {
- extent_map.rbegin()->second.ex.length += ex.length;
- if (ex.start)
- extent_map.rbegin()->second.resize_tail();
- } else {
- extent_map[last_block].ex = ex;
- if (ex.start)
- extent_map[last_block].resize_tail();
- }
- last_block += ex.length;
- if (ex.start)
- alloc_blocks += ex.length;
- return;
- }
-
- // past the end?
- if (offset > last_block) {
- if (ex.start) {
- extent_map[last_block].ex.start = 0;
- extent_map[last_block].ex.length = offset - last_block;
- extent_map[offset].ex = ex;
- extent_map[offset].resize_tail();
- last_block = offset+ex.length;
- alloc_blocks += ex.length;
- } else {
- // ignore attempt to set a trailing "hole"
- }
- return;
- }
-
- // remove any extent bits we overwrite
- if (!extent_map.empty()) {
- // preceeding extent?
- map<block_t,ExtentCsum>::iterator p = extent_map.lower_bound(offset);
- if (p != extent_map.begin()) {
- p--;
- ExtentCsum &left = p->second;
- if (p->first + left.ex.length > offset) {
- //cout << " preceeding left was " << left << std::endl;
- block_t newlen = offset - p->first;
- if (p->first + left.ex.length > offset+ex.length) {
- // cutting chunk out of middle, add trailing bit
- ExtentCsum &right = extent_map[offset+ex.length] = left;
- right.ex.length -= offset+ex.length - p->first;
- if (right.ex.start) {
- right.ex.start += offset+ex.length - p->first;
- alloc_blocks += right.ex.length;
- right.resize_head();
- for (unsigned j=0; j<right.ex.length; j++)
- data_csum += right.csum[j];
- }
- //cout << " tail right is " << right << std::endl;
- }
- if (left.ex.start) {
- alloc_blocks -= left.ex.length - newlen;
- for (unsigned i=newlen; i<left.ex.length; i++)
- data_csum -= left.csum[i];
- }
- left.ex.length = newlen; // cut tail off preceeding extent
- if (left.ex.start)
- left.resize_tail();
- //cout << " preceeding left now " << left << std::endl;
- }
- p++;
- }
-
- // overlapping extents
- while (p != extent_map.end() &&
- p->first < offset + ex.length) {
- map<block_t,ExtentCsum>::iterator next = p;
- next++;
-
- // completely subsumed?
- ExtentCsum &o = p->second;
- if (p->first + o.ex.length <= offset+ex.length) {
- //cout << " erasing " << o << std::endl;
- if (o.ex.start) {
- alloc_blocks -= o.ex.length;
- for (unsigned i=0; i<o.ex.length; i++)
- data_csum -= o.csum[i];
- }
- extent_map.erase(p);
- p = next;
- continue;
- }
-
- // spans next extent, cut off head
- ExtentCsum &n = extent_map[ offset+ex.length ] = o;
- //cout << " cutting head off " << o;
- unsigned overlap = offset+ex.length - p->first;
- n.ex.length -= overlap;
- if (n.ex.start) {
- n.ex.start += overlap;
- alloc_blocks -= overlap;
- for (unsigned j=0; j<overlap; j++)
- data_csum -= n.csum[j];
- n.resize_head();
- }
- extent_map.erase(p);
- //cout << ", now " << n << std::endl;
- break;
- }
- }
-
- // add ourselves
- ExtentCsum &n = extent_map[ offset ];
- n.ex = ex;
- if (ex.start) {
- alloc_blocks += ex.length;
- n.resize_tail();
- }
-
- // extend object?
- if (offset + ex.length > last_block)
- last_block = offset+ex.length;
-
- verify_extents();
- }
-
- int truncate_extents(block_t len, vector<extent_t>& extra) {
- //cout << " truncate to " << len << " .. last_block " << last_block << std::endl;
-
- verify_extents();
-
- map<block_t,ExtentCsum>::iterator p = extent_map.lower_bound(len);
- if (p != extent_map.begin() &&
- (p == extent_map.end() ||( p->first > len && p->first))) {
- p--;
- ExtentCsum &o = p->second;
- if (o.ex.length > len - p->first) {
- int newlen = len - p->first;
- if (o.ex.start) {
- extent_t ex;
- ex.start = o.ex.start + newlen;
- ex.length = o.ex.length - newlen;
- //cout << " truncating ex " << p->second.ex << " to " << newlen << ", releasing " << ex << std::endl;
- for (unsigned i=newlen; i<o.ex.length; i++)
- data_csum -= o.csum[i];
- o.ex.length = newlen;
- o.resize_tail();
- extra.push_back(ex);
- alloc_blocks -= ex.length;
- } else
- o.ex.length = newlen;
- assert(o.ex.length > 0);
- }
- p++;
- }
-
- while (p != extent_map.end()) {
- assert(p->first >= len);
- ExtentCsum &o = p->second;
- if (o.ex.start) {
- for (unsigned i=0; i<o.ex.length; i++)
- data_csum -= o.csum[i];
- extra.push_back(o.ex);
- alloc_blocks -= o.ex.length;
- }
- map<block_t,ExtentCsum>::iterator n = p;
- n++;
- extent_map.erase(p);
- p = n;
- }
-
- last_block = len;
- verify_extents();
- return 0;
- }
-
-
- /* map_extents(start, len, ls)
- * map teh given page range into extents (and csums) on disk.
- */
- int map_extents(block_t start, block_t len, vector<extent_t>& ls, vector<csum_t> *csum) {
- //cout << "map_extents " << start << " " << len << std::endl;
- verify_extents();
-
- map<block_t,ExtentCsum>::iterator p;
-
- // hack hack speed up common cases!
- if (start == 0) {
- p = extent_map.begin();
- } else if (start+len == last_block && len == 1 && !extent_map.empty()) {
- // append hack.
- p = extent_map.end();
- p--;
- if (p->first < start) p++;
- } else {
- // normal
- p = extent_map.lower_bound(start);
- }
-
- if (p != extent_map.begin() &&
- (p == extent_map.end() || (p->first > start && p->first))) {
- p--;
- if (p->second.ex.length > start - p->first) {
- extent_t ex;
- int off = (start - p->first);
- ex.length = MIN(len, p->second.ex.length - off);
- if (p->second.ex.start) {
- ex.start = p->second.ex.start + off;
- if (csum)
- for (unsigned i=off; i<ex.length; i++)
- csum->push_back(p->second.csum[i]);
- } else
- ex.start = 0;
- ls.push_back(ex);
-
- //cout << " got (tail of?) " << p->second << " : " << ex << std::endl;
-
- start += ex.length;
- len -= ex.length;
- }
- p++;
- }
-
- while (len > 0 &&
- p != extent_map.end()) {
- assert(p->first == start);
- ExtentCsum e = p->second;
- e.ex.length = MIN(len, e.ex.length);
- ls.push_back(e.ex);
- if (e.ex.start && csum)
- for (unsigned i=0; i<e.ex.length; i++)
- csum->push_back(p->second.csum[i]);
- //cout << " got (head of?) " << p->second << " : " << ex << std::endl;
- start += e.ex.length;
- len -= e.ex.length;
- p++;
- }
-
- return 0;
- }
-
-
-
- /* map_alloc_regions(start, len, map)
- * map range into regions that need to be (re)allocated on disk
- * because they overlap "safe" (or unallocated) parts of the object
- */
- /*
- void map_alloc_regions(block_t start, block_t len,
- interval_set<block_t>& alloc) {
- interval_set<block_t> already_uncom;
-
- alloc.insert(start, len); // start with whole range
- already_uncom.intersection_of(alloc, uncommitted);
- alloc.subtract(already_uncom); // take out the bits that aren't yet committed
- }
- */
-
- block_t get_first_block() {
- if (!alloc_blocks) return 0;
- map<block_t,ExtentCsum>::iterator p = extent_map.begin();
- while (1) {
- if (p->second.ex.start)
- return p->second.ex.start;
- p++;
- }
- assert(0);
- }
-
-
-
- // pack/unpack
- int get_ondisk_bytes() {
- return sizeof(ebofs_onode) +
- get_collection_bytes() +
- get_attr_bytes() +
- get_extent_bytes() +
- get_bad_byte_bytes();
- }
- int get_collection_bytes() {
- return sizeof(coll_t) * collections.size();
- }
- int get_attr_bytes() {
- int s = 0;
- for (map<string, bufferptr>::iterator i = attr.begin();
- i != attr.end();
- i++) {
- s += i->first.length() + 1;
- s += i->second.length() + sizeof(int);
- }
- return s;
- }
- int get_extent_bytes() {
- return sizeof(extent_t) * extent_map.size() + sizeof(csum_t)*alloc_blocks;
- }
- int get_bad_byte_bytes() {
- return sizeof(extent_t) * bad_byte_extents.m.size();
- }
-};
-
-
-inline ostream& operator<<(ostream& out, Onode& on)
-{
- out << "onode(" << hex << on.object_id << dec << " len=" << on.object_size;
- out << " ref=" << on.get_ref_count();
- if (on.is_dirty()) out << " dirty";
- if (on.is_dangling()) out << " dangling";
- if (on.is_deleted()) out << " deleted";
- out << " uncom=" << on.uncommitted;
- if (!on.bad_byte_extents.empty()) out << " badbytes=" << on.bad_byte_extents;
- // out << " " << &on;
- out << ")";
- return out;
-}
-
-
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef CEPH_EBOFS_TABLE_H
-#define CEPH_EBOFS_TABLE_H
-
-#include "types.h"
-#include "nodes.h"
-
-/** table **/
-
-#define dbtout do { if (25 <= g_conf.debug_ebofs) {\
- _dout_begin_line(25);\
- *_dout << "ebofs.table(" << this << ")."
-
-
-template<class K, class V>
-class Table {
- private:
- NodePool &pool;
-
- ebofs_node_ptr root;
- int nkeys;
- int depth;
-
- public:
- Table(NodePool &p,
- struct ebofs_table& bts) :
- pool(p),
- root(bts.root), nkeys(bts.num_keys), depth(bts.depth) {
- dbtout << "cons" << dendl;
- }
-
- const ebofs_node_ptr &get_root() { return root; }
- int get_num_keys() { return nkeys; }
- int get_depth() { return depth; }
-
-
- /*
- */
- class _IndexItem { // i just need a struct size for below
- K k;
- nodeid_t n;
- };
- class IndexItem {
- public:
- K key;
- nodeid_t node;
- static const int MAX = Node::ITEM_LEN / (sizeof(_IndexItem));
- static const int MIN = MAX/2;
- };
- class _LeafItem { // i just need a struct size for below
- K k;
- V v;
- };
- class LeafItem {
- public:
- K key;
- V value;
- static const int MAX = Node::ITEM_LEN / (sizeof(_LeafItem));
- static const int MIN = MAX/2;
- };
-
- class Nodeptr {
- public:
- Node *node;
-
- Nodeptr() : node(0) {}
- Nodeptr(Node *n) : node(n) {}
- Nodeptr(NodePool& pool, nodeid_t nid) {
- open(pool, nid);
- }
- Nodeptr& operator=(Node *n) {
- node = n;
- return *this;
- }
-
- void open(NodePool& pool, nodeid_t nid) {
- node = pool.get_node(nid);
- if (is_index() && node->children.empty()) init_index(pool);
- }
-
- LeafItem& leaf_item(int i) { return (( LeafItem*)(node->item_ptr()))[i]; }
- IndexItem& index_item(int i) { return ((IndexItem*)(node->item_ptr()))[i]; }
- K key(int i) {
- if (node->is_index())
- return index_item(i).key;
- else
- return leaf_item(i).key;
- }
-
- bool is_leaf() { return node->is_leaf(); }
- bool is_index() { return node->is_index(); }
- void set_type(int t) { node->set_type(t); }
-
- int max_items() const {
- if (node->is_leaf())
- return LeafItem::MAX;
- else
- return IndexItem::MAX;
- }
- int min_items() const { return max_items() / 2; }
-
- nodeid_t get_id() { return node->get_id(); }
-
- int size() { return node->size(); }
- void set_size(int s) { node->set_size(s); }
-
- void init_index(NodePool& nodepool) {
- /*
- node->children = vector<Node*>(max_items());
- for (int i=0; i<max_items(); i++)
- if (i < size())
- node->children[i] = nodepool.get_node(index_item(i).node);
- else
- node->children[i] = 0;
- */
- }
-
-
- void remove_at_pos(int p) {
- if (node->is_index()) {
- for (int i=p; i<size()-1; i++) {
- index_item(i) = index_item(i+1);
- //node->children[i] = node->children[i+1];
- }
- } else {
- for (int i=p; i<size()-1; i++)
- leaf_item(i) = leaf_item(i+1);
- }
- set_size(size() - 1);
- dbtout << "remove_at_pos done, size now " << size() << " " << (node->is_index() ? "index":"leaf") << dendl;
- }
- void insert_at_leaf_pos(int p, K key, V value) {
- assert(is_leaf());
- for (int i=size(); i>p; i--)
- leaf_item(i) = leaf_item(i-1);
- leaf_item(p).key = key;
- leaf_item(p).value = value;
- set_size(size() + 1);
- }
- void insert_at_index_pos(int p, K key, nodeid_t nid) {
- assert(is_index());
- for (int i=size(); i>p; i--) {
- index_item(i) = index_item(i-1);
- //node->children[i] = node->children[i-1];
- }
- index_item(p).key = key;
- index_item(p).node = nid;
- set_size(size() + 1);
- }
-
- void append_item(LeafItem& i) {
- leaf_item(size()) = i;
- set_size(size() + 1);
- }
- void append_item(IndexItem& i) {
- index_item(size()) = i;
- set_size(size() + 1);
- }
-
- void split(Nodeptr& right) {
- if (node->is_index()) {
- for (int i=min_items(); i<size(); i++)
- right.append_item( index_item(i) );
- } else {
- for (int i=min_items(); i<size(); i++)
- right.append_item( leaf_item(i) );
- }
- set_size(min_items());
- }
-
- void merge(Nodeptr& right) {
- if (node->is_index())
- for (int i=0; i<right.size(); i++)
- append_item( right.index_item(i) );
- else
- for (int i=0; i<right.size(); i++)
- append_item( right.leaf_item(i) );
- right.set_size(0);
- }
-
- };
-
- /*
- */
- class Cursor {
- protected:
- public:
- static const int MATCH = 1; // on key
- static const int INSERT = 0; // before key
- static const int OOB = -1; // at end
-
- Table *table;
- vector<Nodeptr> open; // open nodes
- vector<int> pos; // position within the node
- //Nodeptr open[20];
- //int pos[20];
- int level;
-
- Cursor(Table *t) : table(t), open(t->depth), pos(t->depth), level(0) {}
-
- public:
-
- const LeafItem& current() {
- assert(open[level].is_leaf());
- return open[level].leaf_item(pos[level]);
- }
- V& dirty_current_value() {
- assert(open[level].is_leaf());
- dirty();
- return open[level].leaf_item(pos[level]).value;
- }
-
- // ** read-only bits **
- int move_left() {
- if (table->depth == 0) return OOB;
-
- // work up around branch
- int l;
- for (l = level; l >= 0; l--)
- if (pos[l] > 0) break;
- if (l < 0)
- return OOB; // we are the first item in the btree
-
- // move left one
- pos[l]--;
-
- // work back down right side
- for (; l<level; l++) {
- open[l+1].open(table->pool, open[l].index_item(pos[l]).node);
- pos[l+1] = open[l+1].size() - 1;
- }
- return 1;
- }
- int move_right() {
- if (table->depth == 0) return OOB;
-
- // work up branch
- int l;
- for (l=level; l>=0; l--)
- if (pos[l] < open[l].size() - 1) break;
- if (l < 0) {
- /* we are at last item in btree. */
- if (pos[level] < open[level].size()) {
- pos[level]++; /* move into add position! */
- return 0;
- }
- return -1;
- }
-
- /* move right one */
- assert( pos[l] < open[l].size() );
- pos[l]++;
-
- /* work back down */
- for (; l<level; l++) {
- open[l+1].open(table->pool, open[l].index_item(pos[l]).node );
- pos[l+1] = 0; // furthest left
- }
- return 1;
- }
-
- // ** modifications **
- void dirty() {
- for (int l=level; l>=0; l--) {
- if (open[l].node->is_dirty()) {
- dbtout << "dirty " << open[l].node->get_id() << " already dirty (thus parents are too)" << dendl;
- break; // already dirty! (and thus parents are too)
- }
-
- table->pool.dirty_node(open[l].node);
- if (l > 0)
- open[l-1].index_item( pos[l-1] ).node = open[l].get_id();
- else
- table->root.nodeid = open[0].get_id();
- }
- }
- private:
- void repair_parents() {
- // did i make a change at the start of a node?
- if (pos[level] == 0) {
- K key = open[level].key(0); // new key parents should have
- for (int j=level-1; j>=0; j--) {
- if (open[j].index_item(pos[j]).key == key)
- break; /* it's the same key, we can stop fixing */
- open[j].index_item(pos[j]).key = key;
- if (pos[j] > 0) break; /* last in position 0.. */
- }
- }
- }
-
- public:
- void remove() {
- dirty();
-
- // remove from node
- open[level].remove_at_pos( pos[level] );
- repair_parents();
-
- // was it a key?
- if (level == table->depth-1)
- table->nkeys--;
- }
-
- void insert(K key, V value) {
- dirty();
-
- // insert
- open[level].insert_at_leaf_pos(pos[level], key, value);
- repair_parents();
-
- // was it a key?
- if (level == table->depth-1)
- table->nkeys++;
- }
-
- int rotate_left() {
- if (level == 0) return -1; // i am root
- if (pos[level-1] == 0) return -1; // nothing to left
-
- Nodeptr here = open[level];
- Nodeptr parent = open[level-1];
- Nodeptr left(table->pool, parent.index_item(pos[level-1] - 1).node );
- if (left.size() == left.max_items()) return -1; // it's full
-
- // make both dirty
- dirty();
- if (!left.node->is_dirty()) {
- table->pool.dirty_node(left.node);
- parent.index_item(pos[level-1]-1).node = left.get_id();
- }
-
- dbtout << "rotating item " << here.key(0) << " left from " << here.get_id() << " to " << left.get_id() << dendl;
-
- /* add */
- if (here.node->is_leaf())
- left.append_item(here.leaf_item(0));
- else
- left.append_item(here.index_item(0));
-
- /* remove */
- here.remove_at_pos(0);
-
- /* fix parent index for me */
- parent.index_item( pos[level-1] ).key = here.key(0);
- // we never have to update past immediate parent, since we're not at pos 0
-
- /* adjust cursor */
- if (pos[level] > 0)
- pos[level]--;
- //else
- //assert(1); /* if we were positioned here, we're equal */
- /* if it was 0, then the shifted item == our key, and we can stay here safely. */
- return 0;
- }
- int rotate_right() {
- if (level == 0) return -1; // i am root
- if (pos[level-1] + 1 >= open[level-1].size()) return -1; // nothing to right
-
- Nodeptr here = open[level];
- Nodeptr parent = open[level-1];
- Nodeptr right(table->pool, parent.index_item( pos[level-1] + 1 ).node );
- if (right.size() == right.max_items()) return -1; // it's full
-
- // make both dirty
- dirty();
- if (!right.node->is_dirty()) {
- table->pool.dirty_node(right.node);
- parent.index_item( pos[level-1]+1 ).node = right.get_id();
- }
-
- if (pos[level] == here.size()) {
- /* let's just move the cursor over! */
- //if (sizeof(K) == 8)
- dbtout << "shifting cursor right from " << here.get_id() << " to less-full node " << right.get_id() << dendl;
- open[level] = right;
- pos[level] = 0;
- pos[level-1]++;
- return 0;
- }
-
- //if (sizeof(K) == 8)
- dbtout << "rotating item " << hex << here.key(here.size()-1) << dec << " right from "
- << here.get_id() << " to " << right.get_id() << dendl;
-
- /* add */
- if (here.is_index())
- right.insert_at_index_pos(0,
- here.index_item( here.size()-1 ).key,
- here.index_item( here.size()-1 ).node);
- else
- right.insert_at_leaf_pos(0,
- here.leaf_item( here.size()-1 ).key,
- here.leaf_item( here.size()-1 ).value);
-
- /* remove */
- here.set_size(here.size() - 1);
-
- /* fix parent index for right */
- parent.index_item( pos[level-1] + 1 ).key = right.key(0);
-
- return 0;
- }
- };
-
-
- public:
- bool almost_full() {
- if (2*(depth+1) > pool.get_num_free()) // worst case, plus some.
- return true;
- return false;
- }
-
- int find(K key, Cursor& cursor) {
- dbtout << "find " << key << " depth " << depth << dendl;
- verify("find");
-
- if (depth == 0)
- return Cursor::OOB;
-
- // init
- cursor.level = 0;
-
- // start at root
- Nodeptr curnode(pool, root.nodeid);
- cursor.open[0] = curnode;
-
- if (curnode.size() == 0) return -1; // empty!
-
- // find leaf
- for (cursor.level = 0; cursor.level < depth-1; cursor.level++) {
- /* if key=5, we want 2 3 [4] 6 7, or 3 4 [5] 5 6 (err to the left) */
- int left = 0; /* i >= left */
- int right = curnode.size()-1; /* i < right */
- while (left < right) {
- int i = left + (right - left) / 2;
- if (curnode.index_item(i).key < key) {
- left = i + 1;
- } else if (i && curnode.index_item(i-1).key >= key) {
- right = i;
- } else {
- left = right = i;
- break;
- }
- }
- int i = left;
- if (i && curnode.index_item(i).key > key) i--;
-
-#ifdef EBOFS_DEBUG_BTREE
- int j;
- for (j=0; j<curnode.size()-1; j++) {
- if (curnode.index_item(j).key == key) break; /* perfect */
- if (curnode.index_item(j+1).key > key) break;
- }
- if (i != j) {
- dbtout << "btree binary search failed" << dendl;
- i = j;
- }
-#endif
-
- cursor.pos[cursor.level] = i;
- dbtout << "find index level " << cursor.level << " node " << curnode.get_id() << " pos " << i
- << " key " << cursor.open[cursor.level].index_item(i).key
- << " value " << cursor.open[cursor.level].index_item(i).node << dendl;
- /* get child node */
- curnode.open(pool, cursor.open[cursor.level].index_item(i).node );
- cursor.open[cursor.level+1] = curnode;
- }
-
- /* search leaf */
- dbtout << "find leaf " << curnode.get_id() << " size " << curnode.size() << dendl;
-
- /* if key=5, we want 2 3 4 [6] 7, or 3 4 [5] 5 6 (err to the right) */
- int left = 0; /* i >= left */
- int right = curnode.size(); /* i < right */
- while (left < right) {
- int i = left + (right - left) / 2;
- if (curnode.leaf_item(i).key < key) {
- left = i + 1;
- } else if (i && curnode.leaf_item(i-1).key >= key) {
- right = i;
- } else {
- left = right = i;
- break;
- }
- }
- int i = left;
-
-
-#ifdef EBOFS_DEBUG_BTREE
- int j;
- for (j=0; j<curnode.size(); j++) {
- if (curnode.leaf_item(j).key >= key) break;
- }
- if (i != j) {
- dbtout << "btree binary search failed" << dendl;
- i = j;
- }
-#endif
-
- cursor.pos[cursor.level] = i; /* first key in this node, or key insertion point */
-
- if (curnode.size() >= i+1) {
- if (curnode.leaf_item(i).key == key) {
- dbtout << "find pos " << i << " match " << curnode.leaf_item(i).key << dendl;
- return Cursor::MATCH; /* it's the actual key */
- } else {
- dbtout << "find pos " << i << " insert " << curnode.leaf_item(i).key << dendl;
- return Cursor::INSERT; /* it's an insertion point */
- }
- }
- dbtout << "find pos " << i << " OOB (end of btree)" << dendl;
- return Cursor::OOB; /* it's the end of the btree (also a valid insertion point) */
- }
-
- int lookup(K key) {
- dbtout << "lookup" << dendl;
- Cursor cursor(this);
- if (find(key, cursor) == Cursor::MATCH)
- return 0;
- return -1;
- }
-
- int lookup(K key, V& value) {
- dbtout << "lookup" << dendl;
- Cursor cursor(this);
- if (find(key, cursor) == Cursor::MATCH) {
- value = cursor.current().value;
- return 0;
- }
- return -1;
- }
-
- int insert(K key, V value) {
- verify("pre-insert");
- dbtout << "insert " << key << " -> " << value << dendl;
- if (almost_full()) return -1;
-
- // empty?
- if (nkeys == 0) {
- if (root.nodeid == -1) {
- // create a root node (leaf!)
- assert(depth == 0);
- Nodeptr newroot( pool.new_node(Node::TYPE_LEAF) );
- root.nodeid = newroot.get_id();
- depth++;
- }
- assert(depth == 1);
- assert(root.nodeid >= 0);
- }
-
- // start at/near key
- Cursor cursor(this);
- find(key, cursor);
-
- // insert loop
- nodeid_t nodevalue = 0;
- while (1) {
-
- /* room in this node? */
- if (cursor.open[cursor.level].size() < cursor.open[cursor.level].max_items()) {
- if (cursor.open[cursor.level].is_leaf())
- cursor.insert( key, value ); // will dirty, etc.
- else {
- // indices are already dirty
- cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue);
- }
- verify("insert 1");
- return 0;
- }
-
- /* this node is full. */
- assert( cursor.open[cursor.level].size() == cursor.open[cursor.level].max_items() );
-
- /* can we rotate? */
- if (false) // NO! there's a bug in here somewhere, don't to it.
- if (cursor.level > 0) {
- if ((cursor.pos[cursor.level-1] > 0
- && cursor.rotate_left() >= 0) ||
- (cursor.pos[cursor.level-1] + 1 < cursor.open[cursor.level-1].size()
- && cursor.rotate_right() >= 0)) {
-
- if (cursor.open[cursor.level].is_leaf())
- cursor.insert( key, value ); // will dirty, etc.
- else {
- // indices are already dirty
- cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue);
- }
- verify("insert 2");
- return 0;
- }
- }
-
- /** split node **/
-
- if (cursor.level == depth-1) {
- dbtout << "splitting leaf " << cursor.open[cursor.level].get_id() << dendl;
- } else {
- dbtout << "splitting index " << cursor.open[cursor.level].get_id() << dendl;
- }
-
- cursor.dirty();
-
- // split
- Nodeptr leftnode = cursor.open[cursor.level];
- Nodeptr newnode( pool.new_node(leftnode.node->get_type()) );
- leftnode.split( newnode );
-
- /* insert our item */
- if (cursor.pos[cursor.level] > leftnode.size()) {
- // not with cursor, since this node isn't added yet!
- if (newnode.is_leaf()) {
- newnode.insert_at_leaf_pos( cursor.pos[cursor.level] - leftnode.size(),
- key, value );
- nkeys++;
- } else {
- newnode.insert_at_index_pos( cursor.pos[cursor.level] - leftnode.size(),
- key, nodevalue );
- }
- } else {
- // with cursor (if leaf)
- if (leftnode.is_leaf())
- cursor.insert( key, value );
- else
- leftnode.insert_at_index_pos( cursor.pos[cursor.level],
- key, nodevalue );
- }
-
- /* are we at the root? */
- if (cursor.level == 0) {
- /* split root. */
- dbtout << "that split was the root " << root.nodeid << dendl;
- Nodeptr newroot( pool.new_node(Node::TYPE_INDEX) );
-
- /* new root node */
- newroot.set_size(2);
- newroot.index_item(0).key = leftnode.key(0);
- newroot.index_item(0).node = root.nodeid;
- newroot.index_item(1).key = newnode.key(0);
- newroot.index_item(1).node = newnode.get_id();
-
- /* heighten tree */
- depth++;
- root.nodeid = newroot.get_id();
- verify("insert 3");
- return 0;
- }
-
- /* now insert newindex in level-1 */
- nodevalue = newnode.get_id();
- key = newnode.key(0);
- cursor.level--;
- cursor.pos[cursor.level]++; // ...to the right of leftnode!
- }
- }
-
-
- int remove(K key) {
- verify("pre-remove");
- dbtout << "remove " << key << dendl;
-
- if (almost_full()) {
- cout << "table almost full, failing" << std::endl;
- assert(0);
- return -1;
- }
-
- Cursor cursor(this);
- if (find(key, cursor) <= 0) {
- cerr << "remove " << key << " 0x" << hex << key << dec << " .. dne" << std::endl;
- g_conf.debug_ebofs = 33;
- g_conf.ebofs_verify = true;
- verify("remove dne");
- assert(0);
- return -1; // key dne
- }
-
-
- while (1) {
- dbtout << "preremove level " << cursor.level << " size " << cursor.open[cursor.level].size()
- << " ? min " << cursor.open[cursor.level].min_items() << dendl;
- cursor.remove();
- verify("post-remove");
- dbtout << "postremove level " << cursor.level << " size " << cursor.open[cursor.level].size()
- << " ? min " << cursor.open[cursor.level].min_items() << dendl;
-
- // balance + adjust
-
- if (cursor.level == 0) {
- // useless root index?
- if (cursor.open[0].size() == 1 &&
- depth > 1) {
- depth--;
- root.nodeid = cursor.open[0].index_item(0).node;
- pool.release( cursor.open[0].node );
- }
-
- // note: root can be small, but not empty
- else if (nkeys == 0) {
- assert(cursor.open[cursor.level].size() == 0);
- assert(depth == 1);
- root.nodeid = -1;
- depth = 0;
- if (cursor.open[0].node)
- pool.release(cursor.open[0].node);
- }
- verify("remove 1");
- return 0;
- }
-
- if (cursor.open[cursor.level].size() > cursor.open[cursor.level].min_items()) {
- verify("remove 2");
- dbtout << "remove size " << cursor.open[cursor.level].size()
- << " > min " << cursor.open[cursor.level].min_items() << dendl;
- return 0;
- }
-
- // borrow from siblings?
- Nodeptr left;
- Nodeptr right;
-
- // left?
- if (cursor.pos[cursor.level-1] > 0) {
- int left_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] - 1).node;
- left.open(pool, left_loc);
-
- if (left.size() > left.min_items()) {
- /* move cursor left, shift right */
- cursor.pos[cursor.level] = 0;
- cursor.open[cursor.level] = left;
- cursor.pos[cursor.level-1]--;
- cursor.rotate_right();
- verify("remove 3");
- return 0;
- }
-
- /* combine to left */
- right = cursor.open[cursor.level];
- }
- else {
- assert(cursor.pos[cursor.level-1] < cursor.open[cursor.level-1].size() - 1);
- int right_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] + 1 ).node;
- right.open(pool, right_loc );
-
- if (right.size() > right.min_items()) {
- /* move cursor right, shift an item left */
- cursor.pos[cursor.level] = 1;
- cursor.open[cursor.level] = right;
- cursor.pos[cursor.level-1]++;
- cursor.rotate_left();
- verify("remove 4");
- return 0;
- }
-
- /* combine to left */
- left = cursor.open[cursor.level];
- cursor.pos[cursor.level-1]++; /* move cursor to (soon-to-be-empty) right side item */
- }
-
- // note: cursor now points to _right_ node.
-
- /* combine (towards left)
- * (this makes it so our next delete will be in the index
- * interior, which is less scary.)
- */
- dbtout << "combining nodes " << left.get_id() << " and " << right.get_id() << dendl;
-
- left.merge(right);
-
- // dirty left + right
- cursor.dirty(); // right
- if (!left.node->is_dirty()) {
- pool.dirty_node(left.node);
- cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1]-1 ).node = left.get_id();
- }
-
- pool.release(right.node);
-
- cursor.level--; // now point to the link to the obsolete (right-side) sib */
- }
-
- }
-
- void clear(Cursor& cursor, int node_loc, int level) {
- dbtout << "clear" << dendl;
-
- Nodeptr node(pool, node_loc);
- cursor.open[level] = node;
-
- // hose children?
- if (level < depth-1) {
- for (int i=0; i<node.size(); i++) {
- // index
- cursor.pos[level] = i;
- nodeid_t child = cursor.open[level].index_item(i).node;
- clear( cursor, child, level+1 );
- }
- }
-
- // hose myself
- pool.release( node.node );
- }
-
- void clear() {
- Cursor cursor(this);
- if (root.nodeid == -1 && depth == 0) return; // already empty!
- clear(cursor, root.nodeid, 0);
- root.nodeid = -1;
- depth = 0;
- nkeys = 0;
- }
-
- int verify_sub(Cursor& cursor, int node_loc, int level, int& count, K& last, const char *on) {
- int err = 0;
-
- Nodeptr node(pool, node_loc);
- cursor.open[level] = node;
-
- // identify max, min, and validate key range
- K min = node.key(0);
- last = min;
- K max = min;
-
- // print it
- char s[1000];
- strcpy(s," ");
- s[level+1] = 0;
- if (1) {
- if (root.nodeid == node_loc) {
- dbtout << s << "root " << node_loc << ": "
- << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << dendl;
- } else if (level == depth-1) {
- dbtout << s << "leaf " << node_loc << ": "
- << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << dendl;
- } else {
- dbtout << s << "indx " << node_loc << ": "
- << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << dendl;
- }
- if (1) {
- for (int i=0; i<node.size(); i++) {
- if (level < depth-1) { // index
- dbtout << s << " " << hex << node.key(i) << " [" << node.index_item(i).node << "]" << dec << dendl;
- } else { // leaf
- dbtout << s << " " << hex << node.key(i) << " -> " << node.leaf_item(i).value << dec << dendl;
- }
- }
- }
- }
-
- for (int i=0; i<node.size(); i++) {
- if (i && node.key(i) <= last) {
- dbtout << ":: key " << i << " " << hex << node.key(i) << dec << " in node " << node_loc
- << " is out of order, last is " << hex << last << dec << dendl;
- err++;
- }
- if (node.key(i) > max)
- max = node.key(i);
-
- if (level < depth-1) {
- // index
- cursor.pos[level] = i;
- err += verify_sub( cursor, cursor.open[level].index_item(i).node, level+1, count, last, on );
- } else {
- // leaf
- count++;
- last = node.key(i);
- }
- }
-
- if (level) {
- // verify that parent's keys are appropriate
- if (min != cursor.open[level-1].index_item(cursor.pos[level-1]).key) {
- dbtout << ":: key in index node " << cursor.open[level-1].get_id()
- << " != min in child " << node_loc
- << "(key is " << hex << cursor.open[level-1].index_item(cursor.pos[level-1]).key
- << ", min is " << min << ")" << dec << dendl;
- err++;
- }
- if (cursor.pos[level-1] < cursor.open[level-1].size()-1) {
- if (max > cursor.open[level-1].index_item(1+cursor.pos[level-1]).key) {
- dbtout << ":: next key in index node " << cursor.open[level-1].get_id()
- << " < max in child " << node_loc
- << "(key is " << hex << cursor.open[level-1].index_item(1+cursor.pos[level-1]).key
- << ", max is " << max << ")" << dec << dendl;
- err++;
- }
- }
- }
-
- if (err == 0) return err;
-
- // print it
- /*
- char s[1000];
- strcpy(s," ");
- s[level+1] = 0;
- if (1) {
- if (root.nodeid == node_loc) {
- dbtout << s << "root " << node_loc << ": "
- << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << dendl;
- } else if (level == depth-1) {
- dbtout << s << "leaf " << node_loc << ": "
- << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << dendl;
- } else {
- dbtout << s << "indx " << node_loc << ": "
- << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << dendl;
- }
-
- if (1) {
- for (int i=0; i<node.size(); i++) {
- if (level < depth-1) { // index
- dbtout << s << " " << hex << node.key(i) << " [" << node.index_item(i).node << "]" << dec << dendl;
- } else { // leaf
- dbtout << s << " " << hex << node.key(i) << " -> " << node.leaf_item(i).value << dec << dendl;
- }
- }
- }
- }*/
-
- return err;
- }
-
- void verify(const char *on) {
- if (!g_conf.ebofs_verify)
- return;
-
- if (root.nodeid == -1 && depth == 0) {
- return; // empty!
- }
-
- int count = 0;
- Cursor cursor(this);
- K last;
-
- int before = g_conf.debug_ebofs;
- g_conf.debug_ebofs = 0;
-
- int err = verify_sub(cursor, root.nodeid, 0, count, last, on);
- if (count != nkeys) {
- cerr << "** count " << count << " != nkeys " << nkeys << std::endl;
- err++;
- }
-
- g_conf.debug_ebofs = before;
-
- // ok?
- if (err) {
- cerr << "verify failure, called by '" << on << "'" << std::endl;
- g_conf.debug_ebofs = 30;
- // do it again, so we definitely get the dump.
- int count = 0;
- Cursor cursor(this);
- K last;
- verify_sub(cursor, root.nodeid, 0, count, last, on);
- assert(err == 0);
- }
- }
-
-};
-
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#ifndef CEPH_EBOFS_CSUM_H
-#define CEPH_EBOFS_CSUM_H
-
-typedef uint64_t csum_t;
-
-/*
- * physically and logically aligned buffer. yay.
- */
-inline uint64_t calc_csum(const char *start, int len) {
- // must be 64-bit aligned
- assert(((unsigned long)start & 7) == 0);
- assert((len & 7) == 0);
-
- uint64_t *p = (uint64_t*)start;
- uint64_t *end = (uint64_t*)(start + len);
- uint64_t csum = 0;
- while (p < end) {
- csum += *p;
- p++;
- }
- return csum;
-}
-
-/*
- * arbitrarily aligned buffer. buffer alignment must match logical alignment.
- * i.e., buffer content is aligned, but has non-aligned boundaries.
- */
-inline uint64_t calc_csum_unaligned(const char *start, int len) {
- const char *end = start + len;
- uint64_t csum = 0;
-
- // front
- while (start < end && (unsigned long)start & 7) {
- csum += (uint64_t)(*start) << (8*(8 - ((unsigned long)start & 7)));
- start++;
- }
- if (start == end)
- return csum;
-
- // middle, aligned
- const char *fastend = end - 7;
- while (start < fastend) {
- csum += *(uint64_t*)start;
- start += sizeof(uint64_t);
- }
-
- // tail
- while (start < end) {
- csum += (uint64_t)(*start) << (8*(8 - ((unsigned long)start & 7)));
- start++;
- }
- return csum;
-}
-
-
-/*
- * arbitrarily aligned buffer, with arbitrary logical alignment
- */
-inline uint64_t calc_csum_realign(const char *start, int len, int off) {
- const char *end = start + len;
- uint64_t csum = 0;
-
- if (((unsigned long)start & 7) == ((unsigned long)off & 7))
- return calc_csum_unaligned(start, len); // lucky us, start and off alignment matches.
-
- // do it the slow way. yucky!
- while (start < end) {
- csum += (uint64_t)(*start) << (8*(8 - (off & 7)));
- start++; off++;
- }
- return csum;
-}
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-
-#include <iostream>
-#include "ebofs/Ebofs.h"
-
-
-int main(int argc, const char **argv)
-{
- // args
- vector<const char*> args;
- argv_to_vec(argc, argv, args);
- parse_config_options(args);
-
- if (args.size() < 1) {
- cerr << "usage: mkfs.ebofs [options] <device file>" << std::endl;
- return -1;
- }
- const char *filename = args[0];
-
- // mkfs
- Ebofs mfs(filename);
- int r = mfs.mkfs();
- if (r < 0) exit(r);
-
- if (args.size() > 1) { // pass an extra arg of some sort to trigger the test crapola
- // test-o-rama!
- Ebofs fs(filename);
- fs.mount();
-
- // zillion objects
- if (1) {
- char crap[1024*1024];
- memset(crap, 0, 1024*1024);
- bufferlist bl;
- int sz = 10000;
- bl.append(crap, sz);
-
- int n = 100000;
- utime_t start = g_clock.now();
- for (int i=0; i<n; i++) {
- if (i && i % 1000 == 0) {
- utime_t now = g_clock.now();
- utime_t end = now;
- end -= start;
- start = now;
- cout << i << " / " << n << " in " << end << std::endl;
- }
- pobject_t poid(0, 0, object_t(i,0));
- fs.write(0, poid, 0, sz, bl, (Context*)0);
- }
- }
-
- // streaming write test
- if (0) {
- char crap[1024*1024];
- memset(crap, 0, 1024*1024);
-
-
- pobject_t oid(0, 0, object_t(1,2));
- uint64_t pos = 0;
- uint64_t sz = 16;
-
- bufferlist bl;
- bl.append(crap, sz);
-
- struct timespec ts;
- ts.tv_sec = 0;
- ts.tv_nsec = 1000*1000*40; // ms -> nsec
-
- while (1) {
- cout << g_clock.now() << " writing " << pos << "~" << sz << std::endl;
- fs.write(0, oid, pos, sz, bl, (Context*)0);
- pos += sz;
- nanosleep(&ts, 0);
- }
-
- }
-
- /*
- if (1) {
- // partial write tests
- char crap[1024*1024];
- memset(crap, 0, 1024*1024);
-
- bufferlist small;
- small.append(crap, 10);
- bufferlist med;
- med.append(crap, 1000);
- bufferlist big;
- big.append(crap, 1024*1024);
-
- cout << "0" << std::endl;
- fs.write(10, 0, 1024*1024, big, (Context*)0);
- fs.sync();
- fs.trim_buffer_cache();
-
- cout << "1" << std::endl;
- fs.write(10, 10, 10, small, 0);
- fs.write(10, 1, 1000, med, 0);
- fs.sync();
- fs.trim_buffer_cache();
-
- cout << "2" << std::endl;
- fs.write(10, 10, 10, small, 0);
- //fs.sync();
- fs.write(10, 1, 1000, med, 0);
- fs.sync();
- fs.trim_buffer_cache();
-
- cout << "3" << std::endl;
- fs.write(10, 1, 1000, med, 0);
- fs.write(10, 10000, 10, small, 0);
- fs.truncate(10, 100, 0);
- fs.sync();
- fs.trim_buffer_cache();
-
- cout << "4" << std::endl;
- fs.remove(10);
- fs.sync();
- fs.write(10, 10, 10, small, 0);
- fs.sync();
- fs.write(10, 1, 1000, med, 0);
- fs.sync();
- fs.truncate(10, 100, 0);
- fs.write(10, 10, 10, small, 0);
- fs.trim_buffer_cache();
-
-
-
- }
-
- if (0) { // onode write+read test
- bufferlist bl;
- char crap[1024*1024];
- memset(crap, 0, 1024*1024);
- bl.append(crap, 10);
-
- fs.write(10, 10, 0, bl, (Context*)0);
- fs.umount();
-
- Ebofs fs2(filename);
- fs2.mount();
- fs2.read(10, 10, 0, bl);
- fs2.umount();
-
- return 0;
- }
-
-
- if (0) { // small write + read test
- bufferlist bl;
- char crap[1024*1024];
- memset(crap, 0, 1024*1024);
-
- object_t oid = 10;
- int n = 10000;
- int l = 128;
- bl.append(crap, l);
-
-
- char *p = bl.c_str();
- uint64_t o = 0;
- for (int i=0; i<n; i++) {
- cout << "write at " << o << std::endl;
- for (int j=0;j<l;j++)
- p[j] = (char)(oid^(o+j));
- fs.write(oid, l, o, bl, (Context*)0);
- o += l;
- }
-
- fs.sync();
- fs.trim_buffer_cache();
-
- o = 0;
- for (int i=0; i<n; i++) {
- cout << "read at " << o << std::endl;
- bl.clear();
- fs.read(oid, l, o, bl);
-
- char b[l];
- bl.copy(0, l, b);
- char *p = b;
- int left = l;
- while (left--) {
- assert(*p == (char)(o ^ oid));
- o++;
- p++;
- }
- }
-
- }
-
- if (0) { // big write speed test
- bufferlist bl;
- char crap[1024*1024];
- memset(crap, 0, 1024*1024);
- bl.append(crap, 1024*1024);
-
- int megs = 1000;
-
- utime_t start = g_clock.now();
-
- for (uint64_t m=0; m<megs; m++) {
- //if (m%100 == 0)
- cout << m << " / " << megs << std::endl;
- fs.write(10, bl.length(), 1024LL*1024LL*m, bl, (Context*)0);
- }
- fs.sync();
-
- utime_t end = g_clock.now();
- end -= start;
-
- cout << "elapsed " << end << std::endl;
-
- float mbs = (float)megs / (float)end;
- cout << "mb/s " << mbs << std::endl;
- }
-
- if (0) { // test
- bufferlist bl;
- char crap[10000];
- memset(crap, 0, 10000);
- bl.append(crap, 10000);
- fs.write(10, bl.length(), 200, bl, (Context*)0);
- fs.trim_buffer_cache();
- fs.write(10, bl.length(), 5222, bl, (Context*)0);
- sleep(1);
- fs.trim_buffer_cache();
- fs.write(10, 5000, 3222, bl, (Context*)0);
- }
-
- // test small writes
- if (0) {
- char crap[1024*1024];
- memset(crap, 0, 1024*1024);
- bufferlist bl;
- bl.append(crap, 1024*1024);
-
- // reandom write
- if (1) {
- srand(0);
- for (int i=0; i<10000; i++) {
- uint64_t off = rand() % 1000000;
- size_t len = 1+rand() % 10000;
- cout << std::endl << i << " writing bit at " << off << " len " << len << std::endl;
- fs.write(10, len, off, bl, (Context*)0);
- //fs.sync();
- //fs.trim_buffer_cache();
- }
- fs.remove(10);
- for (int i=0; i<100; i++) {
- uint64_t off = rand() % 1000000;
- size_t len = 1+rand() % 10000;
- cout << std::endl << i << " writing bit at " << off << " len " << len << std::endl;
- fs.write(10, len, off, bl, (Context*)0);
- //fs.sync();
- //fs.trim_buffer_cache();
- }
- }
-
- if (0) {
- // sequential write
- srand(0);
- uint64_t off = 0;
- for (int i=0; i<10000; i++) {
- size_t len = 1024*1024;//1+rand() % 10000;
- cout << std::endl << i << " writing bit at " << off << " len " << len << std::endl;
- fs.write(10, len, off, bl, (Context*)0);
- off += len;
- }
-
- }
-
-
- if (0) {
- // read
- srand(0);
- for (int i=0; i<100; i++) {
- bufferlist bl;
- uint64_t off = rand() % 1000000;
- size_t len = rand() % 1000;
- cout << std::endl << "read bit at " << off << " len " << len << std::endl;
- int r = fs.read(10, len, off, bl);
- assert(bl.length() == len);
- assert(r == (int)len);
- }
- }
-
- // flush
- fs.sync();
- fs.trim_buffer_cache();
- //fs.trim_buffer_cache();
-
- if (0) {
- // read again
- srand(0);
- for (int i=0; i<100; i++) {
- bufferlist bl;
- uint64_t off = rand() % 1000000;
- size_t len = 100;
- cout << std::endl << "read bit at " << off << " len " << len << std::endl;
- int r = fs.read(10, len, off, bl);
- assert(bl.length() == len);
- assert(r == (int)len);
- }
-
- // flush
- fs.sync();
- fs.trim_buffer_cache();
- }
-
- if (0) {
- // write on empty cache
- srand(0);
- for (int i=0; i<100; i++) {
- uint64_t off = rand() % 1000000;
- size_t len = 100;
- cout << std::endl << "writing bit at " << off << " len " << len << std::endl;
- fs.write(10, len, off, bl, (Context*)0);
- }
- }
-
- }
- */
-
- fs.sync();
- fs.trim_buffer_cache();
-
- fs.umount();
- }
-
- return 0;
-}
-
-
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef CEPH_EBOFS_NODES_H
-#define CEPH_EBOFS_NODES_H
-
-/** nodes, node regions **/
-
-#include "types.h"
-#include "BlockDevice.h"
-#include "include/xlist.h"
-#include "include/bitmapper.h"
-
-/*
-
- disk wire memory
-
- free free -> free can alloc
- free used -> dirty can modify
-
- free used used -> clean
- free used free -> limbo
-
- used used -> clean
- used free -> limbo
-
-
- // meaningless
- used free free -> free can alloc
- used free used __DNE__
-
-
-*/
-
-#undef debofs
-#define debofs(x) do { if (x <= g_conf.debug_ebofs) { \
- _dout_begin_line(x); *_dout << "ebofs.nodepool."
-
-
-class Node {
- public:
- // bit fields
- static const int STATE_CLEAN = 1;
- static const int STATE_DIRTY = 2;
-
- static const int ITEM_LEN = EBOFS_NODE_BYTES - sizeof(int) - sizeof(int) - sizeof(int);
-
- static const int TYPE_INDEX = 1;
- static const int TYPE_LEAF = 2;
-
- protected:
- nodeid_t id;
- int pos_in_bitmap; // position in bitmap
- int state; // use bit fields above!
-
- bufferptr bptr;
-
- // in disk buffer
- int *type;
- int *nrecs;
-
- public:
- xlist<Node*>::item xlist; // dirty
-
- vector<Node*> children;
-
- Node(nodeid_t i, int pib, bufferptr& b, int s) :
- id(i), pos_in_bitmap(pib),
- state(s), bptr(b), xlist(this) {
- setup_pointers();
- }
-
- void setup_pointers() {
- nrecs = (int*)(bptr.c_str());
- type = (int*)(bptr.c_str() + sizeof(*nrecs));
- }
-
- bool do_cow() {
- if (bptr.do_cow()) {
- setup_pointers();
- return true;
- }
- return false;
- }
-
-
- // id
- nodeid_t get_id() const { return id; }
- void set_id(nodeid_t n) { id = n; }
- int get_pos_in_bitmap() const { return pos_in_bitmap; }
- void set_pos_in_bitmap(int i) { pos_in_bitmap = i; }
-
- // buffer
- bufferptr& get_buffer() { return bptr; }
-
- char *item_ptr() { return bptr.c_str() + sizeof(*nrecs) + sizeof(*type); }
-
- // size
- int size() { return *nrecs; }
- void set_size(int s) { *nrecs = s; }
-
- // type
- int& get_type() { return *type; }
- void set_type(int t) { *type = t; }
- bool is_index() { return *type == TYPE_INDEX; }
- bool is_leaf() { return *type == TYPE_LEAF; }
-
-
- // state
- bool is_dirty() { return state == STATE_DIRTY; }
- bool is_clean() { return state == STATE_CLEAN; }
-
- void set_state(int s) { state = s; }
-
-};
-
-
-
-
-
-class NodePool {
- protected:
- //hash_map<nodeid_t, Node*, rjhash<uint64_t> > node_map; // open node map
- unordered_map<nodeid_t, Node*, rjhash<uint64_t> > node_map; // open node map
- //map<nodeid_t, Node*> node_map;
-
- public:
- vector<extent_t> region_loc; // region locations
- extent_t usemap_even;
- extent_t usemap_odd;
-
- buffer::ptr usemap_data;
- bitmapper usemap_bits;
-
- protected:
- // on-disk block states
- int num_nodes;
- int num_dirty;
- int num_clean;
- int num_free;
- int num_limbo;
-
- xlist<Node*> dirty_ls;
- interval_set<nodeid_t> free;
- interval_set<nodeid_t> limbo;
-
- Mutex &ebofs_lock;
- Cond commit_cond;
- int flushing;
-
- nodeid_t make_nodeid(int region, int offset) {
- return region_loc[region].start + (block_t)offset;
- }
- int nodeid_pos_in_bitmap(nodeid_t nid) {
- unsigned region;
- int num = 0;
- for (region = 0;
- (block_t)nid < region_loc[region].start || (block_t)nid > region_loc[region].end();
- region++) {
- //generic_dout(0) << "node " << nid << " not in " << region << " " << region_loc[region] << dendl;
- num += region_loc[region].length;
- }
- num += nid - region_loc[region].start;
- //generic_dout(0) << "node " << nid << " is in " << region << ", overall bitmap pos is " << num << dendl;
- return num;
- }
-
-
- public:
- NodePool(Mutex &el) :
- num_nodes(0),
- num_dirty(0), num_clean(0), num_free(0), num_limbo(0),
- ebofs_lock(el),
- flushing(0) {}
- ~NodePool() {
- // nodes
- release_all();
- }
-
- int get_num_free() { return num_free; }
- int get_num_dirty() { return num_dirty; }
- int get_num_limbo() { return num_limbo; }
- int get_num_clean() { return num_clean; }
- int get_num_total() { return num_nodes; }
- int get_num_used() { return num_clean + num_dirty; }
-
- int get_usemap_len(int n=0) {
- if (n == 0) n = num_nodes;
- return ((n-1) / 8 / EBOFS_BLOCK_SIZE) + 1;
- }
-
- unsigned num_regions() { return region_loc.size(); }
-
- // the caller had better adjust usemap locations...
- void add_region(extent_t ex) {
- assert(region_loc.size() < EBOFS_MAX_NODE_REGIONS);
- region_loc.push_back(ex);
- free.insert(ex.start, ex.length);
- num_free += ex.length;
- num_nodes += ex.length;
- }
-
- void init_usemap() {
- usemap_data = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*usemap_even.length);
- usemap_data.zero();
- usemap_bits.set_data(usemap_data.c_str(), usemap_data.length());
- }
-
- void expand_usemap() {
- block_t have = usemap_data.length() / EBOFS_BLOCK_SIZE;
- if (have < usemap_even.length) {
- // use bufferlist to copy/merge two chunks
- bufferlist bl;
- bl.push_back(usemap_data);
- bufferptr newbit = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*(usemap_even.length - have));
- newbit.zero();
- bl.push_back(newbit);
- bl.rebuild();
- assert(bl.buffers().size() == 1);
- usemap_data = bl.buffers().front();
- usemap_bits.set_data(usemap_data.c_str(), usemap_data.length());
- }
- }
-
-
-
- int init(struct ebofs_nodepool *np) {
- // regions
- assert(region_loc.empty());
- num_nodes = 0;
- for (unsigned i=0; i<np->num_regions; i++) {
- debofs(3) << "init region " << i << " at " << np->region_loc[i] << dendl;
- region_loc.push_back( np->region_loc[i] );
- num_nodes += np->region_loc[i].length;
- }
-
- // usemap
- usemap_even = np->node_usemap_even;
- usemap_odd = np->node_usemap_odd;
- debofs(3) << "init even map at " << usemap_even << dendl;
- debofs(3) << "init odd map at " << usemap_odd << dendl;
-
- init_usemap();
- return 0;
- }
-
- void close() {
- release_all();
-
- region_loc.clear();
-
- num_free = 0;
- num_dirty = 0;
- num_clean = 0;
- num_limbo = 0;
- dirty_ls.clear();
-
- free.clear();
- limbo.clear();
-
- flushing = 0;
- node_map.clear();
- }
-
-
- // *** blocking i/o routines ***
-
- int read_usemap_and_clean_nodes(BlockDevice& dev, version_t epoch) {
- // read map
- extent_t loc;
- if (epoch & 1)
- loc = usemap_odd;
- else
- loc = usemap_even;
-
- // usemap
- dev.read(loc.start, loc.length, usemap_data);
-
- // nodes
- unsigned region = 0;
- unsigned region_pos = 0;
- for (int i=0; i<num_nodes; i++) {
- nodeid_t nid = make_nodeid(region, region_pos);
- region_pos++;
- if (region_pos == region_loc[region].length) {
- region_pos = 0;
- region++;
- }
-
- if (usemap_bits[i]) {
- num_clean++;
- bufferptr bp = buffer::create_page_aligned(EBOFS_NODE_BYTES);
- dev.read((block_t)nid, EBOFS_NODE_BLOCKS, bp);
-
- Node *n = new Node(nid, i, bp, Node::STATE_CLEAN);
- node_map[nid] = n;
- debofs(10) << "ebofs.nodepool.read node " << nid << " at " << (void*)n << dendl;
-
- } else {
- //debofs(10) << "ebofs.nodepool.read node " << nid << " is free" << dendl;
- free.insert(nid);
- num_free++;
- }
- }
- debofs(10) << "ebofs.nodepool.read free is " << free.m << dendl;
- assert(num_dirty == 0);
- assert(num_limbo == 0);
- assert(num_clean + num_free == num_nodes);
-
- return 0;
- }
-
-
- // **** non-blocking i/o ****
-
- private:
- class C_NP_FlushUsemap : public BlockDevice::callback {
- NodePool *pool;
- public:
- C_NP_FlushUsemap(NodePool *p) :
- pool(p) {}
- void finish(ioh_t ioh, int r) {
- pool->flushed_usemap();
- }
- };
-
- void flushed_usemap() {
- ebofs_lock.Lock();
- flushing--;
- if (flushing == 0)
- commit_cond.Signal();
- ebofs_lock.Unlock();
- }
-
- public:
- int write_usemap(BlockDevice& dev, version_t version) {
- // alloc
- extent_t loc;
- if (version & 1)
- loc = usemap_odd;
- else
- loc = usemap_even;
-
- // write
- bufferlist bl;
- bufferptr bp = usemap_data.clone();
- bl.append(bp);
- dev.write(loc.start, loc.length, bl,
- new C_NP_FlushUsemap(this), "usemap");
- return 0;
- }
-
-
-
- // *** node commit ***
- private:
-
- class C_NP_FlushNode : public BlockDevice::callback {
- NodePool *pool;
- nodeid_t nid;
- public:
- C_NP_FlushNode(NodePool *p, nodeid_t n) :
- pool(p), nid(n) {}
- void finish(ioh_t ioh, int r) {
- pool->flushed_node(nid);
- }
- };
-
- void flushed_node(nodeid_t nid) {
- ebofs_lock.Lock();
- flushing--;
- if (flushing == 0)
- commit_cond.Signal();
- ebofs_lock.Unlock();
- }
-
- public:
- void commit_start(BlockDevice& dev, version_t version) {
- debofs(20) << "ebofs.nodepool.commit_start start dirty=" << dirty_ls.size() << dendl;
-
- assert(flushing == 0);
- /*if (0)
- for (unsigned i=0; i<region_loc.size(); i++) {
- int c = dev.count_io(region_loc[i].start, region_loc[i].length);
- generic_dout(20) << "ebofs.nodepool.commit_start region " << region_loc[i] << " has " << c << " ios" << dendl;
- assert(c == 0);
- }
- */
-
- // write map
- flushing++;
- write_usemap(dev, version & 1);
-
- // dirty -> clean (write to disk)
- while (!dirty_ls.empty()) {
- Node *n = dirty_ls.front();
- assert(n);
- assert(n->is_dirty());
- n->set_state(Node::STATE_CLEAN);
- dirty_ls.remove(&n->xlist);
- num_dirty--;
- num_clean++;
-
- bufferlist bl;
- if (1) {
- bufferptr bp = n->get_buffer().clone(); // dup it now
- bl.append(bp);
- } else {
- bl.append(n->get_buffer()); // this isn't working right .. fixme
- }
-
- debofs(20) << "ebofs.nodepool.commit_start writing node " << n->get_id()
- << " " << (void*)bl.c_str()
- << dendl;
-
- dev.write(n->get_id(), EBOFS_NODE_BLOCKS,
- bl,
- new C_NP_FlushNode(this, n->get_id()), "node");
- flushing++;
- }
-
- // limbo -> free
- for (map<nodeid_t,nodeid_t>::iterator i = limbo.m.begin();
- i != limbo.m.end();
- i++) {
- num_free += i->second;
- num_limbo -= i->second;
- free.insert(i->first, i->second);
- debofs(20) << "ebofs.nodepool.commit_finish " << i->first << "~" << i->second << " limbo->free" << dendl;
- }
- limbo.clear();
-
- debofs(20) << "ebofs.nodepool.commit_start finish" << dendl;
- }
-
- void commit_wait() {
- while (flushing > 0)
- commit_cond.Wait(ebofs_lock);
- debofs(20) << "ebofs.nodepool.commit_wait finish" << dendl;
- }
-
- void commit_finish() {
- }
-
-
-
-
-
- // *** nodes ***
- // opened node
- Node* get_node(nodeid_t nid) {
- //dbtout << "pool.get " << nid << dendl;
- assert(node_map.count(nid));
- return node_map[nid];
- }
-
- // allocate id/block on disk. always free -> dirty.
- nodeid_t alloc_id() {
- // pick node id
- assert(!free.empty());
- nodeid_t nid = free.start();
- free.erase(nid);
- num_free--;
- return nid;
- }
-
- // new node
- Node* new_node(int type) {
- nodeid_t nid = alloc_id();
- debofs(15) << "ebofs.nodepool.new_node " << nid << dendl;
-
- // alloc node
- bufferptr bp = buffer::create_page_aligned(EBOFS_NODE_BYTES);
- bp.zero();
- Node *n = new Node(nid, nodeid_pos_in_bitmap(nid), bp, Node::STATE_DIRTY);
- n->set_type(type);
- n->set_size(0);
-
- usemap_bits.set(n->get_pos_in_bitmap());
-
- n->set_state(Node::STATE_DIRTY);
- dirty_ls.push_back(&n->xlist);
- num_dirty++;
-
- assert(node_map.count(nid) == 0);
- node_map[nid] = n;
-
- return n;
- }
-
- void release(Node *n) {
- const nodeid_t nid = n->get_id();
- node_map.erase(nid);
-
- if (n->is_dirty()) {
- debofs(15) << "ebofs.nodepool.release on " << nid << " to free" << dendl;
- dirty_ls.remove(&n->xlist);
- num_dirty--;
- free.insert(nid);
- num_free++;
- usemap_bits.clear(n->get_pos_in_bitmap());
- } else if (n->is_clean()) {
- debofs(15) << "ebofs.nodepool.release on " << nid << " to limbo" << dendl;
- limbo.insert(nid);
- num_limbo++;
- num_clean--;
- usemap_bits.clear(n->get_pos_in_bitmap());
- } else {
- debofs(15) << "ebofs.nodepool.release on " << nid << " to nowhere?" << dendl;
- }
-
- delete n;
- assert(num_clean + num_dirty + num_limbo + num_free == num_nodes);
- }
-
- void release_all() {
- while (!node_map.empty()) {
- //hash_map<nodeid_t,Node*,rjhash<uint64_t> >::iterator i = node_map.begin();
- unordered_map<nodeid_t,Node*,rjhash<uint64_t> >::iterator i = node_map.begin();
- //map<nodeid_t,Node*>::iterator i = node_map.begin();
- debofs(2) << "ebofs.nodepool.release_all leftover " << i->first << " " << i->second << dendl;
- release( i->second );
- }
- assert(node_map.empty());
- }
-
- void dirty_node(Node *n) {
- // get new node id?
- nodeid_t oldid = n->get_id();
- nodeid_t newid = alloc_id();
- debofs(15) << "ebofs.nodepool.dirty_node on " << oldid << " now " << newid << dendl;
-
- // dup data?
- // this only does a memcpy if there are multiple references..
- // i.e. if we are still writing the old data
- if (n->do_cow()) {
- //assert(0); //i'm duping on write
- debofs(15) << "ebofs.nodepool.dirty_node did cow on " << oldid << " now " << newid << dendl;
- //cerr << "ebofs.nodepool.dirty_node did cow on " << oldid << " now " << newid << dendl;
- }
-
- // release old block
- assert(n->is_clean());
- debofs(15) << "ebofs.nodepool.dirty_node releasing old " << oldid << " to limbo" << dendl;
- num_clean--;
- limbo.insert(oldid);
- num_limbo++;
- usemap_bits.clear(n->get_pos_in_bitmap());
-
- // rename node
- node_map.erase(oldid);
- n->set_id(newid);
- n->set_pos_in_bitmap(nodeid_pos_in_bitmap(newid));
- node_map[newid] = n;
-
- // new block
- n->set_state(Node::STATE_DIRTY);
- dirty_ls.push_back(&n->xlist);
- debofs(15) << "ebofs.nodepool.dirty_node added to dirty list, len now " << dirty_ls.size() << dendl;
- num_dirty++;
- usemap_bits.set(n->get_pos_in_bitmap());
-
- assert(num_clean + num_dirty + num_limbo + num_free == num_nodes);
- }
-
-
-};
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-
-#include <iostream>
-#include "ebofs/Ebofs.h"
-
-bool stop = false;
-
-
-char fingerprint_byte_at(int pos, int seed)
-{
- uint64_t big = ((pos & ~7) / 133) ^ seed;
- return ((char*)&big)[pos & 7];
-}
-
-
-int nt = 0;
-class Tester : public Thread {
- Ebofs &fs;
- int t;
-
- //char b[1024*1024];
-
-public:
- Tester(Ebofs &e) : fs(e), t(nt) { nt++; }
- void *entry() {
-
- while (!stop) {
- pobject_t oid;
- oid.oid.ino = (rand() % 1000) + 0x10000000;
- coll_t cid = rand() % 50;
- uint64_t off = rand() % 10000;//0;//rand() % 1000000;
- uint64_t len = 1+rand() % 100000;
- const char *a = "one";
- if (rand() % 2) a = "two";
- int l = 3;//rand() % 10;
-
- switch (rand() % 5) {//10) {
- case 0:
- {
- oid.oid.snap = rand() % 10;
- cout << t << " read " << hex << oid << dec << " at " << off << " len " << len << std::endl;
- bufferlist bl;
- fs.read(0, oid, off, len, bl);
- int l = MIN(len,bl.length());
- if (l) {
- cout << t << " got " << l << std::endl;
- char *p = bl.c_str();
- while (l--) {
- char want = fingerprint_byte_at(off, oid.oid.ino);
- if (*p != 0 && *p != want) {
- cout << t << " bad fingerprint at " << off << " got " << (int)*p << " want " << (int)want << std::endl;
- assert(0);
- }
- off++;
- p++;
- }
- }
- }
- break;
-
- case 1:
- {
- cout << t << " write " << hex << oid << dec << " at " << off << " len " << len << std::endl;
- char b[len];
- for (unsigned j=0;j<len;j++)
- b[j] = fingerprint_byte_at(off+j, oid.oid.ino);
- bufferlist w;
- w.append(b, len);
- fs.write(0, oid, off, len, w, 0);
- }
- break;
-
- case 2:
- {
- cout << t << " zero " << hex << oid << dec << " at " << off << " len " << len << std::endl;
- fs.zero(0, oid, off, len, 0);
- }
- break;
-
- case 3:
- {
- cout << t << " truncate " << hex << oid << dec << " " << off << std::endl;
- fs.truncate(0, oid, 0);
- }
- break;
-
- case 4:
- cout << t << " remove " << hex << oid << dec << std::endl;
- fs.remove(0, oid);
- break;
-
- case 5:
- cout << t << " collection_add " << hex << oid << dec << " to " << cid << std::endl;
- fs.collection_add(cid, 0, oid, 0);
- break;
-
- case 6:
- cout << t << " collection_remove " << hex << oid << dec << " from " << cid << std::endl;
- fs.collection_remove(cid, oid, 0);
- break;
-
- case 7:
- cout << t << " setattr " << hex << oid << dec << " " << a << " len " << l << std::endl;
- fs.setattr(0, oid, a, (void*)a, l, 0);
- break;
-
- case 8:
- cout << t << " rmattr " << hex << oid << dec << " " << a << std::endl;
- fs.rmattr(0, oid, a);
- break;
-
- case 9:
- {
- char v[4];
- cout << t << " getattr " << hex << oid << dec << " " << a << std::endl;
- if (fs.getattr(0, oid,a,(void*)v,3) == 0) {
- v[3] = 0;
- assert(strcmp(v,a) == 0);
- }
- }
- break;
-
- case 10:
- {
- pobject_t newoid = oid;
- newoid.oid.snap = rand() % 10;
- cout << t << " clone " << oid << " to " << newoid << std::endl;
- fs.clone(0, oid, newoid, 0);
- }
- }
-
-
- }
- cout << t << " done" << std::endl;
- return 0;
- }
-};
-
-int main(int argc, const char **argv)
-{
- vector<const char*> args;
- argv_to_vec(argc, argv, args);
- parse_config_options(args);
-
- // args
- if (args.size() != 3) return -1;
- const char *filename = args[0];
- int seconds = atoi(args[1]);
- int threads = atoi(args[2]);
- if (!threads) threads = 1;
-
- cout << "dev " << filename << " .. " << threads << " threads .. " << seconds << " seconds" << std::endl;
-
- Ebofs fs(filename);
- if (fs.mount() < 0) return -1;
-
-
- // explicit tests
- if (0) {
- // verify that clone() plays nice with partial writes
- pobject_t oid(0, 0, object_t(1,1));
- bufferptr bp(10000);
- bp.zero();
- bufferlist bl;
- bl.push_back(bp);
- fs.write(0, oid, 0, 10000, bl, 0);
-
- fs.sync();
- fs.trim_buffer_cache();
-
- // induce a partial write
- bufferlist bl2;
- bl2.substr_of(bl, 0, 100);
- fs.write(0, oid, 100, 100, bl2, 0);
-
- // clone it
- pobject_t oid2;
- oid2 = oid;
- oid2.oid.snap = 1;
- fs.clone(0, oid, oid2, 0);
-
- // ...
- if (0) {
- // make sure partial still behaves after orig is removed...
- fs.remove(0, oid, 0);
-
- // or i read for oid2...
- bufferlist rbl;
- fs.read(0, oid2, 0, 200, rbl);
- }
- if (1) {
- // make sure things behave if we remove the clone
- fs.remove(0, oid2,0);
- }
- }
- // /explicit tests
-
- list<Tester*> ls;
- for (int i=0; i<threads; i++) {
- Tester *t = new Tester(fs);
- t->create();
- ls.push_back(t);
- }
-
- utime_t now = g_clock.now();
- utime_t dur(seconds, 0);
- utime_t end = now + dur;
- cout << "stop at " << end << std::endl;
- while (now < end) {
- sleep(1);
- now = g_clock.now();
- //cout << now << std::endl;
- }
-
- cout << "stopping" << std::endl;
- stop = true;
-
- while (!ls.empty()) {
- Tester *t = ls.front();
- ls.pop_front();
- t->join();
- delete t;
- }
-
- fs.umount();
- return 0;
-}
-
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef CEPH_EBOFS_TYPES_H
-#define CEPH_EBOFS_TYPES_H
-
-#include "include/buffer.h"
-#include "include/Context.h"
-#include "include/pobject.h"
-#include "common/Cond.h"
-
-#include <ext/hash_map>
-#include <set>
-#include <list>
-#include <vector>
-using namespace std;
-using namespace __gnu_cxx;
-
-#include <tr1/unordered_map>
-using std::tr1::unordered_map;
-
-
-#include "include/object.h"
-
-#include "csum.h"
-
-#include "include/intarith.h"
-
-// disk
-typedef uint64_t block_t; // disk location/sector/block
-
-static const unsigned EBOFS_BLOCK_SIZE = 4096;
-static const unsigned EBOFS_BLOCK_MASK = 4095;
-static const unsigned EBOFS_BLOCK_BITS = 12; // 1<<12 == 4096
-
-struct extent_t {
- block_t start, length;
-
- //extent_t() : start(0), length(0) {}
- //extent_t(block_t s, block_t l) : start(s), length(l) {}
-
- block_t last() const { return start + length - 1; }
- block_t end() const { return start + length; }
-} __attribute__ ((packed));
-
-inline ostream& operator<<(ostream& out, const extent_t& ex)
-{
- return out << ex.start << "~" << ex.length;
-}
-
-
-// objects
-
-struct ebofs_onode {
- csum_t onode_csum; // from after onode_csum to base + onode_bytes
- __u32 onode_bytes;
-
- extent_t onode_loc; /* this is actually the block we live in */
- pobject_t object_id; /* for kicks */
- uint64_t readonly;
-
- int64_t object_size; /* file size in bytes. should this be 64-bit? */
- __u32 alloc_blocks; // allocated
- csum_t data_csum;
-
- __u16 inline_bytes;
- __u16 num_collections;
- __u32 num_attr; // num attr in onode
- __u32 num_extents; /* number of extents used. if 0, data is in the onode */
- __u32 num_bad_byte_extents; // corrupt partial byte extents
-} __attribute__ ((packed));
-
-struct ebofs_cnode {
- csum_t cnode_csum;
- __u32 cnode_bytes;
-
- extent_t cnode_loc; /* this is actually the block we live in */
- coll_t coll_id;
- __u32 num_attr; // num attr in cnode
-} __attribute__ ((packed));
-
-struct ebofs_inode_ptr {
- extent_t loc;
- csum_t csum;
- ebofs_inode_ptr() {}
- ebofs_inode_ptr(const extent_t& l, csum_t c) : loc(l), csum(c) {}
-} __attribute__ ((packed));
-
-static inline ostream& operator<<(ostream& out, const ebofs_inode_ptr& ptr) {
- return out << ptr.loc << "=" << hex << ptr.csum << dec;
-}
-
-
-// tree/set nodes
-//typedef int nodeid_t;
-typedef int64_t nodeid_t; // actually, a block number. FIXME.
-
-static const unsigned EBOFS_NODE_BLOCKS = 1;
-static const unsigned EBOFS_NODE_BYTES = EBOFS_NODE_BLOCKS * EBOFS_BLOCK_SIZE;
-static const unsigned EBOFS_MAX_NODE_REGIONS = 10; // pick a better value!
-static const unsigned EBOFS_NODE_DUP = 3;
-
-struct ebofs_nodepool {
- extent_t node_usemap_even; // for even sb versions
- extent_t node_usemap_odd; // for odd sb versions
-
- __u32 num_regions;
- extent_t region_loc[EBOFS_MAX_NODE_REGIONS];
-} __attribute__ ((packed));
-
-// table
-
-struct ebofs_node_ptr {
- nodeid_t nodeid;
- //uint64_t start[EBOFS_NODE_DUP];
- //uint64_t length;
- csum_t csum;
-} __attribute__ ((packed));
-
-struct ebofs_table {
- ebofs_node_ptr root;
- __u32 num_keys;
- __u32 depth;
-} __attribute__ ((packed));
-
-
-// super
-typedef uint64_t version_t;
-
-static const uint64_t EBOFS_MAGIC = 0x000EB0F5;
-
-static const int EBOFS_NUM_FREE_BUCKETS = 5; /* see alloc.h for bucket constraints */
-static const int EBOFS_FREE_BUCKET_BITS = 2;
-
-struct ebofs_super {
- uint64_t s_magic;
- uint64_t fsid; /* _ebofs_ fsid, mind you, not ceph_fsid_t. */
-
- epoch_t epoch; // version of this superblock.
- uint64_t op_seq; // seq # of last operation we _did_ apply+commit to the store.
-
- uint64_t num_blocks; /* # blocks in filesystem */
-
- // some basic stats, for kicks
- uint64_t free_blocks; /* unused blocks */
- uint64_t limbo_blocks; /* limbo blocks */
- //unsigned num_objects;
- //unsigned num_fragmented;
-
- struct ebofs_nodepool nodepool;
-
- // tables
- struct ebofs_table free_tab[EBOFS_NUM_FREE_BUCKETS];
- struct ebofs_table limbo_tab;
- struct ebofs_table alloc_tab;
- struct ebofs_table object_tab; // object directory
- struct ebofs_table collection_tab; // collection directory
- struct ebofs_table co_tab;
-
- csum_t super_csum;
-
- csum_t calc_csum() {
- return ::calc_csum_unaligned((char*)this, (unsigned long)&super_csum-(unsigned long)this);
- }
- bool is_corrupt() {
- csum_t actual = calc_csum();
- if (actual != super_csum)
- return true;
- else
- return false;
- }
- bool is_valid_magic() { return s_magic == EBOFS_MAGIC; }
- bool is_valid() { return is_valid_magic() && !is_corrupt(); }
-} __attribute__ ((packed));
-
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-
-#include <sys/stat.h>
-#include <iostream>
-#include <string>
-using namespace std;
-
-#include "common/config.h"
-
-#include "mon/MonMap.h"
-
-#include "ebofs/Ebofs.h"
-
-#include "osd/OSD.h"
-#include "mon/MonitorStore.h"
-
-int main(int argc, char **argv)
-{
- vector<char*> args;
- argv_to_vec(argc, argv, args);
-
- Ebofs eb("dev/osd0");
- eb.mount();
- MonitorStore ms("mondata/mon0");
- ms.mount();
-
- epoch_t e = 1;
- while (1) {
- bufferlist bl;
- object_t oid = OSD::get_osdmap_object_name(e);
- eb.read(oid, 0, 0, bl);
- if (bl.length() == 0) break;
- cout << "saving epoch " << e << std::endl;
-
- bufferlist ibl;
- oid = OSD::get_inc_osdmap_object_name(e);
- eb.read(oid, 0, 0, ibl);
-
- ms.put_bl_sn(ibl, "osdmap", e);
- ms.put_bl_sn(bl, "osdmap_full", e);
- e++;
- }
-
- eb.umount();
- //ms.umount();
-
- return 0;
-}
'n' => 16,
# parameters
- 'fs' => ['ebofs','fakestore'],
+ 'fs' => ['fakestore'],
'meta_log_ssize' => [ 128, 256, 1024, 1 << 15, 1 << 20 ],
'meta_log_scount' => 4,#[ 1, 2, 4, 8 ],
'n' => '1 + $cnode + $nummds + $numosd' ],
# parameters
- #'fs' => 'ebofs',
'fs' => 'fakestore',
'until' => 300, # --syn until $n ... when to stop clients
],
'writefile_mb' => 1000,
- 'ebofs_idle_commit_ms' => [ 100, 500 ],
- 'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ],
-
# 'custom' => '--tcp_skip_rank0',# --osd_maxthreads 0',
'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5',
'n' => 12,
# parameters
- 'fs' => 'fakestore',#['ebofs', 'fakestore','obfs'],
+ 'fs' => 'fakestore',#['fakestore','obfs'],
#'fs' => 'ebofs',
- #'ebofs_commit_ms' => [ 1000, 5000 ],
#'osd_maxthreads' => [ 0, 1, 2, 4, 8 ],
'until' => 100, # --syn until $n ... when to stop clients
'file_layout_num_rep'=> 1,#[1,2],
-
-# 'ebofs_idle_commit_ms' => [ 100, 500 ],
-# 'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ],
-
'custom' => '--debug_after 110 --debug_mds 15 --debug 5 --mds_shutdown_check 60',
# for final summation (script/sum.pl)
*/
-#ifndef CEPH_EBOFS_FILEJOURNAL_H
-#define CEPH_EBOFS_FILEJOURNAL_H
+#ifndef CEPH_FILEJOURNAL_H
+#define CEPH_FILEJOURNAL_H
#include <deque>
using std::deque;
*/
-#ifndef CEPH_EBOFS_JOURNAL_H
-#define CEPH_EBOFS_JOURNAL_H
+#ifndef CEPH_JOURNAL_H
+#define CEPH_JOURNAL_H
#include "include/buffer.h"
#include "include/Context.h"
#include "common/Clock.h"
// ick
-//#include "ebofs/Ebofs.h"
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
generic_dout(2) << "age_fill at " << free << " / " << avail << " / " << pc << " creating " << hex << poid << dec << " sz " << s << dendl;
- if (false && !g_conf.ebofs_verify && start_debug && wrote > 1000000ULL) {
+ if (false && start_debug && wrote > 1000000ULL) {
/*
// 99 1000500 ? 1000750 1006000
*/
- g_conf.debug_ebofs = 30;
- g_conf.ebofs_verify = true;
}
off_t off = 0;
int nper = 20;
int n = nper;
- //g_conf.ebofs_verify = true;
-
while (1) {
struct statfs st;
store->statfs(&st);
store->apply_transaction(t);
age_free_oids.push_back(poid);
}
-
- g_conf.ebofs_verify = false;
}
void pfrag(uint64_t written, ObjectStore::FragmentationStat &st)
if (::stat(dev.c_str(), &st) != 0)
return 0;
- //if (g_conf.ebofs)
- //return new Ebofs(dev, jdev);
if (g_conf.filestore)
return new FileStore(dev, jdev);
return new FileStore(dev, jdev);
else
return 0;
- //return new Ebofs(dev, jdev);
}
#undef dout_prefix
g_conf.debug_osd = 100;
g_conf.debug_journal = 100;
g_conf.debug_filestore = 100;
- g_conf.debug_ebofs = 100;
g_conf.debug_ms = 100;
derr << "OSD::shutdown" << dendl;
// we reached a new block. *p was the last entry with bytes in previous block
ondisklog.block_map[startoff] = p->version;
}
-
- /*
- if (g_conf.osd_pad_pg_log) { // pad to 4k, until i fix ebofs reallocation crap. FIXME.
- bufferptr bp(4096 - sizeof(*p));
- bl.push_back(bp);
- }
- */
}
ondisklog.head = bl.length();
ondisklog.has_checksums = true;
'numclient' => 400,#[10, 50, 100, 200, 400],
# parameters
- 'fs' => [ 'ebofs', 'fakestore' ],
+ 'fs' => [ 'fakestore' ],
'until' => 150, # --syn until $n ... when to stop clients
'writefile' => 1,
'writefile_size' => [ 4096, 65526, 256000, 1024000, 2560000 ],
$c .= " --syn readfile $h->{'readfile_mb'} $h->{'readfile_size'}" if $h->{'readfile'};
$c .= " --syn makedirs $h->{'makedirs_dirs'} $h->{'makedirs_files'} $h->{'makedirs_depth'}" if $h->{'makedirs'};
- if ($h->{'ebofs_freelist'}) {
- system "cp freelist/ebofs.freelist.$h->{'ebofs_freelist'} ebofs.freelist";
- $c .= " --osd_age_time -1";
- }
-
for my $k ('nummds', 'numclient', 'numosd', 'kill_after',
'osd_maxthreads', 'osd_object_layout', 'osd_pg_layout','osd_pg_bits',
'mds_bal_rep', 'mds_bal_interval', 'mds_bal_max','mds_decay_halflife',
'mds_local_osd',
'osd_age_time','osd_age',
'osd_rep',
- 'osd_pad_pg_log','ebofs_realloc',
+ 'osd_pad_pg_log',
'osd_balance_reads',
'tcp_multi_out',
'client_cache_stat_ttl','client_cache_readdir_ttl',
'client_oc',
'fake_osdmap_updates',
- 'bdev_el_bidir', 'ebofs_idle_commit_ms', 'ebofs_commit_ms',
- 'ebofs_oc_size','ebofs_cc_size','ebofs_bc_size','ebofs_bc_max_dirty','ebofs_abp_max_alloc',
+ 'bdev_el_bidir',
'file_layout_ssize','file_layout_scount','file_layout_osize','file_layout_num_rep',
'meta_dir_layout_ssize','meta_dir_layout_scount','meta_dir_layout_osize','meta_dir_layout_num_rep',
'meta_log_layout_ssize','meta_log_layout_scount','meta_log_layout_osize','meta_log_layout_num_rep') {
*/
#include <iostream>
-//#include "ebofs/Ebofs.h"
#include "os/FileStore.h"
#include "common/common_init.h"
#include "common/ceph_argparse.h"
cout << "#dev " << filename
<< ", " << seconds << " seconds, " << bytes << " bytes per write" << std::endl;
- //ObjectStore *fs = new Ebofs(filename, journal);
ObjectStore *fs = new FileStore(filename, journal);
if (fs->mount() < 0) {
+++ /dev/null
-/* testos.cc -- simple ObjectStore test harness.
- Copyright (C) 2007 Casey Marshall <csm@soe.ucsc.edu>
-
-Ceph - scalable distributed file system
-
-This is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License version 2.1, as published by the Free Software
-Foundation. See file COPYING. */
-
-
-#include "osd/ObjectStore.h"
-#include "ebofs/Ebofs.h"
-#include "include/buffer.h"
-
-#include <iostream>
-#include <cerrno>
-#include <vector>
-
-#include <fcntl.h>
-#include <sys/mount.h>
-
-using namespace std;
-
-static inline unsigned long long
-to_usec (struct timeval &time)
-{
- return (((unsigned long long) time.tv_sec * 1000000)
- + ((unsigned long long) time.tv_usec));
-}
-
-static inline unsigned long long
-to_msec (struct timeval &time)
-{
- return (((unsigned long long) time.tv_sec * 1000)
- + ((unsigned long long) time.tv_usec / 1000));
-}
-
-int main (int argc, char **argv)
-{
- vector<char *> args;
- char *osd_name = "ebofs";
- unsigned object_size = 1024;
- unsigned object_count = 1024;
- unsigned write_iter = 64;
- unsigned random_seed = ::time(NULL);
- char *device = "/tmp/testos";
- char *mountcmd = "mount /tmp/testos";
- char *umountcmd = "umount /tmp/testos";
-
- bool ebofs_raw_device = false;
- bool inhibit_remount = (getenv("TESTOS_INHIBIT_REMOUNT") != NULL);
-
- if (argc > 1
- && (strcmp (argv[1], "-h") == 0
- || strcmp (argv[1], "-help") == 0
- || strcmp (argv[1], "--help") == 0))
- {
- cout << "usage: " << argv[0] << " [store [object-size [object-count [iterations [seed]]]]]" << endl;
- cout << endl;
- cout << "Where the arguments are:" << endl << endl;
- cout << " store -- store type; default \"ebofs\"" << endl;
- cout << " object-size -- size of objects; default 1024" << endl;
- cout << " object-count -- number of objects to write; default 1024"
- << endl;
- cout << " iterations -- write the objects that many times; default 5"
- << endl;
- cout << " seed -- random seed; default current time" << endl;
- exit (0);
- }
-
- argv_to_vec (argc, argv, args);
- for (vector<char*>::iterator it = args.begin(); it != args.end();
- it++)
- cout << *it << " ";
- cout << endl;
- parse_config_options (args);
- for (vector<char*>::iterator it = args.begin(); it != args.end();
- it++)
- cout << *it << " ";
- cout << endl;
-
- argc = args.size();
- if (argc > 0)
- osd_name = args[0];
- if (argc > 1)
- object_size = (unsigned) atol (args[1]);
- if (argc > 2)
- object_count = (unsigned) atol (args[2]);
- if (argc > 3)
- write_iter = (unsigned) atol (args[3]);
- if (argc > 4)
- random_seed = (unsigned) atol (args[4]);
-
- // algin object size to 'long'
- object_size = ((object_size + (sizeof (long) - 1)) / sizeof (long)) * sizeof (long);
-
- char *osd_file = new char[32];
- strcpy (osd_file, "/tmp/testos/testos.XXXXXX");
- mktemp (osd_file);
-
- if (strcasecmp (osd_name, "ebofs") == 0)
- {
- char *dev_env = getenv ("TESTOS_EBOFS_DEV");
- if (dev_env != NULL)
- {
- // Assume it is a true device.
- strncpy (osd_file, dev_env, 32);
- inhibit_remount = true;
- ebofs_raw_device = true;
- }
- }
-
- if (!inhibit_remount)
- {
- if (system (mountcmd) != 0)
- {
- cerr << "mount failed" << endl;
- exit (1);
- }
- }
-
- ObjectStore *os = NULL;
- if (strcasecmp (osd_name, "ebofs") == 0)
- {
- if (!ebofs_raw_device)
- {
- FILE *f = fopen (osd_file, "w");
- if (f == NULL)
- {
- cerr << "failed to open " << osd_file << ": " << strerror (errno)
- << endl;
- exit (1);
- }
- // 1G file.
- fseek (f, 1024 * 1024 * 1024, SEEK_SET);
- fputc ('\0', f);
- fclose (f);
- }
- os = new Ebofs (osd_file);
- }
- else
- {
- cerr << "I don't know about object store \"" << osd_name << "\""
- << endl;
- exit (1);
- }
-
- cout << "Writing " << object_count << " objects of size "
- << object_size << " to " << osd_name << endl;
-
- char *val = (char *) malloc (object_size);
- char *val2 = (char *) malloc (object_size);
- auto_ptr<char> valptr (val);
- auto_ptr<char> valptr2(val2);
- if (getenv ("TESTOS_UNALIGNED") != NULL)
- {
- val = val + 1;
- val2 = val2 + 1;
- }
-
- for (unsigned i = 0; i < object_size; i++)
- {
- val[i] = (char) i;
- val2[i] = (char) i;
- }
- object_t *oids = new object_t[object_count];
-
- utime_t writes[write_iter];
- utime_t total_write;
- utime_t reads[write_iter];
- utime_t total_read;
- for (unsigned i = 0; i < write_iter; i++)
- {
- cerr << "Iteration " << i << endl;
-
- int ret = os->mkfs();
- if (ret != 0)
- {
- cerr << "mkfs(" << osd_file << "): " << strerror (-ret) << endl;
- exit (1);
- }
- ret = os->mount();
- if (ret != 0)
- {
- cerr << "mount(): " << strerror (-ret) << endl;
- exit (1);
- }
-
- srandom (random_seed + i);
-
- for (unsigned j = 0; j < object_count; j++)
- {
- oids[j].ino = (uint64_t) random() << 32 | random();
- oids[j].bno = random();
- }
-
- utime_t begin = g_clock.now();
- for (unsigned o = 0; o < object_count; o++)
- {
- bufferptr bp (val, object_size);
- bufferlist bl;
- bl.push_back (bp);
- int ret;
- if ((ret = os->write (oids[o], 0L, object_size, bl, NULL)) < 0)
- cerr << "write " << oids[o] << " failed: "
- << strerror (-ret) << endl;
- }
- os->sync();
-
- utime_t end = g_clock.now() - begin;
-
- cerr << "Write finished in " << end << endl;
- total_write += end;
- writes[i] = end;
-
- os->umount();
- sync();
-
- if (!inhibit_remount)
- {
- if (system (umountcmd) != 0)
- {
- cerr << "umount failed" << endl;
- exit (1);
- }
-
- if (system (mountcmd) != 0)
- {
- cerr << "mount(2) failed" << endl;
- exit (1);
- }
- }
-
- os->mount();
-
- // Shuffle the OIDs.
- for (int j = 0; j < object_count; j++)
- {
- int x = random() % object_count;
- if (x < 0)
- x = -x;
- object_t o = oids[j];
- oids[j] = oids[x];
- oids[x] = o;
- }
-
- begin = g_clock.now();
- for (unsigned o = 0; o < object_count; o++)
- {
- bufferptr bp (val2, object_size);
- bufferlist bl;
- bl.push_back (bp);
-
- if (os->read (oids[o], 0L, object_size, bl) < 0)
- {
- cerr << "object " << oids[o] << " not found!" << endl;
- }
- }
- end = g_clock.now() - begin;
-
- cerr << "Read finished in " << end << endl;
- total_read += end;
- reads[i] = end;
-
- os->umount();
- sync();
-
- if (!inhibit_remount)
- {
- if (system (umountcmd) != 0)
- {
- cerr << "umount(2) failed" << endl;
- exit (1);
- }
-
- if (system (mountcmd) != 0)
- {
- cerr << "mount(3) failed" << endl;
- exit (1);
- }
- }
- }
-
- cerr << "Finished in " << (total_write + total_read) << endl;
-
- double write_mean = ((double) total_write) / ((double) write_iter);
- double write_sd = 0.0;
- for (unsigned i = 0; i < write_iter; i++)
- {
- double x = ((double) writes[i]) - write_mean;
- write_sd += x * x;
- }
- write_sd = sqrt (write_sd / ((double) write_iter));
-
- double read_mean = ((double) total_read) / ((double) write_iter);
- double read_sd = 0.0;
- for (unsigned i = 0; i < write_iter; i++)
- {
- double x = ((double) reads[i]) - read_mean;
- write_sd += x * x;
- }
- read_sd = sqrt (read_sd / ((double) write_iter));
-
- cout << "TESTOS: write " << osd_name << ":" << object_size << ":"
- << object_count << ":" << write_iter << ":" << random_seed
- << " -- " << write_mean << " " << write_sd << endl;
-
- cout << "TESTOS: write.raw -- ";
- for (int i = 0; i < write_iter; i++)
- cout << ((double) writes[i]) << " ";
- cout << endl;
-
- cout << "TESTOS: read " << osd_name << ":" << object_size << ":"
- << object_count << ":" << write_iter << ":" << random_seed
- << " -- " << read_mean << " " << read_sd << endl;
-
- cout << "TESTOS: read.raw -- ";
- for (int i = 0; i < write_iter; i++)
- cout << ((double) reads[i]) << " ";
- cout << endl;
-
- unlink (osd_file);
- if (!inhibit_remount)
- {
- if (system (umountcmd) != 0)
- {
- cerr << "umount(3) failed" << endl;
- exit (1);
- }
- }
- exit (0);
-}
*/
#include <iostream>
-//#include "ebofs/Ebofs.h"
#include "common/ceph_argparse.h"
#include "os/FileStore.h"
#include "common/common_init.h"