# mpicxx if you get paranoid.
#CC = g++
-#CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE
+#CFLAGS = -g -fPIC -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE
#LIBS = -lpthread
# Hook for extra -I options, etc.
LDINC = ar -rc
else
# For linux
-CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE
+CFLAGS = -g -fPIC -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE
LDINC = ld -i -o
endif
common/Timer.o\
config.o
-
CLIENT_OBJS= \
client/FileCache.o\
client/Client.o\
client/SyntheticClient.o\
client/Trace.o
+
ifeq ($(want_bdb),yes)
OSBDB_OBJS = \
osbdb/OSBDB.o
OSBDB_OBJ = osbdb.o
endif
-TARGETS = cmon cosd cmds cfuse csyn newsyn fakesyn mkmonmap
+TARGETS = cmon cosd cmds csyn newsyn fakesyn mkmonmap cfuse fakefuse
+NO_FUSE = cmon cosd cmds csyn newsyn fakesyn mkmonmap
+
SRCS=*.cc */*.cc *.h */*.h */*/*.h
all: depend ${TARGETS}
+nofuse: depend ${NO_FUSE}
+
test: depend ${TEST_TARGETS}
obfs: depend obfstest
${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@ ../uofs/uofs.a
+# hadoop
+libhadoopcephfs.so: client/hadoop/CephFSInterface.cc client.o osdc.o msg/SimpleMessenger.o common.o
+ ${CC} -fPIC -shared -Wl,-soname,$@.1 ${CFLAGS} -I/usr/local/java/include -I/usr/local/java/include/linux ${LIBS} $^ -o $@
# libceph
libceph.o: client/ldceph.o client/Client.o msg/SimpleMessenger.o ${COMMON_OBJS} ${SYN_OBJS} ${OSDC_OBJS}
// args for fuse
vec_to_argv(args, argc, argv);
+ // FUSE will chdir("/"); be ready.
+ g_conf.use_abspaths = true;
+
+ if (g_conf.clock_tare) g_clock.tare();
+
// load monmap
MonMap monmap;
int r = monmap.read(".ceph_monmap");
Client::~Client()
{
- if (messenger) { delete messenger; messenger = 0; }
+ tear_down_cache();
+
+ if (objectcacher) {
+ delete objectcacher;
+ objectcacher = 0;
+ }
+
if (filer) { delete filer; filer = 0; }
- if (objectcacher) { delete objectcacher; objectcacher = 0; }
if (objecter) { delete objecter; objecter = 0; }
if (osdmap) { delete osdmap; osdmap = 0; }
+ if (mdsmap) { delete mdsmap; mdsmap = 0; }
- tear_down_cache();
+ if (messenger) { delete messenger; messenger = 0; }
}
if (cap_reap_queue[in->ino()].empty())
cap_reap_queue.erase(in->ino());
}
+ delete m;
return;
}
} else {
//dout(0) << "didn't put_inode" << endl;
}
-
+ delete m;
return;
}
<< " was " << cap_string(old_caps) << endl;
// did file size decrease?
- if ((old_caps & new_caps & CAP_FILE_RDCACHE) &&
+ if ((old_caps & (CAP_FILE_RD|CAP_FILE_WR)) == 0 &&
+ (new_caps & (CAP_FILE_RD|CAP_FILE_WR)) != 0 &&
in->inode.size > m->get_inode().size) {
- dout(10) << "**** file size decreased from " << in->inode.size << " to " << m->get_inode().size << " FIXME" << endl;
- // must have been a truncate() by someone.
- // trim the buffer cache
- // ***** fixme write me ****
+ dout(10) << "*** file size decreased from " << in->inode.size << " to " << m->get_inode().size << endl;
+
+ // trim filecache?
+ if (g_conf.client_oc)
+ in->fc.truncate(in->inode.size, m->get_inode().size);
- in->file_wr_size = m->get_inode().size; //??
+ in->inode.size = in->file_wr_size = m->get_inode().size;
}
// update inode
if (in->file_wr_mtime > in->inode.mtime)
m->get_inode().mtime = in->inode.mtime = in->file_wr_mtime;
+
+
if (g_conf.client_oc) {
// caching on, use FileCache.
Context *onimplement = 0;
}
}
in->fc.set_caps(new_caps, onimplement);
-
} else {
// caching off.
assert(diri);
assert(diri->inode.mode & INODE_MODE_DIR);
+ // add . and ..?
+ string dot(".");
+ contents[dot] = diri->inode;
+ if (diri != root) {
+ string dotdot("..");
+ contents[dotdot] = diri->dn->dir->parent_inode->inode;
+ }
+
if (!reply->get_dir_in().empty()) {
// only open dir if we're actually adding stuff to it!
Dir *dir = diri->open_dir();
for (list<InodeStat*>::const_iterator pin = reply->get_dir_in().begin();
pin != reply->get_dir_in().end();
++pin, ++pdn) {
- // count entries
+ if (*pdn == ".")
+ continue;
+
+ // count entries
res++;
// put in cache
// contents to caller too!
contents[*pdn] = in->inode;
}
+ if (dir->is_empty())
+ close_dir(dir);
}
- // add .. too?
- if (diri != root && diri->dn && diri->dn->dir) {
- Inode *parent = diri->dn->dir->parent_inode;
- contents[".."] = parent->inode;
- }
// FIXME: remove items in cache that weren't in my readdir?
// ***
dout(7) << "open got caps " << cap_string(new_caps)
<< " for " << f->inode->ino()
<< " seq " << reply->get_file_caps_seq()
- << " from mds" << mds << endl;
+ << " from mds" << mds
+ << endl;
int old_caps = f->inode->caps[mds].caps;
f->inode->caps[mds].caps = new_caps;
dout(7) << "open got SAME caps " << cap_string(new_caps)
<< " for " << f->inode->ino()
<< " seq " << reply->get_file_caps_seq()
- << " from mds" << mds << endl;
+ << " from mds" << mds
+ << endl;
}
// put in map
// ------------
// read, write
+
+off_t Client::lseek(fh_t fh, off_t offset, int whence)
+{
+ client_lock.Lock();
+ dout(3) << "op: client->lseek(" << fh << ", " << offset << ", " << whence << ");" << endl;
+
+ assert(fh_map.count(fh));
+ Fh *f = fh_map[fh];
+ Inode *in = f->inode;
+
+ switch (whence) {
+ case SEEK_SET:
+ f->pos = offset;
+ break;
+
+ case SEEK_CUR:
+ f->pos += offset;
+ break;
+
+ case SEEK_END:
+ f->pos = in->inode.size + offset;
+ break;
+
+ default:
+ assert(0);
+ }
+
+ off_t pos = f->pos;
+ client_lock.Unlock();
+
+ return pos;
+}
+
+
// blocking osd interface
int Client::read(fh_t fh, char *buf, off_t size, off_t offset)
tout << size << endl;
tout << offset << endl;
- assert(offset >= 0);
assert(fh_map.count(fh));
Fh *f = fh_map[fh];
Inode *in = f->inode;
- if (offset < 0)
+ bool movepos = false;
+ if (offset < 0) {
offset = f->pos;
+ movepos = true;
+ }
bool lazy = f->mode == FILE_MODE_LAZY;
- // do we have read file cap?
- while (!lazy && (in->file_caps() & CAP_FILE_RD) == 0) {
- dout(7) << " don't have read cap, waiting" << endl;
- Cond cond;
- in->waitfor_read.push_back(&cond);
- cond.Wait(client_lock);
- }
- // lazy cap?
- while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) {
- dout(7) << " don't have lazy cap, waiting" << endl;
- Cond cond;
- in->waitfor_lazy.push_back(&cond);
- cond.Wait(client_lock);
- }
-
// determine whether read range overlaps with file
// ...ONLY if we're doing async io
if (!lazy && (in->file_caps() & (CAP_FILE_WRBUFFER|CAP_FILE_RDCACHE))) {
// we're doing buffered i/o. make sure we're inside the file.
// we can trust size info bc we get accurate info when buffering/caching caps are issued.
- dout(10) << "file size: " << in->inode.size << endl;
+ dout(-10) << "file size: " << in->inode.size << endl;
if (offset > 0 && offset >= in->inode.size) {
client_lock.Unlock();
return 0;
}
- if (offset + size > (unsigned)in->inode.size) size = (unsigned)in->inode.size - offset;
+ if (offset + size > (off_t)in->inode.size)
+ size = (off_t)in->inode.size - offset;
if (size == 0) {
- dout(10) << "read is size=0, returning 0" << endl;
+ dout(-10) << "read is size=0, returning 0" << endl;
client_lock.Unlock();
return 0;
}
}
bufferlist blist; // data will go here
- int rvalue = 0;
int r = 0;
+ int rvalue = 0;
if (g_conf.client_oc) {
// object cache ON
rvalue = r = in->fc.read(offset, size, blist, client_lock); // may block.
} else {
// object cache OFF -- legacy inconsistent way.
+
+ // do we have read file cap?
+ while (!lazy && (in->file_caps() & CAP_FILE_RD) == 0) {
+ dout(7) << " don't have read cap, waiting" << endl;
+ Cond cond;
+ in->waitfor_read.push_back(&cond);
+ cond.Wait(client_lock);
+ }
+ // lazy cap?
+ while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) {
+ dout(7) << " don't have lazy cap, waiting" << endl;
+ Cond cond;
+ in->waitfor_lazy.push_back(&cond);
+ cond.Wait(client_lock);
+ }
+
+ // do sync read
Cond cond;
bool done = false;
C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue);
while (!done)
cond.Wait(client_lock);
}
-
- // adjust fd pos
- f->pos = offset+blist.length();
+
+ if (movepos) {
+ // adjust fd pos
+ f->pos = offset+blist.length();
+ }
// copy data into caller's char* buf
blist.copy(0, blist.length(), buf);
tout << size << endl;
tout << offset << endl;
- assert(offset >= 0);
assert(fh_map.count(fh));
Fh *f = fh_map[fh];
Inode *in = f->inode;
- if (offset < 0)
+ if (offset < 0) {
offset = f->pos;
+ // adjust fd pos
+ f->pos = offset+size;
+ }
bool lazy = f->mode == FILE_MODE_LAZY;
dout(10) << "cur file size is " << in->inode.size << " wr size " << in->file_wr_size << endl;
- // do we have write file cap?
- while (!lazy && (in->file_caps() & CAP_FILE_WR) == 0) {
- dout(7) << " don't have write cap, waiting" << endl;
- Cond cond;
- in->waitfor_write.push_back(&cond);
- cond.Wait(client_lock);
- }
- while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) {
- dout(7) << " don't have lazy cap, waiting" << endl;
- Cond cond;
- in->waitfor_lazy.push_back(&cond);
- cond.Wait(client_lock);
- }
-
- // adjust fd pos
- f->pos = offset+size;
-
// time it.
utime_t start = g_clock.now();
// write (this may block!)
in->fc.write(offset, size, blist, client_lock);
-
+
} else {
// legacy, inconsistent synchronous write.
dout(7) << "synchronous write" << endl;
+ // do we have write file cap?
+ while (!lazy && (in->file_caps() & CAP_FILE_WR) == 0) {
+ dout(7) << " don't have write cap, waiting" << endl;
+ Cond cond;
+ in->waitfor_write.push_back(&cond);
+ cond.Wait(client_lock);
+ }
+ while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) {
+ dout(7) << " don't have lazy cap, waiting" << endl;
+ Cond cond;
+ in->waitfor_lazy.push_back(&cond);
+ cond.Wait(client_lock);
+ }
+
// prepare write
Cond cond;
bool done = false;
}
+// =========================================
+// layout
+
+
+int Client::describe_layout(int fh, FileLayout *lp)
+{
+ client_lock.Lock();
+ dout(3) << "op: client->describe_layout(" << fh << ");" << endl;
+
+ assert(fh_map.count(fh));
+ Fh *f = fh_map[fh];
+ Inode *in = f->inode;
+
+ *lp = in->inode.layout;
+
+ client_lock.Unlock();
+ return 0;
+}
+
+int Client::get_stripe_unit(int fd)
+{
+ FileLayout layout;
+ describe_layout(fd, &layout);
+ return layout.stripe_unit;
+}
+
+int Client::get_stripe_width(int fd)
+{
+ FileLayout layout;
+ describe_layout(fd, &layout);
+ return layout.stripe_width();
+}
+
+int Client::get_stripe_period(int fd)
+{
+ FileLayout layout;
+ describe_layout(fd, &layout);
+ return layout.period();
+}
+
+int Client::enumerate_layout(int fh, list<ObjectExtent>& result,
+ off_t length, off_t offset)
+{
+ client_lock.Lock();
+ dout(3) << "op: client->enumerate_layout(" << fh << ", " << length << ", " << offset << ");" << endl;
+
+ assert(fh_map.count(fh));
+ Fh *f = fh_map[fh];
+ Inode *in = f->inode;
+
+ // map to a list of extents
+ filer->file_to_extents(in->inode, offset, length, result);
+
+ client_lock.Unlock();
+ return 0;
+}
+
+
+
void Client::ms_handle_failure(Message *m, const entity_inst_t& inst)
{
entity_name_t dest = inst.name;
if (num_open_rd) w |= CAP_FILE_RD|CAP_FILE_RDCACHE;
if (num_open_wr) w |= CAP_FILE_WR|CAP_FILE_WRBUFFER;
if (num_open_lazy) w |= CAP_FILE_LAZYIO;
+ if (fc.is_dirty()) w |= CAP_FILE_WRBUFFER;
+ if (fc.is_cached()) w |= CAP_FILE_RDCACHE;
return w;
}
// crap
int chdir(const char *s);
+ const string getcwd() { return cwd; }
// namespace ops
int getdir(const char *path, list<string>& contents);
int mknod(const char *path, mode_t mode);
int open(const char *path, int mode);
int close(fh_t fh);
+ off_t lseek(fh_t fh, off_t offset, int whence);
int read(fh_t fh, char *buf, off_t size, off_t offset=-1);
int write(fh_t fh, const char *buf, off_t size, off_t offset=-1);
int truncate(const char *file, off_t size);
//int truncate(fh_t fh, long long size);
int fsync(fh_t fh, bool syncdataonly);
+
// hpc lazyio
int lazyio_propogate(int fd, off_t offset, size_t count);
int lazyio_synchronize(int fd, off_t offset, size_t count);
- int describe_layout(char *fn, list<ObjectExtent>& result);
+ // expose file layout
+ int describe_layout(int fd, FileLayout* layout);
+ int get_stripe_unit(int fd);
+ int get_stripe_width(int fd);
+ int get_stripe_period(int fd);
+ int enumerate_layout(int fd, list<ObjectExtent>& result,
+ off_t length, off_t offset);
+ // failure
void ms_handle_failure(Message*, const entity_inst_t& inst);
};
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
#include "config.h"
#include "include/types.h"
#include "msg/Messenger.h"
#undef dout
-#define dout(x) if (x <= g_conf.debug_client) cout << g_clock.now() << " " << oc->objecter->messenger->get_myaddr() << ".filecache "
-#define derr(x) if (x <= g_conf.debug_client) cout << g_clock.now() << " " << oc->objecter->messenger->get_myaddr() << ".filecache "
+#define dout(x) if (x <= g_conf.debug_client) cout << g_clock.now() << " " << oc->objecter->messenger->get_myname() << ".filecache "
+#define derr(x) if (x <= g_conf.debug_client) cout << g_clock.now() << " " << oc->objecter->messenger->get_myname() << ".filecache "
// flush/release/clean
}
+void FileCache::tear_down()
+{
+ off_t unclean = release_clean();
+ if (unclean) {
+ dout(0) << "tear_down " << unclean << " unclean bytes, purging" << endl;
+ oc->purge_set(inode.ino);
+ }
+}
+
+// truncate
+
+void FileCache::truncate(off_t olds, off_t news)
+{
+ dout(5) << "truncate " << olds << " -> " << news << endl;
+
+ // map range to objects
+ list<ObjectExtent> ls;
+ oc->filer.file_to_extents(inode, news, olds-news, ls);
+ oc->truncate_set(inode.ino, ls);
+}
+
// caps
+class C_FC_CheckCaps : public Context {
+ FileCache *fc;
+public:
+ C_FC_CheckCaps(FileCache *f) : fc(f) {}
+ void finish(int r) {
+ fc->check_caps();
+ }
+};
+
void FileCache::set_caps(int caps, Context *onimplement)
{
if (onimplement) {
+ dout(10) << "set_caps setting onimplement context for " << cap_string(caps) << endl;
assert(latest_caps & ~caps); // we should be losing caps.
caps_callbacks[caps].push_back(onimplement);
}
latest_caps = caps;
check_caps();
-}
+ // kick waiters? (did we gain caps?)
+ if (can_read() && !waitfor_read.empty())
+ for (set<Cond*>::iterator p = waitfor_read.begin();
+ p != waitfor_read.end();
+ ++p)
+ (*p)->Signal();
+ if (can_write() && !waitfor_write.empty())
+ for (set<Cond*>::iterator p = waitfor_write.begin();
+ p != waitfor_write.end();
+ ++p)
+ (*p)->Signal();
+
+}
-void FileCache::check_caps()
+int FileCache::get_used_caps()
{
int used = 0;
if (num_reading) used |= CAP_FILE_RD;
if (oc->set_is_cached(inode.ino)) used |= CAP_FILE_RDCACHE;
if (num_writing) used |= CAP_FILE_WR;
if (oc->set_is_dirty_or_committing(inode.ino)) used |= CAP_FILE_WRBUFFER;
- dout(10) << "check_caps used " << cap_string(used) << endl;
+ return used;
+}
+
+void FileCache::check_caps()
+{
+ // calc used
+ int used = get_used_caps();
+ dout(10) << "check_caps used was " << cap_string(used) << endl;
+
+ // try to implement caps?
+ // BUG? latest_caps, not least caps i've seen?
+ if ((latest_caps & CAP_FILE_RDCACHE) == 0 &&
+ (used & CAP_FILE_RDCACHE))
+ release_clean();
+ if ((latest_caps & CAP_FILE_WRBUFFER) == 0 &&
+ (used & CAP_FILE_WRBUFFER))
+ flush_dirty(new C_FC_CheckCaps(this));
+
+ used = get_used_caps();
+ dout(10) << "check_caps used now " << cap_string(used) << endl;
// check callbacks
map<int, list<Context*> >::iterator p = caps_callbacks.begin();
{
int r = 0;
+ // can i read?
+ while ((latest_caps & CAP_FILE_RD) == 0) {
+ dout(10) << "read doesn't have RD cap, blocking" << endl;
+ Cond c;
+ waitfor_read.insert(&c);
+ c.Wait(client_lock);
+ waitfor_read.erase(&c);
+ }
+
// inc reading counter
num_reading++;
void FileCache::write(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock)
{
+ // can i write
+ while ((latest_caps & CAP_FILE_WR) == 0) {
+ dout(10) << "write doesn't have WR cap, blocking" << endl;
+ Cond c;
+ waitfor_write.insert(&c);
+ c.Wait(client_lock);
+ waitfor_write.erase(&c);
+ }
+
// inc writing counter
num_writing++;
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
#ifndef __FILECACHE_H
#define __FILECACHE_H
//int num_unsafe;
// waiters
- list<Cond*> waitfor_read;
- list<Cond*> waitfor_write;
- //list<Context*> waitfor_safe;
+ set<Cond*> waitfor_read;
+ set<Cond*> waitfor_write;
+
bool waitfor_release;
public:
latest_caps(0),
num_reading(0), num_writing(0),// num_unsafe(0),
waitfor_release(false) {}
+ ~FileCache() {
+ tear_down();
+ }
// waiters/waiting
bool can_read() { return latest_caps & CAP_FILE_RD; }
bool can_write() { return latest_caps & CAP_FILE_WR; }
bool all_safe();// { return num_unsafe == 0; }
- void add_read_waiter(Cond *c) { waitfor_read.push_back(c); }
- void add_write_waiter(Cond *c) { waitfor_write.push_back(c); }
- void add_safe_waiter(Context *c);// { waitfor_safe.push_back(c); }
+ void add_safe_waiter(Context *c);
+
+ void truncate(off_t olds, off_t news);
// ...
void flush_dirty(Context *onflush=0);
bool is_cached();
bool is_dirty();
+ void tear_down();
+
int get_caps() { return latest_caps; }
+ int get_used_caps();
void set_caps(int caps, Context *onimplement=0);
void check_caps();
} else if (strcmp(args[i],"optest") == 0) {
syn_modes.push_back( SYNCLIENT_MODE_OPTEST );
syn_iargs.push_back( atoi(args[++i]) );
+
+ } else if (strcmp(args[i],"truncate") == 0) {
+ syn_modes.push_back( SYNCLIENT_MODE_TRUNCATE );
+ syn_sargs.push_back(args[++i]);
+ syn_iargs.push_back(atoi(args[++i]));
} else {
cerr << "unknown syn arg " << args[i] << endl;
assert(0);
}
}
break;
+
+ case SYNCLIENT_MODE_TRUNCATE:
+ {
+ string file = get_sarg(0);
+ sargs.push_front(file);
+ int iarg1 = iargs.front(); iargs.pop_front();
+ if (run_me())
+ client->truncate(file.c_str(), iarg1);
+ }
+ break;
default:
assert(0);
char *buf = new char[size];
client->read(fh, buf, size, off);
delete[] buf;
+ } else if (strcmp(op, "lseek") == 0) {
+ __int64_t id = t.get_int();
+ __int64_t fh = open_files[id];
+ int off = t.get_int();
+ int whence = t.get_int();
+ client->lseek(fh, off, whence);
} else if (strcmp(op, "write") == 0) {
__int64_t id = t.get_int();
__int64_t fh = open_files[id];
for (map<string, inode_t>::iterator it = contents.begin();
it != contents.end();
it++) {
+ if (it->first == ".") continue;
+ if (it->first == "..") continue;
string file = basedir + "/" + it->first;
if (time_to_stop()) break;
continue;
}
+ // print
+ char *tm = ctime(&st.st_mtime);
+ tm[strlen(tm)-1] = 0;
+ printf("%c%c%c%c%c%c%c%c%c%c %2d %5d %5d %8d %12s %s\n",
+ S_ISDIR(st.st_mode) ? 'd':'-',
+ (st.st_mode & 0400) ? 'r':'-',
+ (st.st_mode & 0200) ? 'w':'-',
+ (st.st_mode & 0100) ? 'x':'-',
+ (st.st_mode & 040) ? 'r':'-',
+ (st.st_mode & 020) ? 'w':'-',
+ (st.st_mode & 010) ? 'x':'-',
+ (st.st_mode & 04) ? 'r':'-',
+ (st.st_mode & 02) ? 'w':'-',
+ (st.st_mode & 01) ? 'x':'-',
+ (int)st.st_nlink,
+ st.st_uid, st.st_gid,
+ (int)st.st_size,
+ tm,
+ file.c_str());
+
+
if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) {
dirq.push_back(file);
}
}
dout(2) << "writing block " << i << "/" << chunks << endl;
- // fill buf with a fingerprint
- int *p = (int*)buf;
+ // fill buf with a 16 byte fingerprint
+ // 64 bits : file offset
+ // 64 bits : client id
+ // = 128 bits (16 bytes)
+ __uint64_t *p = (__uint64_t*)buf;
while ((char*)p < buf + wrsize) {
- *p = (char*)p - buf;
- p++;
- *p = i;
+ *p = i*wrsize + (char*)p - buf;
p++;
*p = client->get_nodeid();
p++;
- *p = 0;
- p++;
}
client->write(fd, buf, wrsize, i*wrsize);
for (unsigned i=0; i<chunks; i++) {
if (time_to_stop()) break;
dout(2) << "reading block " << i << "/" << chunks << endl;
- client->read(fd, buf, rdsize, i*rdsize);
+ int r = client->read(fd, buf, rdsize, i*rdsize);
+ if (r < rdsize) {
+ dout(1) << "read_file got r = " << r << ", probably end of file" << endl;
+ break;
+ }
// verify fingerprint
- int *p = (int*)buf;
int bad = 0;
- int boff, bgoff, bchunk, bclient, bzero;
+ __int64_t *p = (__int64_t*)buf;
+ __int64_t readoff, readclient;
while ((char*)p + 32 < buf + rdsize) {
- boff = *p;
- bgoff = (int)((char*)p - buf);
+ readoff = *p;
+ __int64_t wantoff = i*rdsize + (__int64_t)((char*)p - buf);
p++;
- bchunk = *p;
+ readclient = *p;
p++;
- bclient = *p;
- p++;
- bzero = *p;
- p++;
- if (boff != bgoff ||
- bchunk != (int)i ||
- bclient != client->get_nodeid() ||
- bzero != 0) {
+ if (readoff != wantoff ||
+ readclient != client->get_nodeid()) {
if (!bad)
- dout(0) << "WARNING: wrong data from OSD, it should be "
- << "(block=" << i
- << " offset=" << bgoff
- << " client=" << client->get_nodeid() << ")"
- << " .. but i read back .. "
- << "(block=" << bchunk
- << " offset=" << boff
- << " client=" << bclient << " zero=" << bzero << ")" << endl;
-
+ dout(0) << "WARNING: wrong data from OSD, block says fileoffset=" << readoff << " client=" << readclient
+ << ", should be offset " << wantoff << " clietn " << client->get_nodeid()
+ << endl;
bad++;
}
}
if (bad)
dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << endl;
-
}
client->close(fd);
#define SYNCLIENT_MODE_RANDOMSLEEP 61
#define SYNCLIENT_MODE_SLEEP 62
+#define SYNCLIENT_MODE_TRUNCATE 200
+
int play_trace(Trace& t, string& prefix);
void make_dir_mess(const char *basedir, int n);
-
};
#endif
return client->write(fh, buf, size, offset);
}
-/*
static int ceph_flush(const char *path, struct fuse_file_info *fi)
{
- fh_t fh = fi->fh;
- return client->flush(fh);
+//fh_t fh = fi->fh;
+ //return client->flush(fh);
+ return 0;
}
-*/
static int ceph_statfs(const char *path, struct statvfs *stbuf)
read: ceph_read,
write: ceph_write,
statfs: ceph_statfs,
- flush: 0, //ceph_flush,
+ flush: ceph_flush,
release: ceph_release,
fsync: ceph_fsync
};
// go fuse go
cout << "ok, calling fuse_main" << endl;
- return fuse_main(newargc, newargv, &ceph_oper);
+ int r = fuse_main(newargc, newargv, &ceph_oper);
+ return r;
}
--- /dev/null
+#include "CephFSInterface.h"
+
+using namespace std;
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_initializeClient
+ * Signature: ()J
+ * Initializes a ceph client.
+ */
+JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1initializeClient
+ (JNIEnv *, jobject)
+{
+
+ cout << "Initializing Ceph client:" << endl;
+
+ // parse args from CEPH_ARGS
+ vector<char*> args;
+ env_to_vec(args);
+ parse_config_options(args);
+
+ if (g_conf.clock_tare) g_clock.tare();
+
+ // be safe
+ g_conf.use_abspaths = true;
+
+ // load monmap
+ MonMap monmap;
+ int r = monmap.read(".ceph_monmap");
+ if (r < 0) {
+ cout << "could not find .ceph_monmap" << endl;
+ return 0;
+ }
+ assert(r >= 0);
+
+ // start up network
+ rank.start_rank();
+
+ // start client
+ Client *client;
+ client = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), &monmap);
+ client->init();
+
+ // mount
+ client->mount();
+
+ jlong clientp = *(jlong*)&client;
+ return clientp;
+}
+
+
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_copyFromLocalFile
+ * Signature: (JLjava/lang/String;Ljava/lang/String;)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyFromLocalFile
+(JNIEnv * env, jobject obj, jlong clientp, jstring j_local_path, jstring j_ceph_path) {
+
+ cout << "In copyFromLocalFile" << endl;
+ cout.flush();
+ Client* client;
+ //client = (Client*) clientp;
+ client = *(Client**)&clientp;
+
+ const char* c_local_path = env->GetStringUTFChars(j_local_path, 0);
+ const char* c_ceph_path = env->GetStringUTFChars(j_ceph_path, 0);
+
+ cout << "Local source file is "<< c_local_path << " and Ceph destination file is " << c_ceph_path << endl;
+ struct stat st;
+ int r = ::stat(c_local_path, &st);
+ assert (r == 0);
+
+ // open the files
+ int fh_local = ::open(c_local_path, O_RDONLY);
+ int fh_ceph = client->open(c_ceph_path, O_WRONLY|O_CREAT|O_TRUNC);
+ assert (fh_local > -1);
+ assert (fh_ceph > -1);
+ cout << "local fd is " << fh_local << " and Ceph fd is " << fh_ceph << endl;
+
+ // get the source file size
+ off_t remaining = st.st_size;
+
+ // copy the file a MB at a time
+ const int chunk = 1048576;
+ bufferptr bp(chunk);
+
+ while (remaining > 0) {
+ off_t got = ::read(fh_local, bp.c_str(), MIN(remaining,chunk));
+ assert(got > 0);
+ remaining -= got;
+ off_t wrote = client->write(fh_ceph, bp.c_str(), got, -1);
+ assert (got == wrote);
+ }
+ client->close(fh_ceph);
+ ::close(fh_local);
+
+ env->ReleaseStringUTFChars(j_local_path, c_local_path);
+ env->ReleaseStringUTFChars(j_ceph_path, c_ceph_path);
+
+ return JNI_TRUE;
+}
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_copyToLocalFile
+ * Signature: (JLjava/lang/String;Ljava/lang/String;)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyToLocalFile
+(JNIEnv *env, jobject obj, jlong clientp, jstring j_ceph_path, jstring j_local_path)
+{
+
+
+ Client* client;
+ client = *(Client**)&clientp;
+ const char* c_ceph_path = env->GetStringUTFChars(j_ceph_path, 0);
+ const char* c_local_path = env->GetStringUTFChars(j_local_path, 0);
+
+ cout << "In copyToLocalFile, copying from Ceph file " << c_ceph_path <<
+ " to local file " << c_local_path << endl;
+ cout.flush();
+
+
+ // get source file size
+ struct stat st;
+ cout << "Attempting lstat with file " << c_ceph_path << ":" << endl;
+ int r = client->lstat(c_ceph_path, &st);
+ assert (r == 0);
+
+ cout << "Opening Ceph source file for read: " << endl;
+ cout.flush();
+ int fh_ceph = client->open(c_ceph_path, O_RDONLY);
+ assert (fh_ceph > -1);
+
+ cout << " Opened Ceph file! Opening local destination file: " << endl;
+ cout.flush();
+ int fh_local = ::open(c_local_path, O_WRONLY|O_CREAT|O_TRUNC, 0644);
+ assert (fh_local > -1);
+
+ // copy the file a chunk at a time
+ const int chunk = 1048576;
+ bufferptr bp(chunk);
+
+ off_t remaining = st.st_size;
+ while (remaining > 0) {
+ off_t got = client->read(fh_ceph, bp.c_str(), MIN(remaining,chunk), -1);
+ assert(got > 0);
+ remaining -= got;
+ off_t wrote = ::write(fh_local, bp.c_str(), got);
+ assert (got == wrote);
+ }
+ client->close(fh_ceph);
+ ::close(fh_local);
+
+ env->ReleaseStringUTFChars(j_local_path, c_local_path);
+ env->ReleaseStringUTFChars(j_ceph_path, c_ceph_path);
+
+ return JNI_TRUE;
+}
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_getcwd
+ * Signature: (J)Ljava/lang/String;
+ * Returns the current working directory.
+ */
+JNIEXPORT jstring JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getcwd
+ (JNIEnv *env, jobject obj, jlong clientp)
+{
+ //cout << "In getcwd" << endl;
+ //cout.flush();
+
+ Client* client;
+ client = *(Client**)&clientp;
+
+ return (env->NewStringUTF(client->getcwd().c_str()));
+}
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_setcwd
+ * Signature: (JLjava/lang/String;)Z
+ *
+ * Changes the working directory.
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1setcwd
+(JNIEnv *env, jobject obj, jlong clientp, jstring j_path)
+{
+ //cout << "In setcwd" << endl;
+ //cout.flush();
+
+ Client* client;
+ client = *(Client**)&clientp;
+
+ const char* c_path = env->GetStringUTFChars(j_path, 0);
+ return (0 <= client->chdir(c_path)) ? JNI_TRUE : JNI_FALSE;
+ env->ReleaseStringUTFChars(j_path, c_path);
+}
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_rmdir
+ * Signature: (JLjava/lang/String;)Z
+ * Removes an empty directory.
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rmdir
+ (JNIEnv *env, jobject, jlong clientp, jstring j_path)
+{
+ cout << "In rmdir" << endl;
+ cout.flush();
+
+ Client* client;
+ client = *(Client**)&clientp;
+
+ const char* c_path = env->GetStringUTFChars(j_path, 0);
+ return (0 == client->rmdir(c_path)) ? JNI_TRUE : JNI_FALSE;
+ env->ReleaseStringUTFChars(j_path, c_path);
+}
+
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_mkdir
+ * Signature: (JLjava/lang/String;)Z
+ * Creates a directory with full permissions.
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1mkdir
+ (JNIEnv * env, jobject, jlong clientp, jstring j_path)
+{
+ //cout << "In mkdir" << endl;
+ //cout.flush();
+
+
+ Client* client;
+ client = *(Client**)&clientp;
+
+ const char* c_path = env->GetStringUTFChars(j_path, 0);
+ return (0 == client->mkdir(c_path, 0xFF)) ? JNI_TRUE : JNI_FALSE;
+ env->ReleaseStringUTFChars(j_path, c_path);
+}
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_unlink
+ * Signature: (JLjava/lang/String;)Z
+ * Unlinks a path.
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1unlink
+ (JNIEnv * env, jobject, jlong clientp, jstring j_path)
+{
+ cout.flush();
+
+ Client* client;
+ client = *(Client**)&clientp;
+
+ const char* c_path = env->GetStringUTFChars(j_path, 0);
+ cout << "In unlink for path " << c_path << ":" << endl;
+
+ // is it a file or a directory?
+ struct stat stbuf;
+ int stat_result = client->lstat(c_path, &stbuf);
+ if (stat_result < 0) {// then the path doesn't even exist
+ cout << "ceph_unlink: path " << c_path << " does not exist" << endl;
+ return false;
+ }
+ int result;
+ if (0 != S_ISDIR(stbuf.st_mode)) { // it's a directory
+ cout << "ceph_unlink: path " << c_path << " is a directory. Calling client->rmdir()" << endl;
+ result = client->rmdir(c_path);
+ }
+ else if (0 != S_ISREG(stbuf.st_mode)) { // it's a file
+ cout << "ceph_unlink: path " << c_path << " is a file. Calling client->unlink()" << endl;
+ result = client->unlink(c_path);
+ }
+ else {
+ cout << "ceph_unlink: path " << c_path << " is not a file or a directory. Failing:" << endl;
+ result = -1;
+ }
+
+ cout << "In ceph_unlink for path " << c_path <<
+ ": got result "
+ << result << ". Returning..."<< endl;
+
+ env->ReleaseStringUTFChars(j_path, c_path);
+ return (0 == result) ? JNI_TRUE : JNI_FALSE;
+}
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_rename
+ * Signature: (JLjava/lang/String;Ljava/lang/String;)Z
+ * Renames a file.
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rename
+ (JNIEnv *env, jobject, jlong clientp, jstring j_from, jstring j_to)
+{
+ cout << "In rename" << endl;
+ cout.flush();
+
+
+ Client* client;
+ client = *(Client**)&clientp;
+
+ const char* c_from = env->GetStringUTFChars(j_from, 0);
+ const char* c_to = env->GetStringUTFChars(j_to, 0);
+
+ return (0 <= client->rename(c_from, c_to)) ? JNI_TRUE : JNI_FALSE;
+ env->ReleaseStringUTFChars(j_from, c_from);
+ env->ReleaseStringUTFChars(j_to, c_to);
+}
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_exists
+ * Signature: (JLjava/lang/String;)Z
+ * Returns true if the path exists.
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1exists
+(JNIEnv *env, jobject, jlong clientp, jstring j_path)
+{
+
+ //cout << "In exists" << endl;
+ //cout.flush();
+
+ Client* client;
+ struct stat stbuf;
+ client = *(Client**)&clientp;
+
+ const char* c_path = env->GetStringUTFChars(j_path, 0);
+ cout << "Attempting lstat with file " << c_path << ":" ;
+ //int i = (int) (*c_path);
+ //cout << "First character value is " << i;
+ // cout.flush();
+ int result = client->lstat(c_path, &stbuf);
+ cout << "result is " << result << endl;
+ // cout << "Attempting to release string \"" << c_path << "\"" << endl;
+ //cout.flush();
+ env->ReleaseStringUTFChars(j_path, c_path);
+ //cout << "String released!" << endl;
+ if (result < 0) {
+ //cout << "Returning false (file does not exist)" << endl;
+ //cout.flush();
+ return JNI_FALSE;
+ }
+ else {
+ //cout << "Returning true (file exists)" << endl;
+ //cout.flush();
+ return JNI_TRUE;
+ }
+
+}
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_getblocksize
+ * Signature: (JLjava/lang/String;)J
+ * Returns the block size. Size is -1 if the file
+ * does not exist.
+ * TODO: see if Hadoop wants something more like stripe size
+ */
+JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getblocksize
+ (JNIEnv *env, jobject obj, jlong clientp, jstring j_path)
+{
+ cout << "In getblocksize" << endl;
+ cout.flush();
+
+
+ Client* client;
+ struct stat stbuf;
+ client = *(Client**)&clientp;
+
+ jint result;
+
+ const char* c_path = env->GetStringUTFChars(j_path, 0);
+ if (0 > client->lstat(c_path, &stbuf))
+ result = -1;
+ else
+ result = stbuf.st_blksize;
+
+ env->ReleaseStringUTFChars(j_path, c_path);
+ return result;
+}
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_getfilesize
+ * Signature: (JLjava/lang/String;)J
+ * Returns the file size, or -1 on failure.
+ */
+JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getfilesize
+ (JNIEnv *env, jobject, jlong clientp, jstring j_path)
+{
+ cout << "In getfilesize" << endl;
+ cout.flush();
+
+ Client* client;
+ struct stat stbuf;
+ client = *(Client**)&clientp;
+
+ jlong result;
+
+ const char* c_path = env->GetStringUTFChars(j_path, 0);
+ if (0 > client->lstat(c_path, &stbuf)) result = -1;
+ else result = stbuf.st_size;
+ env->ReleaseStringUTFChars(j_path, c_path);
+
+ return result;
+}
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_isfile
+ * Signature: (JLjava/lang/String;)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isfile
+ (JNIEnv *env, jobject obj, jlong clientp, jstring j_path)
+{
+ //cout << "In isfile" << endl;
+ //cout.flush();
+
+ Client* client;
+ struct stat stbuf;
+ client = *(Client**)&clientp;
+
+
+ const char* c_path = env->GetStringUTFChars(j_path, 0);
+ //cout << "Attempting lstat with file " << c_path << ":" << endl;
+ //cout.flush();
+ int result = client->lstat(c_path, &stbuf);
+ //cout << "Got through lstat without crashing: result is " << result << endl;
+ //cout.flush();
+
+ env->ReleaseStringUTFChars(j_path, c_path);
+
+ // if the stat call failed, it's definitely not a file...
+ if (0 > result) return JNI_FALSE;
+
+ // check the stat result
+ //cout << "Stat call succeeded: attempting to look inside stbuf for result" << endl;
+ return (0 == S_ISREG(stbuf.st_mode)) ? JNI_FALSE : JNI_TRUE;
+}
+
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_isdirectory
+ * Signature: (JLjava/lang/String;)Z
+ * Returns true if the path is a directory.
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isdirectory
+ (JNIEnv *env, jobject, jlong clientp, jstring j_path)
+{
+ //cout << "In isdirectory" << endl;
+ //cout.flush();
+
+ Client* client;
+ struct stat stbuf;
+ client = *(Client**)&clientp;
+
+ const char* c_path = env->GetStringUTFChars(j_path, 0);
+ int result = client->lstat(c_path, &stbuf);
+ env->ReleaseStringUTFChars(j_path, c_path);
+ //cout << "String released!" << endl;
+ //cout.flush();
+
+ // if the stat call failed, it's definitely not a directory...
+ if (0 > result) return JNI_FALSE;
+
+ // check the stat result
+ return (0 == S_ISDIR(stbuf.st_mode)) ? JNI_FALSE : JNI_TRUE;
+}
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_getdir
+ * Signature: (JLjava/lang/String;)[Ljava/lang/String;
+ * Returns a Java array of Strings with the directory contents
+ */
+JNIEXPORT jobjectArray JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getdir
+(JNIEnv *env, jobject obj, jlong clientp, jstring j_path) {
+
+ //cout << "In getdir" << endl;
+ //cout.flush();
+
+
+ Client* client;
+ client = *(Client**)&clientp;
+
+ // get the directory listing
+ map<string, inode_t> contents;
+ const char* c_path = env->GetStringUTFChars(j_path, 0);
+ int result = client->getdir(c_path, contents);
+ //cout << "Releasing string" << endl;
+ env->ReleaseStringUTFChars(j_path, c_path);
+
+ if (result < 0) return NULL;
+
+ //cout << "checking for empty dir" << endl;
+ jint dir_size = contents.size();
+
+ // Hadoop doesn't want . or .. in the listing, so we shrink the
+ // listing size by two, or by one if the directory's root
+ if(('/' == c_path[0]) && (0 == c_path[1]))
+ dir_size -= 1;
+ else
+ dir_size -= 2;
+ assert (dir_size >= 0);
+
+ // Create a Java String array of the size of the directory listing
+ // jstring blankString = env->NewStringUTF("");
+ jclass stringClass = env->FindClass("java/lang/String");
+ if (NULL == stringClass) {
+ cout << "ERROR: java String class not found; dying a horrible, painful death" << endl;
+ assert(0);
+ }
+ jobjectArray dirListingStringArray = (jobjectArray) env->NewObjectArray(dir_size, stringClass, NULL);
+
+ // populate the array with the elements of the directory list,
+ // omitting . and ..
+ int i = 0;
+ string dot(".");
+ string dotdot ("..");
+ for (map<string, inode_t>::iterator it = contents.begin();
+ it != contents.end();
+ it++) {
+ // is it "."?
+ if (it->first == dot) continue;
+ if (it->first == dotdot) continue;
+
+ if (0 == dir_size)
+ cout << "WARNING: adding stuff to an empty array" << endl;
+ assert (i < dir_size);
+ env->SetObjectArrayElement(dirListingStringArray, i,
+ env->NewStringUTF(it->first.c_str()));
+ ++i;
+ }
+
+ return dirListingStringArray;
+}
+
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_open_for_read
+ * Signature: (JLjava/lang/String;)I
+ * Open a file for reading.
+ */
+JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1read
+ (JNIEnv *env, jobject obj, jlong clientp, jstring j_path)
+
+{
+ //cout << "In open_for_read" << endl;
+ //cout.flush();
+
+
+ Client* client;
+ client = *(Client**)&clientp;
+
+ jint result;
+
+ // open as read-only: flag = O_RDONLY
+
+ const char* c_path = env->GetStringUTFChars(j_path, 0);
+ result = client->open(c_path, O_RDONLY);
+ env->ReleaseStringUTFChars(j_path, c_path);
+
+ // returns file handle, or -1 on failure
+ return result;
+}
+
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_open_for_overwrite
+ * Signature: (JLjava/lang/String;)I
+ * Opens a file for overwriting; creates it if necessary.
+ */
+JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1overwrite
+ (JNIEnv *env, jobject obj, jlong clientp, jstring j_path)
+{
+ //cout << "In open_for_overwrite" << endl;
+ //cout.flush();
+
+ Client* client;
+ client = *(Client**)&clientp;
+
+ jint result;
+
+
+ const char* c_path = env->GetStringUTFChars(j_path, 0);
+ result = client->open(c_path, O_WRONLY|O_CREAT|O_TRUNC);
+ env->ReleaseStringUTFChars(j_path, c_path);
+
+ // returns file handle, or -1 on failure
+ return result;
+}
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_kill_client
+ * Signature: (J)Z
+ *
+ * Closes the Ceph client.
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1kill_1client
+ (JNIEnv *env, jobject obj, jlong clientp)
+{
+ Client* client;
+ client = *(Client**)&clientp;
+
+ client->unmount();
+ client->shutdown();
+ delete client;
+
+ // wait for messenger to finish
+ rank.wait();
+
+ return true;
+}
+
+
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephInputStream
+ * Method: ceph_read
+ * Signature: (JI[BII)I
+ * Reads into the given byte array from the current position.
+ */
+JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1read
+ (JNIEnv *env, jobject obj, jlong clientp, jint fh, jbyteArray j_buffer, jint buffer_offset, jint length)
+{
+ //cout << "In read" << endl;
+ //cout.flush();
+
+
+ // IMPORTANT NOTE: Hadoop read arguments are a bit different from POSIX so we
+ // have to convert. The read is *always* from the current position in the file,
+ // and buffer_offset is the location in the *buffer* where we start writing.
+
+
+ Client* client;
+ client = *(Client**)&clientp;
+ jint result;
+
+ // Step 1: get a pointer to the buffer.
+ jbyte* j_buffer_ptr = env->GetByteArrayElements(j_buffer, NULL);
+ char* c_buffer = (char*) j_buffer_ptr;
+
+ // Step 2: pointer arithmetic to start in the right buffer position
+ c_buffer += (int)buffer_offset;
+
+ // Step 3: do the read
+ result = client->read((int)fh, c_buffer, length, -1);
+
+ // Step 4: release the pointer to the buffer
+ env->ReleaseByteArrayElements(j_buffer, j_buffer_ptr, 0);
+
+ return result;
+}
+
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephInputStream
+ * Method: ceph_seek_from_start
+ * Signature: (JIJ)J
+ * Seeks to the given position.
+ */
+JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1seek_1from_1start
+ (JNIEnv *env, jobject obj, jlong clientp, jint fh, jlong pos)
+{
+ //cout << "In CephInputStream::seek_from_start" << endl;
+ //cout.flush();
+
+
+ Client* client;
+ client = *(Client**)&clientp;
+ jint result;
+
+ result = client->lseek(fh, pos, SEEK_SET);
+
+ return result;
+}
+
+
+JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1getpos
+ (JNIEnv *env, jobject obj, jlong clientp, jint fh)
+{
+ cout << "In CephInputStream::ceph_getpos" << endl;
+ cout.flush();
+
+
+ Client* client;
+ client = *(Client**)&clientp;
+ jint result;
+
+ // seek a distance of 0 to get current offset
+ result = client->lseek(fh, 0, SEEK_CUR);
+
+ return result;
+}
+
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephInputStream
+ * Method: ceph_close
+ * Signature: (JI)I
+ * Closes the file.
+ */
+JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1close
+ (JNIEnv *env, jobject obj, jlong clientp, jint fh)
+{
+ cout << "In CephInputStream::ceph_close" << endl;
+ cout.flush();
+
+ Client* client;
+ client = *(Client**)&clientp;
+ jint result;
+
+ result = client->close(fh);
+
+ return result;
+}
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephOutputStream
+ * Method: ceph_seek_from_start
+ * Signature: (JIJ)J
+ */
+JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1seek_1from_1start
+ (JNIEnv *env, jobject obj, jlong clientp, jint fh, jlong pos)
+{
+ cout << "In CephOutputStream::ceph_seek_from_start" << endl;
+ cout.flush();
+
+ Client* client;
+ client = *(Client**)&clientp;
+ jint result;
+
+ result = client->lseek(fh, pos, SEEK_SET);
+
+ return result;
+}
+
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephOutputStream
+ * Method: ceph_getpos
+ * Signature: (JI)J
+ */
+JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1getpos
+ (JNIEnv *env, jobject obj, jlong clientp, jint fh)
+{
+ cout << "In CephOutputStream::ceph_getpos" << endl;
+ cout.flush();
+
+ Client* client;
+ client = *(Client**)&clientp;
+ jint result;
+
+ // seek a distance of 0 to get current offset
+ result = client->lseek(fh, 0, SEEK_CUR);
+
+ return result;
+}
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephOutputStream
+ * Method: ceph_close
+ * Signature: (JI)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1close
+ (JNIEnv *env, jobject obj, jlong clientp, jint fh)
+{
+ cout << "In CephOutputStream::ceph_close" << endl;
+ cout.flush();
+
+ Client* client;
+ client = *(Client**)&clientp;
+ jint result;
+
+ result = client->close(fh);
+
+ return result;
+}
+
+
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephOutputStream
+ * Method: ceph_write
+ * Signature: (JI[BII)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1write
+ (JNIEnv *env, jobject obj, jlong clientp, jint fh, jbyteArray j_buffer, jint buffer_offset, jint length)
+{
+ //cout << "In write" << endl;
+ //cout.flush();
+
+
+ // IMPORTANT NOTE: Hadoop write arguments are a bit different from POSIX so we
+ // have to convert. The write is *always* from the current position in the file,
+ // and buffer_offset is the location in the *buffer* where we start writing.
+
+ Client* client;
+ client = *(Client**)&clientp;
+ jint result;
+
+ // Step 1: get a pointer to the buffer.
+ jbyte* j_buffer_ptr = env->GetByteArrayElements(j_buffer, NULL);
+ char* c_buffer = (char*) j_buffer_ptr;
+
+ // Step 2: pointer arithmetic to start in the right buffer position
+ c_buffer += (int)buffer_offset;
+
+ // Step 3: do the write
+ result = client->write((int)fh, c_buffer, length, -1);
+
+ // Step 4: release the pointer to the buffer
+ env->ReleaseByteArrayElements(j_buffer, j_buffer_ptr, 0);
+
+ return result;
+}
+
--- /dev/null
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include <jni.h>
+/* Header for class org_apache_hadoop_fs_ceph_CephFileSystem */
+
+#include <sys/stat.h>
+#include "client/Client.h"
+#include "config.h"
+#include "client/fuse.h"
+#include "msg/SimpleMessenger.h"
+#include "common/Timer.h"
+
+#ifndef _Included_org_apache_hadoop_fs_ceph_CephFileSystem
+#define _Included_org_apache_hadoop_fs_ceph_CephFileSystem
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#undef org_apache_hadoop_fs_ceph_CephFileSystem_DEFAULT_BLOCK_SIZE
+#define org_apache_hadoop_fs_ceph_CephFileSystem_DEFAULT_BLOCK_SIZE 1048576LL
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_initializeClient
+ * Signature: ()J
+ * Initializes a ceph client.
+ */
+JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1initializeClient
+(JNIEnv *, jobject);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_copyFromLocalFile
+ * Signature: (JLjava/lang/String;Ljava/lang/String;)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyFromLocalFile
+ (JNIEnv *, jobject, jlong, jstring, jstring);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_copyToLocalFile
+ * Signature: (JLjava/lang/String;Ljava/lang/String;)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyToLocalFile
+ (JNIEnv *, jobject, jlong, jstring, jstring);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_getcwd
+ * Signature: (J)Ljava/lang/String;
+ */
+JNIEXPORT jstring JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getcwd
+ (JNIEnv *, jobject, jlong);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_setcwd
+ * Signature: (JLjava/lang/String;)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1setcwd
+ (JNIEnv *, jobject, jlong, jstring);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_rmdir
+ * Signature: (JLjava/lang/String;)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rmdir
+ (JNIEnv *, jobject, jlong, jstring);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_mkdir
+ * Signature: (JLjava/lang/String;)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1mkdir
+ (JNIEnv *, jobject, jlong, jstring);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_unlink
+ * Signature: (JLjava/lang/String;)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1unlink
+ (JNIEnv *, jobject, jlong, jstring);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_rename
+ * Signature: (JLjava/lang/String;Ljava/lang/String;)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rename
+ (JNIEnv *, jobject, jlong, jstring, jstring);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_exists
+ * Signature: (JLjava/lang/String;)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1exists
+ (JNIEnv *, jobject, jlong, jstring);
+
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_getblocksize
+ * Signature: (JLjava/lang/String;)J
+ */
+JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getblocksize
+ (JNIEnv *, jobject, jlong, jstring);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_getfilesize
+ * Signature: (JLjava/lang/String;)J
+ */
+JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getfilesize
+ (JNIEnv *, jobject, jlong, jstring);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_isdirectory
+ * Signature: (JLjava/lang/String;)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isdirectory
+ (JNIEnv *, jobject, jlong, jstring);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_isfile
+ * Signature: (JLjava/lang/String;)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isfile
+ (JNIEnv *, jobject, jlong, jstring);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_getdir
+ * Signature: (JLjava/lang/String;)[Ljava/lang/String;
+ */
+JNIEXPORT jobjectArray JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getdir
+ (JNIEnv *, jobject, jlong, jstring);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_open_for_read
+ * Signature: (JLjava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1read
+ (JNIEnv *, jobject, jlong, jstring);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_open_for_overwrite
+ * Signature: (JLjava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1overwrite
+ (JNIEnv *, jobject, jlong, jstring);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephFileSystem
+ * Method: ceph_kill_client
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1kill_1client
+ (JNIEnv *, jobject, jlong);
+
+#undef org_apache_hadoop_fs_ceph_CephInputStream_SKIP_BUFFER_SIZE
+#define org_apache_hadoop_fs_ceph_CephInputStream_SKIP_BUFFER_SIZE 2048L
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephInputStream
+ * Method: ceph_read
+ * Signature: (JI[BII)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1read
+ (JNIEnv *, jobject, jlong, jint, jbyteArray, jint, jint);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephInputStream
+ * Method: ceph_seek_from_start
+ * Signature: (JIJ)J
+ */
+JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1seek_1from_1start
+ (JNIEnv *, jobject, jlong, jint, jlong);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephInputStream
+ * Method: ceph_getpos
+ * Signature: (JI)J
+ */
+JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1getpos
+ (JNIEnv *, jobject, jlong, jint);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephInputStream
+ * Method: ceph_close
+ * Signature: (JI)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1close
+ (JNIEnv *, jobject, jlong, jint);
+
+/* Header for class org_apache_hadoop_fs_ceph_CephOutputStream */
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephOutputStream
+ * Method: ceph_seek_from_start
+ * Signature: (JIJ)J
+ */
+JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1seek_1from_1start
+ (JNIEnv *, jobject, jlong, jint, jlong);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephOutputStream
+ * Method: ceph_getpos
+ * Signature: (JI)J
+ */
+JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1getpos
+ (JNIEnv *, jobject, jlong, jint);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephOutputStream
+ * Method: ceph_close
+ * Signature: (JI)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1close
+ (JNIEnv *, jobject, jlong, jint);
+
+/*
+ * Class: org_apache_hadoop_fs_ceph_CephOutputStream
+ * Method: ceph_write
+ * Signature: (JI[BII)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1write
+ (JNIEnv *, jobject, jlong, jint, jbyteArray, jint, jint);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
}
}
+ if (g_conf.clock_tare) g_clock.tare();
// load monmap
MonMap monmap;
return -1;
}
}
-
+
+ if (g_conf.clock_tare) g_clock.tare();
+
MonMap monmap;
if (whoami < 0) {
public:
Clock() {
// set offset
- tare();
+ //tare();
}
// real time.
void tare() {
gettimeofday(&zero.timeval(), NULL);
}
+ void tare(utime_t z) {
+ zero = z;
+ }
utime_t now() {
//lock.Lock();
utime_t n;
{
logger_lock.Lock();
{
+ filename = "";
+ if (g_conf.use_abspaths) {
+ char *cwd = get_current_dir_name();
+ filename = cwd;
+ delete cwd;
+ filename += "/";
+ }
+
filename = "log/";
if (g_conf.log_name) {
filename += g_conf.log_name;
//cout << "log " << filename << endl;
interval = g_conf.log_interval;
- //start = g_clock.now(); // time 0!
+ if (!g_conf.clock_tare)
+ start = g_clock.now(); // time 0! otherwise g_clock does it for us.
+
last_logged = 0;
wrote_header = -1;
open = false;
}
friend class Cond;
+
+
+public:
+ class Locker {
+ Mutex &mutex;
+
+ public:
+ Locker(Mutex& m) : mutex(m) {
+ mutex.Lock();
+ }
+ ~Locker() {
+ mutex.Unlock();
+ }
+ };
};
#endif
#define __THREAD_H
#include <pthread.h>
+#include <errno.h>
class Thread {
private:
}
int join(void **prval = 0) {
- if (thread_id == 0) return -1; // never started.
+ if (thread_id == 0) {
+ cerr << "WARNING: join on thread that was never started" << endl;
+ //assert(0);
+ return -EINVAL; // never started.
+ }
+
int status = pthread_join(thread_id, prval);
if (status == 0)
thread_id = 0;
scheduled.erase(tp);
lock.Unlock();
+
+ // delete the canceled event.
+ delete callback;
+
return true;
}
if (g_timer.cancel_event(scheduled[c])) {
// hosed wrapper. hose original event too.
- delete scheduled[c];
+ delete c;
} else {
// clean up later.
canceled[c] = scheduled[c];
Mutex bufferlock;
#include "osd/osd_types.h"
+Mutex _dout_lock;
FileLayout g_OSD_FileLayout( 1<<20, 1, 1<<20, pg_t::TYPE_REP, 2 ); // stripe over 1M objects, 2x replication
//FileLayout g_OSD_FileLayout( 1<<17, 4, 1<<20 ); // 128k stripes over sets of 4
debug_after: 0,
+ // -- misc --
+ use_abspaths: false, // make monitorstore et al use absolute path (to workaround FUSE chdir("/"))
+
// --- clock ---
clock_lock: false,
+ clock_tare: true,
// --- messenger ---
ms_single_dispatch: false,
mds_decay_halflife: 30,
mds_beacon_interval: 5.0,
- mds_beacon_grace: 10.0,
+ mds_beacon_grace: 100.0,
mds_log: true,
mds_log_max_len: MDS_CACHE_SIZE / 3,
bdbstore_ffactor: 0,
bdbstore_nelem: 0,
bdbstore_pagesize: 0,
- bdbstore_cachesize: 0
+ bdbstore_cachesize: 0,
+ bdbstore_transactional: false
#endif // USE_OSBDB
};
else if (strcmp(args[i], "--clock_lock") == 0)
g_conf.clock_lock = atoi(args[++i]);
+ else if (strcmp(args[i], "--clock_tare") == 0)
+ g_conf.clock_tare = atoi(args[++i]);
else if (strcmp(args[i], "--objecter_buffer_uncommitted") == 0)
g_conf.objecter_buffer_uncommitted = atoi(args[++i]);
else if (strcmp(args[i], "--mds_cache_size") == 0)
g_conf.mds_cache_size = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_beacon_interval") == 0)
+ g_conf.mds_beacon_interval = atoi(args[++i]);
+ else if (strcmp(args[i], "--mds_beacon_grace") == 0)
+ g_conf.mds_beacon_grace = atoi(args[++i]);
+
else if (strcmp(args[i], "--mds_log") == 0)
g_conf.mds_log = atoi(args[++i]);
else if (strcmp(args[i], "--mds_log_before_reply") == 0)
else if (strcmp(args[i], "--bdbstore-cachesize") == 0) {
g_conf.bdbstore_cachesize = atoi(args[++i]);
}
+ else if (strcmp(args[i], "--bdbstore-transactional") == 0) {
+ g_conf.bdbstore_transactional = true;
+ }
+ else if (strcmp(args[i], "--debug-bdbstore") == 0) {
+ g_conf.debug_bdbstore = atoi(args[++i]);
+ }
#endif // USE_OSBDB
else {
#include <vector>
#include <map>
+#include "common/Mutex.h"
+
extern std::map<int,float> g_fake_osd_down;
extern std::map<int,float> g_fake_osd_out;
int debug_after;
+ // misc
+ bool use_abspaths;
+
// clock
bool clock_lock;
+ bool clock_tare;
// messenger
int bdbstore_nelem;
int bdbstore_pagesize;
int bdbstore_cachesize;
+ bool bdbstore_transactional;
#endif // USE_OSBDB
};
extern md_config_t g_conf;
extern md_config_t g_debug_after_conf;
+
+/**
+ * debug output framework
+ */
#define dout(x) if ((x) <= g_conf.debug) std::cout
#define dout2(x) if ((x) <= g_conf.debug) std::cout
+/**
+ * for cleaner output, bracket each line with
+ * dbeginl (in the dout macro) and dendl (in place of endl).
+ */
+extern Mutex _dout_lock;
+struct _dbeginl_t {
+ _dbeginl_t(int) {}
+};
+struct _dendl_t {
+ _dendl_t(int) {}
+};
+static const _dbeginl_t dbeginl = 0;
+static const _dendl_t dendl = 0;
+
+inline ostream& operator<<(ostream& out, _dbeginl_t) {
+ _dout_lock.Lock();
+ return out;
+}
+inline ostream& operator<<(ostream& out, _dendl_t) {
+ out << endl;
+ _dout_lock.Unlock();
+ return out;
+}
+
+
+/**
+ * command line / environment argument parsing
+ */
void env_to_vec(std::vector<char*>& args);
void argv_to_vec(int argc, char **argv,
std::vector<char*>& args);
#include "osd/OSD.h"
#include "ebofs/Ebofs.h"
-#include "msg/NewMessenger.h"
+#include "msg/SimpleMessenger.h"
#include "common/Timer.h"
if (g_conf.debug_after)
g_timer.add_event_after(g_conf.debug_after, new C_Debug);
+ if (g_conf.clock_tare) g_clock.tare();
+
// osd specific args
char *dev;
int whoami = -1;
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
#ifndef __crush_BINARYTREE_H
#define __crush_BINARYTREE_H
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
#ifndef __crush_BUCKET_H
#define __crush_BUCKET_H
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
// Robert Jenkins' function for mixing 32-bit values
// http://burtleburtle.net/bob/hash/evahash.html
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
#ifndef __crush_CRUSH_H
#define __crush_CRUSH_H
int bucketno;
Hash h;
- hash_map<int, int> parent_map; // what bucket each leaf/bucket lives in
+ hash_map<int, int> parent_map; // what bucket each leaf/bucket lives in
public:
map<int, Rule> rules;
off += sizeof(r);
rules[r]._decode(bl,off);
}
-
- // index
- build_parent_map();
+
+ // index
+ build_parent_map();
}
- void build_parent_map() {
- parent_map.clear();
-
- // index every bucket
- for (map<int, Bucket*>::iterator bp = buckets.begin();
- bp != buckets.end();
- ++bp) {
- // index bucket items
- vector<int> items;
- bp->second->get_items(items);
- for (vector<int>::iterator ip = items.begin();
- ip != items.end();
- ++ip)
- parent_map[*ip] = bp->first;
- }
- }
-
+ void build_parent_map() {
+ parent_map.clear();
+
+ // index every bucket
+ for (map<int, Bucket*>::iterator bp = buckets.begin();
+ bp != buckets.end();
+ ++bp) {
+ // index bucket items
+ vector<int> items;
+ bp->second->get_items(items);
+ for (vector<int>::iterator ip = items.begin();
+ ip != items.end();
+ ++ip)
+ parent_map[*ip] = bp->first;
+ }
+ }
+
public:
vector<int>& outvec,
bool firstn,
set<int>& outset, map<int,float>& overloadmap,
- bool forcefeed=false,
- int forcefeedval=-1) {
+ bool forcefeed=false,
+ int forcefeedval=-1) {
int off = outvec.size();
// for each replica
for (int rep=0; rep<numrep; rep++) {
int outv = -1; // my result
- // forcefeed?
- if (forcefeed) {
- forcefeed = false;
- outvec.push_back(forcefeedval);
- continue;
- }
-
+ // forcefeed?
+ if (forcefeed) {
+ forcefeed = false;
+ outvec.push_back(forcefeedval);
+ continue;
+ }
+
// keep trying until we get a non-out, non-colliding item
int ftotal = 0;
bool skip_rep = false;
-
+
while (1) {
// start with the input bucket
Bucket *in = inbucket;
//int numresult = 0;
result.clear();
- // determine hierarchical context for first.
- list<int> force_stack;
- if (forcefeed >= 0) {
- int t = forcefeed;
- while (1) {
- force_stack.push_front(t);
- if (parent_map.count(t) == 0) break; // reached root, presumably.
- //cout << " " << t << " parent is " << parent_map[t] << endl;
- t = parent_map[t];
- }
- }
-
+ // determine hierarchical context for first.
+ list<int> force_stack;
+ if (forcefeed >= 0) {
+ int t = forcefeed;
+ while (1) {
+ force_stack.push_front(t);
+ if (parent_map.count(t) == 0) break; // reached root, presumably.
+ //cout << " " << t << " parent is " << parent_map[t] << endl;
+ t = parent_map[t];
+ }
+ }
+
// working vector
vector<int> w; // working variable
-
+
// go through each statement
for (vector<RuleStep>::iterator pc = rule.steps.begin();
pc != rule.steps.end();
{
const int arg = pc->args[0];
//cout << "take " << arg << endl;
-
- if (!force_stack.empty()) {
- int forceval = force_stack.front();
- force_stack.pop_front();
- assert(arg == forceval);
- }
-
+
+ if (!force_stack.empty()) {
+ int forceval = force_stack.front();
+ force_stack.pop_front();
+ assert(arg == forceval);
+ }
+
w.clear();
w.push_back(arg);
}
vector<int> out;
// forcefeeding?
- bool forcing = false;
- int forceval;
- if (!force_stack.empty()) {
- forceval = force_stack.front();
- force_stack.pop_front();
- //cout << "priming out with " << forceval << endl;
- forcing = true;
- }
-
+ bool forcing = false;
+ int forceval;
+ if (!force_stack.empty()) {
+ forceval = force_stack.front();
+ force_stack.pop_front();
+ //cout << "priming out with " << forceval << endl;
+ forcing = true;
+ }
+
// do each row independently
for (vector<int>::iterator i = w.begin();
i != w.end();
i++) {
assert(buckets.count(*i));
Bucket *b = buckets[*i];
- choose(x, numrep, type, b, out, firstn,
- outset, overloadmap,
- forcing,
- forceval);
- forcing = false; // only once
+ choose(x, numrep, type, b, out, firstn,
+ outset, overloadmap,
+ forcing,
+ forceval);
+ forcing = false; // only once
} // for inrow
// put back into w
// args for fuse
vec_to_argv(args, argc, argv);
+ if (g_conf.clock_tare) g_clock.tare();
+
// load monmap
MonMap monmap;
int r = monmap.read(".ceph_monmap");
// start up network
rank.start_rank();
- // start client
- Client *client = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), &monmap);
- client->init();
+ list<Client*> clients;
+ list<SyntheticClient*> synclients;
+
+ cout << "mounting and starting " << g_conf.num_client << " syn client(s)" << endl;
+ for (int i=0; i<g_conf.num_client; i++) {
+ // start client
+ Client *client = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), &monmap);
+ client->init();
- // start syntheticclient
- SyntheticClient *syn = new SyntheticClient(client);
+ // start syntheticclient
+ SyntheticClient *syn = new SyntheticClient(client);
- // start up fuse
- // use my argc, argv (make sure you pass a mount point!)
- cout << "mounting" << endl;
- client->mount();
-
- cout << "starting syn client" << endl;
- syn->start_thread();
+ client->mount();
+
+ syn->start_thread();
+
+ clients.push_back(client);
+ synclients.push_back(syn);
+ }
+
+ cout << "waiting for client(s) to finish" << endl;
+ while (!clients.empty()) {
+ Client *client = clients.front();
+ SyntheticClient *syn = synclients.front();
+ clients.pop_front();
+ synclients.pop_front();
+
+ // wait
+ syn->join_thread();
- // wait
- syn->join_thread();
+ // unmount
+ client->unmount();
+ client->shutdown();
- // unmount
- client->unmount();
- cout << "unmounted" << endl;
- client->shutdown();
-
- delete client;
-
+ delete syn;
+ delete client;
+ }
+
// wait for messenger to finish
rank.wait();
--- /dev/null
+OBJECT STORE ON BERKELEY DB
+---------------------------
+
+OSBDB is an implementation of an object store that uses Berkeley DB as
+the underlying storage. It is meant to be an alternative to EBOFS.
+
+BUILDING
+--------
+
+You will need to have Berkeley DB installed, including the developent
+packages. We've tested this with Berkeley DB 4.4.20 on Ubuntu 6.10.
+
+To compile OSBDB support, you need to pass the argument "want_bdb=yes"
+to "make." If you don't specify this, OSBDB and all its associated
+support is not included in the executables.
+
+RUNNING
+-------
+
+To use OSBDB in Ceph, simply pass the --bdbstore flag to programs. You
+don't need to create a "device" for OSBDB ahead of time; Berkeley DB
+will take care of creating the files. You also *cannot* use a raw
+device as your store -- it must be regular file.
+
+OSBDB additionally accepts the following flags:
+
+ --bdbstore-btree Configures OSBDB to use the "Btree"
+ database type for Berkeley DB. The default
+ database type is "Hash".
+
+ --bdbstore-hash-ffactor Sets the "fill factor" for the hash
+ database type. Takes an integer argument.
+
+ --bdbstore-hash-nelem Sets the "nelem" parameter for the hash
+ database type. Takes an integer argument.
+
+ --bdbstore-hash-pagesize Sets the page size for the hash database
+ type. Takes an integer argument.
+
+ --bdbstore-cachesize Sets the cache size. Takes an integer
+ argument, which must be a power of two, and
+ no less than 20 KiB.
+
+ --bdbstore-transactional Enable (in-memory-only) transactions for
+ all operations in the OSBDB store.
+
+ --debug-bdbstore Set the debug level. Takes an integer
+ argument.
BarrierQueue(BlockDevice *bd, const char *d) : bdev(bd), dev(d) {
barrier();
}
+ ~BarrierQueue() {
+ for (list<Queue*>::iterator p = qls.begin();
+ p != qls.end();
+ ++p)
+ delete *p;
+ qls.clear();
+ }
int size() {
// this isn't perfectly accurate.
if (!qls.empty())
}
+int ObjectCache::try_map_read(block_t start, block_t len)
+{
+ map<block_t, BufferHead*>::iterator p = data.lower_bound(start);
+
+ block_t cur = start;
+ block_t left = len;
+
+ if (p != data.begin() &&
+ (p == data.end() || p->first > cur)) {
+ p--; // might overlap!
+ if (p->first + p->second->length() <= cur)
+ p++; // doesn't overlap.
+ }
+
+ int num_missing = 0;
+
+ while (left > 0) {
+ // at end?
+ if (p == data.end()) {
+ // rest is a miss.
+ vector<Extent> exv;
+ on->map_extents(cur,
+ left, // no prefetch here!
+ exv);
+
+ num_missing += exv.size();
+ left = 0;
+ cur = start+len;
+ break;
+ }
+
+ if (p->first <= cur) {
+ // have it (or part of it)
+ BufferHead *e = p->second;
+
+ if (e->is_clean() ||
+ e->is_dirty() ||
+ e->is_tx()) {
+ dout(20) << "try_map_read hit " << *e << endl;
+ }
+ else if (e->is_rx()) {
+ dout(20) << "try_map_read rx " << *e << endl;
+ num_missing++;
+ }
+ else if (e->is_partial()) {
+ dout(-20) << "try_map_read partial " << *e << endl;
+ num_missing++;
+ }
+ else {
+ dout(0) << "try_map_read got unexpected " << *e << endl;
+ assert(0);
+ }
+
+ block_t lenfromcur = MIN(e->end() - cur, left);
+ cur += lenfromcur;
+ left -= lenfromcur;
+ p++;
+ continue; // more?
+ } else if (p->first > cur) {
+ // gap.. miss
+ block_t next = p->first;
+ vector<Extent> exv;
+ on->map_extents(cur,
+ MIN(next-cur, left), // no prefetch
+ exv);
+
+ dout(-20) << "try_map_read gap of " << p->first-cur << " blocks, "
+ << exv.size() << " extents" << endl;
+ num_missing += exv.size();
+ left -= (p->first - cur);
+ cur = p->first;
+ continue; // more?
+ }
+ else
+ assert(0);
+ }
+
+ assert(left == 0);
+ assert(cur == start+len);
+ return num_missing;
+}
+
+
+
+
/*
* map a range of blocks into buffer_heads.
dout(20) << "map_read partial " << *e << endl;
}
else {
- dout(0) << "map_read ??? " << *e << endl;
+ dout(0) << "map_read ??? got unexpected " << *e << endl;
assert(0);
}
{
dout(10) << "bh_read " << *on << " on " << *bh << endl;
- if (bh->is_missing()) {
+ if (bh->is_missing()) {
mark_rx(bh);
} else {
assert(bh->is_partial());
// this should be empty!!
assert(bh->rx_ioh == 0);
- dout(20) << "bh_read " << *bh << " from " << ex << endl;
+ dout(20) << "bh_read " << *on << " " << *bh << " from " << ex << endl;
C_OC_RxFinish *fin = new C_OC_RxFinish(ebofs_lock, on->oc,
bh->start(), bh->length(),
if (shouldbe)
assert(ex.length == 1 && ex.start == shouldbe);
- dout(20) << "bh_write " << *bh << " to " << ex << endl;
+ dout(20) << "bh_write " << *on << " " << *bh << " to " << ex << endl;
//assert(bh->tx_ioh == 0);
map<block_t, BufferHead*>& missing, // read these from disk
map<block_t, BufferHead*>& rx, // wait for these to finish reading from disk
map<block_t, BufferHead*>& partial); // (maybe) wait for these to read from disk
+ int try_map_read(block_t start, block_t len); // just tell us how many extents we're missing.
+
int map_write(block_t start, block_t len,
interval_set<block_t>& alloc,
#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << dev.get_device_name() << ")."
#define derr(x) if (x <= g_conf.debug_ebofs) cerr << "ebofs(" << dev.get_device_name() << ")."
+
char *nice_blocks(block_t b)
{
static char s[20];
{
while (1) {
// in cache?
- if (onode_map.count(oid)) {
+ if (have_onode(oid)) {
// yay
Onode *on = onode_map[oid];
on->get();
int Ebofs::_is_cached(object_t oid, off_t off, size_t len)
{
- Onode *on = 0;
- if (onode_map.count(oid) == 0) {
+ if (!have_onode(oid)) {
dout(7) << "_is_cached " << oid << " " << off << "~" << len << " ... onode " << endl;
return -1; // object dne?
}
+ Onode *on = get_onode(oid);
if (!on->have_oc()) {
// nothing is cached. return # of extents in file.
+ dout(10) << "_is_cached have onode but no object cache, returning extent count" << endl;
return on->extent_map.size();
}
map<block_t, BufferHead*> missing; // read these
map<block_t, BufferHead*> rx; // wait for these
map<block_t, BufferHead*> partials; // ??
- on->get_oc(&bc)->map_read(bstart, blen, hits, missing, rx, partials);
- return missing.size() + rx.size() + partials.size();
+
+ int num_missing = on->get_oc(&bc)->try_map_read(bstart, blen);
+ dout(7) << "_is_cached try_map_read reports " << num_missing << " missing extents" << endl;
+ return num_missing;
// FIXME: actually, we should calculate if these extents are contiguous.
// and not using map_read, probably...
void Ebofs::_trim_from_cache(object_t oid, off_t off, size_t len)
{
- Onode *on = 0;
- if (onode_map.count(oid) == 0) {
+ // be careful not to load it if we don't have it
+ if (!have_onode(oid)) {
dout(7) << "_trim_from_cache " << oid << " " << off << "~" << len << " ... onode not in cache " << endl;
return;
}
+ // ok, we have it, get a pointer.
+ Onode *on = get_onode(oid);
+
if (!on->have_oc())
return; // nothing is cached.
map<object_t, list<Cond*> > waitfor_onode;
Onode* new_onode(object_t oid); // make new onode. ref++.
+ bool have_onode(object_t oid) {
+ return onode_map.count(oid);
+ }
Onode* get_onode(object_t oid); // get cached onode, or read from disk. ref++.
void remove_onode(Onode *on);
void put_onode(Onode* o); // put it back down. ref--.
assert(cursor.open[cursor.level].size() == 0);
assert(depth == 1);
root = -1;
- depth = 0;
- pool.release(cursor.open[0].node);
+ depth = 0;
+ if (cursor.open[0].node)
+ pool.release(cursor.open[0].node);
}
verify("remove 1");
return 0;
args = nargs;
vec_to_argv(args, argc, argv);
+ // FUSE will chdir("/"); be ready.
+ g_conf.use_abspaths = true;
+
+ if (g_conf.clock_tare) g_clock.tare();
+
MonMap *monmap = new MonMap(g_conf.num_mon);
Monitor *mon[g_conf.num_mon];
// use my argc, argv (make sure you pass a mount point!)
cout << "starting fuse on pid " << getpid() << endl;
client[i]->mount();
+
+ char *oldcwd = get_current_dir_name(); // note previous wd
ceph_fuse_main(client[i], argc, argv);
+ ::chdir(oldcwd); // return to previous wd
+
client[i]->unmount();
cout << "fuse finished on pid " << getpid() << endl;
client[i]->shutdown();
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-
-#include <sys/stat.h>
-#include <iostream>
-#include <string>
-using namespace std;
-
-#include "config.h"
-
-#include "mds/MDCluster.h"
-
-#include "mds/MDS.h"
-#include "osd/OSD.h"
-#include "mon/Monitor.h"
-#include "client/Client.h"
-
-#include "client/SyntheticClient.h"
-
-#include "msg/FakeMessenger.h"
-
-#include "common/Timer.h"
-
-#define NUMMDS g_conf.num_mds
-#define NUMOSD g_conf.num_osd
-#define NUMCLIENT g_conf.num_client
-
-class C_Test : public Context {
-public:
- void finish(int r) {
- cout << "C_Test->finish(" << r << ")" << endl;
- }
-};
-
-
-int main(int argc, char **argv)
-{
- cerr << "fakesyn start" << endl;
-
- //cerr << "inode_t " << sizeof(inode_t) << endl;
-
- vector<char*> args;
- argv_to_vec(argc, argv, args);
-
- parse_config_options(args);
-
- int start = 0;
-
- parse_syn_options(args);
-
- vector<char*> nargs;
-
- for (unsigned i=0; i<args.size(); i++) {
- // unknown arg, pass it on.
- cerr << " stray arg " << args[i] << endl;
- nargs.push_back(args[i]);
- }
- assert(nargs.empty());
-
-
- MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD);
-
-
- char hostname[100];
- gethostname(hostname,100);
- //int pid = getpid();
-
- // create mon
- Monitor *mon[g_conf.num_mon];
- for (int i=0; i<g_conf.num_mon; i++) {
- mon[i] = new Monitor(i, new FakeMessenger(MSG_ADDR_MON(i)));
- }
-
- // create mds
- MDS *mds[NUMMDS];
- OSD *mdsosd[NUMMDS];
- for (int i=0; i<NUMMDS; i++) {
- //cerr << "mds" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
- mds[i] = new MDS(mdc, i, new FakeMessenger(MSG_ADDR_MDS(i)));
- if (g_conf.mds_local_osd)
- mdsosd[i] = new OSD(i+10000, new FakeMessenger(MSG_ADDR_OSD(i+10000)));
- start++;
- }
-
- // create osd
- OSD *osd[NUMOSD];
- for (int i=0; i<NUMOSD; i++) {
- //cerr << "osd" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
- osd[i] = new OSD(i, new FakeMessenger(MSG_ADDR_OSD(i)));
- start++;
- }
-
- // create client
- Client *client[NUMCLIENT];
- SyntheticClient *syn[NUMCLIENT];
- for (int i=0; i<NUMCLIENT; i++) {
- //cerr << "client" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
- client[i] = new Client(new FakeMessenger(MSG_ADDR_CLIENT(i)));
- start++;
- }
-
-
- // start message loop
- fakemessenger_startthread();
-
- // init
- for (int i=0; i<g_conf.num_mon; i++) {
- mon[i]->init();
- }
- for (int i=0; i<NUMMDS; i++) {
- mds[i]->init();
- if (g_conf.mds_local_osd)
- mdsosd[i]->init();
- }
-
- for (int i=0; i<NUMOSD; i++) {
- osd[i]->init();
- }
-
-
- // create client(s)
- for (int i=0; i<NUMCLIENT; i++) {
- client[i]->init();
-
- // use my argc, argv (make sure you pass a mount point!)
- //cout << "mounting" << endl;
- client[i]->mount();
-
- //cout << "starting synthetic client " << endl;
- syn[i] = new SyntheticClient(client[i]);
-
- syn[i]->start_thread();
- }
-
-
- for (int i=0; i<NUMCLIENT; i++) {
-
- cout << "waiting for synthetic client " << i << " to finish" << endl;
- syn[i]->join_thread();
- delete syn[i];
-
- client[i]->unmount();
- //cout << "unmounted" << endl;
- client[i]->shutdown();
- }
-
-
- // wait for it to finish
- fakemessenger_wait();
-
- // cleanup
- for (int i=0; i<NUMMDS; i++) {
- delete mds[i];
- }
- for (int i=0; i<NUMOSD; i++) {
- delete osd[i];
- }
- for (int i=0; i<NUMCLIENT; i++) {
- delete client[i];
- }
- delete mdc;
-
- cout << "fakesyn done" << endl;
- return 0;
-}
-
#include "common/Timer.h"
-#define NUMMDS g_conf.num_mds
-#define NUMOSD g_conf.num_osd
-#define NUMCLIENT g_conf.num_client
class C_Test : public Context {
public:
if (g_conf.kill_after)
g_timer.add_event_after(g_conf.kill_after, new C_Die);
-
- g_clock.tare();
+ if (g_conf.clock_tare) g_clock.tare();
MonMap *monmap = new MonMap(g_conf.num_mon);
entity_addr_t a;
}
// create mds
- MDS *mds[NUMMDS];
- OSD *mdsosd[NUMMDS];
- for (int i=0; i<NUMMDS; i++) {
+ MDS *mds[g_conf.num_mds];
+ OSD *mdsosd[g_conf.num_mds];
+ for (int i=0; i<g_conf.num_mds; i++) {
//cerr << "mds" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
mds[i] = new MDS(-1, new FakeMessenger(MSG_ADDR_MDS_NEW), monmap);
if (g_conf.mds_local_osd)
}
// create osd
- OSD *osd[NUMOSD];
- for (int i=0; i<NUMOSD; i++) {
+ OSD *osd[g_conf.num_osd];
+ for (int i=0; i<g_conf.num_osd; i++) {
//cerr << "osd" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
osd[i] = new OSD(i, new FakeMessenger(MSG_ADDR_OSD(i)), monmap);
start++;
}
// create client
- Client *client[NUMCLIENT];
- SyntheticClient *syn[NUMCLIENT];
- for (int i=0; i<NUMCLIENT; i++) {
+ Client *client[g_conf.num_client];
+ SyntheticClient *syn[g_conf.num_client];
+ for (int i=0; i<g_conf.num_client; i++) {
//cerr << "client" << i << " on rank " << myrank << " " << hostname << "." << pid << endl;
client[i] = new Client(new FakeMessenger(MSG_ADDR_CLIENT(i)), monmap);
start++;
for (int i=0; i<g_conf.num_mon; i++) {
mon[i]->init();
}
- for (int i=0; i<NUMMDS; i++) {
+ for (int i=0; i<g_conf.num_mds; i++) {
mds[i]->init();
if (g_conf.mds_local_osd)
mdsosd[i]->init();
}
- for (int i=0; i<NUMOSD; i++) {
+ for (int i=0; i<g_conf.num_osd; i++) {
osd[i]->init();
}
// create client(s)
- for (int i=0; i<NUMCLIENT; i++) {
+ for (int i=0; i<g_conf.num_client; i++) {
client[i]->init();
// use my argc, argv (make sure you pass a mount point!)
}
- for (int i=0; i<NUMCLIENT; i++) {
+ for (int i=0; i<g_conf.num_client; i++) {
cout << "waiting for synthetic client " << i << " to finish" << endl;
syn[i]->join_thread();
fakemessenger_wait();
// cleanup
- for (int i=0; i<NUMMDS; i++) {
+ for (int i=0; i<g_conf.num_mon; i++) {
+ delete mon[i];
+ }
+ for (int i=0; i<g_conf.num_mds; i++) {
delete mds[i];
}
- for (int i=0; i<NUMOSD; i++) {
+ for (int i=0; i<g_conf.num_osd; i++) {
delete osd[i];
}
- for (int i=0; i<NUMCLIENT; i++) {
+ for (int i=0; i<g_conf.num_client; i++) {
delete client[i];
}
*/
class C_Gather : public Context {
public:
+ bool sub_finish(int r) {
+ //cout << "C_Gather sub_finish " << this << endl;
+ assert(waitfor.count(r));
+ waitfor.erase(r);
+ if (!waitfor.empty())
+ return false; // more subs left
+
+ // last one
+ onfinish->finish(0);
+ delete onfinish;
+ onfinish = 0;
+ return true;
+ }
+
class C_GatherSub : public Context {
C_Gather *gather;
int num;
public:
C_GatherSub(C_Gather *g, int n) : gather(g), num(n) {}
void finish(int r) {
- gather->finish(num);
+ if (gather->sub_finish(num))
+ delete gather; // last one!
}
};
+ Context *new_sub() {
+ num++;
+ waitfor.insert(num);
+ return new C_GatherSub(this, num);
+ }
+
private:
Context *onfinish;
std::set<int> waitfor;
int num;
public:
- C_Gather(Context *f) : onfinish(f), num(0) {}
-
+ C_Gather(Context *f) : onfinish(f), num(0) {
+ //cout << "C_Gather new " << this << endl;
+ }
+ ~C_Gather() {
+ //cout << "C_Gather delete " << this << endl;
+ assert(!onfinish);
+ }
void finish(int r) {
- assert(waitfor.count(r));
- waitfor.erase(r);
- if (waitfor.empty()) {
- onfinish->finish(0);
- delete onfinish;
- }
+ // nobody should ever call me.
+ assert(0);
}
- Context *new_sub() {
- num++;
- waitfor.insert(num);
- return new C_GatherSub(this, num);
- }
};
#endif
out << '.' << o.rev;
return out;
}
+
+
namespace __gnu_cxx {
+#ifndef __LP64__
template<> struct hash<__uint64_t> {
size_t operator()(__uint64_t __x) const {
static hash<__uint32_t> H;
return H((__x >> 32) ^ (__x & 0xffffffff));
}
};
+#endif
template<> struct hash<object_t> {
size_t operator()(const object_t &r) const {
};
}
+
#endif
}
};
+#ifndef __LP64__
template<> struct hash<__int64_t> {
size_t operator()(__int64_t __x) const {
static hash<__int32_t> H;
return H((__x >> 32) ^ (__x & 0xffffffff));
}
};
+#endif
}
// -- file -> object mapping --
int stripe_unit; // stripe unit, in bytes
int stripe_count; // over this many objects
- int object_size; // until objects are this big, then use a new set of objects.
+ int object_size; // until objects are this big, then move to new objects
int stripe_width() { return stripe_unit * stripe_count; }
int period() { return object_size * stripe_count; }
// -- object -> pg layout --
- char pg_type; // pg type (replicated, raid, etc.) (pg_t::TYPE_*)
+ char pg_type; // pg type (replicated, raid, etc.) (see pg_t::TYPE_*)
char pg_size; // pg size (num replicas, or raid4 stripe width)
int preferred; // preferred primary osd?
--- /dev/null
+#!/usr/bin/perl
+# hi there
+{
+ # startup
+ 'n' => 30, # number of mpi nodes
+ 'sleep' => 3, # seconds to sleep between runs (so you have time to control-c out)
+ 'nummds' => 1,
+ 'numosd' => 6,
+ 'numclient' => 100,
+
+ 'until' => 100, # --syn until $n ... synthetic client will stop itself after this many seconds.
+ 'kill_after' => 300, # seconds before everything commits suicide (in case something hangs)
+
+ # stuff i want to vary
+ # here's a simple example:
+
+ # do --syn writefile command
+ 'writefile' => 1,
+ # and very the write size
+ 'writefile_size' => [ # vary
+# 2048*1024,
+ 1024*1024,
+ 512*1024,
+ 256*1024,
+ 128*1024,
+ 64*1024,
+ 48*1024,
+ 32*1024,
+ 28*1024,
+ 24*1024,
+ 16*1024,
+ 12*1024,
+ 8*1024,
+ 4096,
+# 256,
+# 16,
+# 1
+ ],
+ 'writefile_mb' => 1000, # each client shoudl write 1GB (or more likely, keep going until time runs out)
+
+ 'file_layout_num_rep'=> [1,2], # also vary the replication level
+
+ # pass some other random things to newsyn
+ 'custom' => '--',
+
+ # for final summation (script/sum.pl)
+ # specify time period to look at the results
+ 'start' => 30, # skip first 30 seconds, so that caches are full etc.
+ 'end' => 90, # go for 60 seconds
+
+ # what should i parse/plot?
+ 'comb' => {
+ 'x' => 'writefile_size',
+ 'vars' => [ 'osd.c_wrb', 'osd.r_wrb' ],
+ }
+};
bool is_suppress() { return suppress; }
void set_suppress(bool b) { suppress = b; }
- bool is_null() { return cap_history.empty(); }
+ bool is_null() { return cap_history.empty() && wanted_caps == 0; }
// most recently issued caps.
int pending() {
#define LOCK_GLOCKR 2 // AR R . / C . . . . . . . / C . . . . .
// file lock states
-#define LOCK_GLOCKL 3 // A . . / . . . . . . loner -> lock
+#define LOCK_GLOCKL 3 // A . . / C . . . . . loner -> lock
#define LOCK_GLOCKM 4 // A . . / . . . . . .
#define LOCK_MIXED 5 // AR . . / . R W A . L . . / . R . . . L
#define LOCK_GMIXEDR 6 // AR R . / . R . . . L . . / . R . . . L
return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO;
case LOCK_LOCK:
case LOCK_GLOCKR:
+ case LOCK_GLOCKL:
return CAP_FILE_RDCACHE;
- case LOCK_GLOCKL:
case LOCK_GLOCKM:
return 0;
bool Locker::inode_file_write_start(CInode *in, MClientRequest *m)
{
+ dout(7) << "inode_file_write_start on " << *in << endl;
+
// can't write?
if (!in->filelock.can_write(in->is_auth())) {
void Locker::inode_file_write_finish(CInode *in)
{
// drop ref
- assert(in->filelock.can_write(in->is_auth()));
+ //assert(in->filelock.can_write(in->is_auth()));
in->filelock.put_write();
+ in->auth_unpin();
dout(7) << "inode_file_write_finish on " << *in << ", filelock=" << in->filelock << endl;
// drop lock?
case LOCK_GLOCKR:
case LOCK_GLOCKM:
case LOCK_GLOCKL:
- if (issued == 0) {
+ if ((issued & ~CAP_FILE_RDCACHE) == 0) {
in->filelock.set_state(LOCK_LOCK);
// waiters
// unpin dir
dn->dir->auth_unpin();
+
+ // kick waiters
+ list<Context*> finished;
+ dn->dir->take_waiting(CDIR_WAIT_DNREAD, finished);
+ mds->queue_finished(finished);
}
+
/*
* onfinish->finish() will be called with
* 0 on successful xlock,
journaler = new Journaler(log_inode, mds->objecter, logger);
}
+void MDLog::flush_logger()
+{
+ if (logger)
+ logger->flush(true);
+}
+
void MDLog::reset()
MDLog(MDS *m);
~MDLog();
-
+
+ void flush_logger();
void set_max_events(size_t max) { max_events = max; }
size_t get_max_events() { return max_events; }
if (anchormgr) { delete anchormgr; anchormgr = NULL; }
if (anchorclient) { delete anchorclient; anchorclient = NULL; }
if (osdmap) { delete osdmap; osdmap = 0; }
+ if (mdsmap) { delete mdsmap; mdsmap = 0; }
+
+ if (server) { delete server; server = 0; }
+ if (locker) { delete locker; locker = 0; }
if (filer) { delete filer; filer = 0; }
if (objecter) { delete objecter; objecter = 0; }
// schedule tick
reset_tick();
+ // init logger
+ reopen_logger();
+
mds_lock.Unlock();
return 0;
}
// update messenger.
messenger->reset_myname(MSG_ADDR_MDS(whoami));
- // tell objecter my incarnation
- objecter->set_client_incarnation(mdsmap->get_inc(whoami));
-
reopen_logger();
- dout(1) << "handle_mds_map i am now mds" << whoami << endl;
+ dout(1) << "handle_mds_map i am now mds" << whoami
+ << " incarnation " << mdsmap->get_inc(whoami)
+ << endl;
// do i need an osdmap?
if (oldwhoami < 0) {
}
}
+ // tell objecter my incarnation
+ if (objecter->get_client_incarnation() < 0 &&
+ mdsmap->have_inst(whoami)) {
+ assert(mdsmap->get_inc(whoami) > 0);
+ objecter->set_client_incarnation(mdsmap->get_inc(whoami));
+ }
+
// update my state
state = mdsmap->get_state(whoami);
{
version_t had = osdmap->get_epoch();
+ dout(10) << "handle_osd_map had " << had << endl;
+
+ // pass on to clients
+ for (set<int>::iterator it = clientmap.get_mount_set().begin();
+ it != clientmap.get_mount_set().end();
+ it++) {
+ MOSDMap *n = new MOSDMap;
+ n->maps = m->maps;
+ n->incremental_maps = m->incremental_maps;
+ messenger->send_message(n, clientmap.get_inst(*it));
+ }
+
// process locally
objecter->handle_osd_map(m);
- if (had == 0) {
+ if (had == 0 && osdmap->get_epoch() > 0) {
if (is_creating())
boot_create(); // new tables, journal
else if (is_starting())
assert(is_standby());
}
- // pass on to clients
- for (set<int>::iterator it = clientmap.get_mount_set().begin();
- it != clientmap.get_mount_set().end();
- it++) {
- MOSDMap *n = new MOSDMap;
- n->maps = m->maps;
- n->incremental_maps = m->incremental_maps;
- messenger->send_message(n, clientmap.get_inst(*it));
- }
}
{
dout(1) << "shutdown_final" << endl;
+ // flush loggers
+ if (logger) logger->flush(true);
+ if (logger2) logger2->flush(true);
+ mdlog->flush_logger();
+
// send final down:out beacon (it doesn't matter if this arrives)
set_want_state(MDSMap::STATE_OUT);
return;
}
+ // O_TRUNC
+ if (flags & O_TRUNC) {
+ // write
+ if (!mds->locker->inode_file_write_start(cur, req))
+ return; // fw or (wait for) lock
+
+ // do update
+ cur->inode.size = req->get_sizearg();
+ cur->_mark_dirty(); // fixme
+
+ mds->locker->inode_file_write_finish(cur);
+ }
+
// hmm, check permissions or something.
CDentry *dn = 0;
// make dentry and inode, xlock dentry.
- int r = prepare_mknod(req, diri, &in, &dn);
+ bool excl = req->get_iarg() & O_EXCL;
+ int r = prepare_mknod(req, diri, &in, &dn, !excl);
if (!r)
return; // wait on something
assert(in);
eversion_t pg_trim_to; // primary->replica: trim to here
int op;
- size_t length, offset;
+ size_t length;
+ off_t offset;
+
eversion_t version;
eversion_t old_version;
void set_op(int o) { st.op = o; }
const size_t get_length() { return st.length; }
- const size_t get_offset() { return st.offset; }
+ const off_t get_offset() { return st.offset; }
map<string,bufferptr>& get_attrset() { return attrset; }
void set_attrset(map<string,bufferptr> &as) { attrset = as; }
void set_layout(const ObjectLayout& l) { st.layout = l; }
void set_length(size_t l) { st.length = l; }
- void set_offset(size_t o) { st.offset = o; }
+ void set_offset(off_t o) { st.offset = o; }
void set_version(eversion_t v) { st.version = v; }
void set_old_version(eversion_t ov) { st.old_version = ov; }
sprintf(s, "mondata/mon%d", whoami);
store = new MonitorStore(s);
- if (g_conf.mkfs)
+ if (g_conf.mkfs)
store->mkfs();
- else
- store->mount();
+
+ store->mount();
// create
osdmon = new OSDMonitor(this, messenger, lock);
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
-
+#include <errno.h>
void MonitorStore::mount()
{
assert(0);
}
::closedir(d);
+
+ if (g_conf.use_abspaths) {
+ // combine it with the cwd, in case fuse screws things up (i.e. fakefuse)
+ string old = dir;
+ char *cwd = get_current_dir_name();
+ dir = cwd;
+ delete cwd;
+ dir += "/";
+ dir += old;
+ }
}
}
char vs[30];
+#ifdef __LP64__
+ sprintf(vs, "%ld\n", val);
+#else
sprintf(vs, "%lld\n", val);
+#endif
char tfn[200];
sprintf(tfn, "%s.new", fn);
return 0;
}
- // read size
- __int32_t len = 0;
- ::read(fd, &len, sizeof(len));
-
+ // get size
+ struct stat st;
+ int rc = ::fstat(fd, &st);
+ assert(rc == 0);
+ __int32_t len = st.st_size;
+
// read buffer
bl.clear();
bufferptr bp(len);
- ::read(fd, bp.c_str(), len);
+ int off = 0;
+ while (off < len) {
+ dout(20) << "reading at off " << off << " of " << len << endl;
+ int r = ::read(fd, bp.c_str()+off, len-off);
+ if (r < 0) derr(0) << "errno on read " << strerror(errno) << endl;
+ assert(r>0);
+ off += r;
+ }
bl.append(bp);
::close(fd);
int fd = ::open(tfn, O_WRONLY|O_CREAT);
assert(fd);
- // write size
- __int32_t len = bl.length();
- ::write(fd, &len, sizeof(len));
+ // chmod
+ ::fchmod(fd, 0644);
// write data
for (list<bufferptr>::const_iterator it = bl.buffers().begin();
it != bl.buffers().end();
- it++)
- ::write(fd, it->c_str(), it->length());
+ it++) {
+ int r = ::write(fd, it->c_str(), it->length());
+ if (r != (int)it->length())
+ derr(0) << "put_bl_ss ::write() returned " << r << " not " << it->length() << endl;
+ if (r < 0)
+ derr(0) << "put_bl_ss ::write() errored out, errno is " << strerror(errno) << endl;
+ }
- ::fchmod(fd, 0644);
::fsync(fd);
::close(fd);
::rename(tfn, fn);
int put_bl_ss(bufferlist& bl, const char *a, const char *b);
bool exists_bl_sn(const char *a, version_t b) {
char bs[20];
+#ifdef __LP64__
+ sprintf(bs, "%lu", b);
+#else
sprintf(bs, "%llu", b);
+#endif
return exists_bl_ss(a, bs);
}
int get_bl_sn(bufferlist& bl, const char *a, version_t b) {
char bs[20];
+#ifdef __LP64__
+ sprintf(bs, "%lu", b);
+#else
sprintf(bs, "%llu", b);
+#endif
return get_bl_ss(bl, a, bs);
}
int put_bl_sn(bufferlist& bl, const char *a, version_t b) {
char bs[20];
+#ifdef __LP64__
+ sprintf(bs, "%lu", b);
+#else
sprintf(bs, "%llu", b);
+#endif
return put_bl_ss(bl, a, bs);
}
if (g_conf.osd_pg_bits) {
osdmap.set_pg_bits(g_conf.osd_pg_bits);
} else {
- int osdbits = 1;
+ // figure out how many bits worth of osds we have.
+ // 1 osd -> 0 bits
+ // <= 2 osds -> 1 bit
+ // <= 4 osds -> 2 bits
+ int osdbits = -1;
int n = g_conf.num_osd;
+ assert(n > 0);
while (n) {
n = n >> 1;
osdbits++;
}
- // 2 bits per osd.
- osdmap.set_pg_bits(osdbits + 2);
+ // 7 bits per osd.
+ osdmap.set_pg_bits(osdbits + 4); // FIXME
}
// start at epoch 0 until all osds boot
<< (osdmap.osds.size() - osdmap.osd_inst.size())
<< " osds to boot" << endl;
}
+
+ delete m;
return;
}
dout(18) << "messenger " << mgr << " at " << mgr->get_myname() << " has " << mgr->num_incoming() << " queued" << endl;
-
if (!mgr->is_ready()) {
dout(18) << "messenger " << mgr << " at " << mgr->get_myname() << " has no dispatcher, skipping" << endl;
it++;
FakeMessenger::~FakeMessenger()
{
-
+ // hose any undelivered messages
+ for (list<Message*>::iterator p = incoming.begin();
+ p != incoming.end();
+ ++p)
+ delete *p;
}
assert(directory.count(_myinst.addr) == 1);
shutdown_set.insert(_myinst.addr);
- /*
- directory.erase(myaddr);
- if (directory.empty()) {
- dout(1) << "fakemessenger: last shutdown" << endl;
- ::fm_shutdown = true;
- cond.Signal(); // why not
- }
- */
-
/*
if (loggers[myaddr]) {
delete loggers[myaddr];
lock.Lock();
- // deliver
- try {
#ifdef LOG_MESSAGES
- // stats
- loggers[get_myaddr()]->inc("+send",1);
- loggers[dest]->inc("-recv",1);
-
- char s[20];
- sprintf(s,"+%s", m->get_type_name());
- loggers[get_myaddr()]->inc(s);
- sprintf(s,"-%s", m->get_type_name());
- loggers[dest]->inc(s);
+ // stats
+ loggers[get_myaddr()]->inc("+send",1);
+ loggers[dest]->inc("-recv",1);
+
+ char s[20];
+ sprintf(s,"+%s", m->get_type_name());
+ loggers[get_myaddr()]->inc(s);
+ sprintf(s,"-%s", m->get_type_name());
+ loggers[dest]->inc(s);
#endif
- // queue
- FakeMessenger *dm = directory[inst.addr];
- if (!dm) {
- dout(1) << "** destination " << inst << " dne" << endl;
- for (map<entity_addr_t, FakeMessenger*>::iterator p = directory.begin();
- p != directory.end();
- ++p) {
- dout(1) << "** have " << p->first << " to " << p->second << endl;
- }
- //assert(dm);
- }
- dm->queue_incoming(m);
-
+ // queue
+ if (directory.count(inst.addr)) {
dout(1) << "--> " << get_myname() << " -> " << inst.name << " --- " << *m << endl;
-
- }
- catch (...) {
- cout << "no destination " << dest << endl;
- assert(0);
+ directory[inst.addr]->queue_incoming(m);
+ } else {
+ dout(0) << "--> " << get_myname() << " -> " << inst.name << " " << *m
+ << " *** destination DNE ***" << endl;
+ for (map<entity_addr_t, FakeMessenger*>::iterator p = directory.begin();
+ p != directory.end();
+ ++p) {
+ dout(0) << "** have " << p->first << " to " << p->second << endl;
+ }
+ //assert(dm);
+ delete m;
}
-
// wake up loop?
if (!awake) {
dout(10) << "waking up fakemessenger thread" << endl;
*/
Rank::Rank() :
- single_dispatcher(this) {
+ single_dispatcher(this),
+ started(false) {
// default to any listen_addr
memset((char*)&listen_addr, 0, sizeof(listen_addr));
listen_addr.sin_family = AF_INET;
int Rank::start_rank()
{
+ lock.Lock();
+ if (started) {
+ dout(10) << "start_rank already started" << endl;
+ lock.Unlock();
+ return 0;
+ }
dout(10) << "start_rank" << endl;
+ lock.Unlock();
// bind to a socket
if (accepter.start() < 0)
lock.Lock();
dout(1) << "start_rank at " << listen_addr << endl;
-
+ started = true;
lock.Unlock();
return 0;
}
EntityMessenger *msgr = new EntityMessenger(name);
// add to directory
+ assert(local.count(name) == 0);
local[name] = msgr;
lock.Unlock();
}
} else {
derr(0) << "submit_message " << *m << " dest " << dest << " " << dest_addr << " local but not in local map?" << endl;
- assert(0); // hmpf
+ //assert(0); // hmpf, this is probably mds->mon beacon from newsyn.
}
}
else {
public:
Mutex lock;
Cond wait_cond; // for wait()
-
+ bool started;
+
// where i listen
tcpaddr_t listen_addr;
entity_addr_t my_addr;
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
// first, synchronize clocks.
- MPI_Barrier(MPI_COMM_WORLD);
- //dout(-10) << "tare" << endl;
- g_clock.tare();
+ if (g_conf.clock_tare) {
+ if (1) {
+ // use an MPI barrier. probably not terribly precise.
+ MPI_Barrier(MPI_COMM_WORLD);
+ g_clock.tare();
+ } else {
+ // use wall clock; assume NTP has all nodes synchronized already.
+ // FIXME someday: this hangs for some reason. whatever.
+ utime_t z = g_clock.now();
+ MPI_Bcast( &z, sizeof(z), MPI_CHAR,
+ 0, MPI_COMM_WORLD);
+ cout << "z is " << z << endl;
+ g_clock.tare(z);
+ }
+ }
// start up all monitors at known addresses.
entity_inst_t moninst[mpi_world]; // only care about first g_conf.num_mon of these.
set<int> clientlist;
map<int,Client *> client;//[NUMCLIENT];
map<int,SyntheticClient *> syn;//[NUMCLIENT];
+ int nclients = 0;
for (int i=0; i<NUMCLIENT; i++) {
//if (myrank != NUMMDS + NUMOSD + i % client_nodes) continue;
if (myrank != g_conf.ms_skip_rank0+NUMMDS + skip_osd + i / clients_per_node) continue;
started++;
syn[i] = new SyntheticClient(client[i]);
+
+ client[i]->mount();
+ nclients++;
}
if (!clientlist.empty()) dout(2) << "i have " << clientlist << endl;
- int nclients = 0;
for (set<int>::iterator it = clientlist.begin();
it != clientlist.end();
it++) {
int i = *it;
//cerr << "starting synthetic client" << i << " on rank " << myrank << endl;
- client[i]->mount();
syn[i]->start_thread();
- nclients++;
}
if (nclients) {
cerr << nclients << " clients at " << rank.my_addr << " " << hostname << "." << pid << endl;
Foundation. See file COPYING. */
+#include <map>
+#include <string>
#include <cerrno>
#include "OSBDB.h"
+#include "common/Timer.h"
using namespace std;
#undef dout
-#define dout(x) if (x <= g_conf.debug_bdbstore) cout << "bdbstore(" << device << ")."
+#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_bdbstore) cout << "bdbstore(" << device << ")@" << __LINE__ << "."
#undef derr
-#define derr(x) if (x <= g_conf.debug_bdbstore) cerr << "bdbstore(" << device << ")."
+#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_bdbstore) cerr << "bdbstore(" << device << ")@" << __LINE__ << "."
+
+#define CLEANUP(onsafe) do { \
+ dout(6) << "DELETE " << hex << onsafe << dec << endl; \
+ delete onsafe; \
+ } while (0)
+#define COMMIT(onsafe) do { \
+ dout(6) << "COMMIT " << hex << onsafe << dec << endl; \
+ sync(onsafe); \
+ } while (0)
+
+\f // Have a lock, already.
+
+class scoped_lock
+{
+private:
+ Mutex *m;
+public:
+ scoped_lock(Mutex *m) : m(m) { m->Lock(); }
+ ~scoped_lock() { m->Unlock(); }
+};
\f // Utilities.
\f // Management.
-int OSBDB::opendb(DBTYPE type, int flags)
+DbEnv *OSBDB::getenv ()
+{
+ DbEnv *envp = new DbEnv (DB_CXX_NO_EXCEPTIONS);
+ if (g_conf.debug > 1 || g_conf.debug_bdbstore > 1)
+ envp->set_error_stream (&std::cerr);
+ if (g_conf.debug > 2 || g_conf.debug_bdbstore > 2)
+ envp->set_message_stream (&std::cout);
+ envp->set_flags (DB_LOG_INMEMORY, 1);
+ //env->set_flags (DB_DIRECT_DB, 1);
+ int env_flags = (DB_CREATE
+ | DB_THREAD
+ //| DB_INIT_LOCK
+ | DB_INIT_MPOOL
+ //| DB_INIT_TXN
+ //| DB_INIT_LOG
+ | DB_PRIVATE);
+ if (envp->open (NULL, env_flags, 0) != 0)
+ {
+ std::cerr << "failed to open environment " << std::endl;
+ assert(0);
+ }
+ return envp;
+}
+
+int OSBDB::opendb(DBTYPE type, int flags, bool new_env)
{
+ env = getenv();
db = new Db(env, 0);
db->set_error_stream (&std::cerr);
db->set_message_stream (&std::cout);
db->set_cachesize (0, g_conf.bdbstore_cachesize, 0);
}
+ flags = flags | DB_THREAD;
+ if (transactional)
+ flags = flags | DB_AUTO_COMMIT;
+
int ret;
if ((ret = db->open (NULL, device.c_str(), NULL, type, flags, 0)) != 0)
{
derr(1) << "failed to open database: " << device << ": "
- << strerror(ret) << std::endl;
+ << db_strerror(ret) << std::endl;
return -EINVAL;
}
opened = true;
dout(2) << "mount " << device << endl;
if (mounted)
- return 0;
+ {
+ dout(4) << "..already mounted" << endl;
+ return 0;
+ }
if (!opened)
{
int ret;
if ((ret = opendb ()) != 0)
- return ret;
+ {
+ dout(4) << "..returns " << ret << endl;
+ return ret;
+ }
}
// XXX Do we want anything else in the superblock?
value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL);
if (db->get (NULL, &key, &value, 0) != 0)
- return -EINVAL; // XXX how to say "badly formed fs?"
+ {
+ dout(4) << "..get superblock fails" << endl;
+ return -EINVAL; // XXX how to say "badly formed fs?"
+ }
- dout(2) << ".mount " << super << endl;
+ dout(3) << ".mount " << super << endl;
if (super.version != OSBDB_THIS_VERSION)
- return -EINVAL;
+ {
+ dout(4) << "version mismatch (" << super.version << ")" << endl;
+ return -EINVAL;
+ }
DBTYPE t;
db->get_type (&t);
db->get_flags (&flags);
dout(1) << "mounted version " << OSBDB_THIS_VERSION << "; Btree; "
<< "min keys per page: " << minkey << "; flags: "
- << hex << flags << endl;
+ << hex << flags << dec << endl;
cout << dec;
}
else
dout(1) << "mounted version " << OSBDB_THIS_VERSION << "; Hash; "
<< "fill factor: " << ffactor
<< " table size: " << nelem << "; flags: "
- << hex << flags << endl;
+ << hex << flags << dec << endl;
cout << dec;
}
mounted = true;
+ dout(4) << "..mounted" << endl;
return 0;
}
{
if (!mounted)
return -EINVAL;
- sync();
+
+ dout(2) << "umount" << endl;
+
int ret;
if (opened)
{
+ if (transactional)
+ {
+ env->log_flush (NULL);
+ if ((ret = env->lsn_reset (device.c_str(), 0)) != 0)
+ {
+ derr(1) << "lsn_reset: " << db_strerror (ret) << endl;
+ }
+ }
+
+ db->sync (0);
+
if ((ret = db->close (0)) != 0)
{
derr(1) << "close: " << db_strerror(ret) << endl;
}
delete db;
db = NULL;
+
+ if (env)
+ {
+ env->close (0);
+ delete env;
+ env = NULL;
+ }
}
mounted = false;
opened = false;
+ dout(4) << "..unmounted" << endl;
return 0;
}
dout(2) << "mkfs" << endl;
- unlink (device.c_str());
+ string d = env_dir;
+ d += device;
+ unlink (d.c_str());
+
int ret;
- if ((ret = opendb((g_conf.bdbstore_btree ? DB_BTREE : DB_HASH), DB_CREATE)) != 0)
+ if ((ret = opendb((g_conf.bdbstore_btree ? DB_BTREE : DB_HASH),
+ DB_CREATE, true)) != 0)
{
derr(1) << "failed to open database: " << device << ": "
- << strerror(ret) << std::endl;
+ << db_strerror(ret) << std::endl;
return -EINVAL;
}
opened = true;
ret = db->truncate (NULL, &c, 0);
if (ret != 0)
{
+ derr(1) << "db truncate failed: " << db_strerror (ret) << endl;
return -EIO; // ???
}
Dbt value (&sb, sizeof (sb));
dout(3) << "..writing superblock" << endl;
- if (db->put (NULL, &key, &value, 0) != 0)
+ if ((ret = db->put (NULL, &key, &value, 0)) != 0)
{
- return -EIO; // ???
+ derr(1) << "failed to write superblock: " << db_strerror (ret)
+ << endl;
+ return -EIO;
}
dout(3) << "..wrote superblock" << endl;
-
+ dout(4) << "..mkfs done" << endl;
return 0;
}
int OSBDB::pick_object_revision_lt(object_t& oid)
{
- if (!mounted)
- return -EINVAL;
-
- // XXX this is pretty lame. Can we do better?
- assert(oid.rev > 0);
- oid.rev--;
- while (oid.rev > 0)
- {
- if (exists (oid))
- {
- return 0;
- }
- oid.rev--;
- }
- return -EEXIST; // FIXME
+ // Not really needed.
+ dout(0) << "pick_object_revision_lt " << oid << endl;
+ return -ENOSYS;
}
bool OSBDB::exists(object_t oid)
{
dout(2) << "exists " << oid << endl;
struct stat st;
- return (stat (oid, &st) == 0);
+ bool ret = (stat (oid, &st) == 0);
+ dout(4) << "..returns " << ret << endl;
+ return ret;
}
int OSBDB::statfs (struct statfs *st)
{
- return -ENOSYS;
+ // Hacky?
+ if (::statfs (device.c_str(), st) != 0)
+ {
+ int ret = -errno;
+ derr(1) << "statfs returns " << ret << endl;
+ return ret;
+ }
+ st->f_type = OSBDB_MAGIC;
+ dout(4) << "..statfs OK" << endl;
+ return 0;
}
int OSBDB::stat(object_t oid, struct stat *st)
{
if (!mounted)
- return -EINVAL;
+ {
+ dout(4) << "not mounted!" << endl;
+ return -EINVAL;
+ }
dout(2) << "stat " << oid << endl;
st->st_size = obj.length;
dout(3) << "stat length:" << obj.length << endl;
+ dout(4) << "..stat OK" << endl;
return 0;
}
int OSBDB::remove(object_t oid, Context *onsafe)
{
if (!mounted)
- return -EINVAL;
+ {
+ derr(1) << "not mounted!" << endl;
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EINVAL;
+ }
+ dout(6) << "Context " << hex << onsafe << dec << endl;
+ scoped_lock __lock(&lock);
dout(2) << "remove " << oid << endl;
+ DbTxn *txn = NULL;
+ if (transactional)
+ env->txn_begin (NULL, &txn, 0);
+
oid_t id;
mkoid(id, oid);
Dbt key (&id, sizeof (oid_t));
- db->del (NULL, &key, 0);
+ int ret;
+ if ((ret = db->del (txn, &key, 0)) != 0)
+ {
+ derr(1) << ".del returned error: " << db_strerror (ret) << endl;
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EIO;
+ }
+
object_inode_key _ikey = new_object_inode_key (oid);
Dbt ikey (&_ikey, sizeof_object_inode_key());
- db->del (NULL, &ikey, 0);
+ if ((ret = db->del (txn, &ikey, 0)) != 0)
+ {
+ derr(1) << ".del returned error: " << db_strerror (ret) << endl;
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EIO;
+ }
attrs_id aids = new_attrs_id (oid);
Dbt askey (&aids, sizeof_attrs_id());
Dbt asval;
asval.set_flags (DB_DBT_MALLOC);
- if (db->get (NULL, &askey, &asval, 0) == 0)
+ if (db->get (txn, &askey, &asval, 0) == 0)
{
// We have attributes; remove them.
stored_attrs *sap = (stored_attrs *) asval.get_data();
{
attr_id aid = new_attr_id (oid, sap->names[i].name);
Dbt akey (&aid, sizeof (aid));
- db->del (NULL, &akey, 0);
+ if ((ret = db->del (txn, &akey, 0)) != 0)
+ {
+ derr(1) << ".del returns error: " << db_strerror (ret) << endl;
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EIO;
+ }
+ }
+ if ((ret = db->del (txn, &askey, 0)) != 0)
+ {
+ derr(1) << ".del returns error: " << db_strerror (ret) << endl;
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EIO;
}
- db->del (NULL, &askey, 0);
}
+ // XXX check del return value
+
+ if (txn != NULL)
+ txn->commit (0);
+ if (onsafe != NULL)
+ COMMIT(onsafe);
+ dout(4) << "..remove OK" << endl;
return 0;
}
int OSBDB::truncate(object_t oid, off_t size, Context *onsafe)
{
+ dout(6) << "Context " << hex << onsafe << dec << endl;
+
if (!mounted)
- return -EINVAL;
+ {
+ derr(1) << "not mounted!" << endl;
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EINVAL;
+ }
+ scoped_lock __lock(&lock);
dout(2) << "truncate " << size << endl;
if (size > 0xFFFFFFFF)
- return -ENOSPC;
+ {
+ derr(1) << "object size too big!" << endl;
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -ENOSPC;
+ }
+
+ DbTxn *txn = NULL;
+
+ if (transactional)
+ env->txn_begin (NULL, &txn, 0);
object_inode_key ikey = new_object_inode_key (oid);
stored_object obj;
value.set_ulen (sizeof (obj));
value.set_flags (DB_DBT_USERMEM);
- if (db->get (NULL, &key, &value, 0) != 0)
- return -ENOENT;
+ if (db->get (txn, &key, &value, 0) != 0)
+ {
+ if (txn)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ dout(4) << "..returns -ENOENT" << endl;
+ return -ENOENT;
+ }
if (obj.length < size)
{
newVal.set_dlen (1);
newVal.set_ulen (1);
newVal.set_flags (DB_DBT_PARTIAL);
- if (db->put (NULL, &okey, &newVal, 0) != 0)
- return -EIO;
+ if (db->put (txn, &okey, &newVal, 0) != 0)
+ {
+ if (txn)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".updating object failed" << endl;
+ return -EIO;
+ }
obj.length = size;
value.set_ulen (sizeof (obj));
- if (db->put (NULL, &key, &value, 0) != 0)
- return -EIO;
+ if (db->put (txn, &key, &value, 0) != 0)
+ {
+ if (txn)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".updating object info failed" << endl;
+ return -EIO;
+ }
}
else if (obj.length > size)
{
Dbt tval (&obj, sizeof (obj));
tval.set_ulen (sizeof (obj));
tval.set_flags (DB_DBT_USERMEM);
- if (db->put (NULL, &key, &tval, 0) != 0)
- return -EIO;
+ if (db->put (txn, &key, &tval, 0) != 0)
+ {
+ if (txn)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".updating object info failed" << endl;
+ return -EIO;
+ }
if (size == 0)
{
char x[1];
mkoid (id, oid);
Dbt okey (&id, sizeof (oid_t));
Dbt oval (&x, 0);
- if (db->put (NULL, &okey, &oval, 0) != 0)
- return -EIO;
+ if (db->put (txn, &okey, &oval, 0) != 0)
+ {
+ if (txn)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".updating object failed" << endl;
+ return -EIO;
+ }
}
else
{
Dbt okey (&id, sizeof (oid_t));
Dbt oval;
oval.set_flags (DB_DBT_MALLOC);
- if (db->get (NULL, &okey, &oval, 0) != 0)
- return -EIO;
+ if (db->get (txn, &okey, &oval, 0) != 0)
+ {
+ if (txn)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".getting old object failed" << endl;
+ return -EIO;
+ }
auto_ptr<char> ovalPtr ((char *) oval.get_data());
oval.set_size ((size_t) size);
oval.set_ulen ((size_t) size);
- if (db->put (NULL, &okey, &oval, 0) != 0)
- return -EIO;
+ if (db->put (txn, &okey, &oval, 0) != 0)
+ {
+ if (txn)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".putting new object failed" << endl;
+ return -EIO;
+ }
}
}
+ if (txn)
+ txn->commit (0);
+ if (onsafe != NULL)
+ COMMIT(onsafe);
+
+ dout(4) << "..truncate OK" << endl;
return 0;
}
int OSBDB::read(object_t oid, off_t offset, size_t len, bufferlist& bl)
{
if (!mounted)
- return -EINVAL;
+ {
+ derr(1) << "not mounted!" << endl;
+ return -EINVAL;
+ }
dout(2) << "read " << oid << " " << offset << " "
<< len << endl;
+ if (bl.length() < len)
+ {
+ int remain = len - bl.length();
+ bufferptr ptr (remain);
+ bl.push_back(ptr);
+ }
+
DbTxn *txn = NULL;
- //env->txn_begin (NULL, &txn, 0);
+ if (transactional)
+ env->txn_begin (NULL, &txn, 0);
object_inode_key _ikey = new_object_inode_key (oid);
stored_object obj;
ival.set_flags (DB_DBT_USERMEM);
ival.set_ulen (sizeof(obj));
- dout(3) << " get " << _ikey << endl;
+ dout(3) << "..get " << _ikey << endl;
int ret;
if ((ret = db->get (txn, &ikey, &ival, 0)) != 0)
{
- //txn->abort();
+ if (txn)
+ txn->abort();
derr(1) << "get returned " << db_strerror (ret) << endl;
return -ENOENT;
}
+ dout(3) << "..object has size " << obj.length << endl;
+
if (offset == 0 && len >= obj.length)
{
len = obj.length;
- dout(3) << " doing full read of " << len << endl;
+ dout(3) << "..doing full read of " << len << endl;
oid_t id;
mkoid (id, oid);
Dbt key (&id, sizeof (oid_t));
Dbt value (bl.c_str(), len);
value.set_ulen (len);
value.set_flags (DB_DBT_USERMEM);
- dout(3) << " getting " << oid << endl;
+ dout(3) << "..getting " << oid << endl;
if ((ret = db->get (txn, &key, &value, 0)) != 0)
{
- derr(1) << " get returned " << db_strerror (ret) << endl;
- //txn->abort();
+ derr(1) << ".get returned " << db_strerror (ret) << endl;
+ if (txn)
+ txn->abort();
return -EIO;
}
}
else
{
if (offset > obj.length)
- return 0;
+ {
+ dout(2) << "..offset out of range" << endl;
+ return 0;
+ }
if (offset + len > obj.length)
len = obj.length - (size_t) offset;
- dout(3) << " doing partial read of " << len << endl;
+ dout(3) << "..doing partial read of " << len << endl;
oid_t id;
mkoid (id, oid);
Dbt key (&id, sizeof (oid));
- Dbt value (bl.c_str(), len);
+ Dbt value;
+ char *data = bl.c_str();
+ dout(3) << ".bufferlist c_str returned " << ((void*) data) << endl;
+ value.set_data (data);
value.set_doff ((size_t) offset);
value.set_dlen (len);
value.set_ulen (len);
value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL);
- dout(3) << " getting " << oid << endl;
- if ((ret = db->get (NULL, &key, &value, 0)) != 0)
+ dout(3) << "..getting " << oid << endl;
+ if ((ret = db->get (txn, &key, &value, 0)) != 0)
{
- derr(1) << "get returned " << db_strerror (ret) << endl;
- //txn->abort();
+ derr(1) << ".get returned " << db_strerror (ret) << endl;
+ if (txn)
+ txn->abort();
return -EIO;
}
}
- //txn->commit (0);
+ if (txn)
+ txn->commit (0);
+ dout(4) << "..read OK, returning " << len << endl;
return len;
}
int OSBDB::write(object_t oid, off_t offset, size_t len,
bufferlist& bl, Context *onsafe)
{
+ dout(6) << "Context " << hex << onsafe << dec << endl;
if (!mounted)
- return -EINVAL;
+ {
+ derr(1) << "not mounted!" << endl;
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EINVAL;
+ }
+ scoped_lock __lock(&lock);
dout(2) << "write " << oid << " " << offset << " "
<< len << endl;
if (offset > 0xFFFFFFFFL || offset + len > 0xFFFFFFFFL)
- return -ENOSPC;
+ {
+ derr(1) << "object too big" << endl;
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -ENOSPC;
+ }
DbTxn *txn = NULL;
- //env->txn_begin (NULL, &txn, 0);
+ if (transactional)
+ env->txn_begin (txn, &txn, 0);
object_inode_key _ikey = new_object_inode_key (oid);
stored_object obj;
ival.set_flags (DB_DBT_USERMEM);
int ret;
- dout(3) << " getting " << _ikey << endl;
+ dout(3) << "..getting " << _ikey << endl;
if (db->get (txn, &ikey, &ival, 0) != 0)
{
- dout(3) << " writing new object" << endl;
+ dout(3) << "..writing new object" << endl;
// New object.
obj.length = (size_t) offset + len;
- dout(3) << " mapping " << _ikey << " => "
+ dout(3) << "..mapping " << _ikey << " => "
<< obj << endl;
if ((ret = db->put (txn, &ikey, &ival, 0)) != 0)
{
- derr(1) << " put returned " << db_strerror (ret) << endl;
+ derr(1) << "..put returned " << db_strerror (ret) << endl;
+ if (txn)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
return -EIO;
}
value.set_doff ((size_t) offset);
value.set_dlen (len);
}
- dout(3) << " mapping " << oid << " => ("
+ dout(3) << "..mapping " << oid << " => ("
<< obj.length << " bytes)" << endl;
if ((ret = db->put (txn, &key, &value, 0)) != 0)
{
- derr(1) << " put returned " << db_strerror (ret) << endl;
+ derr(1) << "..put returned " << db_strerror (ret) << endl;
+ if (txn)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
return -EIO;
}
+
+ if (txn != NULL)
+ txn->commit (0);
+ if (onsafe != NULL)
+ COMMIT(onsafe);
+
+ dout(4) << "..write OK, returning " << len << endl;
return len;
}
obj.length = len;
if ((ret = db->put (txn, &ikey, &ival, 0)) != 0)
{
- derr(1) << " put returned " << db_strerror (ret) << endl;
+ derr(1) << " put returned " << db_strerror (ret) << endl;
+ if (txn)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
return -EIO;
}
}
Dbt value (bl.c_str(), len);
if (db->put (txn, &key, &value, 0) != 0)
{
+ if (txn)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << "..writing object failed!" << endl;
return -EIO;
}
}
if (offset + len > obj.length)
{
obj.length = (size_t) offset + len;
- if (db->put (NULL, &ikey, &ival, 0) != 0)
+ if (db->put (txn, &ikey, &ival, 0) != 0)
{
+ if (txn)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << "..writing object info failed!" << endl;
return -EIO;
}
}
value.set_dlen (len);
value.set_ulen (len);
value.set_flags (DB_DBT_PARTIAL);
- if (db->put (NULL, &key, &value, 0) != 0)
+ if (db->put (txn, &key, &value, 0) != 0)
{
+ if (txn)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << "..writing object failed!" << endl;
return -EIO;
}
}
+ if (txn != NULL)
+ txn->commit (0);
+ if (onsafe != NULL)
+ COMMIT(onsafe);
+
+ dout(4) << "..write OK, returning " << len << endl;
return len;
}
int OSBDB::clone(object_t oid, object_t noid)
{
if (!mounted)
- return -EINVAL;
+ {
+ derr(1) << "not mounted!" << endl;
+ return -EINVAL;
+ }
dout(2) << "clone " << oid << ", " << noid << endl;
if (exists (noid))
- return -EEXIST;
+ {
+ dout(4) << "..target exists; returning -EEXIST" << endl;
+ return -EEXIST;
+ }
+
+ DbTxn *txn = NULL;
+ if (transactional)
+ env->txn_begin (NULL, &txn, 0);
object_inode_key _ikey = new_object_inode_key (oid);
object_inode_key _nikey = new_object_inode_key (noid);
Dbt value;
value.set_flags (DB_DBT_MALLOC);
- if (db->get (NULL, &ikey, &ival, 0) != 0)
- return -ENOENT;
- if (db->get (NULL, &key, &value, 0) != 0)
- return -ENOENT;
+ if (db->get (txn, &ikey, &ival, 0) != 0)
+ {
+ if (txn)
+ txn->abort();
+ derr(1) << "..getting object info failed!" << endl;
+ return -ENOENT;
+ }
+ if (db->get (txn, &key, &value, 0) != 0)
+ {
+ if (txn)
+ txn->abort();
+ derr(1) << "..getting original object failed" << endl;
+ return -ENOENT;
+ }
auto_ptr<char> valueptr ((char *) value.get_data());
- if (db->put (NULL, &nikey, &ival, 0) != 0)
- return -EIO;
- if (db->put (NULL, &nkey, &value, 0) != 0)
- return -EIO;
+ if (db->put (txn, &nikey, &ival, 0) != 0)
+ {
+ if (txn)
+ txn->abort();
+ derr(1) << "..putting object info failed" << endl;
+ return -EIO;
+ }
+ if (db->put (txn, &nkey, &value, 0) != 0)
+ {
+ if (txn)
+ txn->abort();
+ derr(1) << "..putting new object failed" << endl;
+ return -EIO;
+ }
+
+ if (txn)
+ txn->commit (0);
+ dout(4) << "..clone OK" << endl;
return 0;
}
int OSBDB::list_collections(list<coll_t>& ls)
{
if (!mounted)
- return -EINVAL;
+ {
+ derr(1) << "not mounted!" << endl;
+ return -EINVAL;
+ }
dout(2) << "list_collections" << endl;
value.set_flags (DB_DBT_MALLOC);
if (db->get (NULL, &key, &value, 0) != 0)
- return 0; // no collections.
+ {
+ dout(4) << "..no collections" << endl;
+ return 0; // no collections.
+ }
auto_ptr<stored_colls> sc ((stored_colls *) value.get_data());
stored_colls *scp = sc.get();
for (uint32_t i = 0; i < sc->count; i++)
ls.push_back (scp->colls[i]);
+ dout(4) << "..list_collections returns " << scp->count << endl;
return scp->count;
}
int OSBDB::create_collection(coll_t c, Context *onsafe)
{
+ dout(6) << "Context " << hex << onsafe << dec << endl;
if (!mounted)
- return -EINVAL;
+ {
+ derr(1) << "not mounted" << endl;
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EINVAL;
+ }
- dout(2) << "create_collection " << c << endl;
+ scoped_lock __lock(&lock);
+ dout(2) << "create_collection " << hex << c << dec << endl;
Dbt key (COLLECTIONS_KEY, 1);
Dbt value;
value.set_flags (DB_DBT_MALLOC);
+ DbTxn *txn = NULL;
+ if (transactional)
+ env->txn_begin (NULL, &txn, 0);
+
stored_colls *scp = NULL;
size_t sz = 0;
bool created = false;
- if (db->get (NULL, &key, &value, 0) != 0)
+ if (db->get (txn, &key, &value, 0) != 0)
{
sz = sizeof (stored_colls) + sizeof (coll_t);
scp = (stored_colls *) malloc (sz);
int ins = 0;
if (scp->count > 0)
ins = binary_search<coll_t> (scp->colls, scp->count, c);
- if (scp->colls[ins] == c)
- return -EEXIST;
+ if (ins < scp->count && scp->colls[ins] == c)
+ {
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".collection " << c << " already exists " << endl;
+ return -EEXIST;
+ }
dout(3) << "..insertion point: " << ins << endl;
// Put the modified collection list back.
{
Dbt value2 (scp, sz);
- if (db->put (NULL, &key, &value2, 0) != 0)
+ if (db->put (txn, &key, &value2, 0) != 0)
{
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".writing new collections list failed" << endl;
return -EIO;
}
}
new_coll.count = 0;
Dbt coll_key (&c, sizeof (coll_t));
Dbt coll_value (&new_coll, sizeof (stored_coll));
- if (db->put (NULL, &coll_key, &coll_value, 0) != 0)
+ if (db->put (txn, &coll_key, &coll_value, 0) != 0)
{
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".writing new collection failed" << endl;
return -EIO;
}
}
+ if (txn != NULL)
+ txn->commit (0);
+ if (onsafe != NULL)
+ COMMIT(onsafe);
+
+ dout(4) << "..create_collection OK" << endl;
return 0;
}
int OSBDB::destroy_collection(coll_t c, Context *onsafe)
{
+ dout(6) << "Context " << hex << onsafe << dec << endl;
if (!mounted)
- return -EINVAL;
+ {
+ derr(1) << "not mounted" << endl;
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EINVAL;
+ }
- dout(2) << "destroy_collection " << c << endl;
+ scoped_lock __lock(&lock);
+ dout(2) << "destroy_collection " << hex << c << dec << endl;
Dbt key (COLLECTIONS_KEY, 1);
Dbt value;
value.set_flags (DB_DBT_MALLOC);
+ DbTxn *txn = NULL;
- if (db->get (NULL, &key, &value, 0) != 0)
+ if (transactional)
+ env->txn_begin (NULL, &txn, 0);
+
+ if (db->get (txn, &key, &value, 0) != 0)
{
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".collection list doesn't exist" << endl;
return -ENOENT; // XXX
}
auto_ptr<stored_colls> valueBuf (scp);
if (scp->count == 0)
{
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".collection " << c << " not listed" << endl;
return -ENOENT;
}
uint32_t ins = binary_search<coll_t> (scp->colls, scp->count, c);
- if (scp->colls[ins] != c)
+ dout(4) << "..insertion point is " << ins << endl;
+ if (ins >= scp->count || scp->colls[ins] != c)
{
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".collection " << c << " not listed" << endl;
return -ENOENT;
}
+ dout(4) << "..collections list is " << scp << endl;
+
// Move the rest of the list down in memory, if needed.
- if (ins < scp->count - 1)
+ if (ins < scp->count)
{
size_t n = scp->count - ins - 1;
+ dout(4) << "..shift list down " << n << endl;
memmove (&scp->colls[ins], &scp->colls[ins + 1], n);
}
+ dout(4) << "..collections list is " << scp << endl;
+
// Modify the record size to be one less.
Dbt nvalue (scp, value.get_size() - sizeof (coll_t));
nvalue.set_flags (DB_DBT_USERMEM);
- if (db->put (NULL, &key, &nvalue, 0) != 0)
+ if (db->put (txn, &key, &nvalue, 0) != 0)
{
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".putting modified collection list failed" << endl;
return -EIO;
}
// Delete the collection.
Dbt collKey (&c, sizeof (coll_t));
- if (db->del (NULL, &collKey, 0) != 0)
+ if (db->del (txn, &collKey, 0) != 0)
{
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".deleting collection failed" << endl;
return -EIO;
}
+ if (txn != NULL)
+ txn->commit (0);
+ if (onsafe != NULL)
+ COMMIT(onsafe);
+ dout(4) << "..destroy_collection OK" << endl;
return 0;
}
bool OSBDB::collection_exists(coll_t c)
{
if (!mounted)
- return -EINVAL;
+ {
+ derr(1) << "not mounted" << endl;
+ return -EINVAL;
+ }
- dout(2) << "collection_exists " << c << endl;
+ dout(2) << "collection_exists " << hex << c << dec << endl;
- Dbt key (COLLECTIONS_KEY, 1);
+ /*Dbt key (COLLECTIONS_KEY, 1);
Dbt value;
value.set_flags (DB_DBT_MALLOC);
if (db->get (NULL, &key, &value, 0) != 0)
- return false;
+ {
+ dout(4) << "..no collection list; return false" << endl;
+ return false;
+ }
stored_colls *scp = (stored_colls *) value.get_data();
auto_ptr<stored_colls> sc (scp);
+ dout(5) << "..collection list is " << scp << endl;
if (scp->count == 0)
- return false;
+ {
+ dout(4) << "..empty collection list; return false" << endl;
+ return false;
+ }
uint32_t ins = binary_search<coll_t> (scp->colls, scp->count, c);
+ dout(4) << "..insertion point is " << ins << endl;
- return (scp->colls[ins] == c);
+ int ret = (scp->colls[ins] == c);
+ dout(4) << "..returns " << ret << endl;
+ return ret;*/
+
+ Dbt key (&c, sizeof (coll_t));
+ Dbt value;
+ value.set_flags (DB_DBT_MALLOC);
+ if (db->get (NULL, &key, &value, 0) != 0)
+ {
+ dout(4) << "..no collection, return false" << endl;
+ return false;
+ }
+ void *val = value.get_data();
+ free (val);
+ dout(4) << "..collection exists; return true" << endl;
+ return true;
}
int OSBDB::collection_stat(coll_t c, struct stat *st)
{
if (!mounted)
- return -EINVAL;
+ {
+ derr(1) << "not mounted" << endl;
+ return -EINVAL;
+ }
dout(2) << "collection_stat " << c << endl;
+ // XXX is this needed?
return -ENOSYS;
}
int OSBDB::collection_add(coll_t c, object_t o, Context *onsafe)
{
+ dout(6) << "Context " << hex << onsafe << dec << endl;
if (!mounted)
- return -EINVAL;
+ {
+ dout(2) << "not mounted" << endl;
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EINVAL;
+ }
- dout(2) << "collection_add " << c << " " << o << endl;
+ scoped_lock __lock(&lock);
+ dout(2) << "collection_add " << hex << c << dec << " " << o << endl;
Dbt key (&c, sizeof (coll_t));
Dbt value;
value.set_flags (DB_DBT_MALLOC);
+ DbTxn *txn = NULL;
- if (db->get (NULL, &key, &value, 0) != 0)
+ if (transactional)
+ env->txn_begin (NULL, &txn, 0);
+
+ if (db->get (txn, &key, &value, 0) != 0)
{
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << "failed to find collection" << endl;
return -ENOENT;
}
{
ins = binary_search<object_t> (scp->objects, scp->count, o);
// Already there?
- if (scp->objects[ins] == o)
+ if (ins < scp->count && scp->objects[ins] == o)
{
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << "collection already has object" << endl;
return -EEXIST;
}
}
scp = (stored_coll *) realloc (scp, sz);
sc.release();
sc.reset (scp);
- if (ins < scp->count)
+ dout(3) << "..current collection: " << scp << endl;
+ if (ins < scp->count - 1)
{
size_t n = (scp->count - ins) * sizeof (object_t);
+ dout(3) << "..move up " << n << " bytes" << endl;
memmove (&scp->objects[ins + 1], &scp->objects[ins], n);
}
scp->count++;
dout(3) << "..collection: " << scp << endl;
Dbt nvalue (scp, sz);
- if (db->put (NULL, &key, &nvalue, 0) != 0)
+ if (db->put (txn, &key, &nvalue, 0) != 0)
{
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << "..putting modified collection failed" << endl;
return -EIO;
}
+ if (txn != NULL)
+ txn->commit (0);
+ if (onsafe != NULL)
+ COMMIT(onsafe);
+ dout(4) << "..collection add OK" << endl;
return 0;
}
int OSBDB::collection_remove(coll_t c, object_t o, Context *onsafe)
{
+ dout(6) << "Context " << hex << onsafe << dec << endl;
if (!mounted)
- return -EINVAL;
+ {
+ derr(1) << "not mounted" << endl;
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EINVAL;
+ }
- dout(2) << "collection_remove " << c << " " << o << endl;
+ scoped_lock __lock(&lock);
+ dout(2) << "collection_remove " << hex << c << dec << " " << o << endl;
Dbt key (&c, sizeof (coll_t));
Dbt value;
value.set_flags (DB_DBT_MALLOC);
+ DbTxn *txn = NULL;
+
+ if (transactional)
+ env->txn_begin (NULL, &txn, 0);
- if (db->get (NULL, &key, &value, 0) != 0)
+ if (db->get (txn, &key, &value, 0) != 0)
{
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ dout(1) << "..collection doesn't exist" << endl;
return -ENOENT;
}
stored_coll *scp = (stored_coll *) value.get_data();
auto_ptr<stored_coll> sc (scp);
+ dout(5) << "..collection is " << scp << endl;
if (scp->count == 0)
{
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ dout(1) << "..collection is empty" << endl;
return -ENOENT;
}
uint32_t ins = binary_search<object_t> (scp->objects, scp->count, o);
- if (scp->objects[ins] != o)
+ dout(4) << "..insertion point is " << ins << endl;
+ if (ins >= scp->count || scp->objects[ins] != o)
{
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ dout(1) << "..object not in collection" << endl;
return -ENOENT;
}
if (ins < scp->count - 1)
{
size_t n = (scp->count - ins - 1) * sizeof (object_t);
+ dout(5) << "..moving " << n << " bytes down" << endl;
memmove (&scp->objects[ins], &scp->objects[ins + 1], n);
}
scp->count--;
dout(3) << "..collection " << scp << endl;
Dbt nval (scp, value.get_size() - sizeof (object_t));
- if (db->put (NULL, &key, &nval, 0) != 0)
+ if (db->put (txn, &key, &nval, 0) != 0)
{
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << "..putting modified collection failed" << endl;
return -EIO;
}
+ if (txn != NULL)
+ txn->commit (0);
+ if (onsafe != NULL)
+ COMMIT(onsafe);
+ dout(4) << "..collection remove OK" << endl;
return 0;
}
int OSBDB::collection_list(coll_t c, list<object_t>& o)
{
if (!mounted)
- return -EINVAL;
+ {
+ derr(1) << "not mounted" << endl;
+ return -EINVAL;
+ }
Dbt key (&c, sizeof (coll_t));
Dbt value;
- if (db->get (NULL, &key, &value, 0) != 0)
- return -ENOENT;
+ DbTxn *txn = NULL;
+
+ if (transactional)
+ env->txn_begin (NULL, &txn, 0);
+
+ if (db->get (txn, &key, &value, 0) != 0)
+ {
+ if (txn != NULL)
+ txn->abort();
+ return -ENOENT;
+ }
stored_coll *scp = (stored_coll *) value.get_data();
auto_ptr<stored_coll> sc (scp);
for (uint32_t i = 0; i < scp->count; i++)
o.push_back (scp->objects[i]);
+ if (txn != NULL)
+ txn->commit (0);
return 0;
}
\f // Attributes
int OSBDB::_setattr(object_t oid, const char *name,
- const void *value, size_t size, Context *onsafe)
+ const void *value, size_t size, Context *onsafe,
+ DbTxn *txn)
{
+ dout(6) << "Context " << hex << onsafe << dec << endl;
if (!mounted)
- return -EINVAL;
+ {
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EINVAL;
+ }
if (strlen (name) >= OSBDB_MAX_ATTR_LEN)
- return -ENAMETOOLONG;
+ {
+ derr(1) << "name too long: " << name << endl;
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -ENAMETOOLONG;
+ }
+
+ scoped_lock __lock(&lock);
// Add name to attribute list, if needed.
attrs_id aids = new_attrs_id (oid);
size_t sz = 0;
dout(3) << " getting " << aids << endl;
- if (db->get (NULL, &attrs_key, &attrs_val, 0) != 0)
+ if (db->get (txn, &attrs_key, &attrs_val, 0) != 0)
{
dout(2) << " first attribute" << endl;
sz = sizeof (stored_attrs);
{
sz = attrs_val.get_size();
sap = (stored_attrs *) attrs_val.get_data();
- dout(2) << " add to list of " << sap->count << " attrs" << endl;
+ dout(2) << "..add to list of " << sap->count << " attrs" << endl;
}
auto_ptr<stored_attrs> sa (sap);
int ins = 0;
if (sap->count > 0)
ins = binary_search<attr_name> (sap->names, sap->count, _name);
- dout(3) << " insertion point is " << ins << endl;
- if (sap->count == 0 || strcmp (sap->names[ins].name, name) != 0)
+ dout(3) << "..insertion point is " << ins << endl;
+ if (sap->count == 0 ||
+ (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0))
{
sz += sizeof (attr_name);
- dout(3) << " realloc 0x" << hex << ((void *) sap) << " to "
+ dout(3) << "..realloc " << ((void *) sap) << " to "
<< dec << sz << endl;
sap = (stored_attrs *) realloc (sap, sz);
- dout(3) << " returns 0x" << hex << ((void *) sap) << endl;
+ dout(3) << "..returns " << ((void *) sap) << endl;
sa.release ();
sa.reset (sap);
int n = (sap->count - ins) * sizeof (attr_name);
if (n > 0)
{
- dout(3) << " move " << n << " bytes from 0x"
+ dout(3) << "..move " << n << " bytes from 0x"
<< hex << (&sap->names[ins]) << " to 0x"
- << hex << (&sap->names[ins+1]) << endl;
+ << hex << (&sap->names[ins+1]) << dec << endl;
memmove (&sap->names[ins+1], &sap->names[ins], n);
}
memset (&sap->names[ins], 0, sizeof (attr_name));
Dbt newAttrs_val (sap, sz);
newAttrs_val.set_ulen (sz);
newAttrs_val.set_flags (DB_DBT_USERMEM);
- dout(3) << " putting " << aids << endl;
- if (db->put (NULL, &attrs_key, &newAttrs_val, 0) != 0)
- return -EIO;
+ dout(3) << "..putting " << aids << endl;
+ if (db->put (txn, &attrs_key, &newAttrs_val, 0) != 0)
+ {
+ derr(1) << ".writing attributes list failed" << endl;
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EIO;
+ }
}
else
{
- dout(3) << " attribute " << name << " already exists" << endl;
+ dout(3) << "..attribute " << name << " already exists" << endl;
}
- dout(3) << " attributes list: " << sap << endl;
+ dout(5) << "..attributes list: " << sap << endl;
// Add the attribute.
attr_id aid = new_attr_id (oid, name);
Dbt attr_key (&aid, sizeof (aid));
Dbt attr_val ((void *) value, size);
- dout(3) << " writing attribute key " << aid << endl;
- if (db->put (NULL, &attr_key, &attr_val, 0) != 0)
- return -EIO;
+ dout(3) << "..writing attribute key " << aid << endl;
+ if (db->put (txn, &attr_key, &attr_val, 0) != 0)
+ {
+ derr(1) << ".writing attribute key failed" << endl;
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EIO;
+ }
+ dout(4) << "..setattr OK" << endl;
+ if (onsafe != NULL)
+ COMMIT(onsafe);
return 0;
}
const void *value, size_t size,
Context *onsafe)
{
+ dout(6) << "Context " << hex << onsafe << dec << endl;
if (!mounted)
- return -EINVAL;
+ {
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EINVAL;
+ }
+
+ DbTxn *txn = NULL;
+ if (transactional)
+ env->txn_begin (NULL, &txn, 0);
dout(2) << "setattr " << oid << ":" << name << " => ("
<< size << " bytes)" << endl;
- int ret = _setattr (oid, name, value, size, onsafe);
+ int ret = _setattr (oid, name, value, size, onsafe, txn);
+ if (ret == 0)
+ {
+ if (txn != NULL)
+ txn->commit (0);
+ }
+ else
+ {
+ if (txn != NULL)
+ txn->abort();
+ }
return ret;
}
int OSBDB::setattrs(object_t oid, map<string,bufferptr>& aset,
Context *onsafe)
{
+ dout(6) << "Context " << hex << onsafe << dec << endl;
if (!mounted)
- return -EINVAL;
+ {
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EINVAL;
+ }
+
+ DbTxn *txn = NULL;
+
+ if (transactional)
+ env->txn_begin (NULL, &txn, 0);
map<string,bufferptr>::iterator it;
for (it = aset.begin(); it != aset.end(); it++)
string name = it->first;
bufferptr value = it->second;
int ret = _setattr (oid, name.c_str(), value.c_str(),
- value.length(), onsafe);
+ value.length(), onsafe, txn);
if (ret != 0)
{
+ if (txn != NULL)
+ txn->abort();
return ret;
}
}
+
+ if (txn != NULL)
+ txn->commit (0);
return 0;
}
if (!mounted)
return -EINVAL;
+ dout(2) << "_getattr " << oid << " " << name << " " << size << endl;
+
attr_id aid = new_attr_id (oid, name);
Dbt key (&aid, sizeof (aid));
Dbt val (value, size);
val.set_ulen (size);
+ val.set_doff (0);
+ val.set_dlen (size);
val.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL);
- if (db->get (NULL, &key, &val, 0) != 0)
+ int ret;
+ if ((ret = db->get (NULL, &key, &val, 0)) != 0)
{
+ derr(1) << ".getting value failed: " << db_strerror (ret) << endl;
return -ENOENT;
}
+ dout(4) << ".._getattr OK; returns " << val.get_size() << endl;
return val.get_size();
}
if (!mounted)
return -EINVAL;
- int count = 0;
for (map<string,bufferptr>::iterator it = aset.begin();
it != aset.end(); it++)
{
(*it).second.length());
if (ret < 0)
return ret;
- count += ret;
}
- return count;
+ return 0;
}
int OSBDB::rmattr(object_t oid, const char *name, Context *onsafe)
{
+ dout(6) << "Context " << hex << onsafe << dec << endl;
if (!mounted)
- return -EINVAL;
+ {
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EINVAL;
+ }
+
+ scoped_lock __lock(&lock);
+ dout(2) << "rmattr " << oid << " " << name << endl;
+
attrs_id aids = new_attrs_id (oid);
Dbt askey (&aids, sizeof_attrs_id());
Dbt asvalue;
asvalue.set_flags (DB_DBT_MALLOC);
- if (db->get (NULL, &askey, &asvalue, 0) != 0)
- return -ENOENT;
+ DbTxn *txn = NULL;
+
+ if (transactional)
+ env->txn_begin (NULL, &txn, 0);
+
+ if (db->get (txn, &askey, &asvalue, 0) != 0)
+ {
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -ENOENT;
+ }
stored_attrs *sap = (stored_attrs *) asvalue.get_data();
auto_ptr<stored_attrs> sa (sap);
+ dout(5) << "..attributes list " << sap << endl;
+
if (sap->count == 0)
- return -ENOENT;
+ {
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".empty attribute list" << endl;
+ return -ENOENT;
+ }
attr_name _name;
- memset(&name, 0, sizeof (_name));
+ memset(&_name, 0, sizeof (_name));
strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN);
int ins = binary_search<attr_name> (sap->names, sap->count, _name);
- if (strcmp (sap->names[ins].name, name) != 0)
- return -ENOENT;
+ dout(4) << "..insertion point is " << ins << endl;
+ if (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0)
+ {
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".attribute not found in list" << endl;
+ return -ENOENT;
+ }
// Shift the later elements down by one, if needed.
int n = (sap->count - ins) * sizeof (attr_name);
if (n > 0)
- memmove (&(sap->names[ins]), &(sap->names[ins + 1]), n);
+ {
+ dout(4) << "..shift down by " << n << endl;
+ memmove (&(sap->names[ins]), &(sap->names[ins + 1]), n);
+ }
sap->count--;
+ dout(5) << "..attributes list now " << sap << endl;
+
asvalue.set_size(asvalue.get_size() - sizeof (attr_name));
int ret;
- if ((ret = db->put (NULL, &askey, &asvalue, 0)) != 0)
+ if ((ret = db->put (txn, &askey, &asvalue, 0)) != 0)
{
derr(1) << "put stored_attrs " << db_strerror (ret) << endl;
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
return -EIO;
}
// Remove the attribute.
attr_id aid = new_attr_id (oid, name);
Dbt key (&aid, sizeof (aid));
- if ((ret = db->del (NULL, &key, 0)) != 0)
- derr(1) << "deleting " << aid << ": " << db_strerror(ret) << endl;
+ if ((ret = db->del (txn, &key, 0)) != 0)
+ {
+ derr(1) << "deleting " << aid << ": " << db_strerror(ret) << endl;
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EIO;
+ }
+ if (txn != NULL)
+ txn->commit (0);
+ if (onsafe != NULL)
+ COMMIT(onsafe);
+ dout(4) << "..rmattr OK" << endl;
return 0;
}
Dbt value;
value.set_flags (DB_DBT_MALLOC);
+ // XXX Transactions for read atomicity???
+
int ret;
if ((ret = db->get (NULL, &key, &value, 0)) != 0)
{
p[n] = '\0';
p = p + n + 1;
}
+
+ dout(4) << "listattr OK" << endl;
return 0;
}
const void *value, size_t size,
Context *onsafe)
{
+ dout(6) << "Context " << hex << onsafe << dec << endl;
if (!mounted)
- return -EINVAL;
+ {
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EINVAL;
+ }
- dout(2) << "collection_setattr" << cid << " " << name
+ scoped_lock __lock(&lock);
+ dout(2) << "collection_setattr " << hex << cid << dec << " " << name
<< " (" << size << " bytes)" << endl;
if (strlen (name) >= OSBDB_MAX_ATTR_LEN)
- return -ENAMETOOLONG;
+ {
+ derr(1) << "name too long" << endl;
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -ENAMETOOLONG;
+ }
// Add name to attribute list, if needed.
coll_attrs_id aids = new_coll_attrs_id (cid);
stored_attrs *sap = NULL;
size_t sz = 0;
+ DbTxn *txn = NULL;
+ if (transactional)
+ env->txn_begin (NULL, &txn, 0);
+
dout(3) << " getting " << aids << endl;
- if (db->get (NULL, &attrs_key, &attrs_val, 0) != 0)
+ if (db->get (txn, &attrs_key, &attrs_val, 0) != 0)
{
dout(2) << " first attribute" << endl;
sz = sizeof (stored_attrs);
if (sap->count > 0)
ins = binary_search<attr_name> (sap->names, sap->count, _name);
dout(3) << " insertion point is " << ins << endl;
- if (sap->count == 0 || strcmp (sap->names[ins].name, name) != 0)
+ if (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0)
{
sz += sizeof (attr_name);
- dout(3) << " realloc 0x" << hex << ((void *) sap) << " to "
+ dout(3) << " realloc " << hex << ((void *) sap) << " to "
<< dec << sz << endl;
sap = (stored_attrs *) realloc (sap, sz);
- dout(3) << " returns 0x" << hex << ((void *) sap) << endl;
+ dout(3) << " returns " << hex << ((void *) sap) << dec << endl;
sa.release ();
sa.reset (sap);
int n = (sap->count - ins) * sizeof (attr_name);
{
dout(3) << " move " << n << " bytes from 0x"
<< hex << (&sap->names[ins]) << " to 0x"
- << hex << (&sap->names[ins+1]) << endl;
+ << hex << (&sap->names[ins+1]) << dec << endl;
memmove (&sap->names[ins+1], &sap->names[ins], n);
}
memset (&sap->names[ins], 0, sizeof (attr_name));
newAttrs_val.set_ulen (sz);
newAttrs_val.set_flags (DB_DBT_USERMEM);
dout(3) << " putting " << aids << endl;
- if (db->put (NULL, &attrs_key, &newAttrs_val, 0) != 0)
- return -EIO;
+ if (db->put (txn, &attrs_key, &newAttrs_val, 0) != 0)
+ {
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".putting new attributes failed" << endl;
+ return -EIO;
+ }
}
else
{
- dout(3) << " attribute " << name << " already exists" << endl;
+ dout(3) << "..attribute " << name << " already exists" << endl;
}
- dout(3) << " attributes list: " << sap << endl;
+ dout(3) << "..attributes list: " << sap << endl;
// Add the attribute.
coll_attr_id aid = new_coll_attr_id (cid, name);
Dbt attr_key (&aid, sizeof (aid));
Dbt attr_val ((void *) value, size);
dout(3) << " writing attribute key " << aid << endl;
- if (db->put (NULL, &attr_key, &attr_val, 0) != 0)
- return -EIO;
+ if (db->put (txn, &attr_key, &attr_val, 0) != 0)
+ {
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".putting attribute failed" << endl;
+ return -EIO;
+ }
+
+ if (txn != NULL)
+ txn->commit (0);
+ if (onsafe != NULL)
+ COMMIT(onsafe);
+ dout(4) << "..collection setattr OK" << endl;
return 0;
}
int OSBDB::collection_rmattr(coll_t cid, const char *name,
Context *onsafe)
{
+ dout(6) << "Context " << hex << onsafe << dec << endl;
if (!mounted)
- return -EINVAL;
+ {
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EINVAL;
+ }
+
+ scoped_lock __lock(&lock);
+ dout(2) << "collection_rmattr " << hex << cid << dec
+ << " " << name << endl;
coll_attrs_id aids = new_coll_attrs_id (cid);
Dbt askey (&aids, sizeof_coll_attrs_id());
Dbt asvalue;
asvalue.set_flags (DB_DBT_MALLOC);
- if (db->get (NULL, &askey, &asvalue, 0) != 0)
- return -ENOENT;
+ DbTxn *txn = NULL;
+ if (transactional)
+ env->txn_begin (NULL, &txn, 0);
+
+ if (db->get (txn, &askey, &asvalue, 0) != 0)
+ {
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".no attributes list" << endl;
+ return -ENOENT;
+ }
stored_attrs *sap = (stored_attrs *) asvalue.get_data();
auto_ptr<stored_attrs> sa (sap);
+ dout(5) << "..attributes list " << sap << endl;
if (sap->count == 0)
- return -ENOENT;
+ {
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".empty attributes list" << endl;
+ return -ENOENT;
+ }
attr_name _name;
- memset(&name, 0, sizeof (_name));
+ memset(&_name, 0, sizeof (_name));
strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN);
int ins = binary_search<attr_name> (sap->names, sap->count, _name);
- if (strcmp (sap->names[ins].name, name) != 0)
- return -ENOENT;
+ if (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0)
+ {
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ derr(1) << ".attribute not listed" << endl;
+ return -ENOENT;
+ }
// Shift the later elements down by one, if needed.
int n = (sap->count - ins) * sizeof (attr_name);
if (n > 0)
- memmove (&(sap->names[ins]), &(sap->names[ins + 1]), n);
+ {
+ dout(4) << "..shift down by " << n << endl;
+ memmove (&(sap->names[ins]), &(sap->names[ins + 1]), n);
+ }
sap->count--;
+ dout(5) << "..attributes list now " << sap << endl;
+
asvalue.set_size(asvalue.get_size() - sizeof (attr_name));
int ret;
- if ((ret = db->put (NULL, &askey, &asvalue, 0)) != 0)
+ if ((ret = db->put (txn, &askey, &asvalue, 0)) != 0)
{
derr(1) << "put stored_attrs " << db_strerror (ret) << endl;
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
return -EIO;
}
// Remove the attribute.
coll_attr_id aid = new_coll_attr_id (cid, name);
Dbt key (&aid, sizeof (aid));
- if ((ret = db->del (NULL, &key, 0)) != 0)
- derr(1) << "deleting " << aid << ": " << db_strerror(ret) << endl;
+ if ((ret = db->del (txn, &key, 0)) != 0)
+ {
+ derr(1) << "deleting " << aid << ": " << db_strerror(ret) << endl;
+ if (txn != NULL)
+ txn->abort();
+ if (onsafe != NULL)
+ CLEANUP(onsafe);
+ return -EIO;
+ }
+ if (txn != NULL)
+ txn->commit (0);
+ if (onsafe != NULL)
+ COMMIT(onsafe);
+
+ dout(4) << "..collection rmattr OK" << endl;
return 0;
}
if (!mounted)
return -EINVAL;
- dout(2) << "collection_getattr " << cid << " " << name << endl;
+ dout(2) << "collection_getattr " << hex << cid << dec
+ << " " << name << endl;
+
+ // XXX transactions/read isolation?
coll_attr_id caid = new_coll_attr_id (cid, name);
Dbt key (&caid, sizeof (caid));
val.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL);
if (db->get (NULL, &key, &val, 0) != 0)
- return -ENOENT;
+ {
+ derr(1) << ".no attribute entry" << endl;
+ return -ENOENT;
+ }
+ dout(4) << "..collection getattr OK; returns " << val.get_size() << endl;
return val.get_size();
}
if (!mounted)
return -EINVAL;
- dout(2) << "collection_listattr " << cid << endl;
+ dout(2) << "collection_listattr " << hex << cid << dec << endl;
+
+ // XXX transactions/read isolation?
coll_attrs_id caids = new_coll_attrs_id (cid);
Dbt key (&caids, sizeof_coll_attrs_id());
return;
sync();
- // huh?
+
+ if (onsync != NULL)
+ {
+ g_timer.add_event_after(0.1, onsync);
+ }
}
void OSBDB::sync()
if (!mounted)
return;
+ if (transactional)
+ {
+ env->log_flush (NULL);
+ env->lsn_reset (device.c_str(), 0);
+ }
db->sync(0);
}
#include <db_cxx.h>
#include "osd/ObjectStore.h"
-// Redefine this to use a different BDB access type. DB_BTREE is
-// probably the only other one that makes sense.
-#ifndef OSBDB_DB_TYPE
-#define OSBDB_DB_TYPE DB_HASH
-#endif // OSBDB_DB_TYPE
+#define OSBDB_MAGIC 0x05BDB
/*
* Maximum length of an attribute name.
return out;
}
+class OSBDBException : public std::exception
+{
+ const char *msg;
+
+public:
+ OSBDBException(const char *msg) : msg(msg) { }
+ const char *what() { return msg; }
+};
+
/*
* The object store interface for Berkeley DB.
*/
class OSBDB : public ObjectStore
{
private:
+ Mutex lock;
DbEnv *env;
Db *db;
string device;
+ string env_dir;
bool mounted;
bool opened;
+ bool transactional;
public:
- OSBDB(const char *dev)
- : env(0), db (0), device (dev), mounted(false), opened(false)
+ OSBDB(const char *dev) throw(OSBDBException)
+ : lock(true), env(0), db (0), device (dev), mounted(false), opened(false),
+ transactional(g_conf.bdbstore_transactional)
{
- /*env = new DbEnv (DB_CXX_NO_EXCEPTIONS);
- env->set_error_stream (&std::cerr);
- // WTF? You can't open an env if you set this flag here, but BDB
- // says you also can't set it after you open the env.
- //env->set_flags (DB_LOG_INMEMORY, 1);
- char *p = strrchr (dev, '/');
- int env_flags = (DB_CREATE | DB_THREAD | DB_INIT_LOCK
- | DB_INIT_MPOOL | DB_INIT_TXN | DB_INIT_LOG);
- if (p != NULL)
- {
- *p = '\0';
- if (env->open (dev, env_flags, 0) != 0)
- {
- std::cerr << "failed to open environment: "
- << dev << std::endl;
- ::abort();
- }
- *p = '/';
- dev = p+1;
- }
- else
- {
- if (env->open (NULL, env_flags, 0) != 0)
- {
- std::cerr << "failed to open environment: ." << std::endl;
- ::abort();
- }
- }
-
- // Double WTF: if you remove the DB_LOG_INMEMORY bit, db->open
- // fails, inexplicably, with EINVAL!*/
- // env->set_flags (DB_DIRECT_DB | /*DB_AUTO_COMMIT |*/ DB_LOG_INMEMORY, 1);
}
~OSBDB()
{
umount();
}
- if (env != NULL)
- {
- env->close (0);
- delete env;
- }
}
int mount();
void sync();
private:
- int opendb (DBTYPE type=DB_UNKNOWN, int flags=0);
+ int opendb (DBTYPE type=DB_UNKNOWN, int flags=0, bool new_env=false);
int _setattr(object_t oid, const char *name, const void *value,
- size_t size, Context *onsync);
+ size_t size, Context *onsync, DbTxn *txn);
int _getattr(object_t oid, const char *name, void *value, size_t size);
+ DbEnv *getenv();
};
{
static hash<object_t> H;
assert(sizeof(oid) == 16);
+#ifdef __LP64__
+ sprintf(s, "%s/objects/%02lx/%016lx.%016lx", basedir.c_str(), H(oid) & HASH_MASK,
+ *((__uint64_t*)&oid),
+ *(((__uint64_t*)&oid) + 1));
+#else
sprintf(s, "%s/objects/%02x/%016llx.%016llx", basedir.c_str(), H(oid) & HASH_MASK,
*((__uint64_t*)&oid),
*(((__uint64_t*)&oid) + 1));
+#endif
}
void FakeStore::get_cdir(coll_t cid, char *s)
{
assert(sizeof(cid) == 8);
+#ifdef __LP64__
+ sprintf(s, "%s/collections/%016lx", basedir.c_str(),
+ cid);
+#else
sprintf(s, "%s/collections/%016llx", basedir.c_str(),
cid);
+#endif
}
void FakeStore::get_coname(coll_t cid, object_t oid, char *s)
{
assert(sizeof(oid) == 16);
+#ifdef __LP64__
+ sprintf(s, "%s/collections/%016lx/%016lx.%016lx", basedir.c_str(), cid,
+ *((__uint64_t*)&oid),
+ *(((__uint64_t*)&oid) + 1));
+#else
sprintf(s, "%s/collections/%016llx/%016llx.%016llx", basedir.c_str(), cid,
*((__uint64_t*)&oid),
*(((__uint64_t*)&oid) + 1));
+#endif
}
#include "config.h"
#undef dout
-#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " "
-#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cerr << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " "
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << dbeginl << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " "
+#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cerr << dbeginl << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " "
char *osd_base_path = "./osddata";
char *ebofs_base_path = "./dev";
void OSD::force_remount()
{
- dout(0) << "forcing remount" << endl;
+ dout(0) << "forcing remount" << dendl;
osd_lock.Lock();
{
store->umount();
store->mount();
}
osd_lock.Unlock();
- dout(0) << "finished remount" << endl;
+ dout(0) << "finished remount" << dendl;
}
// </hack>
gethostname(hostname,100);
sprintf(dev_path, "%s/osd%d", ebofs_base_path, whoami);
-
+
struct stat sta;
if (::lstat(dev_path, &sta) != 0)
sprintf(dev_path, "%s/osd.%s", ebofs_base_path, hostname);
{
// mkfs?
if (g_conf.osd_mkfs) {
- dout(2) << "mkfs" << endl;
+ dout(2) << "mkfs" << dendl;
store->mkfs();
// make up a superblock
}
// mount.
- dout(2) << "mounting " << dev_path << endl;
+ dout(2) << "mounting " << dev_path << dendl;
int r = store->mount();
assert(r>=0);
if (g_conf.osd_mkfs) {
// age?
if (g_conf.osd_age_time != 0) {
- dout(2) << "age" << endl;
+ dout(2) << "age" << dendl;
Ager ager(store);
if (g_conf.osd_age_time < 0)
ager.load_freelist();
}
}
else {
- dout(2) << "boot" << endl;
+ dout(2) << "boot" << dendl;
// read superblock
read_superblock();
// load up pgs (as they previously existed)
load_pgs();
- dout(2) << "superblock: i am osd" << superblock.whoami << endl;
+ dout(2) << "superblock: i am osd" << superblock.whoami << dendl;
assert(whoami == superblock.whoami);
}
}
osd_lock.Unlock();
- //dout(0) << "osd_rep " << g_conf.osd_rep << endl;
+ //dout(0) << "osd_rep " << g_conf.osd_rep << dendl;
return 0;
}
int OSD::shutdown()
{
- dout(1) << "shutdown" << endl;
+ dout(1) << "shutdown" << dendl;
state = STATE_STOPPING;
void OSD::write_superblock(ObjectStore::Transaction& t)
{
- dout(10) << "write_superblock " << superblock << endl;
+ dout(10) << "write_superblock " << superblock << dendl;
bufferlist bl;
bl.append((char*)&superblock, sizeof(superblock));
bufferlist bl;
int r = store->read(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl);
if (bl.length() != sizeof(superblock)) {
- dout(10) << "read_superblock failed, r = " << r << ", i got " << bl.length() << " bytes, not " << sizeof(superblock) << endl;
+ dout(10) << "read_superblock failed, r = " << r << ", i got " << bl.length() << " bytes, not " << sizeof(superblock) << dendl;
return -1;
}
bl.copy(0, sizeof(superblock), (char*)&superblock);
- dout(10) << "read_superblock " << superblock << endl;
+ dout(10) << "read_superblock " << superblock << dendl;
// load up "current" osdmap
assert(!osdmap);
PG *OSD::_create_lock_pg(pg_t pgid, ObjectStore::Transaction& t)
{
- dout(10) << "_create_lock_pg " << pgid << endl;
+ dout(10) << "_create_lock_pg " << pgid << dendl;
if (pg_map.count(pgid))
- dout(0) << "_create_lock_pg on " << pgid << ", already have " << *pg_map[pgid] << endl;
+ dout(0) << "_create_lock_pg on " << pgid << ", already have " << *pg_map[pgid] << dendl;
// create
PG *pg;
// wait?
if (pg_lock.count(pgid)) {
Cond c;
- dout(15) << "lock_pg " << pgid << " waiting as " << &c << endl;
- //cerr << "lock_pg " << pgid << " waiting as " << &c << endl;
+ dout(15) << "lock_pg " << pgid << " waiting as " << &c << dendl;
+ //cerr << "lock_pg " << pgid << " waiting as " << &c << dendl;
list<Cond*>& ls = pg_lock_waiters[pgid]; // this is commit, right?
ls.push_back(&c);
pg_lock_waiters.erase(pgid);
}
- dout(15) << "lock_pg " << pgid << endl;
+ dout(15) << "lock_pg " << pgid << dendl;
pg_lock.insert(pgid);
PG *pg = pg_map[pgid];
// someone is in line
Cond *c = pg_lock_waiters[pgid].front();
assert(c);
- dout(15) << "unlock_pg " << pgid << " waking up next guy " << c << endl;
+ dout(15) << "unlock_pg " << pgid << " waking up next guy " << c << dendl;
c->Signal();
} else {
// nobody waiting
- dout(15) << "unlock_pg " << pgid << endl;
+ dout(15) << "unlock_pg " << pgid << dendl;
}
}
{
pg_t pgid = pg->info.pgid;
- dout(10) << "_remove_unlock_pg " << pgid << endl;
+ dout(10) << "_remove_unlock_pg " << pgid << dendl;
// there shouldn't be any waiters, since we're a stray, and pg is presumably clean0.
assert(pg_lock_waiters.count(pgid) == 0);
void OSD::load_pgs()
{
- dout(10) << "load_pgs" << endl;
+ dout(10) << "load_pgs" << dendl;
assert(pg_map.empty());
list<coll_t> ls;
int role = osdmap->calc_pg_role(whoami, pg->acting, nrep);
pg->set_role(role);
- dout(10) << "load_pgs loaded " << *pg << " " << pg->log << endl;
+ dout(10) << "load_pgs loaded " << *pg << " " << pg->log << dendl;
}
}
dout(15) << "project_pg_history " << pgid
<< " from " << from << " to " << osdmap->get_epoch()
<< ", start " << h
- << endl;
+ << dendl;
vector<int> last;
osdmap->pg_to_acting_osds(pgid, last);
if (acting != last &&
e <= h.same_since) {
dout(15) << "project_pg_history " << pgid << " changed in " << e+1
- << " from " << acting << " -> " << last << endl;
+ << " from " << acting << " -> " << last << dendl;
h.same_since = e+1;
}
// primary change?
if (!(!acting.empty() && !last.empty() && acting[0] == last[0]) &&
e <= h.same_primary_since) {
- dout(15) << "project_pg_history " << pgid << " primary changed in " << e+1 << endl;
+ dout(15) << "project_pg_history " << pgid << " primary changed in " << e+1 << dendl;
h.same_primary_since = e+1;
if (g_conf.osd_rep == OSD_REP_PRIMARY)
if (g_conf.osd_rep != OSD_REP_PRIMARY) {
if (!(!acting.empty() && !last.empty() && acting[acting.size()-1] == last[last.size()-1]) &&
e <= h.same_acker_since) {
- dout(15) << "project_pg_history " << pgid << " acker changed in " << e+1 << endl;
+ dout(15) << "project_pg_history " << pgid << " acker changed in " << e+1 << dendl;
h.same_acker_since = e+1;
}
}
h.same_acker_since > e) break;
}
- dout(15) << "project_pg_history end " << h << endl;
+ dout(15) << "project_pg_history end " << h << dendl;
}
void OSD::activate_pg(pg_t pgid, epoch_t epoch)
dout(5) << "heartbeat " << now
<< ": ops " << hb_stat_ops
<< ", avg qlen " << avg_qlen
- << endl;
+ << dendl;
// reset until next time around
hb_stat_ops = 0;
// does client have old map?
if (inst.name.is_client()) {
if (epoch < osdmap->get_epoch()) {
- dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << endl;
+ dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << dendl;
send_incremental_map(epoch, inst, true);
shared = true;
}
// older?
if (peer_map_epoch[inst.name] < osdmap->get_epoch()) {
- dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << endl;
+ dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << dendl;
send_incremental_map(epoch, inst, true);
peer_map_epoch[inst.name] = osdmap->get_epoch(); // so we don't send it again.
shared = true;
// -- don't need lock --
case MSG_PING:
- dout(10) << "ping from " << m->get_source() << endl;
+ dout(10) << "ping from " << m->get_source() << dendl;
delete m;
break;
{
// no map? starting up?
if (!osdmap) {
- dout(7) << "no OSDMap, not booted" << endl;
+ dout(7) << "no OSDMap, not booted" << dendl;
waiting_for_osdmap.push_back(m);
break;
}
// down?
if (osdmap->is_down(whoami)) {
- dout(7) << "i am marked down, dropping " << *m << endl;
+ dout(7) << "i am marked down, dropping " << *m << dendl;
delete m;
break;
}
default:
- dout(1) << " got unknown message " << m->get_type() << endl;
+ dout(1) << " got unknown message " << m->get_type() << dendl;
assert(0);
}
}
entity_name_t dest = inst.name;
if (g_conf.ms_die_on_failure) {
- dout(0) << "ms_handle_failure " << inst << " on " << *m << endl;
+ dout(0) << "ms_handle_failure " << inst << " on " << *m << dendl;
exit(0);
}
if (dest.is_osd()) {
// failed osd. drop message, report to mon.
int mon = monmap->pick_mon();
- dout(0) << "ms_handle_failure " << dest << " inst " << inst
+ dout(0) << "ms_handle_failure " << inst
<< ", dropping and reporting to mon" << mon
- << endl;
+ << " " << *m
+ << dendl;
messenger->send_message(new MOSDFailure(inst, osdmap->get_epoch()),
monmap->get_inst(mon));
delete m;
} else if (dest.is_mon()) {
// resend to a different monitor.
int mon = monmap->pick_mon(true);
- dout(0) << "ms_handle_failure " << dest << " inst " << inst
+ dout(0) << "ms_handle_failure " << inst
<< ", resending to mon" << mon
- << endl;
+ << " " << *m
+ << dendl;
messenger->send_message(m, monmap->get_inst(mon));
}
else {
// client?
- dout(0) << "ms_handle_failure " << dest << " inst " << inst
- << ", dropping" << endl;
+ dout(0) << "ms_handle_failure " << inst
+ << ", dropping " << *m << dendl;
delete m;
}
}
void OSD::handle_osd_ping(MOSDPing *m)
{
- dout(20) << "osdping from " << m->get_source() << endl;
+ dout(20) << "osdping from " << m->get_source() << dendl;
_share_map_incoming(m->get_source_inst(), ((MOSDPing*)m)->map_epoch);
int from = m->get_source().num();
dout(3) << "handle_osd_map epochs ["
<< m->get_first() << "," << m->get_last()
<< "], i have " << osdmap->get_epoch()
- << endl;
+ << dendl;
} else {
dout(3) << "handle_osd_map epochs ["
<< m->get_first() << "," << m->get_last()
<< "], i have none"
- << endl;
+ << dendl;
osdmap = new OSDMap;
boot_epoch = m->get_last(); // hrm...?
}
p++) {
object_t oid = get_osdmap_object_name(p->first);
if (store->exists(oid)) {
- dout(10) << "handle_osd_map already had full map epoch " << p->first << endl;
+ dout(10) << "handle_osd_map already had full map epoch " << p->first << dendl;
logger->inc("mapfdup");
bufferlist bl;
get_map_bl(p->first, bl);
- dout(10) << " .. it is " << bl.length() << " bytes" << endl;
+ dout(10) << " .. it is " << bl.length() << " bytes" << dendl;
continue;
}
- dout(10) << "handle_osd_map got full map epoch " << p->first << endl;
+ dout(10) << "handle_osd_map got full map epoch " << p->first << dendl;
//t.write(oid, 0, p->second.length(), p->second);
store->write(oid, 0, p->second.length(), p->second, 0);
p++) {
object_t oid = get_inc_osdmap_object_name(p->first);
if (store->exists(oid)) {
- dout(10) << "handle_osd_map already had incremental map epoch " << p->first << endl;
+ dout(10) << "handle_osd_map already had incremental map epoch " << p->first << dendl;
logger->inc("mapidup");
bufferlist bl;
get_inc_map_bl(p->first, bl);
- dout(10) << " .. it is " << bl.length() << " bytes" << endl;
+ dout(10) << " .. it is " << bl.length() << " bytes" << dendl;
continue;
}
- dout(10) << "handle_osd_map got incremental map epoch " << p->first << endl;
+ dout(10) << "handle_osd_map got incremental map epoch " << p->first << dendl;
//t.write(oid, 0, p->second.length(), p->second);
store->write(oid, 0, p->second.length(), p->second, 0);
bufferlist bl;
if (m->incremental_maps.count(cur+1) ||
store->exists(get_inc_osdmap_object_name(cur+1))) {
- dout(10) << "handle_osd_map decoding inc map epoch " << cur+1 << endl;
+ dout(10) << "handle_osd_map decoding inc map epoch " << cur+1 << dendl;
bufferlist bl;
if (m->incremental_maps.count(cur+1))
}
else if (m->maps.count(cur+1) ||
store->exists(get_osdmap_object_name(cur+1))) {
- dout(10) << "handle_osd_map decoding full map epoch " << cur+1 << endl;
+ dout(10) << "handle_osd_map decoding full map epoch " << cur+1 << dendl;
bufferlist bl;
if (m->maps.count(cur+1))
bl = m->maps[cur+1];
// FIXME BUG: need to notify messenger of ups/downs!!
}
else {
- dout(10) << "handle_osd_map missing epoch " << cur+1 << endl;
+ dout(10) << "handle_osd_map missing epoch " << cur+1 << dendl;
int mon = monmap->pick_mon();
messenger->send_message(new MOSDGetMap(cur), monmap->get_inst(mon));
break;
{
dout(7) << "advance_map epoch " << osdmap->get_epoch()
<< " " << pg_map.size() << " pgs"
- << endl;
+ << dendl;
if (osdmap->is_mkfs()) {
ps_t maxps = 1ULL << osdmap->get_pg_bits();
ps_t maxlps = 1ULL << osdmap->get_localized_pg_bits();
- dout(1) << "mkfs on " << osdmap->get_pg_bits() << " bits, " << maxps << " pgs" << endl;
+ dout(1) << "mkfs on " << osdmap->get_pg_bits() << " bits, " << maxps << " pgs" << dendl;
assert(osdmap->get_epoch() == 1);
- //cerr << "osdmap " << osdmap->get_ctime() << " logger start " << logger->get_start() << endl;
+ //cerr << "osdmap " << osdmap->get_ctime() << " logger start " << logger->get_start() << dendl;
logger->set_start( osdmap->get_ctime() );
assert(g_conf.osd_mkfs); // make sure we did a mkfs!
pg->info.history.same_acker_since = osdmap->get_epoch();
pg->activate(t);
- dout(7) << "created " << *pg << endl;
+ dout(7) << "created " << *pg << dendl;
_unlock_pg(pgid);
}
pg->info.history.same_since = osdmap->get_epoch();
pg->activate(t);
- dout(7) << "created " << *pg << endl;
+ dout(7) << "created " << *pg << dendl;
_unlock_pg(pgid);
}
}
pg->info.history.same_acker_since = osdmap->get_epoch();
pg->activate(t);
- dout(7) << "created " << *pg << endl;
+ dout(7) << "created " << *pg << dendl;
_unlock_pg(pgid);
}
pg->info.history.same_since = osdmap->get_epoch();
pg->activate(t);
- dout(7) << "created " << *pg << endl;
+ dout(7) << "created " << *pg << dendl;
_unlock_pg(pgid);
}
}
- dout(1) << "mkfs done, created " << pg_map.size() << " pgs" << endl;
+ dout(1) << "mkfs done, created " << pg_map.size() << " pgs" << dendl;
} else {
// scan existing pg's
if (nrep == 0) {
pg->state_set(PG::STATE_CRASHED);
- dout(1) << *pg << " is crashed" << endl;
+ dout(1) << *pg << " is crashed" << dendl;
}
}
// my role changed.
dout(10) << *pg << " " << oldacting << " -> " << pg->acting
- << ", role " << oldrole << " -> " << role << endl;
+ << ", role " << oldrole << " -> " << role << dendl;
} else {
// no role change.
dout(10) << *pg << " " << oldacting << " -> " << pg->acting
<< ", acting primary "
<< oldprimary << " -> " << pg->get_primary()
- << endl;
+ << dendl;
} else {
// primary is the same.
if (role == 0) {
pg->state_clear(PG::STATE_REPLAY);
dout(10) << *pg << " " << oldacting << " -> " << pg->acting
- << ", replicas changed" << endl;
+ << ", replicas changed" << dendl;
}
}
}
void OSD::activate_map(ObjectStore::Transaction& t)
{
- dout(7) << "activate_map version " << osdmap->get_epoch() << endl;
+ dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
map< int, list<PG::Info> > notify_list; // primary -> list
map< int, map<pg_t,PG::Query> > query_map; // peer -> PG -> get_summary_since
void OSD::send_incremental_map(epoch_t since, const entity_inst_t& inst, bool full)
{
dout(10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch()
- << " to " << inst << endl;
+ << " to " << inst << dendl;
MOSDMap *m = new MOSDMap;
for (e = epoch; e > 0; e--) {
bufferlist bl;
if (get_map_bl(e, bl)) {
- //dout(10) << "get_map " << epoch << " full " << e << endl;
+ //dout(10) << "get_map " << epoch << " full " << e << dendl;
m.decode(bl);
break;
} else {
// apply incrementals
for (e++; e <= epoch; e++) {
- //dout(10) << "get_map " << epoch << " inc " << e << endl;
+ //dout(10) << "get_map " << epoch << " inc " << e << dendl;
m.apply_incremental( incs.front() );
incs.pop_front();
}
{
// older map?
if (ep < osdmap->get_epoch()) {
- dout(7) << "require_current_map epoch " << ep << " < " << osdmap->get_epoch() << endl;
+ dout(7) << "require_current_map epoch " << ep << " < " << osdmap->get_epoch() << dendl;
delete m; // discard and ignore.
return false;
}
// newer map?
if (ep > osdmap->get_epoch()) {
- dout(7) << "require_current_map epoch " << ep << " > " << osdmap->get_epoch() << endl;
+ dout(7) << "require_current_map epoch " << ep << " > " << osdmap->get_epoch() << dendl;
wait_for_new_map(m);
return false;
}
*/
bool OSD::require_same_or_newer_map(Message *m, epoch_t epoch)
{
- dout(10) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ")" << endl;
+ dout(10) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ")" << dendl;
// newer map?
if (epoch > osdmap->get_epoch()) {
- dout(7) << " from newer map epoch " << epoch << " > " << osdmap->get_epoch() << endl;
+ dout(7) << " from newer map epoch " << epoch << " > " << osdmap->get_epoch() << dendl;
wait_for_new_map(m);
return false;
}
if (epoch < boot_epoch) {
- dout(7) << " from pre-boot epoch " << epoch << " < " << boot_epoch << endl;
+ dout(7) << " from pre-boot epoch " << epoch << " < " << boot_epoch << dendl;
delete m;
return false;
}
it != notify_list.end();
it++) {
if (it->first == whoami) {
- dout(7) << "do_notify osd" << it->first << " is self, skipping" << endl;
+ dout(7) << "do_notify osd" << it->first << " is self, skipping" << dendl;
continue;
}
- dout(7) << "do_notify osd" << it->first << " on " << it->second.size() << " PGs" << endl;
+ dout(7) << "do_notify osd" << it->first << " on " << it->second.size() << " PGs" << dendl;
MOSDPGNotify *m = new MOSDPGNotify(osdmap->get_epoch(), it->second);
_share_map_outgoing(osdmap->get_inst(it->first));
messenger->send_message(m, osdmap->get_inst(it->first));
pit++) {
int who = pit->first;
dout(7) << "do_queries querying osd" << who
- << " on " << pit->second.size() << " PGs" << endl;
+ << " on " << pit->second.size() << " PGs" << dendl;
MOSDPGQuery *m = new MOSDPGQuery(osdmap->get_epoch(),
pit->second);
*/
void OSD::handle_pg_notify(MOSDPGNotify *m)
{
- dout(7) << "handle_pg_notify from " << m->get_source() << endl;
+ dout(7) << "handle_pg_notify from " << m->get_source() << dendl;
int from = m->get_source().num();
if (!require_same_or_newer_map(m, m->get_epoch())) return;
if (m->get_epoch() < history.same_primary_since) {
dout(10) << "handle_pg_notify pg " << pgid << " dne, and primary changed in "
- << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << endl;
+ << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << dendl;
continue;
}
t.collection_setattr(pgid, "info", (char*)&pg->info, sizeof(pg->info));
- dout(10) << *pg << " is new" << endl;
+ dout(10) << *pg << " is new" << dendl;
// kick any waiters
if (waiting_for_pg.count(pgid)) {
if (m->get_epoch() < pg->info.history.same_primary_since) {
dout(10) << *pg << " handle_pg_notify primary changed in "
<< pg->info.history.same_primary_since
- << " (msg from " << m->get_epoch() << ")" << endl;
+ << " (msg from " << m->get_epoch() << ")" << dendl;
_unlock_pg(pgid);
continue;
}
// stray?
bool acting = pg->is_acting(from);
if (!acting && (*it).last_epoch_started > 0) {
- dout(10) << *pg << " osd" << from << " has stray content: " << *it << endl;
+ dout(10) << *pg << " osd" << from << " has stray content: " << *it << dendl;
pg->stray_set.insert(from);
pg->state_clear(PG::STATE_CLEAN);
}
(*it).is_clean() && acting) {
pg->clean_set.insert(from);
dout(10) << *pg << " osd" << from << " now clean (" << pg->clean_set
- << "): " << *it << endl;
+ << "): " << *it << dendl;
if (pg->is_all_clean()) {
- dout(-10) << *pg << " now clean on all replicas" << endl;
+ dout(10) << *pg << " now clean on all replicas" << dendl;
pg->state_set(PG::STATE_CLEAN);
pg->clean_replicas();
}
} else {
// hmm, maybe keep an eye out for cases where we see this, but peer should happen.
- dout(10) << *pg << " already had notify info from osd" << from << ": " << *it << endl;
+ dout(10) << *pg << " already had notify info from osd" << from << ": " << *it << dendl;
}
} else {
// adjust prior?
if (!require_same_or_newer_map(m, m->get_epoch())) return;
if (pg_map.count(pgid) == 0) {
- dout(10) << "handle_pg_log don't have pg " << pgid << ", dropping" << endl;
+ dout(10) << "handle_pg_log don't have pg " << pgid << ", dropping" << dendl;
assert(m->get_epoch() < osdmap->get_epoch());
delete m;
return;
dout(10) << "handle_pg_log " << *pg
<< " from " << m->get_source()
<< " is old, discarding"
- << endl;
+ << dendl;
delete m;
return;
}
dout(7) << "handle_pg_log " << *pg
<< " got " << m->log << " " << m->missing
- << " from " << m->get_source() << endl;
+ << " from " << m->get_source() << dendl;
//m->log.print(cout);
} else {
// i am REPLICA
- dout(10) << *pg << " got " << m->log << " " << m->missing << endl;
+ dout(10) << *pg << " got " << m->log << " " << m->missing << dendl;
// merge log
pg->merge_log(m->log, m->missing, from);
*/
void OSD::handle_pg_query(MOSDPGQuery *m)
{
- dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << endl;
+ dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << dendl;
int from = m->get_source().num();
if (!require_same_or_newer_map(m, m->get_epoch())) return;
if (m->get_epoch() < history.same_since) {
dout(10) << " pg " << pgid << " dne, and pg has changed in "
- << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << endl;
+ << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << dendl;
continue;
}
int role = osdmap->calc_pg_role(whoami, acting, nrep);
if (role < 0) {
- dout(10) << " pg " << pgid << " dne, and i am not an active replica" << endl;
+ dout(10) << " pg " << pgid << " dne, and i am not an active replica" << dendl;
PG::Info empty(pgid);
notify_list[from].push_back(empty);
continue;
t.collection_setattr(pgid, "info", (char*)&pg->info, sizeof(pg->info));
store->apply_transaction(t);
- dout(10) << *pg << " dne (before), but i am role " << role << endl;
+ dout(10) << *pg << " dne (before), but i am role " << role << dendl;
} else {
pg = _lock_pg(pgid);
if (m->get_epoch() < pg->info.history.same_since) {
dout(10) << *pg << " handle_pg_query primary changed in "
<< pg->info.history.same_since
- << " (msg from " << m->get_epoch() << ")" << endl;
+ << " (msg from " << m->get_epoch() << ")" << dendl;
_unlock_pg(pgid);
continue;
}
if (it->second.type == PG::Query::INFO) {
// info
- dout(10) << *pg << " sending info" << endl;
+ dout(10) << *pg << " sending info" << dendl;
notify_list[from].push_back(pg->info);
} else {
MOSDPGLog *m = new MOSDPGLog(osdmap->get_epoch(), pg->get_pgid());
if (it->second.type == PG::Query::LOG) {
dout(10) << *pg << " sending info+missing+log since split " << it->second.split
<< " from floor " << it->second.floor
- << endl;
+ << dendl;
if (!m->log.copy_after_unless_divergent(pg->log, it->second.split, it->second.floor)) {
- dout(10) << *pg << " divergent, sending backlog" << endl;
+ dout(10) << *pg << " divergent, sending backlog" << dendl;
it->second.type = PG::Query::BACKLOG;
}
}
if (it->second.type == PG::Query::BACKLOG) {
- dout(10) << *pg << " sending info+missing+backlog" << endl;
+ dout(10) << *pg << " sending info+missing+backlog" << dendl;
if (pg->log.backlog) {
m->log = pg->log;
} else {
}
}
else if (it->second.type == PG::Query::FULLLOG) {
- dout(10) << *pg << " sending info+missing+full log" << endl;
+ dout(10) << *pg << " sending info+missing+full log" << dendl;
m->log.copy_non_backlog(pg->log);
}
- dout(10) << *pg << " sending " << m->log << " " << m->missing << endl;
+ dout(10) << *pg << " sending " << m->log << " " << m->missing << dendl;
//m->log.print(cout);
_share_map_outgoing(osdmap->get_inst(from));
void OSD::handle_pg_remove(MOSDPGRemove *m)
{
- dout(7) << "handle_pg_remove from " << m->get_source() << endl;
+ dout(7) << "handle_pg_remove from " << m->get_source() << dendl;
if (!require_same_or_newer_map(m, m->get_epoch())) return;
PG *pg;
if (pg_map.count(pgid) == 0) {
- dout(10) << " don't have pg " << pgid << endl;
+ dout(10) << " don't have pg " << pgid << dendl;
continue;
}
pg = _lock_pg(pgid);
- dout(10) << *pg << " removing." << endl;
+ dout(10) << *pg << " removing." << dendl;
assert(pg->get_role() == -1);
_remove_unlock_pg(pg);
if (!pg) {
dout(7) << "hit non-existent pg "
<< pgid
- << ", waiting" << endl;
+ << ", waiting" << dendl;
waiting_for_pg[pgid].push_back(op);
_unlock_pg(pgid);
return;
if (read && !pg->same_for_read_since(op->get_map_epoch())) {
dout(7) << "handle_rep_op pg changed " << pg->info.history
<< " after " << op->get_map_epoch()
- << ", dropping" << endl;
+ << ", dropping" << dendl;
assert(op->get_map_epoch() < osdmap->get_epoch());
_unlock_pg(pgid);
delete op;
!pg->same_for_modify_since(op->get_map_epoch()))) {
dout(7) << "handle_rep_op pg changed " << pg->info.history
<< " after " << op->get_map_epoch()
- << ", dropping" << endl;
+ << ", dropping" << dendl;
assert(op->get_map_epoch() < osdmap->get_epoch());
_unlock_pg(pgid);
delete op;
if (op->get_version().version > 0) {
if (op->get_version() > pg->info.last_update) {
dout(7) << *pg << " queueing replay at " << op->get_version()
- << " for " << *op << endl;
+ << " for " << *op << dendl;
pg->replay_queue[op->get_version()] = op;
_unlock_pg(pgid);
return;
} else {
dout(7) << *pg << " replay at " << op->get_version() << " <= " << pg->info.last_update
<< " for " << *op
- << ", will queue for WRNOOP" << endl;
+ << ", will queue for WRNOOP" << dendl;
}
}
- dout(7) << *pg << " not active (yet)" << endl;
+ dout(7) << *pg << " not active (yet)" << dendl;
pg->waiting_for_active.push_back(op);
_unlock_pg(pgid);
return;
dout(10) << "handle_op read on " << op->get_oid()
<< ", have " << loid
<< ", but need missing " << moid
- << ", pulling" << endl;
+ << ", pulling" << dendl;
pull(pg, moid);
pg->waiting_for_missing_object[moid].push_back(op);
return;
dout(10) << "handle_op read on " << op->get_oid()
<< ", have " << loid
<< ", don't need missing " << moid
- << endl;
+ << dendl;
}
} else {
// live revision. easy.
}
*/
- dout(7) << "handle_op " << *op << " in " << *pg << endl;
+ dout(7) << "handle_op " << *op << " in " << *pg << dendl;
// balance reads?
if (false) {
if (pg->acting.size() > 1) {
int peer = pg->acting[1];
- dout(-10) << "fwd client read op to osd" << peer << " for " << op->get_client() << " " << op->get_client_inst() << endl;
+ dout(-10) << "fwd client read op to osd" << peer << " for " << op->get_client() << " " << op->get_client_inst() << dendl;
messenger->send_message(op, osdmap->get_inst(peer));
_unlock_pg(pgid);
return;
<< ", p=" << p
<< ", fwd to peer w/ qlen " << peer_qlen[peer]
<< " osd" << peer
- << endl;
+ << dendl;
messenger->send_message(op, osdmap->get_inst(peer));
_unlock_pg(pgid);
return;
// have pg?
if (!pg) {
derr(-7) << "handle_rep_op " << *op
- << " pgid " << pgid << " dne" << endl;
+ << " pgid " << pgid << " dne" << dendl;
delete op;
//assert(0); // wtf, shouldn't happen.
return;
if (!pg->same_for_rep_modify_since(op->get_map_epoch())) {
dout(10) << "handle_rep_op pg changed " << pg->info.history
<< " after " << op->get_map_epoch()
- << ", dropping" << endl;
+ << ", dropping" << dendl;
_unlock_pg(pgid);
delete op;
return;
}
assert(pg->get_role() >= 0);
- dout(7) << "handle_rep_op " << op << " in " << *pg << endl;
+ dout(7) << "handle_rep_op " << op << " in " << *pg << dendl;
}
if (g_conf.osd_maxthreads < 1) {
void OSD::handle_op_reply(MOSDOpReply *op)
{
if (op->get_map_epoch() < boot_epoch) {
- dout(3) << "replica op reply from before boot" << endl;
+ dout(3) << "replica op reply from before boot" << dendl;
delete op;
return;
}
void OSD::enqueue_op(pg_t pgid, Message *op)
{
while (pending_ops > g_conf.osd_max_opq) {
- dout(10) << "enqueue_op waiting for pending_ops " << pending_ops << " to drop to " << g_conf.osd_max_opq << endl;
+ dout(10) << "enqueue_op waiting for pending_ops " << pending_ops << " to drop to " << g_conf.osd_max_opq << dendl;
op_queue_cond.Wait(osd_lock);
}
if (pgid) {
dout(10) << "dequeue_op " << op << " write pg " << pgid
- << ls.size() << " / " << (pending_ops-1) << " more pending" << endl;
+ << ls.size() << " / " << (pending_ops-1) << " more pending" << dendl;
} else {
dout(10) << "dequeue_op " << op << " read "
- << ls.size() << " / " << (pending_ops-1) << " more pending" << endl;
+ << ls.size() << " / " << (pending_ops-1) << " more pending" << dendl;
}
if (ls.empty())
_unlock_pg(pgid);
}
- dout(10) << "dequeue_op " << op << " finish" << endl;
+ dout(10) << "dequeue_op " << op << " finish" << dendl;
assert(pending_ops > 0);
if (pending_ops > g_conf.osd_max_opq)
void OSD::wait_for_no_ops()
{
if (pending_ops > 0) {
- dout(7) << "wait_for_no_ops - waiting for " << pending_ops << endl;
+ dout(7) << "wait_for_no_ops - waiting for " << pending_ops << dendl;
waiting_for_no_ops = true;
while (pending_ops > 0)
no_pending_ops.Wait(osd_lock);
waiting_for_no_ops = false;
assert(pending_ops == 0);
}
- dout(7) << "wait_for_no_ops - none" << endl;
+ dout(7) << "wait_for_no_ops - none" << dendl;
}
entity_name_t source;
int len = store->getattr(oid, "wrlock", &source, sizeof(entity_name_t));
- //cout << "getattr returns " << len << " on " << oid << endl;
+ //cout << "getattr returns " << len << " on " << oid << dendl;
if (len == sizeof(source) &&
source != op->get_client()) {
#include "ObjectStore.h"
#include "PG.h"
-#include "messages/MOSDOp.h"
-
#include <map>
using namespace std;
class ObjectStore;
class OSDMap;
-
class OSD : public Dispatcher {
public:
// -- states --
// -- object locking --
hash_map<object_t, list<Message*> > waiting_for_wr_unlock;
- bool block_if_wrlocked(MOSDOp* op);
+ bool block_if_wrlocked(class MOSDOp* op);
// -- op queue --
#include "common/Timer.h"
+#include "messages/MOSDOp.h"
#include "messages/MOSDPGNotify.h"
#include "messages/MOSDPGLog.h"
#include "messages/MOSDPGRemove.h"
#undef dout
-#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << g_clock.now() << " osd" << osd->whoami << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " "
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << dbeginl << g_clock.now() << " osd" << osd->whoami << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " "
/******* PGLog ********/
void PG::trim_write_ahead()
{
if (info.last_update < log.top) {
- dout(10) << "trim_write_ahead (" << info.last_update << "," << log.top << "]" << endl;
+ dout(10) << "trim_write_ahead (" << info.last_update << "," << log.top << "]" << dendl;
log.trim_write_ahead(info.last_update);
} else {
assert(info.last_update == log.top);
- dout(10) << "trim_write_ahead last_update=top=" << info.last_update << endl;
+ dout(10) << "trim_write_ahead last_update=top=" << info.last_update << dendl;
}
}
void PG::proc_replica_log(Log &olog, Missing& omissing, int from)
{
- dout(10) << "proc_replica_log for osd" << from << ": " << olog << " " << omissing << endl;
+ dout(10) << "proc_replica_log for osd" << from << ": " << olog << " " << omissing << dendl;
assert(!is_active());
if (!have_master_log) {
eversion_t lu = peer_info[from].last_update;
while (pp != olog.log.rend()) {
if (!log.objects.count(pp->oid)) {
- dout(10) << " divergent " << *pp << " not in our log, generating backlog" << endl;
+ dout(10) << " divergent " << *pp << " not in our log, generating backlog" << dendl;
generate_backlog();
}
if (!log.objects.count(pp->oid)) {
- dout(10) << " divergent " << *pp << " dne, must have been new, ignoring" << endl;
+ dout(10) << " divergent " << *pp << " dne, must have been new, ignoring" << dendl;
++pp;
continue;
}
if (log.objects[pp->oid]->version > pp->version) {
dout(10) << " divergent " << *pp
<< " superceded by " << log.objects[pp->oid]
- << ", ignoring" << endl;
+ << ", ignoring" << dendl;
} else {
- dout(10) << " divergent " << *pp << ", adding to missing" << endl;
+ dout(10) << " divergent " << *pp << ", adding to missing" << dendl;
peer_missing[from].add(pp->oid, pp->version);
}
}
if (lu < peer_info[from].last_update) {
- dout(10) << " peer osd" << from << " last_update now " << lu << endl;
+ dout(10) << " peer osd" << from << " last_update now " << lu << dendl;
peer_info[from].last_update = lu;
if (lu < oldest_update) {
- dout(10) << " oldest_update now " << lu << endl;
+ dout(10) << " oldest_update now " << lu << dendl;
oldest_update = lu;
}
}
void PG::merge_log(Log &olog, Missing &omissing, int fromosd)
{
dout(10) << "merge_log " << olog << " from osd" << fromosd
- << " into " << log << endl;
+ << " into " << log << dendl;
- //cout << "log" << endl;
+ //cout << "log" << dendl;
//log.print(cout);
- //cout << "olog" << endl;
+ //cout << "olog" << dendl;
//olog.print(cout);
if (log.empty() ||
// was our old log divergent?
if (log.top > p->version) {
- dout(10) << "merge_log i was (possibly) divergent for (" << p->version << "," << log.top << "]" << endl;
+ dout(10) << "merge_log i was (possibly) divergent for (" << p->version << "," << log.top << "]" << dendl;
if (p->version < oldest_update)
oldest_update = p->version;
if (log.objects[oe.oid]->version < oe.version) {
dout(10) << "merge_log divergent entry " << oe
<< " not superceded by " << *log.objects[oe.oid]
- << ", adding to missing" << endl;
+ << ", adding to missing" << dendl;
missing.add(oe.oid, oe.version);
} else {
dout(10) << "merge_log divergent entry " << oe
<< " superceded by " << *log.objects[oe.oid]
- << ", ignoring" << endl;
+ << ", ignoring" << dendl;
}
} else {
- dout(10) << "merge_log divergent entry " << oe << ", adding to missing" << endl;
+ dout(10) << "merge_log divergent entry " << oe << ", adding to missing" << dendl;
missing.add(oe.oid, oe.version);
}
olog.log.pop_back(); // discard divergent entry
}
if (p->is_delete()) {
- dout(10) << "merge_log merging " << *p << ", not missing" << endl;
+ dout(10) << "merge_log merging " << *p << ", not missing" << dendl;
missing.rm(p->oid, p->version);
} else {
- dout(10) << "merge_log merging " << *p << ", now missing" << endl;
+ dout(10) << "merge_log merging " << *p << ", now missing" << dendl;
missing.add(p->oid, p->version);
}
}
if (olog.bottom < log.bottom && olog.top >= log.bottom && !log.backlog) {
dout(10) << "merge_log extending bottom to " << olog.bottom
<< (olog.backlog ? " +backlog":"")
- << endl;
+ << dendl;
// ok
list<Log::Entry>::iterator from = olog.log.begin();
// update our index while we're here
log.index(*to);
- dout(15) << *to << endl;
+ dout(15) << *to << dendl;
// new missing object?
if (to->version > info.last_complete) {
// extend on top?
if (olog.top > log.top &&
olog.bottom <= log.top) {
- dout(10) << "merge_log extending top to " << olog.top << endl;
+ dout(10) << "merge_log extending top to " << olog.top << dendl;
list<Log::Entry>::iterator to = olog.log.end();
list<Log::Entry>::iterator from = olog.log.end();
while (1) {
if (from == olog.log.begin()) break;
from--;
- //dout(0) << "? " << *from << endl;
+ //dout(0) << "? " << *from << dendl;
if (from->version < log.top) {
from++;
break;
}
log.index(*from);
- dout(10) << "merge_log " << *from << endl;
+ dout(10) << "merge_log " << *from << dendl;
// add to missing
if (from->is_update()) {
if (log.objects[oldtail->oid]->version == oldtail->version) {
// and significant.
- dout(10) << "merge_log had divergent " << *oldtail << ", adding to missing" << endl;
+ dout(10) << "merge_log had divergent " << *oldtail << ", adding to missing" << dendl;
//missing.add(oldtail->oid);
assert(0);
} else {
- dout(10) << "merge_log had divergent " << *oldtail << ", already missing" << endl;
+ dout(10) << "merge_log had divergent " << *oldtail << ", already missing" << dendl;
assert(missing.is_missing(oldtail->oid));
}
log.log.pop_back();
}
}
- dout(10) << "merge_log result " << log << " " << missing << endl;
+ dout(10) << "merge_log result " << log << " " << missing << dendl;
//log.print(cout);
}
assert(omissing.is_missing(p->first, p->second));
if (omissing.loc.count(p->first)) {
dout(10) << "proc_missing missing " << p->first << " " << p->second
- << " on osd" << omissing.loc[p->first] << endl;
+ << " on osd" << omissing.loc[p->first] << dendl;
missing.loc[p->first] = omissing.loc[p->first];
} else {
dout(10) << "proc_missing missing " << p->first << " " << p->second
- << " also LOST on source, osd" << fromosd << endl;
+ << " also LOST on source, osd" << fromosd << dendl;
}
}
else if (p->second <= olog.top) {
dout(10) << "proc_missing missing " << p->first << " " << p->second
- << " on source, osd" << fromosd << endl;
+ << " on source, osd" << fromosd << dendl;
missing.loc[p->first] = fromosd;
} else {
dout(10) << "proc_missing " << p->first << " " << p->second
<< " > olog.top " << olog.top << ", not found...."
- << endl;
+ << dendl;
}
}
- dout(10) << "proc_missing missing " << missing.missing << endl;
+ dout(10) << "proc_missing missing " << missing.missing << dendl;
}
void PG::generate_backlog()
{
- dout(10) << "generate_backlog to " << log << endl;
+ dout(10) << "generate_backlog to " << log << dendl;
assert(!log.backlog);
log.backlog = true;
"version",
&e.version, sizeof(e.version));
add[e.version] = e;
- dout(10) << "generate_backlog found " << e << endl;
+ dout(10) << "generate_backlog found " << e << dendl;
}
for (map<eversion_t,Log::Entry>::reverse_iterator i = add.rbegin();
dout(10) << local << " local objects, "
<< add.size() << " objects added to backlog, "
- << log.objects.size() << " in pg" << endl;
+ << log.objects.size() << " in pg" << dendl;
//log.print(cout);
}
void PG::drop_backlog()
{
- dout(10) << "drop_backlog for " << log << endl;
+ dout(10) << "drop_backlog for " << log << dendl;
//log.print(cout);
assert(log.backlog);
Log::Entry &e = *log.log.begin();
if (e.version > log.bottom) break;
- dout(15) << "drop_backlog trimming " << e.version << endl;
+ dout(15) << "drop_backlog trimming " << e.version << dendl;
log.unindex(e);
log.log.pop_front();
}
ostream& PG::Log::print(ostream& out) const
{
- out << *this << endl;
+ out << *this << dendl;
for (list<Entry>::const_iterator p = log.begin();
p != log.end();
p++)
- out << *p << endl;
+ out << *p << dendl;
return out;
}
omap.pg_to_acting_osds(get_pgid(), acting);
for (unsigned i=0; i<acting.size(); i++) {
- //dout(10) << "build prior considering epoch " << epoch << " osd" << acting[i] << endl;
+ //dout(10) << "build prior considering epoch " << epoch << " osd" << acting[i] << dendl;
if (osd->osdmap->is_up(acting[i]) && // is up now
acting[i] != osd->whoami) // and is not me
prior_set.insert(acting[i]);
}
}
- dout(10) << "build_prior built " << prior_set << endl;
+ dout(10) << "build_prior built " << prior_set << dendl;
}
void PG::adjust_prior()
}
dout(10) << "adjust_prior last_epoch_started_any "
- << last_epoch_started_any << " -> " << max << endl;
+ << last_epoch_started_any << " -> " << max << dendl;
assert(max > last_epoch_started_any);
last_epoch_started_any = max;
void PG::clear_primary_state()
{
- dout(10) << "clear_primary_state" << endl;
+ dout(10) << "clear_primary_state" << dendl;
// clear peering state
have_master_log = false;
map< int, map<pg_t,Query> >& query_map)
{
dout(10) << "peer. acting is " << acting
- << ", prior_set is " << prior_set << endl;
+ << ", prior_set is " << prior_set << dendl;
/** GET ALL PG::Info *********/
if (peer_info.count(*it)) {
dout(10) << " have info from osd" << *it
<< ": " << peer_info[*it]
- << endl;
+ << dendl;
continue;
}
missing_info = true;
if (peer_info_requested.count(*it)) {
- dout(10) << " waiting for osd" << *it << endl;
+ dout(10) << " waiting for osd" << *it << dendl;
continue;
}
- dout(10) << " querying info from osd" << *it << endl;
+ dout(10) << " querying info from osd" << *it << dendl;
query_map[*it][info.pgid] = Query(Query::INFO, info.history);
peer_info_requested.insert(*it);
}
// -- ok, we have all (prior_set) info. (and maybe others.)
// did we crash?
- dout(10) << " last_epoch_started_any " << last_epoch_started_any << endl;
+ dout(10) << " last_epoch_started_any " << last_epoch_started_any << dendl;
if (last_epoch_started_any) {
OSDMap omap;
osd->get_map(last_epoch_started_any, omap);
for (set<int>::iterator i = last_started.begin();
i != last_started.end();
i++) {
- //dout(10) << " down in epoch " << e << " is " << omap.get_down_osds() << endl;
+ //dout(10) << " down in epoch " << e << " is " << omap.get_down_osds() << dendl;
if (omap.is_up(*i))
still_up.insert(*i);
}
last_started.swap(still_up);
- //dout(10) << " still active as of epoch " << e << ": " << last_started << endl;
+ //dout(10) << " still active as of epoch " << e << ": " << last_started << dendl;
}
if (last_started.empty()) {
- dout(10) << " crashed since epoch " << last_epoch_started_any << endl;
+ dout(10) << " crashed since epoch " << last_epoch_started_any << dendl;
state_set(STATE_CRASHED);
} else {
- dout(10) << " still active from last started: " << last_started << endl;
+ dout(10) << " still active from last started: " << last_started << dendl;
}
} else if (osd->osdmap->get_epoch() > 1) {
- dout(10) << " crashed since epoch " << last_epoch_started_any << endl;
+ dout(10) << " crashed since epoch " << last_epoch_started_any << dendl;
state_set(STATE_CRASHED);
}
- dout(10) << " peers_complete_thru " << peers_complete_thru << endl;
+ dout(10) << " peers_complete_thru " << peers_complete_thru << dendl;
dout(10) << " newest update on osd" << newest_update_osd
<< " v " << newest_update
<< ", already queried"
- << endl;
+ << dendl;
} else {
// we'd like it back to oldest_update, but will settle for log_bottom
eversion_t since = MAX(peer_info[newest_update_osd].log_bottom,
dout(10) << " newest update on osd" << newest_update_osd
<< " v " << newest_update
<< ", querying since " << since
- << endl;
+ << dendl;
query_map[newest_update_osd][info.pgid] = Query(Query::LOG, log.top, since, info.history);
peer_log_requested.insert(newest_update_osd);
} else {
dout(10) << " newest update on osd" << newest_update_osd
<< " v " << newest_update
<< ", querying entire summary/backlog"
- << endl;
+ << dendl;
assert((peer_info[newest_update_osd].last_complete >=
peer_info[newest_update_osd].log_bottom) ||
peer_info[newest_update_osd].log_backlog); // or else we're in trouble.
}
return;
} else {
- dout(10) << " newest_update " << info.last_update << " (me)" << endl;
+ dout(10) << " newest_update " << info.last_update << " (me)" << dendl;
}
- dout(10) << " oldest_update " << oldest_update << endl;
+ dout(10) << " oldest_update " << oldest_update << dendl;
have_master_log = true;
if (oldest_update < log.bottom && !log.backlog) {
dout(10) << "generating backlog for some peers, bottom "
<< log.bottom << " > " << oldest_update
- << endl;
+ << dendl;
generate_backlog();
}
peer_summary_requested.count(peer)) continue;
dout(10) << " pulling log+missing from osd" << peer
- << endl;
+ << dendl;
query_map[peer][info.pgid] = Query(Query::FULLLOG, info.history);
peer_log_requested.insert(peer);
}
if (peer_info[peer].is_empty()) continue;
if (peer_missing.count(peer)) continue;
- dout(10) << " waiting for log+missing from osd" << peer << endl;
+ dout(10) << " waiting for log+missing from osd" << peer << dendl;
have_missing = false;
}
if (!have_missing) return;
- dout(10) << " peers_complete_thru " << peers_complete_thru << endl;
+ dout(10) << " peers_complete_thru " << peers_complete_thru << dendl;
// -- ok. and have i located all pg contents?
if (missing.num_lost() > 0) {
- dout(10) << "there are still " << missing.num_lost() << " lost objects" << endl;
+ dout(10) << "there are still " << missing.num_lost() << " lost objects" << dendl;
// *****
// FIXME: i don't think this actually accomplishes anything!
int peer = it->first;
if (peer_summary_requested.count(peer)) {
- dout(10) << " already requested summary/backlog from osd" << peer << endl;
+ dout(10) << " already requested summary/backlog from osd" << peer << dendl;
waiting = true;
continue;
}
- dout(10) << " requesting summary/backlog from osd" << peer << endl;
+ dout(10) << " requesting summary/backlog from osd" << peer << dendl;
query_map[peer][info.pgid] = Query(Query::BACKLOG, info.history);
peer_summary_requested.insert(peer);
waiting = true;
}
if (!waiting) {
- dout(10) << missing.num_lost() << " objects are still lost, waiting+hoping for a notify from someone else!" << endl;
+ dout(10) << missing.num_lost() << " objects are still lost, waiting+hoping for a notify from someone else!" << dendl;
}
return;
}
// -- crash recovery?
if (is_crashed()) {
- dout(10) << "crashed, allowing op replay for " << g_conf.osd_replay_window << endl;
+ dout(10) << "crashed, allowing op replay for " << g_conf.osd_replay_window << dendl;
state_set(STATE_REPLAY);
osd->timer.add_event_after(g_conf.osd_replay_window,
new OSD::C_Activate(osd, info.pgid, osd->osdmap->get_epoch()));
// init complete pointer
if (info.last_complete == info.last_update) {
- dout(10) << "activate - complete" << endl;
+ dout(10) << "activate - complete" << dendl;
log.complete_to == log.log.end();
log.requested_to = log.log.end();
}
//else if (is_primary()) {
else if (true) {
- dout(10) << "activate - not complete, " << missing << ", starting recovery" << endl;
+ dout(10) << "activate - not complete, " << missing << ", starting recovery" << dendl;
// init complete_to
log.complete_to = log.log.begin();
log.requested_to = log.complete_to;
do_recovery();
} else {
- dout(10) << "activate - not complete, " << missing << endl;
+ dout(10) << "activate - not complete, " << missing << dendl;
}
}
dout(10) << "activate sending " << m->log << " " << m->missing
- << " to osd" << peer << endl;
+ << " to osd" << peer << dendl;
//m->log.print(cout);
osd->messenger->send_message(m, osd->osdmap->get_inst(peer));
// update our missing
if (peer_missing[peer].num_missing() == 0) {
- dout(10) << "activate peer osd" << peer << " already clean, " << peer_info[peer] << endl;
+ dout(10) << "activate peer osd" << peer << " already clean, " << peer_info[peer] << dendl;
assert(peer_info[peer].last_complete == info.last_update);
clean_set.insert(peer);
} else {
dout(10) << "activate peer osd" << peer << " " << peer_info[peer]
- << " missing " << peer_missing[peer] << endl;
+ << " missing " << peer_missing[peer] << dendl;
}
}
// all clean?
if (is_all_clean()) {
state_set(STATE_CLEAN);
- dout(10) << "activate all replicas clean" << endl;
+ dout(10) << "activate all replicas clean" << dendl;
clean_replicas();
}
}
p != replay_queue.end();
p++) {
if (p->first <= info.last_update) {
- dout(10) << "activate will WRNOOP " << p->first << " " << *p->second << endl;
+ dout(10) << "activate will WRNOOP " << p->first << " " << *p->second << dendl;
replay.push_back(p->second);
continue;
}
dout(10) << "activate replay " << p->first
<< " skipping " << c.version+1 - p->first.version
<< " ops"
- << endl;
+ << dendl;
}
- dout(10) << "activate replay " << p->first << " " << *p->second << endl;
+ dout(10) << "activate replay " << p->first << " " << *p->second << dendl;
replay.push_back(p->second);
c = p->first;
}
void PG::write_log(ObjectStore::Transaction& t)
{
- dout(10) << "write_log" << endl;
+ dout(10) << "write_log" << dendl;
// assemble buffer
bufferlist bl;
void PG::trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v)
{
- dout(15) << " trim_ondisk_log_to v " << v << endl;
+ dout(15) << " trim_ondisk_log_to v " << v << dendl;
map<off_t,eversion_t>::iterator p = ondisklog.block_map.begin();
while (p != ondisklog.block_map.end()) {
- dout(15) << " " << p->first << " -> " << p->second << endl;
+ dout(15) << " " << p->first << " -> " << p->second << dendl;
p++;
if (p == ondisklog.block_map.end() ||
p->second > v) { // too far!
break;
}
}
- dout(15) << " * " << p->first << " -> " << p->second << endl;
+ dout(15) << " * " << p->first << " -> " << p->second << dendl;
if (p == ondisklog.block_map.begin())
return; // can't trim anything!
// we can trim!
off_t trim = p->first;
- dout(10) << " trimming ondisklog to [" << ondisklog.bottom << "," << ondisklog.top << ")" << endl;
+ dout(10) << " trimming ondisklog to [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl;
ondisklog.bottom = trim;
void PG::append_log(ObjectStore::Transaction& t, PG::Log::Entry& logentry,
eversion_t trim_to)
{
- dout(10) << "append_log " << ondisklog.top << " " << logentry << endl;
+ dout(10) << "append_log " << ondisklog.top << " " << logentry << dendl;
// write entry on disk
bufferlist bl;
// trim?
if (trim_to > log.bottom) {
- dout(10) << " trimming " << log << " to " << trim_to << endl;
+ dout(10) << " trimming " << log << " to " << trim_to << dendl;
log.trim(t, trim_to);
info.log_bottom = log.bottom;
info.log_backlog = log.backlog;
trim_ondisklog_to(t, trim_to);
}
- dout(10) << " ondisklog [" << ondisklog.bottom << "," << ondisklog.top << ")" << endl;
+ dout(10) << " ondisklog [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl;
}
void PG::read_log(ObjectStore *store)
r = store->collection_getattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top));
assert(r == sizeof(ondisklog.top));
- dout(10) << "read_log [" << ondisklog.bottom << "," << ondisklog.top << ")" << endl;
+ dout(10) << "read_log [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl;
log.backlog = info.log_backlog;
log.bottom = info.log_bottom;
assert(log.log.empty());
while (pos < ondisklog.top) {
bl.copy(pos-ondisklog.bottom, sizeof(e), (char*)&e);
- dout(10) << "read_log " << pos << " " << e << endl;
+ dout(10) << "read_log " << pos << " " << e << dendl;
if (e.version > log.bottom || log.backlog) { // ignore items below log.bottom
if (pos % 4096 == 0)
ondisklog.block_map[pos] = e.version;
log.log.push_back(e);
} else {
- dout(10) << "read_log ignoring entry at " << pos << endl;
+ dout(10) << "read_log ignoring entry at " << pos << dendl;
}
if (g_conf.osd_pad_pg_log) // pad to 4k, until i fix ebofs reallocation crap. FIXME.
int r = osd->store->getattr(t, "crev", &crev, sizeof(crev));
assert(r >= 0);
if (crev <= oid.rev) {
- dout(10) << "pick_object_rev choosing " << t << " crev " << crev << " for " << oid << endl;
+ dout(10) << "pick_object_rev choosing " << t << " crev " << crev << " for " << oid << dendl;
oid = t;
return true;
}
friend class OSD;
- // [primary|tail]
-
-
- // [primary|replica]
// pg waiters
list<class Message*> waiting_for_active;
hash_map<object_t,
#include "config.h"
#undef dout
-#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << g_clock.now() << " osd" << osd->get_nodeid() << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " "
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << dbeginl << g_clock.now() << " osd" << osd->get_nodeid() << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " "
#include <errno.h>
#include <sys/stat.h>
void RAID4PG::note_failed_osd(int o)
{
- dout(10) << "note_failed_osd osd" << o << endl;
+ dout(10) << "note_failed_osd osd" << o << dendl;
assert(0);
}
void RAID4PG::on_acker_change()
{
- dout(10) << "on_acker_change" << endl;
+ dout(10) << "on_acker_change" << dendl;
assert(0);
}
void RAID4PG::on_role_change()
{
- dout(10) << "on_role_change" << endl;
+ dout(10) << "on_role_change" << dendl;
assert(0);
}
#include "config.h"
#undef dout
-#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << g_clock.now() << " osd" << osd->get_nodeid() << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " "
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << dbeginl << g_clock.now() << " osd" << osd->get_nodeid() << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " "
#include <errno.h>
#include <sys/stat.h>
<< oid
<< " v " << v
<< ", already pulling"
- << endl;
+ << dendl;
} else {
dout(7) << "missing "
<< oid
<< " v " << v
<< ", pulling"
- << endl;
+ << dendl;
pull(oid);
}
waiting_for_missing_object[oid].push_back(op);
*/
void ReplicatedPG::do_op(MOSDOp *op)
{
- //dout(15) << "do_op " << *op << endl;
+ //dout(15) << "do_op " << *op << dendl;
osd->logger->inc("op");
dout(10) << "op_read " << oid
<< " " << op->get_offset() << "~" << op->get_length()
//<< " in " << *pg
- << endl;
+ << dendl;
long r = 0;
bufferlist bl;
reply->set_length(0);
}
- dout(10) << " read got " << r << " / " << op->get_length() << " bytes from obj " << oid << endl;
+ dout(10) << " read got " << r << " / " << op->get_length() << " bytes from obj " << oid << dendl;
// send it
osd->messenger->send_message(reply, op->get_client_inst());
<< " r = " << r
<< " size = " << st.st_size
//<< " in " << *pg
- << endl;
+ << dendl;
MOSDOpReply *reply = new MOSDOpReply(op, r, osd->osdmap->get_epoch(), true);
reply->set_object_size(st.st_size);
dout(10) << "prepare_log_transaction " << op->get_op()
<< " " << cloneentry
- << endl;
+ << dendl;
}
// actual op
dout(10) << "prepare_log_transaction " << op->get_op()
<< " " << logentry
- << endl;
+ << dendl;
// append to log
assert(version > log.top);
log.add(logentry);
assert(log.top == version);
- dout(10) << "prepare_log_transaction appended" << endl;
+ dout(10) << "prepare_log_transaction appended" << dendl;
// write to pg log on disk
append_log(t, logentry, trim_to);
<< " v " << version
<< " crev " << crev
<< " rev " << rev
- << endl;
+ << dendl;
// WRNOOP does nothing.
if (op->get_op() == OSD_OP_WRNOOP)
if (crev && rev && rev > crev) {
object_t noid = oid;
noid.rev = rev;
- dout(10) << "prepare_op_transaction cloning " << oid << " crev " << crev << " to " << noid << endl;
+ dout(10) << "prepare_op_transaction cloning " << oid << " crev " << crev << " to " << noid << dendl;
t.clone(oid, noid);
did_clone = true;
}
}
} else {
// noop?
- dout(10) << "apply_transaction zero on " << oid << ", but dne? stat returns " << r << endl;
+ dout(10) << "apply_transaction zero on " << oid << ", but dne? stat returns " << r << dendl;
}
}
break;
void ReplicatedPG::get_rep_gather(RepGather *repop)
{
//repop->lock.Lock();
- dout(10) << "get_repop " << *repop << endl;
+ dout(10) << "get_repop " << *repop << dendl;
}
void ReplicatedPG::apply_repop(RepGather *repop)
{
- dout(10) << "apply_repop applying update on " << *repop << endl;
+ dout(10) << "apply_repop applying update on " << *repop << dendl;
assert(!repop->applied);
Context *oncommit = new C_OSD_ModifyCommit(this, repop->rep_tid, repop->pg_local_last_complete);
unsigned r = osd->store->apply_transaction(repop->t, oncommit);
if (r)
- dout(-10) << "apply_repop apply transaction return " << r << " on " << *repop << endl;
+ dout(-10) << "apply_repop apply transaction return " << r << " on " << *repop << dendl;
// discard my reference to the buffer
repop->op->get_data().clear();
void ReplicatedPG::put_rep_gather(RepGather *repop)
{
- dout(10) << "put_repop " << *repop << endl;
+ dout(10) << "put_repop " << *repop << dendl;
// commit?
if (repop->can_send_commit() &&
repop->op->wants_commit()) {
// send commit.
MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osd->osdmap->get_epoch(), true);
- dout(10) << "put_repop sending commit on " << *repop << " " << reply << endl;
+ dout(10) << "put_repop sending commit on " << *repop << " " << reply << dendl;
osd->messenger->send_message(reply, repop->op->get_client_inst());
repop->sent_commit = true;
}
// send ack
MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osd->osdmap->get_epoch(), false);
- dout(10) << "put_repop sending ack on " << *repop << " " << reply << endl;
+ dout(10) << "put_repop sending ack on " << *repop << " " << reply << dendl;
osd->messenger->send_message(reply, repop->op->get_client_inst());
repop->sent_ack = true;
if (min > peers_complete_thru) {
dout(10) << "put_repop peers_complete_thru "
<< peers_complete_thru << " -> " << min
- << endl;
+ << dendl;
peers_complete_thru = min;
}
}
- dout(10) << "put_repop deleting " << *repop << endl;
+ dout(10) << "put_repop deleting " << *repop << dendl;
//repop->lock.Unlock();
assert(rep_gather.count(repop->rep_tid));
dout(7) << " issue_repop rep_tid " << op->get_rep_tid()
<< " o " << oid
<< " to osd" << dest
- << endl;
+ << dendl;
// forward the write/update/whatever
MOSDOp *wr = new MOSDOp(op->get_client_inst(), op->get_client_inc(), op->get_reqid().tid,
ReplicatedPG::RepGather *ReplicatedPG::new_rep_gather(MOSDOp *op)
{
- dout(10) << "new_rep_gather rep_tid " << op->get_rep_tid() << " on " << *op << endl;
+ dout(10) << "new_rep_gather rep_tid " << op->get_rep_tid() << " on " << *op << dendl;
int whoami = osd->get_nodeid();
RepGather *repop = new RepGather(op, op->get_rep_tid(),
dout(7) << "repop_ack rep_tid " << repop->rep_tid << " op " << *op
<< " result " << result << " commit " << commit << " from osd" << fromosd
- << endl;
+ << dendl;
get_rep_gather(repop);
{
if (rep_gather.count(rep_tid)) {
RepGather *repop = rep_gather[rep_tid];
- dout(10) << "op_modify_commit " << *repop->op << endl;
+ dout(10) << "op_modify_commit " << *repop->op << dendl;
get_rep_gather(repop);
{
assert(repop->waitfor_commit.count(osd->get_nodeid()));
repop->pg_complete_thru[osd->get_nodeid()] = pg_complete_thru;
}
put_rep_gather(repop);
- dout(10) << "op_modify_commit done on " << repop << endl;
+ dout(10) << "op_modify_commit done on " << repop << dendl;
} else {
- dout(10) << "op_modify_commit rep_tid " << rep_tid << " dne" << endl;
+ dout(10) << "op_modify_commit rep_tid " << rep_tid << " dne" << dendl;
}
}
// dup op?
if (is_dup(op->get_reqid())) {
dout(-3) << "op_modify " << opname << " dup op " << op->get_reqid()
- << ", doing WRNOOP" << endl;
+ << ", doing WRNOOP" << dendl;
op->set_op(OSD_OP_WRNOOP);
opname = MOSDOp::get_opname(op->get_op());
}
//<< " crev " << crev
<< " rev " << op->get_rev()
<< " " << op->get_offset() << "~" << op->get_length()
- << endl;
+ << dendl;
// issue replica writes
RepGather *repop = 0;
unsigned r = osd->store->apply_transaction(t, oncommit);
if (r != 0 && // no errors
r != 2) { // or error on collection_add
- cerr << "error applying transaction: r = " << r << endl;
+ cerr << "error applying transaction: r = " << r << dendl;
assert(r == 0);
}
<< " " << oid
<< " v " << nv
<< " " << op->get_offset() << "~" << op->get_length()
- << endl;
+ << dendl;
// we better not be missing this.
assert(!missing.is_missing(oid));
unsigned tr = osd->store->apply_transaction(t, oncommit);
if (tr != 0 && // no errors
tr != 2) { // or error on collection_add
- cerr << "error applying transaction: r = " << tr << endl;
+ cerr << "error applying transaction: r = " << tr << dendl;
assert(tr == 0);
}
}
// send commit.
dout(10) << "rep_modify_commit on op " << *op
<< ", sending commit to osd" << ackerosd
- << endl;
+ << dendl;
MOSDOpReply *commit = new MOSDOpReply(op, 0, osd->osdmap->get_epoch(), true);
commit->set_pg_complete_thru(last_complete);
osd->messenger->send_message(commit, osd->osdmap->get_inst(ackerosd));
dout(7) << "pull " << oid
<< " v " << v
<< " from osd" << fromosd
- << endl;
+ << dendl;
// send op
tid_t tid = osd->get_tid();
dout(7) << "push " << oid << " v " << v
<< " size " << bl.length()
<< " to osd" << dest
- << endl;
+ << dendl;
osd->logger->inc("r_push");
osd->logger->inc("r_pushb", bl.length());
dout(7) << "op_pull " << oid << " v " << op->get_version()
<< " from " << op->get_source()
- << endl;
+ << dendl;
// is a replica asking? are they missing it?
if (is_primary()) {
assert(peer_missing.count(from)); // we had better know this, from the peering process.
if (!peer_missing[from].is_missing(oid)) {
- dout(7) << "op_pull replica isn't actually missing it, we must have already pushed to them" << endl;
+ dout(7) << "op_pull replica isn't actually missing it, we must have already pushed to them" << dendl;
delete op;
return;
}
} else {
// non-primary
if (missing.is_missing(oid)) {
- dout(7) << "op_pull not primary, and missing " << oid << ", ignoring" << endl;
+ dout(7) << "op_pull not primary, and missing " << oid << ", ignoring" << dendl;
delete op;
return;
}
eversion_t v = op->get_version();
if (!is_missing_object(oid)) {
- dout(7) << "op_push not missing " << oid << endl;
+ dout(7) << "op_push not missing " << oid << dendl;
return;
}
<< oid
<< " v " << v
<< " size " << op->get_length() << " " << op->get_data().length()
- << endl;
+ << dendl;
assert(op->get_data().length() == op->get_length());
info.last_complete = log.complete_to->version;
log.complete_to++;
}
- dout(10) << "last_complete now " << info.last_complete << endl;
+ dout(10) << "last_complete now " << info.last_complete << dendl;
// apply to disk!
void ReplicatedPG::note_failed_osd(int o)
{
- dout(10) << "note_failed_osd " << o << endl;
+ dout(10) << "note_failed_osd " << o << dendl;
// do async; repop_ack() may modify pg->repop_gather
list<RepGather*> ls;
for (map<tid_t,RepGather*>::iterator p = rep_gather.begin();
p != rep_gather.end();
p++) {
- //dout(-1) << "checking repop tid " << p->first << endl;
+ //dout(-1) << "checking repop tid " << p->first << dendl;
if (p->second->waitfor_ack.count(o) ||
p->second->waitfor_commit.count(o))
ls.push_back(p->second);
void ReplicatedPG::on_acker_change()
{
- dout(10) << "on_acker_change" << endl;
+ dout(10) << "on_acker_change" << dendl;
// apply repops
for (map<tid_t,RepGather*>::iterator p = rep_gather.begin();
void ReplicatedPG::on_role_change()
{
- dout(10) << "on_role_change" << endl;
+ dout(10) << "on_role_change" << dendl;
// take object waiters
for (hash_map<object_t, list<Message*> >::iterator it = waiting_for_missing_object.begin();
*/
void ReplicatedPG::clean_up_local(ObjectStore::Transaction& t)
{
- dout(10) << "clean_up_local" << endl;
+ dout(10) << "clean_up_local" << dendl;
assert(info.last_update >= log.bottom); // otherwise we need some help!
if (p->is_delete()) {
if (s.count(p->oid)) {
dout(10) << " deleting " << p->oid
- << " when " << p->version << endl;
+ << " when " << p->version << dendl;
t.remove(p->oid);
}
s.erase(p->oid);
for (set<object_t>::iterator i = s.begin();
i != s.end();
i++) {
- dout(10) << " deleting stray " << *i << endl;
+ dout(10) << " deleting stray " << *i << dendl;
t.remove(*i);
}
if (p->is_delete()) {
dout(10) << " deleting " << p->oid
- << " when " << p->version << endl;
+ << " when " << p->version << dendl;
t.remove(p->oid);
} else {
// keep old(+missing) objects, just for kicks.
{
dout(-10) << "do_recovery pulling " << objects_pulling.size() << " in pg, "
<< osd->num_pulling << "/" << g_conf.osd_max_pull << " total"
- << endl;
- dout(10) << "do_recovery " << missing << endl;
+ << dendl;
+ dout(10) << "do_recovery " << missing << dendl;
// can we slow down on this PG?
if (osd->num_pulling >= g_conf.osd_max_pull && !objects_pulling.empty()) {
- dout(-10) << "do_recovery already pulling max, waiting" << endl;
+ dout(-10) << "do_recovery already pulling max, waiting" << dendl;
return true;
}
dout(10) << "do_recovery "
<< *log.requested_to
<< (objects_pulling.count(latest->oid) ? " (pulling)":"")
- << endl;
+ << dendl;
if (latest->is_update() &&
!objects_pulling.count(latest->oid) &&
}
if (!objects_pulling.empty()) {
- dout(7) << "do_recovery requested everything, still waiting" << endl;
+ dout(7) << "do_recovery requested everything, still waiting" << dendl;
return false;
}
if (is_primary()) {
// i am primary
- dout(7) << "do_recovery complete, cleaning strays" << endl;
+ dout(7) << "do_recovery complete, cleaning strays" << dendl;
clean_set.insert(osd->whoami);
if (is_all_clean()) {
state_set(PG::STATE_CLEAN);
}
} else {
// tell primary
- dout(7) << "do_recovery complete, telling primary" << endl;
+ dout(7) << "do_recovery complete, telling primary" << dendl;
list<PG::Info> ls;
ls.push_back(info);
osd->messenger->send_message(new MOSDPGNotify(osd->osdmap->get_epoch(),
void ReplicatedPG::do_peer_recovery()
{
- dout(10) << "do_peer_recovery" << endl;
+ dout(10) << "do_peer_recovery" << dendl;
for (unsigned i=0; i<acting.size(); i++) {
int peer = acting[i];
void ReplicatedPG::clean_replicas()
{
- dout(10) << "clean_replicas. strays are " << stray_set << endl;
+ dout(10) << "clean_replicas. strays are " << stray_set << dendl;
for (set<int>::iterator p = stray_set.begin();
p != stray_set.end();
p++) {
- dout(10) << "sending PGRemove to osd" << *p << endl;
+ dout(10) << "sending PGRemove to osd" << *p << dendl;
set<pg_t> ls;
ls.insert(info.pgid);
MOSDPGRemove *m = new MOSDPGRemove(osd->osdmap->get_epoch(), ls);
#include "config.h"
#undef dout
-#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_filer) cout << g_clock.now() << " " << objecter->messenger->get_myaddr() << ".filer "
+#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_filer) cout << g_clock.now() << " " << objecter->messenger->get_myname() << ".filer "
class Filer::C_Probe : public Context {
#include "config.h"
#undef dout
-#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cout << g_clock.now() << " " << objecter->messenger->get_myaddr() << ".journaler "
-#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cerr << g_clock.now() << " " << objecter->messenger->get_myaddr() << ".journaler "
+#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cout << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler "
+#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cerr << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler "
if (!g_conf.journaler_allow_split_entries) {
// will we span a stripe boundary?
int p = inode.layout.stripe_unit;
- if (write_pos / p != (write_pos + bl.length() + sizeof(s)) / p) {
+ if (write_pos / p != (write_pos + (off_t)(bl.length() + sizeof(s))) / p) {
// yes.
// move write_pos forward.
off_t owp = write_pos;
// start reading some more?
if (!_is_reading()) {
if (s)
- fetch_len = MAX(fetch_len, sizeof(s)+s-read_buf.length());
+ fetch_len = MAX(fetch_len, (off_t)(sizeof(s)+s-read_buf.length()));
_issue_read(fetch_len);
}
/*** ObjectCacher::Object ***/
#undef dout
-#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) cout << oc->objecter->messenger->get_myaddr() << ".objectcacher.object(" << oid << ") "
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) cout << g_clock.now() << " " << oc->objecter->messenger->get_myname() << ".objectcacher.object(" << oid << ") "
ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *bh, off_t off)
dout(10) << "merge_left result " << *left << endl;
}
-/* buggy possibly, but more importnatly, unnecessary.
-void ObjectCacher::Object::merge_right(BufferHead *left, BufferHead *right)
-{
- assert(left->end() == right->start());
- assert(left->get_state() == right->get_state());
- dout(10) << "merge_right " << *left << " + " << *right << endl;
- oc->bh_remove(this, left);
- oc->bh_stat_sub(right);
- data.erase(right->start());
- right->set_start( left->start() );
- data[right->start()] = right;
- right->set_length( left->length() + right->length());
- oc->bh_stat_add(right);
-
- // data
- bufferlist nbl;
- nbl.claim(left->bl);
- nbl.claim_append(right->bl);
- right->bl.claim(nbl);
-
- // version
- // note: this is sorta busted, but should only be used for dirty buffers
- right->last_write_tid = MAX( left->last_write_tid, right->last_write_tid );
-
- // waiters
- map<off_t,list<Context*> > old;
- old.swap(right->waitfor_read);
-
- // take left's waiters
- right->waitfor_read.swap(left->waitfor_read);
-
- // shift old waiters
- for (map<off_t, list<Context*> >::iterator p = old.begin();
- p != old.end();
- p++)
- right->waitfor_read[p->first + left->length()].swap( p->second );
-
- // hose left
- delete left;
-
- dout(10) << "merge_right result " << *right << endl;
-}
-*/
/*
* map a range of bytes into buffer_heads.
}
+void ObjectCacher::Object::truncate(off_t s)
+{
+ dout(10) << "truncate to " << s << endl;
+
+ while (!data.empty()) {
+ BufferHead *bh = data.rbegin()->second;
+ if (bh->end() <= s)
+ break;
+
+ // split bh at truncation point?
+ if (bh->start() < s) {
+ split(bh, s);
+ continue;
+ }
+
+ // remove bh entirely
+ assert(bh->start() >= s);
+ oc->bh_remove(this, bh);
+ delete bh;
+ }
+}
+
/*** ObjectCacher ***/
#undef dout
-#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) cout << objecter->messenger->get_myaddr() << ".objectcacher "
+#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) cout << g_clock.now() << " " << objecter->messenger->get_myname() << ".objectcacher "
+
/* private */
+void ObjectCacher::close_object(Object *ob)
+{
+ dout(10) << "close_object " << *ob << endl;
+ assert(ob->can_close());
+
+ // ok!
+ objects.erase(ob->get_oid());
+ objects_by_ino[ob->get_ino()].erase(ob);
+ if (objects_by_ino[ob->get_ino()].empty())
+ objects_by_ino.erase(ob->get_ino());
+ delete ob;
+}
+
+
+
+
void ObjectCacher::bh_read(BufferHead *bh)
{
dout(7) << "bh_read on " << *bh << endl;
dout(10) << "flush " << amount << endl;
+ /*
+ * NOTE: we aren't actually pulling things off the LRU here, just looking at the
+ * tail item. Then we call bh_write, which moves it to the other LRU, so that we
+ * can call lru_dirty.lru_get_next_expire() again.
+ */
off_t did = 0;
while (amount == 0 || did < amount) {
BufferHead *bh = (BufferHead*) lru_dirty.lru_get_next_expire();
}
dout(10) << "readx result is " << rd->bl->length() << endl;
+ // done with read.
+ delete rd;
+
trim();
return pos;
}
+// purge. non-blocking. violently removes dirty buffers from cache.
+void ObjectCacher::purge(Object *ob)
+{
+ dout(10) << "purge " << *ob << endl;
+
+ for (map<off_t,BufferHead*>::iterator p = ob->data.begin();
+ p != ob->data.end();
+ p++) {
+ BufferHead *bh = p->second;
+ if (!bh->is_clean())
+ dout(0) << "purge forcibly removing " << *ob << " " << *bh << endl;
+ bh_remove(ob, bh);
+ delete bh;
+ }
+
+ if (ob->can_close()) {
+ dout(10) << "trim trimming " << *ob << endl;
+ close_object(ob);
+ }
+}
+
// flush. non-blocking. no callback.
// true if clean, already flushed.
// false if we wrote something.
return false;
}
+void ObjectCacher::purge_set(inodeno_t ino)
+{
+ if (objects_by_ino.count(ino) == 0) {
+ dout(10) << "purge_set on " << ino << " dne" << endl;
+ return;
+ }
+
+ dout(10) << "purge_set " << ino << endl;
+
+ set<Object*>& s = objects_by_ino[ino];
+ for (set<Object*>::iterator i = s.begin();
+ i != s.end();
+ i++) {
+ Object *ob = *i;
+ purge(ob);
+ }
+}
+
off_t ObjectCacher::release(Object *ob)
{
for (list<BufferHead*>::iterator p = clean.begin();
p != clean.end();
- p++)
+ p++) {
bh_remove(ob, *p);
+ delete *p;
+ }
+
+ if (ob->can_close()) {
+ dout(10) << "trim trimming " << *ob << endl;
+ close_object(ob);
+ }
return o_unclean;
}
dout(10) << "release_set " << ino << endl;
- set<Object*>& s = objects_by_ino[ino];
+ set<Object*> s = objects_by_ino[ino];
for (set<Object*>::iterator i = s.begin();
i != s.end();
i++) {
return unclean;
}
+void ObjectCacher::truncate_set(inodeno_t ino, list<ObjectExtent>& exls)
+{
+ if (objects_by_ino.count(ino) == 0) {
+ dout(10) << "truncate_set on " << ino << " dne" << endl;
+ return;
+ }
+
+ dout(10) << "truncate_set " << ino << endl;
+
+ for (list<ObjectExtent>::iterator p = exls.begin();
+ p != exls.end();
+ ++p) {
+ ObjectExtent &ex = *p;
+ if (objects.count(ex.oid) == 0) continue;
+ Object *ob = objects[ex.oid];
+
+ // purge or truncate?
+ if (ex.start == 0) {
+ dout(10) << "truncate_set purging " << *ob << endl;
+ purge(ob);
+ } else {
+ // hrm, truncate object
+ dout(10) << "truncate_set truncating " << *ob << " at " << ex.start << endl;
+ ob->truncate(ex.start);
+
+ if (ob->can_close()) {
+ dout(10) << "truncate_set trimming " << *ob << endl;
+ close_object(ob);
+ }
+ }
+ }
+}
+
void ObjectCacher::kick_sync_writers(inodeno_t ino)
{
last_write_tid(0), last_ack_tid(0), last_commit_tid(0),
lock_state(LOCK_NONE), wrlock_ref(0), rdlock_ref(0)
{}
+ ~Object() {
+ assert(data.empty());
+ }
object_t get_oid() { return oid; }
inodeno_t get_ino() { return ino; }
map<off_t, BufferHead*>& rx);
BufferHead *map_write(Objecter::OSDWrite *wr);
+ void truncate(off_t s);
};
// ******* ObjectCacher *********
objects_by_ino[ino].insert(o);
return o;
}
- void close_object(Object *ob) {
- assert(ob->can_close());
-
- // ok!
- objects.erase(ob->get_oid());
- objects_by_ino[ob->get_ino()].erase(ob);
- if (objects_by_ino[ob->get_ino()].empty())
- objects_by_ino.erase(ob->get_ino());
- delete ob;
- }
+ void close_object(Object *ob);
// bh stats
Cond stat_cond;
void bh_add(Object *ob, BufferHead *bh) {
ob->add_bh(bh);
- if (bh->is_dirty())
+ if (bh->is_dirty()) {
lru_dirty.lru_insert_top(bh);
- else
+ dirty_bh.insert(bh);
+ } else {
lru_rest.lru_insert_top(bh);
+ }
bh_stat_add(bh);
}
void bh_remove(Object *ob, BufferHead *bh) {
ob->remove_bh(bh);
- if (bh->is_dirty())
+ if (bh->is_dirty()) {
lru_dirty.lru_remove(bh);
- else
+ dirty_bh.erase(bh);
+ } else {
lru_rest.lru_remove(bh);
+ }
bh_stat_sub(bh);
}
bool flush(Object *o);
off_t release(Object *o);
+ void purge(Object *o);
void rdlock(Object *o);
void rdunlock(Object *o);
flusher_thread.create();
}
~ObjectCacher() {
- //lock.Lock(); // hmm.. watch out for deadlock!
+ // we should be empty.
+ assert(objects.empty());
+ assert(lru_rest.lru_get_size() == 0);
+ assert(lru_dirty.lru_get_size() == 0);
+ assert(dirty_bh.empty());
+
+ assert(flusher_thread.is_started());
+ lock.Lock(); // hmm.. watch out for deadlock!
flusher_stop = true;
flusher_cond.Signal();
- //lock.Unlock();
+ lock.Unlock();
flusher_thread.join();
}
bool commit_set(inodeno_t ino, Context *oncommit);
void commit_all(Context *oncommit=0);
+ void purge_set(inodeno_t ino);
+
off_t release_set(inodeno_t ino); // returns # of bytes not released (ie non-clean)
+ void truncate_set(inodeno_t ino, list<ObjectExtent>& ex);
+
void kick_sync_writers(inodeno_t ino);
void kick_sync_readers(inodeno_t ino);
#include "config.h"
#undef dout
-#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cout << g_clock.now() << " " << messenger->get_myaddr() << ".objecter "
-#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cerr << g_clock.now() << " " << messenger->get_myaddr() << ".objecter "
+#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cout << g_clock.now() << " " << messenger->get_myname() << ".objecter "
+#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cerr << g_clock.now() << " " << messenger->get_myname() << ".objecter "
// messages ------------------------------
ObjectExtent &ex = st->extents.front();
PG &pg = get_pg( ex.layout.pgid );
- // send
+ // pick tid
last_tid++;
assert(client_inc >= 0);
- MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid,
- ex.oid, ex.layout, osdmap->get_epoch(),
- OSD_OP_STAT);
+
+ // add to gather set
+ st->tid = last_tid;
+ op_stat[last_tid] = st;
+
+ pg.active_tids.insert(last_tid);
+
+ // send?
dout(10) << "stat_submit " << st << " tid " << last_tid
<< " oid " << ex.oid
<< " osd" << pg.acker()
<< endl;
- if (pg.acker() >= 0)
+ if (pg.acker() >= 0) {
+ MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid,
+ ex.oid, ex.layout, osdmap->get_epoch(),
+ OSD_OP_STAT);
+
messenger->send_message(m, osdmap->get_inst(pg.acker()));
+ }
- // add to gather set
- st->tid = last_tid;
- op_stat[last_tid] = st;
-
- pg.active_tids.insert(last_tid);
-
return last_tid;
}
// find OSD
PG &pg = get_pg( ex.layout.pgid );
- // send
+ // pick tid
last_tid++;
assert(client_inc >= 0);
- MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid,
- ex.oid, ex.layout, osdmap->get_epoch(),
- OSD_OP_READ);
- m->set_length(ex.length);
- m->set_offset(ex.start);
+
+ // add to gather set
+ rd->ops[last_tid] = ex;
+ op_read[last_tid] = rd;
+
+ pg.active_tids.insert(last_tid);
+
+ // send?
dout(10) << "readx_submit " << rd << " tid " << last_tid
<< " oid " << ex.oid << " " << ex.start << "~" << ex.length
<< " (" << ex.buffer_extents.size() << " buffer fragments)"
<< " osd" << pg.acker()
<< endl;
- if (pg.acker() >= 0)
+ if (pg.acker() >= 0) {
+ MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid,
+ ex.oid, ex.layout, osdmap->get_epoch(),
+ OSD_OP_READ);
+ m->set_length(ex.length);
+ m->set_offset(ex.start);
+
messenger->send_message(m, osdmap->get_inst(pg.acker()));
+ }
- // add to gather set
- rd->ops[last_tid] = ex;
- op_read[last_tid] = rd;
-
- pg.active_tids.insert(last_tid);
-
return last_tid;
}
// find
PG &pg = get_pg( ex.layout.pgid );
- // send
+ // pick tid
tid_t tid;
if (usetid > 0)
tid = usetid;
else
tid = ++last_tid;
- MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, tid,
- ex.oid, ex.layout, osdmap->get_epoch(),
- wr->op);
- m->set_length(ex.length);
- m->set_offset(ex.start);
- m->set_rev(ex.rev);
-
- if (wr->tid_version.count(tid))
- m->set_version(wr->tid_version[tid]); // we're replaying this op!
-
- // what type of op?
- switch (wr->op) {
- case OSD_OP_WRITE:
- {
- // map buffer segments into this extent
- // (may be fragmented bc of striping)
- bufferlist cur;
- for (map<size_t,size_t>::iterator bit = ex.buffer_extents.begin();
- bit != ex.buffer_extents.end();
- bit++) {
- bufferlist thisbit;
- thisbit.substr_of(((OSDWrite*)wr)->bl, bit->first, bit->second);
- cur.claim_append(thisbit);
- }
- assert(cur.length() == ex.length);
- m->set_data(cur);//.claim(cur);
- }
- break;
- }
-
// add to gather set
wr->waitfor_ack[tid] = ex;
wr->waitfor_commit[tid] = ex;
++num_unacked;
++num_uncommitted;
- // send
+ // send?
dout(10) << "modifyx_submit " << MOSDOp::get_opname(wr->op) << " tid " << tid
<< " oid " << ex.oid
<< " " << ex.start << "~" << ex.length
<< " " << ex.layout
<< " osd" << pg.primary()
<< endl;
- if (pg.primary() >= 0)
+ if (pg.primary() >= 0) {
+ MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, tid,
+ ex.oid, ex.layout, osdmap->get_epoch(),
+ wr->op);
+ m->set_length(ex.length);
+ m->set_offset(ex.start);
+ m->set_rev(ex.rev);
+
+ if (wr->tid_version.count(tid))
+ m->set_version(wr->tid_version[tid]); // we're replaying this op!
+
+ // what type of op?
+ switch (wr->op) {
+ case OSD_OP_WRITE:
+ {
+ // map buffer segments into this extent
+ // (may be fragmented bc of striping)
+ bufferlist cur;
+ for (map<size_t,size_t>::iterator bit = ex.buffer_extents.begin();
+ bit != ex.buffer_extents.end();
+ bit++) {
+ bufferlist thisbit;
+ thisbit.substr_of(((OSDWrite*)wr)->bl, bit->first, bit->second);
+ cur.claim_append(thisbit);
+ }
+ assert(cur.length() == ex.length);
+ m->set_data(cur);//.claim(cur);
+ }
+ break;
+ }
+
messenger->send_message(m, osdmap->get_inst(pg.primary()));
+ }
dout(5) << num_unacked << " unacked, " << num_uncommitted << " uncommitted" << endl;
return !(op_read.empty() && op_modify.empty());
}
+ int get_client_incarnation() { return client_inc; }
void set_client_incarnation(int inc) {
client_inc = inc;
}
#$e = './tcpsynobfs' if $h->{'fs'} eq 'obfs';
my $c = "$e";
$c .= " --mkfs" unless $h->{'no_mkfs'};
- $c .= " --$h->{'fs'}";
+ $c .= " --$h->{'fs'}" if $h->{'fs'};
$c .= " --syn until $h->{'until'}" if $h->{'until'};
$c .= " --syn writefile $h->{'writefile_mb'} $h->{'writefile_size'}" if $h->{'writefile'};
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-
-#include <sys/stat.h>
-#include <iostream>
-#include <string>
-using namespace std;
-
-#include "config.h"
-
-#include "mds/MDCluster.h"
-#include "mds/MDS.h"
-#include "osd/OSD.h"
-#include "client/Client.h"
-#include "client/fuse.h"
-
-#include "msg/TCPMessenger.h"
-
-#include "common/Timer.h"
-
-#include <envz.h>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-
-int main(int argc, char **argv, char *envp[]) {
-
- //cerr << "tcpfuse starting " << myrank << "/" << world << endl;
- vector<char*> args;
- argv_to_vec(argc, argv, args);
- parse_config_options(args);
-
- // args for fuse
- vec_to_argv(args, argc, argv);
-
- // start up tcpmessenger
- tcpaddr_t nsa;
- if (tcpmessenger_findns(nsa) < 0) exit(1);
- tcpmessenger_init();
- tcpmessenger_start();
- tcpmessenger_start_rankserver(nsa);
-
- Client *client = new Client(new TCPMessenger(MSG_ADDR_CLIENT_NEW));
- client->init();
-
- // start up fuse
- // use my argc, argv (make sure you pass a mount point!)
- cout << "mounting" << endl;
- client->mount();
-
- cerr << "starting fuse on pid " << getpid() << endl;
- ceph_fuse_main(client, argc, argv);
- cerr << "fuse finished on pid " << getpid() << endl;
-
- client->unmount();
- cout << "unmounted" << endl;
- client->shutdown();
-
- delete client;
-
- // wait for it to finish
- tcpmessenger_wait();
- tcpmessenger_shutdown(); // shutdown MPI
-
- return 0;
-}
-
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#include <sys/stat.h>
-#include <iostream>
-#include <string>
-using namespace std;
-
-#include "config.h"
-
-#include "mds/MDCluster.h"
-#include "mds/MDS.h"
-#include "osd/OSD.h"
-#include "mon/Monitor.h"
-#include "client/Client.h"
-#include "client/SyntheticClient.h"
-
-#include "msg/TCPMessenger.h"
-
-#include "common/Timer.h"
-
-#define NUMMDS g_conf.num_mds
-#define NUMOSD g_conf.num_osd
-#define NUMCLIENT g_conf.num_client
-
-class C_Test : public Context {
-public:
- void finish(int r) {
- cout << "C_Test->finish(" << r << ")" << endl;
- }
-};
-
-
-#include "msg/mpistarter.cc"
-
-utime_t tick_start;
-int tick_count = 0;
-
-class C_Tick : public Context {
-public:
- void finish(int) {
- utime_t now = g_clock.now() - tick_start;
- dout(0) << "tick +" << g_conf.tick << " -> " << now << " (" << tick_count << ")" << endl;
- tick_count += g_conf.tick;
- utime_t next = tick_start;
- next.sec_ref() += tick_count;
- g_timer.add_event_at(next, new C_Tick);
- }
-};
-
-class C_Die : public Context {
-public:
- void finish(int) {
- cerr << "die" << endl;
- exit(1);
- }
-};
-
-class C_Debug : public Context {
- public:
- void finish(int) {
- int size = &g_conf.debug_after - &g_conf.debug;
- memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size);
- dout(0) << "debug_after flipping debug settings" << endl;
- }
-};
-
-
-int main(int argc, char **argv)
-{
- vector<char*> args;
- argv_to_vec(argc, argv, args);
-
- parse_config_options(args);
-
- parse_syn_options(args);
-
- if (g_conf.kill_after)
- g_timer.add_event_after(g_conf.kill_after, new C_Die);
- if (g_conf.debug_after)
- g_timer.add_event_after(g_conf.debug_after, new C_Debug);
-
- if (g_conf.tick) {
- tick_start = g_clock.now();
- g_timer.add_event_after(g_conf.tick, new C_Tick);
- }
-
- vector<char*> nargs;
- for (unsigned i=0; i<args.size(); i++) {
- //cout << "a " << args[i] << endl;
- // unknown arg, pass it on.
- nargs.push_back(args[i]);
- }
-
- args = nargs;
- if (!args.empty()) {
- for (unsigned i=0; i<args.size(); i++)
- cerr << "stray arg " << args[i] << endl;
- }
- assert(args.empty());
-
-
- // start up tcp messenger via MPI
- pair<int,int> mpiwho = mpi_bootstrap_tcp(argc, argv);
- int myrank = mpiwho.first;
- int world = mpiwho.second;
-
- int need = 0;
- if (g_conf.tcp_skip_rank0) need++;
- need += NUMMDS;
- need += NUMOSD;
- if (NUMCLIENT) {
- if (!g_conf.tcp_overlay_clients)
- need += 1;
- }
- assert(need <= world);
-
- if (myrank == 0)
- cerr << "nummds " << NUMMDS << " numosd " << NUMOSD << " numclient " << NUMCLIENT << " .. need " << need << ", have " << world << endl;
-
- MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD);
-
-
- char hostname[100];
- gethostname(hostname,100);
- int pid = getpid();
-
- int started = 0;
-
- //if (myrank == 0) g_conf.debug = 20;
-
- // create mon
- if (myrank == 0) {
- Monitor *mon = new Monitor(0, new TCPMessenger(MSG_ADDR_MON(0)));
- mon->init();
- }
-
- // create mds
- MDS *mds[NUMMDS];
- OSD *mdsosd[NUMMDS];
- for (int i=0; i<NUMMDS; i++) {
- if (myrank != g_conf.tcp_skip_rank0+i) continue;
- TCPMessenger *m = new TCPMessenger(MSG_ADDR_MDS(i));
- cerr << "mds" << i << " on tcprank " << tcpmessenger_get_rank() << " " << hostname << "." << pid << endl;
- mds[i] = new MDS(mdc, i, m);
- mds[i]->init();
- started++;
-
- if (g_conf.mds_local_osd) {
- mdsosd[i] = new OSD(i+10000, new TCPMessenger(MSG_ADDR_OSD(i+10000)));
- mdsosd[i]->init();
- }
- }
-
- // create osd
- OSD *osd[NUMOSD];
- for (int i=0; i<NUMOSD; i++) {
- if (myrank != g_conf.tcp_skip_rank0+NUMMDS + i) continue;
- TCPMessenger *m = new TCPMessenger(MSG_ADDR_OSD(i));
- cerr << "osd" << i << " on tcprank " << tcpmessenger_get_rank() << " " << hostname << "." << pid << endl;
- osd[i] = new OSD(i, m);
- osd[i]->init();
- started++;
- }
-
- if (g_conf.tcp_overlay_clients) sleep(5);
-
- // create client
- int skip_osd = NUMOSD;
- if (g_conf.tcp_overlay_clients)
- skip_osd = 0; // put clients with osds too!
- int client_nodes = world - NUMMDS - skip_osd - g_conf.tcp_skip_rank0;
- int clients_per_node = 1;
- if (NUMCLIENT) clients_per_node = (NUMCLIENT-1) / client_nodes + 1;
- set<int> clientlist;
- Client *client[NUMCLIENT];
- SyntheticClient *syn[NUMCLIENT];
- for (int i=0; i<NUMCLIENT; i++) {
- //if (myrank != NUMMDS + NUMOSD + i % client_nodes) continue;
- if (myrank != g_conf.tcp_skip_rank0+NUMMDS + skip_osd + i / clients_per_node) continue;
- clientlist.insert(i);
- client[i] = new Client(new TCPMessenger(MSG_ADDR_CLIENT_NEW));//(i)) );
-
- // logger?
- if (client_logger == 0) {
- char s[80];
- sprintf(s,"clnode.%d", myrank);
- client_logger = new Logger(s, &client_logtype);
-
- client_logtype.add_inc("lsum");
- client_logtype.add_inc("lnum");
- client_logtype.add_inc("lwsum");
- client_logtype.add_inc("lwnum");
- client_logtype.add_inc("lrsum");
- client_logtype.add_inc("lrnum");
- client_logtype.add_inc("trsum");
- client_logtype.add_inc("trnum");
- client_logtype.add_inc("wrlsum");
- client_logtype.add_inc("wrlnum");
- client_logtype.add_inc("lstatsum");
- client_logtype.add_inc("lstatnum");
- client_logtype.add_inc("ldirsum");
- client_logtype.add_inc("ldirnum");
- client_logtype.add_inc("readdir");
- client_logtype.add_inc("stat");
- }
-
- client[i]->init();
- started++;
-
- syn[i] = new SyntheticClient(client[i]);
- }
-
- if (!clientlist.empty()) dout(2) << "i have " << clientlist << endl;
-
- int nclients = 0;
- for (set<int>::iterator it = clientlist.begin();
- it != clientlist.end();
- it++) {
- int i = *it;
-
- //cerr << "starting synthetic client" << i << " on rank " << myrank << endl;
- client[i]->mount();
- syn[i]->start_thread();
-
- nclients++;
- }
- if (nclients) {
- cerr << nclients << " clients on tcprank " << tcpmessenger_get_rank() << " " << hostname << "." << pid << endl;
- }
-
- for (set<int>::iterator it = clientlist.begin();
- it != clientlist.end();
- it++) {
- int i = *it;
-
- // cout << "waiting for synthetic client" << i << " to finish" << endl;
- syn[i]->join_thread();
- delete syn[i];
-
- client[i]->unmount();
- //cout << "client" << i << " unmounted" << endl;
- client[i]->shutdown();
- }
-
-
- if (myrank && !started) {
- //dout(1) << "IDLE" << endl;
- cerr << "idle on tcprank " << tcpmessenger_get_rank() << " " << hostname << "." << pid << endl;
- tcpmessenger_stop_rankserver();
- }
-
- // wait for everything to finish
- tcpmessenger_wait();
-
- if (started) cerr << "tcpsyn finishing" << endl;
-
- tcpmessenger_shutdown();
-
-
- /*
- // cleanup
- for (int i=0; i<NUMMDS; i++) {
- if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_MDS(i),world)) continue;
- delete mds[i];
- }
- for (int i=0; i<NUMOSD; i++) {
- if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_OSD(i),world)) continue;
- delete osd[i];
- }
- for (int i=0; i<NUMCLIENT; i++) {
- if (myrank != MPI_DEST_TO_RANK(MSG_ADDR_CLIENT(i),world)) continue;
- delete client[i];
- }
- */
- delete mdc;
-
-
- return 0;
-}
-
#include <iostream>
#include <cerrno>
+#include <vector>
#include <fcntl.h>
#include <sys/mount.h>
int main (int argc, char **argv)
{
+ vector<char *> args;
char *osd_name = "ebofs";
unsigned object_size = 1024;
unsigned object_count = 1024;
char *mountcmd = "mount /tmp/testos";
char *umountcmd = "umount /tmp/testos";
+ bool ebofs_raw_device = false;
bool inhibit_remount = (getenv("TESTOS_INHIBIT_REMOUNT") != NULL);
if (argc > 1
&& (strcmp (argv[1], "-h") == 0
|| strcmp (argv[1], "-help") == 0
- || strcmp (argv[1], "--help") == 0
- || argc > 6))
+ || strcmp (argv[1], "--help") == 0))
{
cout << "usage: " << argv[0] << " [store [object-size [object-count [iterations [seed]]]]]" << endl;
cout << endl;
exit (0);
}
+ argv_to_vec (argc, argv, args);
+ for (vector<char*>::iterator it = args.begin(); it != args.end();
+ it++)
+ cout << *it << " ";
+ cout << endl;
+ parse_config_options (args);
+ for (vector<char*>::iterator it = args.begin(); it != args.end();
+ it++)
+ cout << *it << " ";
+ cout << endl;
+
+ argc = args.size();
+ if (argc > 0)
+ osd_name = args[0];
if (argc > 1)
- osd_name = argv[1];
+ object_size = (unsigned) atol (args[1]);
if (argc > 2)
- object_size = (unsigned) atol (argv[2]);
+ object_count = (unsigned) atol (args[2]);
if (argc > 3)
- object_count = (unsigned) atol (argv[3]);
+ write_iter = (unsigned) atol (args[3]);
if (argc > 4)
- write_iter = (unsigned) atol (argv[4]);
- if (argc > 5)
- random_seed = (unsigned) atol (argv[5]);
+ random_seed = (unsigned) atol (args[4]);
// algin object size to 'long'
object_size = ((object_size + (sizeof (long) - 1)) / sizeof (long)) * sizeof (long);
strcpy (osd_file, "/tmp/testos/testos.XXXXXX");
mktemp (osd_file);
+ if (strcasecmp (osd_name, "ebofs") == 0)
+ {
+ char *dev_env = getenv ("TESTOS_EBOFS_DEV");
+ if (dev_env != NULL)
+ {
+ // Assume it is a true device.
+ strncpy (osd_file, dev_env, 32);
+ inhibit_remount = true;
+ ebofs_raw_device = true;
+ }
+ }
+
if (!inhibit_remount)
{
if (system (mountcmd) != 0)
ObjectStore *os = NULL;
if (strcasecmp (osd_name, "ebofs") == 0)
{
- FILE *f = fopen (osd_file, "w");
- if (f == NULL)
+ if (!ebofs_raw_device)
{
- cerr << "failed to open " << osd_file << ": " << strerror (errno)
- << endl;
- exit (1);
+ FILE *f = fopen (osd_file, "w");
+ if (f == NULL)
+ {
+ cerr << "failed to open " << osd_file << ": " << strerror (errno)
+ << endl;
+ exit (1);
+ }
+ // 1G file.
+ fseek (f, 1024 * 1024 * 1024, SEEK_SET);
+ fputc ('\0', f);
+ fclose (f);
}
- // 1G file.
- fseek (f, 1024 * 1024 * 1024, SEEK_SET);
- fputc ('\0', f);
- fclose (f);
- // 20K cache
- g_conf.ebofs_bc_size = 5; // times 4K
os = new Ebofs (osd_file);
}
else if (strcasecmp (osd_name, "osbdb") == 0)
{
- char *e = getenv ("OSBDB_FFACTOR");
- if (e != NULL)
- g_conf.bdbstore_ffactor = atol(e);
- e = getenv ("OSBDB_NELEM");
- if (e != NULL)
- g_conf.bdbstore_nelem = atol(e);
- e = getenv ("OSBDB_PAGESIZE");
- if (e != NULL)
- g_conf.bdbstore_pagesize = atol(e);
- g_conf.debug_bdbstore = 1;
- // 20K cache
- g_conf.bdbstore_cachesize = 20 * 1024;
os = new OSBDB (osd_file);
}
else if (strcasecmp (osd_name, "osbdb-btree") == 0)
{
g_conf.bdbstore_btree = true;
- // 20K cache
- g_conf.bdbstore_cachesize = 20 * 1024;
os = new OSBDB (osd_file);
}
else
cerr << "write " << oids[o] << " failed: "
<< strerror (-ret) << endl;
}
+ os->sync();
+
utime_t end = g_clock.now() - begin;
cerr << "Write finished in " << end << endl;
total_write += end;
writes[i] = end;
- os->sync();
os->umount();
sync();
os->mount();
+ // Shuffle the OIDs.
+ for (int j = 0; j < object_count; j++)
+ {
+ int x = random() % object_count;
+ if (x < 0)
+ x = -x;
+ object_t o = oids[j];
+ oids[j] = oids[x];
+ oids[x] = o;
+ }
+
begin = g_clock.now();
for (unsigned o = 0; o < object_count; o++)
{
cerr << "Finished in " << (total_write + total_read) << endl;
- double write_mean = (double) total_write / write_iter;
+ double write_mean = ((double) total_write) / ((double) write_iter);
double write_sd = 0.0;
for (unsigned i = 0; i < write_iter; i++)
{
- double x = (double) writes[i] - write_mean;
+ double x = ((double) writes[i]) - write_mean;
write_sd += x * x;
}
- write_sd = sqrt (write_sd / write_iter);
+ write_sd = sqrt (write_sd / ((double) write_iter));
- double read_mean = (double) total_read / write_iter;
+ double read_mean = ((double) total_read) / ((double) write_iter);
double read_sd = 0.0;
for (unsigned i = 0; i < write_iter; i++)
{
- double x = (double) reads[i] - read_mean;
+ double x = ((double) reads[i]) - read_mean;
write_sd += x * x;
}
- read_sd = sqrt (read_sd / write_iter);
+ read_sd = sqrt (read_sd / ((double) write_iter));
cout << "TESTOS: write " << osd_name << ":" << object_size << ":"
<< object_count << ":" << write_iter << ":" << random_seed
<< " -- " << write_mean << " " << write_sd << endl;
+ cout << "TESTOS: write.raw -- ";
+ for (int i = 0; i < write_iter; i++)
+ cout << ((double) writes[i]) << " ";
+ cout << endl;
+
cout << "TESTOS: read " << osd_name << ":" << object_size << ":"
<< object_count << ":" << write_iter << ":" << random_seed
<< " -- " << read_mean << " " << read_sd << endl;
+ cout << "TESTOS: read.raw -- ";
+ for (int i = 0; i < write_iter; i++)
+ cout << ((double) reads[i]) << " ";
+ cout << endl;
+
unlink (osd_file);
if (!inhibit_remount)
{
--- /dev/null
+/* testosbdb.cc -- test OSBDB.
+ Copyright (C) 2007 Casey Marshall <csm@soe.ucsc.edu> */
+
+
+#include <iostream>
+#include "osbdb/OSBDB.h"
+
+using namespace std;
+
+int
+main (int argc, char **argv)
+{
+ vector<char *> args;
+ argv_to_vec (argc, argv, args);
+ parse_config_options (args);
+
+ g_conf.debug_bdbstore = 10;
+ //g_conf.bdbstore_btree = true;
+ char dbfile[256];
+ strncpy (dbfile, "/tmp/testosbdb/db.XXXXXX", 256);
+ mktemp (dbfile);
+ OSBDB *os = new OSBDB(dbfile);
+ auto_ptr<OSBDB> osPtr (os);
+ os->mkfs();
+ os->mount();
+
+ // Put an object.
+ object_t oid (0xDEADBEEF00000000ULL, 0xFEEDFACE);
+
+ cout << "sizeof oid_t is " << sizeof (oid_t) << endl;
+ cout << "offsetof oid_t.id " << offsetof (oid_t, id) << endl;
+
+ cout << sizeof (object_t) << endl;
+ cout << sizeof (oid.ino) << endl;
+ cout << sizeof (oid.bno) << endl;
+ cout << sizeof (oid.rev) << endl;
+
+ // Shouldn't be there.
+ if (os->exists (oid))
+ {
+ cout << "FAIL: oid shouldn't be there " << oid << endl;
+ }
+
+ // Write an object.
+ char *x = (char *) malloc (1024);
+ memset(x, 0xaa, 1024);
+ bufferptr bp (x, 1024);
+ bufferlist bl;
+ bl.push_back (bp);
+
+ if (os->write (oid, 0L, 1024, bl, NULL) != 1024)
+ {
+ cout << "FAIL: writing object" << endl;
+ }
+
+ os->sync();
+
+ // Should be there.
+ if (!os->exists (oid))
+ {
+ cout << "FAIL: oid should be there: " << oid << endl;
+ }
+
+ memset(x, 0, 1024);
+ if (os->read (oid, 0, 1024, bl) != 1024)
+ {
+ cout << "FAIL: reading object" << endl;
+ }
+
+ for (int i = 0; i < 1024; i++)
+ {
+ if ((x[i] & 0xFF) != 0xaa)
+ {
+ cout << "FAIL: data read out is different" << endl;
+ break;
+ }
+ }
+
+ // Set some attributes
+ if (os->setattr (oid, "alpha", "value", strlen ("value")) != 0)
+ {
+ cout << "FAIL: set attribute" << endl;
+ }
+ if (os->setattr (oid, "beta", "value", strlen ("value")) != 0)
+ {
+ cout << "FAIL: set attribute" << endl;
+ }
+ if (os->setattr (oid, "gamma", "value", strlen ("value")) != 0)
+ {
+ cout << "FAIL: set attribute" << endl;
+ }
+ if (os->setattr (oid, "fred", "value", strlen ("value")) != 0)
+ {
+ cout << "FAIL: set attribute" << endl;
+ }
+
+ char *attrs = (char *) malloc (1024);
+ if (os->listattr (oid, attrs, 1024) != 0)
+ {
+ cout << "FAIL: listing attributes" << endl;
+ }
+ else
+ {
+ char *p = attrs;
+ if (strcmp (p, "alpha") != 0)
+ {
+ cout << "FAIL: should be \"alpha:\" \"" << p << "\"" << endl;
+ }
+ p = p + strlen (p) + 1;
+ if (strcmp (p, "beta") != 0)
+ {
+ cout << "FAIL: should be \"beta:\" \"" << p << "\"" << endl;
+ }
+ p = p + strlen (p) + 1;
+ if (strcmp (p, "fred") != 0)
+ {
+ cout << "FAIL: should be \"fred:\" \"" << p << "\"" << endl;
+ }
+ p = p + strlen (p) + 1;
+ if (strcmp (p, "gamma") != 0)
+ {
+ cout << "FAIL: should be \"gamma:\" \"" << p << "\"" << endl;
+ }
+ }
+
+ char attrvalue[256];
+ memset(attrvalue, 0, sizeof (attrvalue));
+ if (os->getattr (oid, "alpha", attrvalue, sizeof(attrvalue)) < 0)
+ {
+ cout << "FAIL: getattr alpha" << endl;
+ }
+ else if (strncmp ("value", attrvalue, strlen("value")) != 0)
+ {
+ cout << "FAIL: read attribute value differs" << endl;
+ }
+ memset(attrvalue, 0, sizeof (attrvalue));
+ if (os->getattr (oid, "fred", attrvalue, sizeof(attrvalue)) < 0)
+ {
+ cout << "FAIL: getattr fred" << endl;
+ }
+ else if (strncmp ("value", attrvalue, strlen("value")) != 0)
+ {
+ cout << "FAIL: read attribute value differs" << endl;
+ }
+ memset(attrvalue, 0, sizeof (attrvalue));
+ if (os->getattr (oid, "beta", attrvalue, sizeof(attrvalue)) < 0)
+ {
+ cout << "FAIL: getattr beta" << endl;
+ }
+ else if (strncmp ("value", attrvalue, strlen("value")) != 0)
+ {
+ cout << "FAIL: read attribute value differs" << endl;
+ }
+ memset(attrvalue, 0, sizeof (attrvalue));
+ if (os->getattr (oid, "gamma", attrvalue, sizeof(attrvalue)) < 0)
+ {
+ cout << "FAIL: getattr gamma" << endl;
+ }
+ else if (strncmp ("value", attrvalue, strlen("value")) != 0)
+ {
+ cout << "FAIL: read attribute value differs" << endl;
+ }
+
+ if (os->setattr (oid, "alpha", "different", strlen("different")) != 0)
+ cout << "FAIL: setattr overwrite" << endl;
+ memset(attrvalue, 0, sizeof (attrvalue));
+ if (os->getattr (oid, "alpha", attrvalue, sizeof(attrvalue)) < 0)
+ {
+ cout << "FAIL: getattr alpha" << endl;
+ }
+ else if (strncmp ("different", attrvalue, strlen("different")) != 0)
+ {
+ cout << "FAIL: read attribute value differs" << endl;
+ }
+
+ if (os->rmattr (oid, "alpha") != 0)
+ {
+ cout << "FAIL: rmattr alpha" << endl;
+ }
+ if (os->rmattr (oid, "fred") != 0)
+ {
+ cout << "FAIL: rmattr fred" << endl;
+ }
+ if (os->rmattr (oid, "beta") != 0)
+ {
+ cout << "FAIL: rmattr beta" << endl;
+ }
+ if (os->rmattr (oid, "gamma") != 0)
+ {
+ cout << "FAIL: rmattr gamma" << endl;
+ }
+
+ coll_t cid = 0xCAFEBABE;
+ if (os->create_collection (cid) != 0)
+ {
+ cout << "FAIL: create_collection" << endl;
+ }
+ if (os->create_collection (cid + 10) != 0)
+ {
+ cout << "FAIL: create_collection" << endl;
+ }
+ if (os->create_collection (cid + 5) != 0)
+ {
+ cout << "FAIL: create_collection" << endl;
+ }
+ if (os->create_collection (42) != 0)
+ {
+ cout << "FAIL: create_collection" << endl;
+ }
+
+ if (os->collection_add (cid, oid) != 0)
+ {
+ cout << "FAIL: collection_add" << endl;
+ }
+
+ list<coll_t> ls;
+ if (os->list_collections (ls) < 0)
+ {
+ cout << "FAIL: list_collections" << endl;
+ }
+ cout << "collections: ";
+ for (list<coll_t>::iterator it = ls.begin(); it != ls.end(); it++)
+ {
+ cout << *it << ", ";
+ }
+ cout << endl;
+
+ if (os->destroy_collection (0xCAFEBABE + 10) != 0)
+ {
+ cout << "FAIL: destroy_collection" << endl;
+ }
+
+ if (os->destroy_collection (0xCAFEBADE + 10) == 0)
+ {
+ cout << "FAIL: destroy_collection" << endl;
+ }
+
+ object_t oid2 (12345, 12345);
+ for (int i = 0; i < 8; i++)
+ {
+ oid2.rev++;
+ if (os->collection_add (cid, oid2) != 0)
+ {
+ cout << "FAIL: collection_add" << endl;
+ }
+ }
+ for (int i = 0; i < 8; i++)
+ {
+ if (os->collection_remove (cid, oid2) != 0)
+ {
+ cout << "FAIL: collection_remove" << endl;
+ }
+ oid2.rev--;
+ }
+
+ if (os->collection_setattr (cid, "alpha", "value", 5) != 0)
+ cout << "FAIL: collection_setattr" << endl;
+ if (os->collection_setattr (cid, "beta", "value", 5) != 0)
+ cout << "FAIL: collection_setattr" << endl;
+ if (os->collection_setattr (cid, "gamma", "value", 5) != 0)
+ cout << "FAIL: collection_setattr" << endl;
+ if (os->collection_setattr (cid, "fred", "value", 5) != 0)
+ cout << "FAIL: collection_setattr" << endl;
+
+ memset (attrvalue, 0, sizeof (attrvalue));
+ if (os->collection_getattr (cid, "alpha", attrvalue, sizeof (attrvalue)) < 0)
+ cout << "FAIL: collection_getattr" << endl;
+ else if (strncmp (attrvalue, "value", 5) != 0)
+ cout << "FAIL: collection attribute value different" << endl;
+ memset (attrvalue, 0, sizeof (attrvalue));
+ if (os->collection_getattr (cid, "beta", attrvalue, sizeof (attrvalue)) < 0)
+ cout << "FAIL: collection_getattr" << endl;
+ else if (strncmp (attrvalue, "value", 5) != 0)
+ cout << "FAIL: collection attribute value different" << endl;
+ memset (attrvalue, 0, sizeof (attrvalue));
+ if (os->collection_getattr (cid, "gamma", attrvalue, sizeof (attrvalue)) < 0)
+ cout << "FAIL: collection_getattr" << endl;
+ else if (strncmp (attrvalue, "value", 5) != 0)
+ cout << "FAIL: collection attribute value different" << endl;
+ memset (attrvalue, 0, sizeof (attrvalue));
+ if (os->collection_getattr (cid, "fred", attrvalue, sizeof (attrvalue)) < 0)
+ cout << "FAIL: collection_getattr" << endl;
+ else if (strncmp (attrvalue, "value", 5) != 0)
+ cout << "FAIL: collection attribute value different" << endl;
+
+ if (os->collection_setattr (cid, "alpha", "eulavvalue", 10) != 0)
+ cout << "FAIL: collection setattr overwrite" << endl;
+ memset (attrvalue, 0, sizeof (attrvalue));
+ if (os->collection_getattr (cid, "alpha", attrvalue, sizeof (attrvalue)) < 0)
+ cout << "FAIL: collection_getattr" << endl;
+ else if (strncmp (attrvalue, "eulavvalue", 10) != 0)
+ cout << "FAIL: collection attribute value different" << endl;
+ memset (attrvalue, 0, sizeof (attrvalue));
+ if (os->collection_getattr (cid, "beta", attrvalue, sizeof (attrvalue)) < 0)
+ cout << "FAIL: collection_getattr" << endl;
+ else if (strncmp (attrvalue, "value", 5) != 0)
+ cout << "FAIL: collection attribute value different" << endl;
+ memset (attrvalue, 0, sizeof (attrvalue));
+ if (os->collection_getattr (cid, "gamma", attrvalue, sizeof (attrvalue)) < 0)
+ cout << "FAIL: collection_getattr" << endl;
+ else if (strncmp (attrvalue, "value", 5) != 0)
+ cout << "FAIL: collection attribute value different" << endl;
+ memset (attrvalue, 0, sizeof (attrvalue));
+ if (os->collection_getattr (cid, "fred", attrvalue, sizeof (attrvalue)) < 0)
+ cout << "FAIL: collection_getattr" << endl;
+ else if (strncmp (attrvalue, "value", 5) != 0)
+ cout << "FAIL: collection attribute value different" << endl;
+
+ if (os->collection_rmattr (cid, "alpha") != 0)
+ cout << "FAIL: collection_rmattr" << endl;
+ if (os->collection_rmattr (cid, "fred") != 0)
+ cout << "FAIL: collection_rmattr" << endl;
+ if (os->collection_rmattr (cid, "beta") != 0)
+ cout << "FAIL: collection_rmattr" << endl;
+ if (os->collection_rmattr (cid, "gamma") != 0)
+ cout << "FAIL: collection_rmattr" << endl;
+
+ if (os->collection_rmattr (cid, "alpha") == 0)
+ cout << "FAIL: collection_rmattr (nonexistent)" << endl;
+
+ // Truncate the object.
+ if (os->truncate (oid, 512, NULL) != 0)
+ {
+ cout << "FAIL: truncate" << endl;
+ }
+
+ // Expand the object.
+ if (os->truncate (oid, 1200, NULL) != 0)
+ {
+ cout << "FAIL: expand" << endl;
+ }
+
+ // Delete the object.
+ if (os->remove (oid) != 0)
+ {
+ cout << "FAIL: could not remove object" << endl;
+ }
+
+ // Shouldn't be there
+ if (os->exists (oid))
+ {
+ cout << "FAIL: should not be there" << endl;
+ }
+
+ os->sync();
+ exit (0);
+}
--- /dev/null
+# some valgrind suppressions
+# to load these automagically,
+# cat > ~/.valgrindrc
+# --suppressions=valgrind.supp
+# <control-d>
+
+
+# this one makes valgrind shut up about what appears to be a bug in libc's writev.
+{
+ writev uninit bytes thing -sage
+ Memcheck:Param
+ writev(vector[...])
+ fun:writev
+ fun:_ZN11BlockDevice6_writeEijjRN6buffer4listE
+ fun:_ZN11BlockDevice5do_ioEiRSt4listIPNS_6biovecESaIS2_EE
+ fun:_ZN11BlockDevice15io_thread_entryEv
+ fun:_ZN11BlockDevice8IOThread5entryEv
+ fun:_ZN6Thread11_entry_funcEPv
+ fun:start_thread
+ fun:clone
+ obj:*
+ obj:*
+ obj:*
+ obj:*
+}